diff options
Diffstat (limited to 'backend/tolData/eol')
| -rw-r--r-- | backend/tolData/eol/README.md | 6 | ||||
| -rwxr-xr-x | backend/tolData/eol/genImagesListDb.py | 36 | ||||
| -rwxr-xr-x | backend/tolData/eol/genImagesListDb.sh | 12 |
3 files changed, 39 insertions, 15 deletions
diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md index 8c527a8..1a9dbdf 100644 --- a/backend/tolData/eol/README.md +++ b/backend/tolData/eol/README.md @@ -3,7 +3,7 @@ This directory holds files obtained from/using the [Encyclopedia of Life](https: # Name Data Files - vernacularNames.csv <br> Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020). - Contains alternative-name data from EOL. + Contains alternative-node-names data from EOL. # Image Metadata Files - imagesList.tgz <br> @@ -11,10 +11,10 @@ This directory holds files obtained from/using the [Encyclopedia of Life](https: Contains metadata for images from EOL. - imagesList/ <br> Extracted from imagesList.tgz. -- genImagesListDb.sh <br> +- genImagesListDb.py <br> Creates a database, and imports imagesList/*.csv files into it. - imagesList.db <br> - Created by running genImagesListDb.sh <br> + Created by running genImagesListDb.py <br> Tables: <br> - `images`: `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py new file mode 100755 index 0000000..32df10a --- /dev/null +++ b/backend/tolData/eol/genImagesListDb.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 + +import sys, os, re +import csv +import sqlite3 + +usageInfo = f""" +Usage: {sys.argv[0]} + +Generates a sqlite db from a directory of CSV files holding EOL image data +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +imagesListDir = "imagesList/" +dbFile = "imagesList.db" + +print("Creating database") +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE images" \ + " (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)") +print("Reading CSV files") +csvFilenames = os.listdir(imagesListDir) +for filename in csvFilenames: + print(f"Processing {imagesListDir}{filename}") + with open(imagesListDir + filename, newline="") as file: + for (contentId, pageId, sourceUrl, copyUrl, license, owner) in csv.reader(file): + if re.match(r"^[a-zA-Z]", contentId): # Skip header line + continue + dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", + (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner)) +print("Closing database") +dbCon.commit() +dbCon.close() diff --git a/backend/tolData/eol/genImagesListDb.sh b/backend/tolData/eol/genImagesListDb.sh deleted file mode 100755 index 87dd840..0000000 --- a/backend/tolData/eol/genImagesListDb.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -e - -# Combine CSV files into one, skipping header lines -cat imagesList/media_*_{1..58}.csv | tail -n +2 > imagesList.csv -# Create database, and import the CSV file -sqlite3 imagesList.db <<END -CREATE TABLE images ( - content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT); -.mode csv -.import 'imagesList.csv' images -END |
