diff options
Diffstat (limited to 'backend/tolData/eol')
| -rw-r--r-- | backend/tolData/eol/README.md | 26 | ||||
| -rwxr-xr-x | backend/tolData/eol/downloadImgs.py | 147 | ||||
| -rwxr-xr-x | backend/tolData/eol/genImagesListDb.sh | 12 | ||||
| -rwxr-xr-x | backend/tolData/eol/reviewImgs.py | 205 |
4 files changed, 390 insertions, 0 deletions
diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md new file mode 100644 index 0000000..8c527a8 --- /dev/null +++ b/backend/tolData/eol/README.md @@ -0,0 +1,26 @@ +This directory holds files obtained from/using the [Encyclopedia of Life](https://eol.org/). + +# Name Data Files +- vernacularNames.csv <br> + Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020). + Contains alternative-name data from EOL. + +# Image Metadata Files +- imagesList.tgz <br> + Obtained from <https://opendata.eol.org/dataset/images-list> on 24/04/2022 (last updated on 05/02/2020). + Contains metadata for images from EOL. +- imagesList/ <br> + Extracted from imagesList.tgz. +- genImagesListDb.sh <br> + Creates a database, and imports imagesList/*.csv files into it. +- imagesList.db <br> + Created by running genImagesListDb.sh <br> + Tables: <br> + - `images`: + `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` + +# Image Generation Files +- downloadImgs.py <br> + Used to download image files into imgsForReview/. +- reviewImgs.py <br> + Used to review images in imgsForReview/, moving acceptable ones into imgs/. diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py new file mode 100755 index 0000000..96bc085 --- /dev/null +++ b/backend/tolData/eol/downloadImgs.py @@ -0,0 +1,147 @@ +#!/usr/bin/python3 + +import sys, re, os, random +import sqlite3 +import urllib.parse, requests +import time +from threading import Thread +import signal + +usageInfo = f""" +Usage: {sys.argv[0]} + +For some set of EOL IDs, downloads associated images from URLs in +an image-list database. Uses multiple downloading threads. + +May obtain multiple images per ID. The images will get names +with the form 'eolId1 contentId1.ext1'. + +SIGINT causes the program to finish ongoing downloads and exit. +The program can be re-run to continue downloading. It looks for +already-downloaded files, and continues after the one with +highest EOL ID. +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) +# In testing, this downloaded about 70k images, over a few days + +imagesListDb = "imagesList.db" +def getInputEolIds(): + eolIds = set() + dbCon = sqlite3.connect("../data.db") + dbCur = dbCon.cursor() + for (id,) in dbCur.execute("SELECT id FROM eol_ids"): + eolIds.add(id) + dbCon.close() + return eolIds +outDir = "imgsForReview/" +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 5 +POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) +POST_DL_DELAY_MAX = 3 +LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" + +print("Getting input EOL IDs") +eolIds = getInputEolIds() +print("Getting EOL IDs to download for") +# Get IDs from images-list db +imgDbCon = sqlite3.connect(imagesListDb) +imgCur = imgDbCon.cursor() +imgListIds = set() +for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): + imgListIds.add(pageId) +# Get set intersection, and sort into list +eolIds = eolIds.intersection(imgListIds) +eolIds = sorted(eolIds) +print(f"Result: {len(eolIds)} EOL IDs") + +print("Checking output directory") +if not os.path.exists(outDir): + os.mkdir(outDir) +print("Finding next ID to download for") +nextIdx = 0 +fileList = os.listdir(outDir) +ids = [int(filename.split(" ")[0]) for filename in fileList] +if len(ids) > 0: + ids.sort() + nextIdx = eolIds.index(ids[-1]) + 1 +if nextIdx == len(eolIds): + print("No IDs left. Exiting...") + sys.exit(0) + +print("Starting download threads") +numThreads = 0 +threadException = None # Used for ending main thread after a non-main thread exception +# Handle SIGINT signals +interrupted = False +oldHandler = None +def onSigint(sig, frame): + global interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) +oldHandler = signal.signal(signal.SIGINT, onSigint) +# Function for threads to execute +def downloadImg(url, outFile): + global numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) + threadException = e + numThreads -= 1 +# Manage downloading +for idx in range(nextIdx, len(eolIds)): + eolId = eolIds[idx] + # Get image urls + imgDataList = [] + ownerSet = set() # Used to get images from different owners, for variety + exitLoop = False + query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" + for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): + if url.startswith("data/"): + url = "https://content.eol.org/" + url + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) + continue + # Check image-quantity limit + if len(ownerSet) == MAX_IMGS_PER_ID: + break + # Check for skip conditions + if re.fullmatch(LICENSE_REGEX, license) == None: + continue + if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner in ownerSet: + continue + ownerSet.add(copyrightOwner) + # Determine output filename + outPath = f"{outDir}{eolId} {contentId}{extension}" + if os.path.exists(outPath): + print(f"WARNING: {outPath} already exists. Skipping download.") + continue + # Check thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException != None: + print("Waiting for existing threads to end") + while numThreads > 0: + time.sleep(1) + exitLoop = True + break + # Perform download + print(f"Downloading image to {outPath}") + numThreads += 1 + thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) + thread.start() + if exitLoop: + break +# Close images-list db +print("Finished downloading") +imgDbCon.close() diff --git a/backend/tolData/eol/genImagesListDb.sh b/backend/tolData/eol/genImagesListDb.sh new file mode 100755 index 0000000..87dd840 --- /dev/null +++ b/backend/tolData/eol/genImagesListDb.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +# Combine CSV files into one, skipping header lines +cat imagesList/media_*_{1..58}.csv | tail -n +2 > imagesList.csv +# Create database, and import the CSV file +sqlite3 imagesList.db <<END +CREATE TABLE images ( + content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT); +.mode csv +.import 'imagesList.csv' images +END diff --git a/backend/tolData/eol/reviewImgs.py b/backend/tolData/eol/reviewImgs.py new file mode 100755 index 0000000..ecdf7ab --- /dev/null +++ b/backend/tolData/eol/reviewImgs.py @@ -0,0 +1,205 @@ +#!/usr/bin/python3 + +import sys, re, os, time +import sqlite3 +import tkinter as tki +from tkinter import ttk +import PIL +from PIL import ImageTk, Image, ImageOps + +usageInfo = f""" +Usage: {sys.argv[0]} + +Provides a GUI for reviewing images. Looks in a for-review directory for +images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to +choose an image to keep, or reject all. Also provides image rotation. +Chosen images are placed in another directory, and rejected ones are deleted. +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +imgDir = "imgsForReview/" +outDir = "imgs/" +extraInfoDbCon = sqlite3.connect("../data.db") +extraInfoDbCur = extraInfoDbCon.cursor() +def getExtraInfo(eolId): + global extraInfoDbCur + query = "SELECT names.alt_name FROM" \ + " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ + " WHERE id = ? and pref_alt = 1" + row = extraInfoDbCur.execute(query, (eolId,)).fetchone() + if row != None: + return f"Reviewing EOL ID {eolId}, aka \"{row[0]}\"" + else: + return f"Reviewing EOL ID {eolId}" +IMG_DISPLAY_SZ = 400 +MAX_IMGS_PER_ID = 3 +IMG_BG_COLOR = (88, 28, 135) +PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) + +print("Checking output directory") +if not os.path.exists(outDir): + os.mkdir(outDir) +print("Getting input image list") +imgList = os.listdir(imgDir) +imgList.sort(key=lambda s: int(s.split(" ")[0])) +if len(imgList) == 0: + print("No input images found") + sys.exit(0) + +class EolImgReviewer: + " Provides the GUI for reviewing images " + def __init__(self, root, imgList): + self.root = root + root.title("EOL Image Reviewer") + # Setup main frame + mainFrame = ttk.Frame(root, padding="5 5 5 5") + mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S)) + root.columnconfigure(0, weight=1) + root.rowconfigure(0, weight=1) + # Set up images-to-be-reviewed frames + self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation + self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter + # These need a persistent reference for some reason (doesn't display otherwise) + self.labels = [] + for i in range(MAX_IMGS_PER_ID): + frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ) + frame.grid(column=i, row=0) + label = ttk.Label(frame, image=self.photoImgs[i]) + label.grid(column=0, row=0) + self.labels.append(label) + # Add padding + for child in mainFrame.winfo_children(): + child.grid_configure(padx=5, pady=5) + # Add keyboard bindings + root.bind("<q>", self.quit) + root.bind("<Key-j>", lambda evt: self.accept(0)) + root.bind("<Key-k>", lambda evt: self.accept(1)) + root.bind("<Key-l>", lambda evt: self.accept(2)) + root.bind("<Key-i>", lambda evt: self.reject()) + root.bind("<Key-a>", lambda evt: self.rotate(0)) + root.bind("<Key-s>", lambda evt: self.rotate(1)) + root.bind("<Key-d>", lambda evt: self.rotate(2)) + root.bind("<Key-A>", lambda evt: self.rotate(0, True)) + root.bind("<Key-S>", lambda evt: self.rotate(1, True)) + root.bind("<Key-D>", lambda evt: self.rotate(2, True)) + # Initialise images to review + self.imgList = imgList + self.imgListIdx = 0 + self.nextEolId = 0 + self.nextImgNames = [] + self.rotations = [] + self.getNextImgs() + # For displaying extra info + self.numReviewed = 0 + self.startTime = time.time() + def getNextImgs(self): + " Updates display with new images to review, or ends program " + # Gather names of next images to review + for i in range(MAX_IMGS_PER_ID): + if self.imgListIdx == len(self.imgList): + if i == 0: + self.quit() + return + break + imgName = self.imgList[self.imgListIdx] + eolId = int(re.match(r"(\d+) (\d+)", imgName).group(1)) + if i == 0: + self.nextEolId = eolId + self.nextImgNames = [imgName] + self.rotations = [0] + else: + if self.nextEolId != eolId: + break + self.nextImgNames.append(imgName) + self.rotations.append(0) + self.imgListIdx += 1 + # Update displayed images + idx = 0 + while idx < MAX_IMGS_PER_ID: + if idx < len(self.nextImgNames): + try: + img = Image.open(imgDir + self.nextImgNames[idx]) + img = ImageOps.exif_transpose(img) + except PIL.UnidentifiedImageError: + os.remove(imgDir + self.nextImgNames[idx]) + del self.nextImgNames[idx] + del self.rotations[idx] + continue + self.imgs[idx] = self.resizeImgForDisplay(img) + else: + self.imgs[idx] = PLACEHOLDER_IMG + self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx]) + self.labels[idx].config(image=self.photoImgs[idx]) + idx += 1 + # Restart if all image files non-recognisable + if len(self.nextImgNames) == 0: + self.getNextImgs() + return + # Update title + firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 + lastImgIdx = self.imgListIdx + title = getExtraInfo(self.nextEolId) + title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})" + self.root.title(title) + def accept(self, imgIdx): + " React to a user selecting an image " + if imgIdx >= len(self.nextImgNames): + print("Invalid selection") + return + for i in range(len(self.nextImgNames)): + inFile = imgDir + self.nextImgNames[i] + if i == imgIdx: # Move accepted image, rotating if needed + outFile = outDir + self.nextImgNames[i] + img = Image.open(inFile) + img = ImageOps.exif_transpose(img) + if self.rotations[i] != 0: + img = img.rotate(self.rotations[i], expand=True) + img.save(outFile) + os.remove(inFile) + else: # Delete non-accepted image + os.remove(inFile) + self.numReviewed += 1 + self.getNextImgs() + def reject(self): + " React to a user rejecting all images of a set " + for i in range(len(self.nextImgNames)): + os.remove(imgDir + self.nextImgNames[i]) + self.numReviewed += 1 + self.getNextImgs() + def rotate(self, imgIdx, anticlockwise = False): + " Respond to a user rotating an image " + deg = -90 if not anticlockwise else 90 + self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg) + self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) + self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) + self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 + def quit(self, e = None): + global extraInfoDbCon + print(f"Number reviewed: {self.numReviewed}") + timeElapsed = time.time() - self.startTime + print(f"Time elapsed: {timeElapsed:.2f} seconds") + if self.numReviewed > 0: + print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds") + extraInfoDbCon.close() + self.root.destroy() + def resizeImgForDisplay(self, img): + " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background " + if max(img.width, img.height) > IMG_DISPLAY_SZ: + if (img.width > img.height): + newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) + img = img.resize((IMG_DISPLAY_SZ, newHeight)) + else: + newWidth = int(img.width * IMG_DISPLAY_SZ / img.height) + img = img.resize((newWidth, IMG_DISPLAY_SZ)) + bgImg = PLACEHOLDER_IMG.copy() + bgImg.paste(img, box=( + int((IMG_DISPLAY_SZ - img.width) / 2), + int((IMG_DISPLAY_SZ - img.height) / 2))) + return bgImg +# Create GUI and defer control +print("Starting GUI") +root = tki.Tk() +EolImgReviewer(root, imgList) +root.mainloop() |
