diff options
Diffstat (limited to 'backend/tol_data/eol')
| -rw-r--r-- | backend/tol_data/eol/README.md | 31 | ||||
| -rw-r--r-- | backend/tol_data/eol/__init__.py | 0 | ||||
| -rwxr-xr-x | backend/tol_data/eol/download_imgs.py | 152 | ||||
| -rwxr-xr-x | backend/tol_data/eol/gen_images_list_db.py | 39 | ||||
| -rwxr-xr-x | backend/tol_data/eol/review_imgs.py | 213 |
5 files changed, 435 insertions, 0 deletions
diff --git a/backend/tol_data/eol/README.md b/backend/tol_data/eol/README.md new file mode 100644 index 0000000..580310d --- /dev/null +++ b/backend/tol_data/eol/README.md @@ -0,0 +1,31 @@ +This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/). + +# Mapping Files +- `provider_ids.csv.gz` <br> + Obtained from <https://opendata.eol.org/dataset/identifier-map> on 22/08/22 (says last updated 27/07/22). + Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium. + +# Name Data Files +- `vernacularNames.csv` <br> + Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020). + Contains alternative-node-names data from EOL. + +# Image Metadata Files +- `imagesList.tgz` <br> + Obtained from <https://opendata.eol.org/dataset/images-list> on 24/04/2022 (last updated on 05/02/2020). + Contains metadata for images from EOL. +- `imagesList/` <br> + Extracted from imagesList.tgz. +- `gen_images_list_db.py` <br> + Creates a database, and imports imagesList/*.csv files into it. +- `images_list.db` <br> + Created by running genImagesListDb.py <br> + Tables: <br> + - `images`: + `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` + +# Image Generation Files +- `download_imgs.py` <br> + Used to download image files into imgs_for_review/. +- `review_imgs.py` <br> + Used to review images in imgs_for_review/, moving acceptable ones into imgs/. diff --git a/backend/tol_data/eol/__init__.py b/backend/tol_data/eol/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tol_data/eol/__init__.py diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py new file mode 100755 index 0000000..8454a35 --- /dev/null +++ b/backend/tol_data/eol/download_imgs.py @@ -0,0 +1,152 @@ +#!/usr/bin/python3 + +""" +For some set of EOL IDs, downloads associated images from URLs in +an image-list database. Uses multiple downloading threads. + +May obtain multiple images per ID. The images will get names +with the form 'eolId1 contentId1.ext1'. + +SIGINT causes the program to finish ongoing downloads and exit. +The program can be re-run to continue downloading. It looks for +already-downloaded files, and continues after the one with +highest EOL ID. +""" + +import sys, re, os, random +import sqlite3 +import urllib.parse, requests +import time +from threading import Thread +import signal + +IMAGES_LIST_DB = 'images_list.db' +OUT_DIR = 'imgs_for_review' +DB_FILE = os.path.join('..', 'data.db') +# +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 5 +POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) +POST_DL_DELAY_MAX = 3 +LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' + +def downloadImgs(eolIds, imagesListDb, outDir): + print('Getting EOL IDs to download for') + # Get IDs from images-list db + imgDbCon = sqlite3.connect(imagesListDb) + imgCur = imgDbCon.cursor() + imgListIds: set[int] = set() + for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): + imgListIds.add(pageId) + # Get set intersection, and sort into list + eolIds = eolIds.intersection(imgListIds) + eolIdList = sorted(eolIds) + nextIdx = 0 + print(f'Result: {len(eolIdList)} EOL IDs') + # + print('Checking output directory') + if not os.path.exists(outDir): + os.mkdir(outDir) + else: + print('Finding next ID to download for') + fileList = os.listdir(outDir) + ids = [int(filename.split(' ')[0]) for filename in fileList] + if ids: + ids.sort() + nextIdx = eolIdList.index(ids[-1]) + 1 + if nextIdx == len(eolIdList): + print('No IDs left. Exiting...') + return + # + print('Starting download threads') + numThreads = 0 + threadException: Exception | None = None # Used for ending main thread after a non-main thread exception + # Handle SIGINT signals + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # Function for threads to execute + def downloadImg(url, outFile): + nonlocal numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) + threadException = e + numThreads -= 1 + # Manage downloading + for idx in range(nextIdx, len(eolIdList)): + eolId = eolIdList[idx] + # Get image urls + ownerSet: set[str] = set() # Used to get images from different owners, for variety + exitLoop = False + query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' + for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): + if url.startswith('data/'): + url = 'https://content.eol.org/' + url + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) + continue + # Check image-quantity limit + if len(ownerSet) == MAX_IMGS_PER_ID: + break + # Check for skip conditions + if re.fullmatch(LICENSE_REGEX, license) is None: + continue + if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner in ownerSet: + continue + ownerSet.add(copyrightOwner) + # Determine output filename + outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}') + if os.path.exists(outPath): + print(f'WARNING: {outPath} already exists. Skipping download.') + continue + # Check thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException is not None: + print('Waiting for existing threads to end') + while numThreads > 0: + time.sleep(1) + exitLoop = True + break + # Perform download + print(f'Downloading image to {outPath}') + numThreads += 1 + thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) + thread.start() + if exitLoop: + break + # Close images-list db + while numThreads > 0: + time.sleep(1) + print('Finished downloading') + imgDbCon.close() + +def getEolIdsFromDb(dbFile) -> set[int]: + eolIds: set[int] = set() + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + for (id,) in dbCur.execute('SELECT id FROM eol_ids'): + eolIds.add(id) + dbCon.close() + return eolIds +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + eolIds = getEolIdsFromDb(DB_FILE) + downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR) diff --git a/backend/tol_data/eol/gen_images_list_db.py b/backend/tol_data/eol/gen_images_list_db.py new file mode 100755 index 0000000..ee57ac6 --- /dev/null +++ b/backend/tol_data/eol/gen_images_list_db.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +""" +Generates a sqlite db from a directory of CSV files holding EOL image data +""" + +import os, glob +import csv, re, sqlite3 + +IMAGE_LISTS_GLOB = os.path.join('imagesList', '*.csv') +DB_FILE = 'images_list.db' + +def genData(imageListsGlob: str, dbFile: str) -> None: + print('Creating database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE images' \ + ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \ + ' copy_url TEXT, license TEXT, copyright_owner TEXT)') + dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)') + print('Reading CSV files') + for filename in glob.glob(imageListsGlob): + print(f'Processing {filename}') + with open(filename, newline='') as file: + for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file): + if re.match(r'^[a-zA-Z]', contentId): # Skip header line (not in all files) + continue + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner)) + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(IMAGE_LISTS_GLOB, DB_FILE) diff --git a/backend/tol_data/eol/review_imgs.py b/backend/tol_data/eol/review_imgs.py new file mode 100755 index 0000000..9fb462c --- /dev/null +++ b/backend/tol_data/eol/review_imgs.py @@ -0,0 +1,213 @@ +#!/usr/bin/python3 + +""" +Provides a GUI for reviewing images. Looks in a for-review directory for +images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to +choose an image to keep, or reject all. Also provides image rotation. +Chosen images are placed in another directory, and rejected ones are deleted. +""" + +import sys, re, os, time +import sqlite3 +import tkinter as tki +from tkinter import ttk +import PIL +from PIL import ImageTk, Image, ImageOps + +IMG_DIR = 'imgs_for_review' +OUT_DIR = 'imgs' +EXTRA_INFO_DB = os.path.join('..', 'data.db') +# +IMG_DISPLAY_SZ = 400 +MAX_IMGS_PER_ID = 3 +IMG_BG_COLOR = (88, 28, 135) +PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) + +class EolImgReviewer: + """ Provides the GUI for reviewing images """ + def __init__(self, root, imgDir, imgList, extraInfoDb, outDir): + self.root = root + root.title('EOL Image Reviewer') + # Setup main frame + mainFrame = ttk.Frame(root, padding='5 5 5 5') + mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S)) + root.columnconfigure(0, weight=1) + root.rowconfigure(0, weight=1) + # Set up images-to-be-reviewed frames + self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation + self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter + # These need a persistent reference for some reason (doesn't display otherwise) + self.labels: list[ttk.Label] = [] + for i in range(MAX_IMGS_PER_ID): + frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ) + frame.grid(column=i, row=0) + label = ttk.Label(frame, image=self.photoImgs[i]) + label.grid(column=0, row=0) + self.labels.append(label) + # Add padding + for child in mainFrame.winfo_children(): + child.grid_configure(padx=5, pady=5) + # Add keyboard bindings + root.bind('<q>', self.quit) + root.bind('<Key-j>', lambda evt: self.accept(0)) + root.bind('<Key-k>', lambda evt: self.accept(1)) + root.bind('<Key-l>', lambda evt: self.accept(2)) + root.bind('<Key-i>', lambda evt: self.reject()) + root.bind('<Key-a>', lambda evt: self.rotate(0)) + root.bind('<Key-s>', lambda evt: self.rotate(1)) + root.bind('<Key-d>', lambda evt: self.rotate(2)) + root.bind('<Key-A>', lambda evt: self.rotate(0, True)) + root.bind('<Key-S>', lambda evt: self.rotate(1, True)) + root.bind('<Key-D>', lambda evt: self.rotate(2, True)) + # Initialise fields + self.imgDir = imgDir + self.imgList = imgList + self.outDir = outDir + self.imgListIdx = 0 + self.nextEolId = 0 + self.nextImgNames: list[str] = [] + self.rotations: list[int] = [] + # For displaying extra info + self.extraInfoDbCon = sqlite3.connect(extraInfoDb) + self.extraInfoDbCur = self.extraInfoDbCon.cursor() + self.numReviewed = 0 + self.startTime = time.time() + # + self.getNextImgs() + def getNextImgs(self): + """ Updates display with new images to review, or ends program """ + # Gather names of next images to review + for i in range(MAX_IMGS_PER_ID): + if self.imgListIdx == len(self.imgList): + if i == 0: + self.quit() + return + break + imgName = self.imgList[self.imgListIdx] + eolId = int(re.match(r'(\d+) (\d+)', imgName).group(1)) + if i == 0: + self.nextEolId = eolId + self.nextImgNames = [imgName] + self.rotations = [0] + else: + if self.nextEolId != eolId: + break + self.nextImgNames.append(imgName) + self.rotations.append(0) + self.imgListIdx += 1 + # Update displayed images + idx = 0 + while idx < MAX_IMGS_PER_ID: + if idx < len(self.nextImgNames): + try: + img = Image.open(os.path.join(self.imgDir, self.nextImgNames[idx])) + img = ImageOps.exif_transpose(img) + except PIL.UnidentifiedImageError: + os.remove(os.path.join(self.imgDir, self.nextImgNames[idx])) + del self.nextImgNames[idx] + del self.rotations[idx] + continue + self.imgs[idx] = self.resizeImgForDisplay(img) + else: + self.imgs[idx] = PLACEHOLDER_IMG + self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx]) + self.labels[idx].config(image=self.photoImgs[idx]) + idx += 1 + # Restart if all image files non-recognisable + if not self.nextImgNames: + self.getNextImgs() + return + # Update title + firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 + lastImgIdx = self.imgListIdx + title = self.getExtraInfo(self.nextEolId) + title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})' + self.root.title(title) + def accept(self, imgIdx): + """ React to a user selecting an image """ + if imgIdx >= len(self.nextImgNames): + print('Invalid selection') + return + for i in range(len(self.nextImgNames)): + inFile = os.path.join(self.imgDir, self.nextImgNames[i]) + if i == imgIdx: # Move accepted image, rotating if needed + outFile = os.path.join(self.outDir, self.nextImgNames[i]) + img = Image.open(inFile) + img = ImageOps.exif_transpose(img) + if self.rotations[i] != 0: + img = img.rotate(self.rotations[i], expand=True) + img.save(outFile) + os.remove(inFile) + else: # Delete non-accepted image + os.remove(inFile) + self.numReviewed += 1 + self.getNextImgs() + def reject(self): + """ React to a user rejecting all images of a set """ + for i in range(len(self.nextImgNames)): + os.remove(os.path.join(self.imgDir, self.nextImgNames[i])) + self.numReviewed += 1 + self.getNextImgs() + def rotate(self, imgIdx, anticlockwise = False): + """ Respond to a user rotating an image """ + deg = -90 if not anticlockwise else 90 + self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg) + self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) + self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) + self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 + def quit(self, e = None): + print(f'Number reviewed: {self.numReviewed}') + timeElapsed = time.time() - self.startTime + print(f'Time elapsed: {timeElapsed:.2f} seconds') + if self.numReviewed > 0: + print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds') + self.extraInfoDbCon.close() + self.root.destroy() + # + def resizeImgForDisplay(self, img): + """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """ + if max(img.width, img.height) > IMG_DISPLAY_SZ: + if (img.width > img.height): + newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) + img = img.resize((IMG_DISPLAY_SZ, newHeight)) + else: + newWidth = int(img.width * IMG_DISPLAY_SZ / img.height) + img = img.resize((newWidth, IMG_DISPLAY_SZ)) + bgImg = PLACEHOLDER_IMG.copy() + bgImg.paste(img, box=( + int((IMG_DISPLAY_SZ - img.width) / 2), + int((IMG_DISPLAY_SZ - img.height) / 2))) + return bgImg + def getExtraInfo(self, eolId: int) -> str: + """ Used to display extra EOL ID info """ + query = 'SELECT names.alt_name FROM' \ + ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \ + ' WHERE id = ? and pref_alt = 1' + row = self.extraInfoDbCur.execute(query, (eolId,)).fetchone() + if row is not None: + return f'Reviewing EOL ID {eolId}, aka "{row[0]}"' + else: + return f'Reviewing EOL ID {eolId}' + +def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str): + print('Checking output directory') + if not os.path.exists(outDir): + os.mkdir(outDir) + print('Getting input image list') + imgList = os.listdir(imgDir) + imgList.sort(key=lambda s: int(s.split(' ')[0])) + if not imgList: + print('No input images found') + sys.exit(0) + # Create GUI and defer control + print('Starting GUI') + root = tki.Tk() + EolImgReviewer(root, imgDir, imgList, extraInfoDb, outDir) + root.mainloop() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + reviewImgs(IMG_DIR, OUT_DIR, EXTRA_INFO_DB) |
