From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tolData/eol/README.md | 31 ----- backend/tolData/eol/downloadImgs.py | 142 ----------------------- backend/tolData/eol/genImagesListDb.py | 34 ------ backend/tolData/eol/reviewImgs.py | 202 --------------------------------- 4 files changed, 409 deletions(-) delete mode 100644 backend/tolData/eol/README.md delete mode 100755 backend/tolData/eol/downloadImgs.py delete mode 100755 backend/tolData/eol/genImagesListDb.py delete mode 100755 backend/tolData/eol/reviewImgs.py (limited to 'backend/tolData/eol') diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md deleted file mode 100644 index c07b48e..0000000 --- a/backend/tolData/eol/README.md +++ /dev/null @@ -1,31 +0,0 @@ -This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/). - -# Mapping Files -- `provider_ids.csv.gz`
- Obtained from on 22/08/22 (says last updated 27/07/22). - Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium. - -# Name Data Files -- vernacularNames.csv
- Obtained from on 24/04/2022 (last updated on 27/10/2020). - Contains alternative-node-names data from EOL. - -# Image Metadata Files -- imagesList.tgz
- Obtained from on 24/04/2022 (last updated on 05/02/2020). - Contains metadata for images from EOL. -- imagesList/
- Extracted from imagesList.tgz. -- genImagesListDb.py
- Creates a database, and imports imagesList/*.csv files into it. -- imagesList.db
- Created by running genImagesListDb.py
- Tables:
- - `images`: - `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` - -# Image Generation Files -- downloadImgs.py
- Used to download image files into imgsForReview/. -- reviewImgs.py
- Used to review images in imgsForReview/, moving acceptable ones into imgs/. diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py deleted file mode 100755 index 5213aaf..0000000 --- a/backend/tolData/eol/downloadImgs.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os, random -import sqlite3 -import urllib.parse, requests -import time -from threading import Thread -import signal - -import argparse -parser = argparse.ArgumentParser(description=""" -For some set of EOL IDs, downloads associated images from URLs in -an image-list database. Uses multiple downloading threads. - -May obtain multiple images per ID. The images will get names -with the form 'eolId1 contentId1.ext1'. - -SIGINT causes the program to finish ongoing downloads and exit. -The program can be re-run to continue downloading. It looks for -already-downloaded files, and continues after the one with -highest EOL ID. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imagesListDb = 'imagesList.db' -def getInputEolIds() -> set[int]: - eolIds: set[int] = set() - dbCon = sqlite3.connect('../data.db') - dbCur = dbCon.cursor() - for (id,) in dbCur.execute('SELECT id FROM eol_ids'): - eolIds.add(id) - dbCon.close() - return eolIds -outDir = 'imgsForReview/' -MAX_IMGS_PER_ID = 3 -MAX_THREADS = 5 -POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) -POST_DL_DELAY_MAX = 3 -LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' - -print('Getting input EOL IDs') -eolIds = getInputEolIds() -print('Getting EOL IDs to download for') -# Get IDs from images-list db -imgDbCon = sqlite3.connect(imagesListDb) -imgCur = imgDbCon.cursor() -imgListIds: set[int] = set() -for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): - imgListIds.add(pageId) -# Get set intersection, and sort into list -eolIds = eolIds.intersection(imgListIds) -eolIdList = sorted(eolIds) -print(f'Result: {len(eolIdList)} EOL IDs') - -print('Checking output directory') -if not os.path.exists(outDir): - os.mkdir(outDir) -print('Finding next ID to download for') -nextIdx = 0 -fileList = os.listdir(outDir) -ids = [int(filename.split(' ')[0]) for filename in fileList] -if ids: - ids.sort() - nextIdx = eolIdList.index(ids[-1]) + 1 -if nextIdx == len(eolIdList): - print('No IDs left. Exiting...') - sys.exit(0) - -print('Starting download threads') -numThreads = 0 -threadException: Exception | None = None # Used for ending main thread after a non-main thread exception -# Handle SIGINT signals -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) -# Function for threads to execute -def downloadImg(url, outFile): - global numThreads, threadException - try: - data = requests.get(url) - with open(outFile, 'wb') as file: - file.write(data.content) - time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) - except Exception as e: - print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) - threadException = e - numThreads -= 1 -# Manage downloading -for idx in range(nextIdx, len(eolIdList)): - eolId = eolIdList[idx] - # Get image urls - ownerSet: set[str] = set() # Used to get images from different owners, for variety - exitLoop = False - query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' - for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): - if url.startswith('data/'): - url = 'https://content.eol.org/' + url - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) - continue - # Check image-quantity limit - if len(ownerSet) == MAX_IMGS_PER_ID: - break - # Check for skip conditions - if re.fullmatch(LICENSE_REGEX, license) is None: - continue - if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic - continue - if copyrightOwner in ownerSet: - continue - ownerSet.add(copyrightOwner) - # Determine output filename - outPath = f'{outDir}{eolId} {contentId}{extension}' - if os.path.exists(outPath): - print(f'WARNING: {outPath} already exists. Skipping download.') - continue - # Check thread limit - while numThreads == MAX_THREADS: - time.sleep(1) - # Wait for threads after an interrupt or thread-exception - if interrupted or threadException is not None: - print('Waiting for existing threads to end') - while numThreads > 0: - time.sleep(1) - exitLoop = True - break - # Perform download - print(f'Downloading image to {outPath}') - numThreads += 1 - thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) - thread.start() - if exitLoop: - break -# Close images-list db -print('Finished downloading') -imgDbCon.close() diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py deleted file mode 100755 index 808292d..0000000 --- a/backend/tolData/eol/genImagesListDb.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/python3 - -import os, re -import csv -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Generates a sqlite db from a directory of CSV files holding EOL image data -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imagesListDir = 'imagesList/' -dbFile = 'imagesList.db' - -print('Creating database') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE images' \ - ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)') -dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)') -print('Reading CSV files') -csvFilenames = os.listdir(imagesListDir) -for filename in csvFilenames: - print(f'Processing {imagesListDir}{filename}') - with open(imagesListDir + filename, newline='') as file: - for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file): - if re.match(r'^[a-zA-Z]', contentId): # Skip header line - continue - dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', - (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner)) -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/eol/reviewImgs.py b/backend/tolData/eol/reviewImgs.py deleted file mode 100755 index e44fb3d..0000000 --- a/backend/tolData/eol/reviewImgs.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os, time -import sqlite3 -import tkinter as tki -from tkinter import ttk -import PIL -from PIL import ImageTk, Image, ImageOps - -import argparse -parser = argparse.ArgumentParser(description=""" -Provides a GUI for reviewing images. Looks in a for-review directory for -images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to -choose an image to keep, or reject all. Also provides image rotation. -Chosen images are placed in another directory, and rejected ones are deleted. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imgDir = 'imgsForReview/' -outDir = 'imgs/' -extraInfoDbCon = sqlite3.connect('../data.db') -extraInfoDbCur = extraInfoDbCon.cursor() -def getExtraInfo(eolId: int) -> str: - global extraInfoDbCur - query = 'SELECT names.alt_name FROM' \ - ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \ - ' WHERE id = ? and pref_alt = 1' - row = extraInfoDbCur.execute(query, (eolId,)).fetchone() - if row is not None: - return f'Reviewing EOL ID {eolId}, aka "{row[0]}"' - else: - return f'Reviewing EOL ID {eolId}' -IMG_DISPLAY_SZ = 400 -MAX_IMGS_PER_ID = 3 -IMG_BG_COLOR = (88, 28, 135) -PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) - -print('Checking output directory') -if not os.path.exists(outDir): - os.mkdir(outDir) -print('Getting input image list') -imgList = os.listdir(imgDir) -imgList.sort(key=lambda s: int(s.split(' ')[0])) -if not imgList: - print('No input images found') - sys.exit(0) - -class EolImgReviewer: - """ Provides the GUI for reviewing images """ - def __init__(self, root, imgList): - self.root = root - root.title('EOL Image Reviewer') - # Setup main frame - mainFrame = ttk.Frame(root, padding='5 5 5 5') - mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S)) - root.columnconfigure(0, weight=1) - root.rowconfigure(0, weight=1) - # Set up images-to-be-reviewed frames - self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation - self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter - # These need a persistent reference for some reason (doesn't display otherwise) - self.labels: list[ttk.Label] = [] - for i in range(MAX_IMGS_PER_ID): - frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ) - frame.grid(column=i, row=0) - label = ttk.Label(frame, image=self.photoImgs[i]) - label.grid(column=0, row=0) - self.labels.append(label) - # Add padding - for child in mainFrame.winfo_children(): - child.grid_configure(padx=5, pady=5) - # Add keyboard bindings - root.bind('', self.quit) - root.bind('', lambda evt: self.accept(0)) - root.bind('', lambda evt: self.accept(1)) - root.bind('', lambda evt: self.accept(2)) - root.bind('', lambda evt: self.reject()) - root.bind('', lambda evt: self.rotate(0)) - root.bind('', lambda evt: self.rotate(1)) - root.bind('', lambda evt: self.rotate(2)) - root.bind('', lambda evt: self.rotate(0, True)) - root.bind('', lambda evt: self.rotate(1, True)) - root.bind('', lambda evt: self.rotate(2, True)) - # Initialise images to review - self.imgList = imgList - self.imgListIdx = 0 - self.nextEolId = 0 - self.nextImgNames: list[str] = [] - self.rotations: list[int] = [] - self.getNextImgs() - # For displaying extra info - self.numReviewed = 0 - self.startTime = time.time() - def getNextImgs(self): - """ Updates display with new images to review, or ends program """ - # Gather names of next images to review - for i in range(MAX_IMGS_PER_ID): - if self.imgListIdx == len(self.imgList): - if i == 0: - self.quit() - return - break - imgName = self.imgList[self.imgListIdx] - eolId = int(re.match(r'(\d+) (\d+)', imgName).group(1)) - if i == 0: - self.nextEolId = eolId - self.nextImgNames = [imgName] - self.rotations = [0] - else: - if self.nextEolId != eolId: - break - self.nextImgNames.append(imgName) - self.rotations.append(0) - self.imgListIdx += 1 - # Update displayed images - idx = 0 - while idx < MAX_IMGS_PER_ID: - if idx < len(self.nextImgNames): - try: - img = Image.open(imgDir + self.nextImgNames[idx]) - img = ImageOps.exif_transpose(img) - except PIL.UnidentifiedImageError: - os.remove(imgDir + self.nextImgNames[idx]) - del self.nextImgNames[idx] - del self.rotations[idx] - continue - self.imgs[idx] = self.resizeImgForDisplay(img) - else: - self.imgs[idx] = PLACEHOLDER_IMG - self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx]) - self.labels[idx].config(image=self.photoImgs[idx]) - idx += 1 - # Restart if all image files non-recognisable - if not self.nextImgNames: - self.getNextImgs() - return - # Update title - firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 - lastImgIdx = self.imgListIdx - title = getExtraInfo(self.nextEolId) - title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})' - self.root.title(title) - def accept(self, imgIdx): - """ React to a user selecting an image """ - if imgIdx >= len(self.nextImgNames): - print('Invalid selection') - return - for i in range(len(self.nextImgNames)): - inFile = imgDir + self.nextImgNames[i] - if i == imgIdx: # Move accepted image, rotating if needed - outFile = outDir + self.nextImgNames[i] - img = Image.open(inFile) - img = ImageOps.exif_transpose(img) - if self.rotations[i] != 0: - img = img.rotate(self.rotations[i], expand=True) - img.save(outFile) - os.remove(inFile) - else: # Delete non-accepted image - os.remove(inFile) - self.numReviewed += 1 - self.getNextImgs() - def reject(self): - """ React to a user rejecting all images of a set """ - for i in range(len(self.nextImgNames)): - os.remove(imgDir + self.nextImgNames[i]) - self.numReviewed += 1 - self.getNextImgs() - def rotate(self, imgIdx, anticlockwise = False): - """ Respond to a user rotating an image """ - deg = -90 if not anticlockwise else 90 - self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg) - self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) - self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) - self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 - def quit(self, e = None): - global extraInfoDbCon - print(f'Number reviewed: {self.numReviewed}') - timeElapsed = time.time() - self.startTime - print(f'Time elapsed: {timeElapsed:.2f} seconds') - if self.numReviewed > 0: - print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds') - extraInfoDbCon.close() - self.root.destroy() - def resizeImgForDisplay(self, img): - """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """ - if max(img.width, img.height) > IMG_DISPLAY_SZ: - if (img.width > img.height): - newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) - img = img.resize((IMG_DISPLAY_SZ, newHeight)) - else: - newWidth = int(img.width * IMG_DISPLAY_SZ / img.height) - img = img.resize((newWidth, IMG_DISPLAY_SZ)) - bgImg = PLACEHOLDER_IMG.copy() - bgImg.paste(img, box=( - int((IMG_DISPLAY_SZ - img.width) / 2), - int((IMG_DISPLAY_SZ - img.height) / 2))) - return bgImg -# Create GUI and defer control -print('Starting GUI') -root = tki.Tk() -EolImgReviewer(root, imgList) -root.mainloop() -- cgit v1.2.3