aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/eol
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-11 14:55:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-11 15:04:14 +1000
commit5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/eol
parentdaccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
Add backend unit tests
- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tolData/eol')
-rw-r--r--backend/tolData/eol/README.md31
-rwxr-xr-xbackend/tolData/eol/downloadImgs.py142
-rwxr-xr-xbackend/tolData/eol/genImagesListDb.py34
-rwxr-xr-xbackend/tolData/eol/reviewImgs.py202
4 files changed, 0 insertions, 409 deletions
diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md
deleted file mode 100644
index c07b48e..0000000
--- a/backend/tolData/eol/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/).
-
-# Mapping Files
-- `provider_ids.csv.gz` <br>
- Obtained from <https://opendata.eol.org/dataset/identifier-map> on 22/08/22 (says last updated 27/07/22).
- Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium.
-
-# Name Data Files
-- vernacularNames.csv <br>
- Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020).
- Contains alternative-node-names data from EOL.
-
-# Image Metadata Files
-- imagesList.tgz <br>
- Obtained from <https://opendata.eol.org/dataset/images-list> on 24/04/2022 (last updated on 05/02/2020).
- Contains metadata for images from EOL.
-- imagesList/ <br>
- Extracted from imagesList.tgz.
-- genImagesListDb.py <br>
- Creates a database, and imports imagesList/*.csv files into it.
-- imagesList.db <br>
- Created by running genImagesListDb.py <br>
- Tables: <br>
- - `images`:
- `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT`
-
-# Image Generation Files
-- downloadImgs.py <br>
- Used to download image files into imgsForReview/.
-- reviewImgs.py <br>
- Used to review images in imgsForReview/, moving acceptable ones into imgs/.
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py
deleted file mode 100755
index 5213aaf..0000000
--- a/backend/tolData/eol/downloadImgs.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os, random
-import sqlite3
-import urllib.parse, requests
-import time
-from threading import Thread
-import signal
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-For some set of EOL IDs, downloads associated images from URLs in
-an image-list database. Uses multiple downloading threads.
-
-May obtain multiple images per ID. The images will get names
-with the form 'eolId1 contentId1.ext1'.
-
-SIGINT causes the program to finish ongoing downloads and exit.
-The program can be re-run to continue downloading. It looks for
-already-downloaded files, and continues after the one with
-highest EOL ID.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imagesListDb = 'imagesList.db'
-def getInputEolIds() -> set[int]:
- eolIds: set[int] = set()
- dbCon = sqlite3.connect('../data.db')
- dbCur = dbCon.cursor()
- for (id,) in dbCur.execute('SELECT id FROM eol_ids'):
- eolIds.add(id)
- dbCon.close()
- return eolIds
-outDir = 'imgsForReview/'
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
-POST_DL_DELAY_MAX = 3
-LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain'
-
-print('Getting input EOL IDs')
-eolIds = getInputEolIds()
-print('Getting EOL IDs to download for')
-# Get IDs from images-list db
-imgDbCon = sqlite3.connect(imagesListDb)
-imgCur = imgDbCon.cursor()
-imgListIds: set[int] = set()
-for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'):
- imgListIds.add(pageId)
-# Get set intersection, and sort into list
-eolIds = eolIds.intersection(imgListIds)
-eolIdList = sorted(eolIds)
-print(f'Result: {len(eolIdList)} EOL IDs')
-
-print('Checking output directory')
-if not os.path.exists(outDir):
- os.mkdir(outDir)
-print('Finding next ID to download for')
-nextIdx = 0
-fileList = os.listdir(outDir)
-ids = [int(filename.split(' ')[0]) for filename in fileList]
-if ids:
- ids.sort()
- nextIdx = eolIdList.index(ids[-1]) + 1
-if nextIdx == len(eolIdList):
- print('No IDs left. Exiting...')
- sys.exit(0)
-
-print('Starting download threads')
-numThreads = 0
-threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
-# Handle SIGINT signals
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
- global interrupted
- interrupted = True
- signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Function for threads to execute
-def downloadImg(url, outFile):
- global numThreads, threadException
- try:
- data = requests.get(url)
- with open(outFile, 'wb') as file:
- file.write(data.content)
- time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
- except Exception as e:
- print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
- threadException = e
- numThreads -= 1
-# Manage downloading
-for idx in range(nextIdx, len(eolIdList)):
- eolId = eolIdList[idx]
- # Get image urls
- ownerSet: set[str] = set() # Used to get images from different owners, for variety
- exitLoop = False
- query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?'
- for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)):
- if url.startswith('data/'):
- url = 'https://content.eol.org/' + url
- urlParts = urllib.parse.urlparse(url)
- extension = os.path.splitext(urlParts.path)[1]
- if len(extension) <= 1:
- print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
- continue
- # Check image-quantity limit
- if len(ownerSet) == MAX_IMGS_PER_ID:
- break
- # Check for skip conditions
- if re.fullmatch(LICENSE_REGEX, license) is None:
- continue
- if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
- continue
- if copyrightOwner in ownerSet:
- continue
- ownerSet.add(copyrightOwner)
- # Determine output filename
- outPath = f'{outDir}{eolId} {contentId}{extension}'
- if os.path.exists(outPath):
- print(f'WARNING: {outPath} already exists. Skipping download.')
- continue
- # Check thread limit
- while numThreads == MAX_THREADS:
- time.sleep(1)
- # Wait for threads after an interrupt or thread-exception
- if interrupted or threadException is not None:
- print('Waiting for existing threads to end')
- while numThreads > 0:
- time.sleep(1)
- exitLoop = True
- break
- # Perform download
- print(f'Downloading image to {outPath}')
- numThreads += 1
- thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
- thread.start()
- if exitLoop:
- break
-# Close images-list db
-print('Finished downloading')
-imgDbCon.close()
diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py
deleted file mode 100755
index 808292d..0000000
--- a/backend/tolData/eol/genImagesListDb.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/python3
-
-import os, re
-import csv
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Generates a sqlite db from a directory of CSV files holding EOL image data
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imagesListDir = 'imagesList/'
-dbFile = 'imagesList.db'
-
-print('Creating database')
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-dbCur.execute('CREATE TABLE images' \
- ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)')
-dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)')
-print('Reading CSV files')
-csvFilenames = os.listdir(imagesListDir)
-for filename in csvFilenames:
- print(f'Processing {imagesListDir}{filename}')
- with open(imagesListDir + filename, newline='') as file:
- for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file):
- if re.match(r'^[a-zA-Z]', contentId): # Skip header line
- continue
- dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
- (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner))
-print('Closing database')
-dbCon.commit()
-dbCon.close()
diff --git a/backend/tolData/eol/reviewImgs.py b/backend/tolData/eol/reviewImgs.py
deleted file mode 100755
index e44fb3d..0000000
--- a/backend/tolData/eol/reviewImgs.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os, time
-import sqlite3
-import tkinter as tki
-from tkinter import ttk
-import PIL
-from PIL import ImageTk, Image, ImageOps
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Provides a GUI for reviewing images. Looks in a for-review directory for
-images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to
-choose an image to keep, or reject all. Also provides image rotation.
-Chosen images are placed in another directory, and rejected ones are deleted.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imgDir = 'imgsForReview/'
-outDir = 'imgs/'
-extraInfoDbCon = sqlite3.connect('../data.db')
-extraInfoDbCur = extraInfoDbCon.cursor()
-def getExtraInfo(eolId: int) -> str:
- global extraInfoDbCur
- query = 'SELECT names.alt_name FROM' \
- ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \
- ' WHERE id = ? and pref_alt = 1'
- row = extraInfoDbCur.execute(query, (eolId,)).fetchone()
- if row is not None:
- return f'Reviewing EOL ID {eolId}, aka "{row[0]}"'
- else:
- return f'Reviewing EOL ID {eolId}'
-IMG_DISPLAY_SZ = 400
-MAX_IMGS_PER_ID = 3
-IMG_BG_COLOR = (88, 28, 135)
-PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR)
-
-print('Checking output directory')
-if not os.path.exists(outDir):
- os.mkdir(outDir)
-print('Getting input image list')
-imgList = os.listdir(imgDir)
-imgList.sort(key=lambda s: int(s.split(' ')[0]))
-if not imgList:
- print('No input images found')
- sys.exit(0)
-
-class EolImgReviewer:
- """ Provides the GUI for reviewing images """
- def __init__(self, root, imgList):
- self.root = root
- root.title('EOL Image Reviewer')
- # Setup main frame
- mainFrame = ttk.Frame(root, padding='5 5 5 5')
- mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
- root.columnconfigure(0, weight=1)
- root.rowconfigure(0, weight=1)
- # Set up images-to-be-reviewed frames
- self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation
- self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter
- # These need a persistent reference for some reason (doesn't display otherwise)
- self.labels: list[ttk.Label] = []
- for i in range(MAX_IMGS_PER_ID):
- frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ)
- frame.grid(column=i, row=0)
- label = ttk.Label(frame, image=self.photoImgs[i])
- label.grid(column=0, row=0)
- self.labels.append(label)
- # Add padding
- for child in mainFrame.winfo_children():
- child.grid_configure(padx=5, pady=5)
- # Add keyboard bindings
- root.bind('<q>', self.quit)
- root.bind('<Key-j>', lambda evt: self.accept(0))
- root.bind('<Key-k>', lambda evt: self.accept(1))
- root.bind('<Key-l>', lambda evt: self.accept(2))
- root.bind('<Key-i>', lambda evt: self.reject())
- root.bind('<Key-a>', lambda evt: self.rotate(0))
- root.bind('<Key-s>', lambda evt: self.rotate(1))
- root.bind('<Key-d>', lambda evt: self.rotate(2))
- root.bind('<Key-A>', lambda evt: self.rotate(0, True))
- root.bind('<Key-S>', lambda evt: self.rotate(1, True))
- root.bind('<Key-D>', lambda evt: self.rotate(2, True))
- # Initialise images to review
- self.imgList = imgList
- self.imgListIdx = 0
- self.nextEolId = 0
- self.nextImgNames: list[str] = []
- self.rotations: list[int] = []
- self.getNextImgs()
- # For displaying extra info
- self.numReviewed = 0
- self.startTime = time.time()
- def getNextImgs(self):
- """ Updates display with new images to review, or ends program """
- # Gather names of next images to review
- for i in range(MAX_IMGS_PER_ID):
- if self.imgListIdx == len(self.imgList):
- if i == 0:
- self.quit()
- return
- break
- imgName = self.imgList[self.imgListIdx]
- eolId = int(re.match(r'(\d+) (\d+)', imgName).group(1))
- if i == 0:
- self.nextEolId = eolId
- self.nextImgNames = [imgName]
- self.rotations = [0]
- else:
- if self.nextEolId != eolId:
- break
- self.nextImgNames.append(imgName)
- self.rotations.append(0)
- self.imgListIdx += 1
- # Update displayed images
- idx = 0
- while idx < MAX_IMGS_PER_ID:
- if idx < len(self.nextImgNames):
- try:
- img = Image.open(imgDir + self.nextImgNames[idx])
- img = ImageOps.exif_transpose(img)
- except PIL.UnidentifiedImageError:
- os.remove(imgDir + self.nextImgNames[idx])
- del self.nextImgNames[idx]
- del self.rotations[idx]
- continue
- self.imgs[idx] = self.resizeImgForDisplay(img)
- else:
- self.imgs[idx] = PLACEHOLDER_IMG
- self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
- self.labels[idx].config(image=self.photoImgs[idx])
- idx += 1
- # Restart if all image files non-recognisable
- if not self.nextImgNames:
- self.getNextImgs()
- return
- # Update title
- firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
- lastImgIdx = self.imgListIdx
- title = getExtraInfo(self.nextEolId)
- title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})'
- self.root.title(title)
- def accept(self, imgIdx):
- """ React to a user selecting an image """
- if imgIdx >= len(self.nextImgNames):
- print('Invalid selection')
- return
- for i in range(len(self.nextImgNames)):
- inFile = imgDir + self.nextImgNames[i]
- if i == imgIdx: # Move accepted image, rotating if needed
- outFile = outDir + self.nextImgNames[i]
- img = Image.open(inFile)
- img = ImageOps.exif_transpose(img)
- if self.rotations[i] != 0:
- img = img.rotate(self.rotations[i], expand=True)
- img.save(outFile)
- os.remove(inFile)
- else: # Delete non-accepted image
- os.remove(inFile)
- self.numReviewed += 1
- self.getNextImgs()
- def reject(self):
- """ React to a user rejecting all images of a set """
- for i in range(len(self.nextImgNames)):
- os.remove(imgDir + self.nextImgNames[i])
- self.numReviewed += 1
- self.getNextImgs()
- def rotate(self, imgIdx, anticlockwise = False):
- """ Respond to a user rotating an image """
- deg = -90 if not anticlockwise else 90
- self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg)
- self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
- self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
- self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
- def quit(self, e = None):
- global extraInfoDbCon
- print(f'Number reviewed: {self.numReviewed}')
- timeElapsed = time.time() - self.startTime
- print(f'Time elapsed: {timeElapsed:.2f} seconds')
- if self.numReviewed > 0:
- print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
- extraInfoDbCon.close()
- self.root.destroy()
- def resizeImgForDisplay(self, img):
- """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """
- if max(img.width, img.height) > IMG_DISPLAY_SZ:
- if (img.width > img.height):
- newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
- img = img.resize((IMG_DISPLAY_SZ, newHeight))
- else:
- newWidth = int(img.width * IMG_DISPLAY_SZ / img.height)
- img = img.resize((newWidth, IMG_DISPLAY_SZ))
- bgImg = PLACEHOLDER_IMG.copy()
- bgImg.paste(img, box=(
- int((IMG_DISPLAY_SZ - img.width) / 2),
- int((IMG_DISPLAY_SZ - img.height) / 2)))
- return bgImg
-# Create GUI and defer control
-print('Starting GUI')
-root = tki.Tk()
-EolImgReviewer(root, imgList)
-root.mainloop()