aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/eol
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tol_data/eol')
-rw-r--r--backend/tol_data/eol/README.md31
-rw-r--r--backend/tol_data/eol/__init__.py0
-rwxr-xr-xbackend/tol_data/eol/download_imgs.py152
-rwxr-xr-xbackend/tol_data/eol/gen_images_list_db.py39
-rwxr-xr-xbackend/tol_data/eol/review_imgs.py213
5 files changed, 435 insertions, 0 deletions
diff --git a/backend/tol_data/eol/README.md b/backend/tol_data/eol/README.md
new file mode 100644
index 0000000..580310d
--- /dev/null
+++ b/backend/tol_data/eol/README.md
@@ -0,0 +1,31 @@
+This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/).
+
+# Mapping Files
+- `provider_ids.csv.gz` <br>
+ Obtained from <https://opendata.eol.org/dataset/identifier-map> on 22/08/22 (says last updated 27/07/22).
+ Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium.
+
+# Name Data Files
+- `vernacularNames.csv` <br>
+ Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020).
+ Contains alternative-node-names data from EOL.
+
+# Image Metadata Files
+- `imagesList.tgz` <br>
+ Obtained from <https://opendata.eol.org/dataset/images-list> on 24/04/2022 (last updated on 05/02/2020).
+ Contains metadata for images from EOL.
+- `imagesList/` <br>
+ Extracted from imagesList.tgz.
+- `gen_images_list_db.py` <br>
+ Creates a database, and imports imagesList/*.csv files into it.
+- `images_list.db` <br>
+ Created by running genImagesListDb.py <br>
+ Tables: <br>
+ - `images`:
+ `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT`
+
+# Image Generation Files
+- `download_imgs.py` <br>
+ Used to download image files into imgs_for_review/.
+- `review_imgs.py` <br>
+ Used to review images in imgs_for_review/, moving acceptable ones into imgs/.
diff --git a/backend/tol_data/eol/__init__.py b/backend/tol_data/eol/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/tol_data/eol/__init__.py
diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py
new file mode 100755
index 0000000..8454a35
--- /dev/null
+++ b/backend/tol_data/eol/download_imgs.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python3
+
+"""
+For some set of EOL IDs, downloads associated images from URLs in
+an image-list database. Uses multiple downloading threads.
+
+May obtain multiple images per ID. The images will get names
+with the form 'eolId1 contentId1.ext1'.
+
+SIGINT causes the program to finish ongoing downloads and exit.
+The program can be re-run to continue downloading. It looks for
+already-downloaded files, and continues after the one with
+highest EOL ID.
+"""
+
+import sys, re, os, random
+import sqlite3
+import urllib.parse, requests
+import time
+from threading import Thread
+import signal
+
+IMAGES_LIST_DB = 'images_list.db'
+OUT_DIR = 'imgs_for_review'
+DB_FILE = os.path.join('..', 'data.db')
+#
+MAX_IMGS_PER_ID = 3
+MAX_THREADS = 5
+POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
+POST_DL_DELAY_MAX = 3
+LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain'
+
+def downloadImgs(eolIds, imagesListDb, outDir):
+ print('Getting EOL IDs to download for')
+ # Get IDs from images-list db
+ imgDbCon = sqlite3.connect(imagesListDb)
+ imgCur = imgDbCon.cursor()
+ imgListIds: set[int] = set()
+ for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'):
+ imgListIds.add(pageId)
+ # Get set intersection, and sort into list
+ eolIds = eolIds.intersection(imgListIds)
+ eolIdList = sorted(eolIds)
+ nextIdx = 0
+ print(f'Result: {len(eolIdList)} EOL IDs')
+ #
+ print('Checking output directory')
+ if not os.path.exists(outDir):
+ os.mkdir(outDir)
+ else:
+ print('Finding next ID to download for')
+ fileList = os.listdir(outDir)
+ ids = [int(filename.split(' ')[0]) for filename in fileList]
+ if ids:
+ ids.sort()
+ nextIdx = eolIdList.index(ids[-1]) + 1
+ if nextIdx == len(eolIdList):
+ print('No IDs left. Exiting...')
+ return
+ #
+ print('Starting download threads')
+ numThreads = 0
+ threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
+ # Handle SIGINT signals
+ interrupted = False
+ oldHandler = None
+ def onSigint(sig, frame):
+ nonlocal interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+ oldHandler = signal.signal(signal.SIGINT, onSigint)
+ # Function for threads to execute
+ def downloadImg(url, outFile):
+ nonlocal numThreads, threadException
+ try:
+ data = requests.get(url)
+ with open(outFile, 'wb') as file:
+ file.write(data.content)
+ time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
+ except Exception as e:
+ print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
+ threadException = e
+ numThreads -= 1
+ # Manage downloading
+ for idx in range(nextIdx, len(eolIdList)):
+ eolId = eolIdList[idx]
+ # Get image urls
+ ownerSet: set[str] = set() # Used to get images from different owners, for variety
+ exitLoop = False
+ query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?'
+ for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)):
+ if url.startswith('data/'):
+ url = 'https://content.eol.org/' + url
+ urlParts = urllib.parse.urlparse(url)
+ extension = os.path.splitext(urlParts.path)[1]
+ if len(extension) <= 1:
+ print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
+ continue
+ # Check image-quantity limit
+ if len(ownerSet) == MAX_IMGS_PER_ID:
+ break
+ # Check for skip conditions
+ if re.fullmatch(LICENSE_REGEX, license) is None:
+ continue
+ if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
+ continue
+ if copyrightOwner in ownerSet:
+ continue
+ ownerSet.add(copyrightOwner)
+ # Determine output filename
+ outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}')
+ if os.path.exists(outPath):
+ print(f'WARNING: {outPath} already exists. Skipping download.')
+ continue
+ # Check thread limit
+ while numThreads == MAX_THREADS:
+ time.sleep(1)
+ # Wait for threads after an interrupt or thread-exception
+ if interrupted or threadException is not None:
+ print('Waiting for existing threads to end')
+ while numThreads > 0:
+ time.sleep(1)
+ exitLoop = True
+ break
+ # Perform download
+ print(f'Downloading image to {outPath}')
+ numThreads += 1
+ thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
+ thread.start()
+ if exitLoop:
+ break
+ # Close images-list db
+ while numThreads > 0:
+ time.sleep(1)
+ print('Finished downloading')
+ imgDbCon.close()
+
+def getEolIdsFromDb(dbFile) -> set[int]:
+ eolIds: set[int] = set()
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ for (id,) in dbCur.execute('SELECT id FROM eol_ids'):
+ eolIds.add(id)
+ dbCon.close()
+ return eolIds
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ eolIds = getEolIdsFromDb(DB_FILE)
+ downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR)
diff --git a/backend/tol_data/eol/gen_images_list_db.py b/backend/tol_data/eol/gen_images_list_db.py
new file mode 100755
index 0000000..ee57ac6
--- /dev/null
+++ b/backend/tol_data/eol/gen_images_list_db.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+"""
+Generates a sqlite db from a directory of CSV files holding EOL image data
+"""
+
+import os, glob
+import csv, re, sqlite3
+
+IMAGE_LISTS_GLOB = os.path.join('imagesList', '*.csv')
+DB_FILE = 'images_list.db'
+
+def genData(imageListsGlob: str, dbFile: str) -> None:
+ print('Creating database')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ dbCur.execute('CREATE TABLE images' \
+ ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \
+ ' copy_url TEXT, license TEXT, copyright_owner TEXT)')
+ dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)')
+ print('Reading CSV files')
+ for filename in glob.glob(imageListsGlob):
+ print(f'Processing {filename}')
+ with open(filename, newline='') as file:
+ for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file):
+ if re.match(r'^[a-zA-Z]', contentId): # Skip header line (not in all files)
+ continue
+ dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
+ (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner))
+ print('Closing database')
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ genData(IMAGE_LISTS_GLOB, DB_FILE)
diff --git a/backend/tol_data/eol/review_imgs.py b/backend/tol_data/eol/review_imgs.py
new file mode 100755
index 0000000..9fb462c
--- /dev/null
+++ b/backend/tol_data/eol/review_imgs.py
@@ -0,0 +1,213 @@
+#!/usr/bin/python3
+
+"""
+Provides a GUI for reviewing images. Looks in a for-review directory for
+images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to
+choose an image to keep, or reject all. Also provides image rotation.
+Chosen images are placed in another directory, and rejected ones are deleted.
+"""
+
+import sys, re, os, time
+import sqlite3
+import tkinter as tki
+from tkinter import ttk
+import PIL
+from PIL import ImageTk, Image, ImageOps
+
+IMG_DIR = 'imgs_for_review'
+OUT_DIR = 'imgs'
+EXTRA_INFO_DB = os.path.join('..', 'data.db')
+#
+IMG_DISPLAY_SZ = 400
+MAX_IMGS_PER_ID = 3
+IMG_BG_COLOR = (88, 28, 135)
+PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR)
+
+class EolImgReviewer:
+ """ Provides the GUI for reviewing images """
+ def __init__(self, root, imgDir, imgList, extraInfoDb, outDir):
+ self.root = root
+ root.title('EOL Image Reviewer')
+ # Setup main frame
+ mainFrame = ttk.Frame(root, padding='5 5 5 5')
+ mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
+ root.columnconfigure(0, weight=1)
+ root.rowconfigure(0, weight=1)
+ # Set up images-to-be-reviewed frames
+ self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation
+ self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter
+ # These need a persistent reference for some reason (doesn't display otherwise)
+ self.labels: list[ttk.Label] = []
+ for i in range(MAX_IMGS_PER_ID):
+ frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ)
+ frame.grid(column=i, row=0)
+ label = ttk.Label(frame, image=self.photoImgs[i])
+ label.grid(column=0, row=0)
+ self.labels.append(label)
+ # Add padding
+ for child in mainFrame.winfo_children():
+ child.grid_configure(padx=5, pady=5)
+ # Add keyboard bindings
+ root.bind('<q>', self.quit)
+ root.bind('<Key-j>', lambda evt: self.accept(0))
+ root.bind('<Key-k>', lambda evt: self.accept(1))
+ root.bind('<Key-l>', lambda evt: self.accept(2))
+ root.bind('<Key-i>', lambda evt: self.reject())
+ root.bind('<Key-a>', lambda evt: self.rotate(0))
+ root.bind('<Key-s>', lambda evt: self.rotate(1))
+ root.bind('<Key-d>', lambda evt: self.rotate(2))
+ root.bind('<Key-A>', lambda evt: self.rotate(0, True))
+ root.bind('<Key-S>', lambda evt: self.rotate(1, True))
+ root.bind('<Key-D>', lambda evt: self.rotate(2, True))
+ # Initialise fields
+ self.imgDir = imgDir
+ self.imgList = imgList
+ self.outDir = outDir
+ self.imgListIdx = 0
+ self.nextEolId = 0
+ self.nextImgNames: list[str] = []
+ self.rotations: list[int] = []
+ # For displaying extra info
+ self.extraInfoDbCon = sqlite3.connect(extraInfoDb)
+ self.extraInfoDbCur = self.extraInfoDbCon.cursor()
+ self.numReviewed = 0
+ self.startTime = time.time()
+ #
+ self.getNextImgs()
+ def getNextImgs(self):
+ """ Updates display with new images to review, or ends program """
+ # Gather names of next images to review
+ for i in range(MAX_IMGS_PER_ID):
+ if self.imgListIdx == len(self.imgList):
+ if i == 0:
+ self.quit()
+ return
+ break
+ imgName = self.imgList[self.imgListIdx]
+ eolId = int(re.match(r'(\d+) (\d+)', imgName).group(1))
+ if i == 0:
+ self.nextEolId = eolId
+ self.nextImgNames = [imgName]
+ self.rotations = [0]
+ else:
+ if self.nextEolId != eolId:
+ break
+ self.nextImgNames.append(imgName)
+ self.rotations.append(0)
+ self.imgListIdx += 1
+ # Update displayed images
+ idx = 0
+ while idx < MAX_IMGS_PER_ID:
+ if idx < len(self.nextImgNames):
+ try:
+ img = Image.open(os.path.join(self.imgDir, self.nextImgNames[idx]))
+ img = ImageOps.exif_transpose(img)
+ except PIL.UnidentifiedImageError:
+ os.remove(os.path.join(self.imgDir, self.nextImgNames[idx]))
+ del self.nextImgNames[idx]
+ del self.rotations[idx]
+ continue
+ self.imgs[idx] = self.resizeImgForDisplay(img)
+ else:
+ self.imgs[idx] = PLACEHOLDER_IMG
+ self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
+ self.labels[idx].config(image=self.photoImgs[idx])
+ idx += 1
+ # Restart if all image files non-recognisable
+ if not self.nextImgNames:
+ self.getNextImgs()
+ return
+ # Update title
+ firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
+ lastImgIdx = self.imgListIdx
+ title = self.getExtraInfo(self.nextEolId)
+ title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})'
+ self.root.title(title)
+ def accept(self, imgIdx):
+ """ React to a user selecting an image """
+ if imgIdx >= len(self.nextImgNames):
+ print('Invalid selection')
+ return
+ for i in range(len(self.nextImgNames)):
+ inFile = os.path.join(self.imgDir, self.nextImgNames[i])
+ if i == imgIdx: # Move accepted image, rotating if needed
+ outFile = os.path.join(self.outDir, self.nextImgNames[i])
+ img = Image.open(inFile)
+ img = ImageOps.exif_transpose(img)
+ if self.rotations[i] != 0:
+ img = img.rotate(self.rotations[i], expand=True)
+ img.save(outFile)
+ os.remove(inFile)
+ else: # Delete non-accepted image
+ os.remove(inFile)
+ self.numReviewed += 1
+ self.getNextImgs()
+ def reject(self):
+ """ React to a user rejecting all images of a set """
+ for i in range(len(self.nextImgNames)):
+ os.remove(os.path.join(self.imgDir, self.nextImgNames[i]))
+ self.numReviewed += 1
+ self.getNextImgs()
+ def rotate(self, imgIdx, anticlockwise = False):
+ """ Respond to a user rotating an image """
+ deg = -90 if not anticlockwise else 90
+ self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg)
+ self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
+ self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
+ self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
+ def quit(self, e = None):
+ print(f'Number reviewed: {self.numReviewed}')
+ timeElapsed = time.time() - self.startTime
+ print(f'Time elapsed: {timeElapsed:.2f} seconds')
+ if self.numReviewed > 0:
+ print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
+ self.extraInfoDbCon.close()
+ self.root.destroy()
+ #
+ def resizeImgForDisplay(self, img):
+ """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """
+ if max(img.width, img.height) > IMG_DISPLAY_SZ:
+ if (img.width > img.height):
+ newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
+ img = img.resize((IMG_DISPLAY_SZ, newHeight))
+ else:
+ newWidth = int(img.width * IMG_DISPLAY_SZ / img.height)
+ img = img.resize((newWidth, IMG_DISPLAY_SZ))
+ bgImg = PLACEHOLDER_IMG.copy()
+ bgImg.paste(img, box=(
+ int((IMG_DISPLAY_SZ - img.width) / 2),
+ int((IMG_DISPLAY_SZ - img.height) / 2)))
+ return bgImg
+ def getExtraInfo(self, eolId: int) -> str:
+ """ Used to display extra EOL ID info """
+ query = 'SELECT names.alt_name FROM' \
+ ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \
+ ' WHERE id = ? and pref_alt = 1'
+ row = self.extraInfoDbCur.execute(query, (eolId,)).fetchone()
+ if row is not None:
+ return f'Reviewing EOL ID {eolId}, aka "{row[0]}"'
+ else:
+ return f'Reviewing EOL ID {eolId}'
+
+def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str):
+ print('Checking output directory')
+ if not os.path.exists(outDir):
+ os.mkdir(outDir)
+ print('Getting input image list')
+ imgList = os.listdir(imgDir)
+ imgList.sort(key=lambda s: int(s.split(' ')[0]))
+ if not imgList:
+ print('No input images found')
+ sys.exit(0)
+ # Create GUI and defer control
+ print('Starting GUI')
+ root = tki.Tk()
+ EolImgReviewer(root, imgDir, imgList, extraInfoDb, outDir)
+ root.mainloop()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ reviewImgs(IMG_DIR, OUT_DIR, EXTRA_INFO_DB)