diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
| commit | 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch) | |
| tree | 3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/data/eol | |
| parent | a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff) | |
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or
symlinked from, public/. This needed to be changed before each
build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/data/eol')
| -rw-r--r-- | backend/data/eol/README.md | 26 | ||||
| -rwxr-xr-x | backend/data/eol/downloadImgs.py | 147 | ||||
| -rwxr-xr-x | backend/data/eol/genImagesListDb.sh | 12 | ||||
| -rwxr-xr-x | backend/data/eol/reviewImgs.py | 205 |
4 files changed, 0 insertions, 390 deletions
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md deleted file mode 100644 index 8c527a8..0000000 --- a/backend/data/eol/README.md +++ /dev/null @@ -1,26 +0,0 @@ -This directory holds files obtained from/using the [Encyclopedia of Life](https://eol.org/). - -# Name Data Files -- vernacularNames.csv <br> - Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020). - Contains alternative-name data from EOL. - -# Image Metadata Files -- imagesList.tgz <br> - Obtained from <https://opendata.eol.org/dataset/images-list> on 24/04/2022 (last updated on 05/02/2020). - Contains metadata for images from EOL. -- imagesList/ <br> - Extracted from imagesList.tgz. -- genImagesListDb.sh <br> - Creates a database, and imports imagesList/*.csv files into it. -- imagesList.db <br> - Created by running genImagesListDb.sh <br> - Tables: <br> - - `images`: - `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` - -# Image Generation Files -- downloadImgs.py <br> - Used to download image files into imgsForReview/. -- reviewImgs.py <br> - Used to review images in imgsForReview/, moving acceptable ones into imgs/. diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py deleted file mode 100755 index 96bc085..0000000 --- a/backend/data/eol/downloadImgs.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os, random -import sqlite3 -import urllib.parse, requests -import time -from threading import Thread -import signal - -usageInfo = f""" -Usage: {sys.argv[0]} - -For some set of EOL IDs, downloads associated images from URLs in -an image-list database. Uses multiple downloading threads. - -May obtain multiple images per ID. The images will get names -with the form 'eolId1 contentId1.ext1'. - -SIGINT causes the program to finish ongoing downloads and exit. -The program can be re-run to continue downloading. It looks for -already-downloaded files, and continues after the one with -highest EOL ID. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) -# In testing, this downloaded about 70k images, over a few days - -imagesListDb = "imagesList.db" -def getInputEolIds(): - eolIds = set() - dbCon = sqlite3.connect("../data.db") - dbCur = dbCon.cursor() - for (id,) in dbCur.execute("SELECT id FROM eol_ids"): - eolIds.add(id) - dbCon.close() - return eolIds -outDir = "imgsForReview/" -MAX_IMGS_PER_ID = 3 -MAX_THREADS = 5 -POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) -POST_DL_DELAY_MAX = 3 -LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" - -print("Getting input EOL IDs") -eolIds = getInputEolIds() -print("Getting EOL IDs to download for") -# Get IDs from images-list db -imgDbCon = sqlite3.connect(imagesListDb) -imgCur = imgDbCon.cursor() -imgListIds = set() -for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): - imgListIds.add(pageId) -# Get set intersection, and sort into list -eolIds = eolIds.intersection(imgListIds) -eolIds = sorted(eolIds) -print(f"Result: {len(eolIds)} EOL IDs") - -print("Checking output directory") -if not os.path.exists(outDir): - os.mkdir(outDir) -print("Finding next ID to download for") -nextIdx = 0 -fileList = os.listdir(outDir) -ids = [int(filename.split(" ")[0]) for filename in fileList] -if len(ids) > 0: - ids.sort() - nextIdx = eolIds.index(ids[-1]) + 1 -if nextIdx == len(eolIds): - print("No IDs left. Exiting...") - sys.exit(0) - -print("Starting download threads") -numThreads = 0 -threadException = None # Used for ending main thread after a non-main thread exception -# Handle SIGINT signals -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) -# Function for threads to execute -def downloadImg(url, outFile): - global numThreads, threadException - try: - data = requests.get(url) - with open(outFile, 'wb') as file: - file.write(data.content) - time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) - except Exception as e: - print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) - threadException = e - numThreads -= 1 -# Manage downloading -for idx in range(nextIdx, len(eolIds)): - eolId = eolIds[idx] - # Get image urls - imgDataList = [] - ownerSet = set() # Used to get images from different owners, for variety - exitLoop = False - query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" - for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): - if url.startswith("data/"): - url = "https://content.eol.org/" + url - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) - continue - # Check image-quantity limit - if len(ownerSet) == MAX_IMGS_PER_ID: - break - # Check for skip conditions - if re.fullmatch(LICENSE_REGEX, license) == None: - continue - if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic - continue - if copyrightOwner in ownerSet: - continue - ownerSet.add(copyrightOwner) - # Determine output filename - outPath = f"{outDir}{eolId} {contentId}{extension}" - if os.path.exists(outPath): - print(f"WARNING: {outPath} already exists. Skipping download.") - continue - # Check thread limit - while numThreads == MAX_THREADS: - time.sleep(1) - # Wait for threads after an interrupt or thread-exception - if interrupted or threadException != None: - print("Waiting for existing threads to end") - while numThreads > 0: - time.sleep(1) - exitLoop = True - break - # Perform download - print(f"Downloading image to {outPath}") - numThreads += 1 - thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) - thread.start() - if exitLoop: - break -# Close images-list db -print("Finished downloading") -imgDbCon.close() diff --git a/backend/data/eol/genImagesListDb.sh b/backend/data/eol/genImagesListDb.sh deleted file mode 100755 index 87dd840..0000000 --- a/backend/data/eol/genImagesListDb.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -e - -# Combine CSV files into one, skipping header lines -cat imagesList/media_*_{1..58}.csv | tail -n +2 > imagesList.csv -# Create database, and import the CSV file -sqlite3 imagesList.db <<END -CREATE TABLE images ( - content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT); -.mode csv -.import 'imagesList.csv' images -END diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py deleted file mode 100755 index ecdf7ab..0000000 --- a/backend/data/eol/reviewImgs.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os, time -import sqlite3 -import tkinter as tki -from tkinter import ttk -import PIL -from PIL import ImageTk, Image, ImageOps - -usageInfo = f""" -Usage: {sys.argv[0]} - -Provides a GUI for reviewing images. Looks in a for-review directory for -images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to -choose an image to keep, or reject all. Also provides image rotation. -Chosen images are placed in another directory, and rejected ones are deleted. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -imgDir = "imgsForReview/" -outDir = "imgs/" -extraInfoDbCon = sqlite3.connect("../data.db") -extraInfoDbCur = extraInfoDbCon.cursor() -def getExtraInfo(eolId): - global extraInfoDbCur - query = "SELECT names.alt_name FROM" \ - " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ - " WHERE id = ? and pref_alt = 1" - row = extraInfoDbCur.execute(query, (eolId,)).fetchone() - if row != None: - return f"Reviewing EOL ID {eolId}, aka \"{row[0]}\"" - else: - return f"Reviewing EOL ID {eolId}" -IMG_DISPLAY_SZ = 400 -MAX_IMGS_PER_ID = 3 -IMG_BG_COLOR = (88, 28, 135) -PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) - -print("Checking output directory") -if not os.path.exists(outDir): - os.mkdir(outDir) -print("Getting input image list") -imgList = os.listdir(imgDir) -imgList.sort(key=lambda s: int(s.split(" ")[0])) -if len(imgList) == 0: - print("No input images found") - sys.exit(0) - -class EolImgReviewer: - " Provides the GUI for reviewing images " - def __init__(self, root, imgList): - self.root = root - root.title("EOL Image Reviewer") - # Setup main frame - mainFrame = ttk.Frame(root, padding="5 5 5 5") - mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S)) - root.columnconfigure(0, weight=1) - root.rowconfigure(0, weight=1) - # Set up images-to-be-reviewed frames - self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation - self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter - # These need a persistent reference for some reason (doesn't display otherwise) - self.labels = [] - for i in range(MAX_IMGS_PER_ID): - frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ) - frame.grid(column=i, row=0) - label = ttk.Label(frame, image=self.photoImgs[i]) - label.grid(column=0, row=0) - self.labels.append(label) - # Add padding - for child in mainFrame.winfo_children(): - child.grid_configure(padx=5, pady=5) - # Add keyboard bindings - root.bind("<q>", self.quit) - root.bind("<Key-j>", lambda evt: self.accept(0)) - root.bind("<Key-k>", lambda evt: self.accept(1)) - root.bind("<Key-l>", lambda evt: self.accept(2)) - root.bind("<Key-i>", lambda evt: self.reject()) - root.bind("<Key-a>", lambda evt: self.rotate(0)) - root.bind("<Key-s>", lambda evt: self.rotate(1)) - root.bind("<Key-d>", lambda evt: self.rotate(2)) - root.bind("<Key-A>", lambda evt: self.rotate(0, True)) - root.bind("<Key-S>", lambda evt: self.rotate(1, True)) - root.bind("<Key-D>", lambda evt: self.rotate(2, True)) - # Initialise images to review - self.imgList = imgList - self.imgListIdx = 0 - self.nextEolId = 0 - self.nextImgNames = [] - self.rotations = [] - self.getNextImgs() - # For displaying extra info - self.numReviewed = 0 - self.startTime = time.time() - def getNextImgs(self): - " Updates display with new images to review, or ends program " - # Gather names of next images to review - for i in range(MAX_IMGS_PER_ID): - if self.imgListIdx == len(self.imgList): - if i == 0: - self.quit() - return - break - imgName = self.imgList[self.imgListIdx] - eolId = int(re.match(r"(\d+) (\d+)", imgName).group(1)) - if i == 0: - self.nextEolId = eolId - self.nextImgNames = [imgName] - self.rotations = [0] - else: - if self.nextEolId != eolId: - break - self.nextImgNames.append(imgName) - self.rotations.append(0) - self.imgListIdx += 1 - # Update displayed images - idx = 0 - while idx < MAX_IMGS_PER_ID: - if idx < len(self.nextImgNames): - try: - img = Image.open(imgDir + self.nextImgNames[idx]) - img = ImageOps.exif_transpose(img) - except PIL.UnidentifiedImageError: - os.remove(imgDir + self.nextImgNames[idx]) - del self.nextImgNames[idx] - del self.rotations[idx] - continue - self.imgs[idx] = self.resizeImgForDisplay(img) - else: - self.imgs[idx] = PLACEHOLDER_IMG - self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx]) - self.labels[idx].config(image=self.photoImgs[idx]) - idx += 1 - # Restart if all image files non-recognisable - if len(self.nextImgNames) == 0: - self.getNextImgs() - return - # Update title - firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 - lastImgIdx = self.imgListIdx - title = getExtraInfo(self.nextEolId) - title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})" - self.root.title(title) - def accept(self, imgIdx): - " React to a user selecting an image " - if imgIdx >= len(self.nextImgNames): - print("Invalid selection") - return - for i in range(len(self.nextImgNames)): - inFile = imgDir + self.nextImgNames[i] - if i == imgIdx: # Move accepted image, rotating if needed - outFile = outDir + self.nextImgNames[i] - img = Image.open(inFile) - img = ImageOps.exif_transpose(img) - if self.rotations[i] != 0: - img = img.rotate(self.rotations[i], expand=True) - img.save(outFile) - os.remove(inFile) - else: # Delete non-accepted image - os.remove(inFile) - self.numReviewed += 1 - self.getNextImgs() - def reject(self): - " React to a user rejecting all images of a set " - for i in range(len(self.nextImgNames)): - os.remove(imgDir + self.nextImgNames[i]) - self.numReviewed += 1 - self.getNextImgs() - def rotate(self, imgIdx, anticlockwise = False): - " Respond to a user rotating an image " - deg = -90 if not anticlockwise else 90 - self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg) - self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) - self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) - self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 - def quit(self, e = None): - global extraInfoDbCon - print(f"Number reviewed: {self.numReviewed}") - timeElapsed = time.time() - self.startTime - print(f"Time elapsed: {timeElapsed:.2f} seconds") - if self.numReviewed > 0: - print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds") - extraInfoDbCon.close() - self.root.destroy() - def resizeImgForDisplay(self, img): - " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background " - if max(img.width, img.height) > IMG_DISPLAY_SZ: - if (img.width > img.height): - newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) - img = img.resize((IMG_DISPLAY_SZ, newHeight)) - else: - newWidth = int(img.width * IMG_DISPLAY_SZ / img.height) - img = img.resize((newWidth, IMG_DISPLAY_SZ)) - bgImg = PLACEHOLDER_IMG.copy() - bgImg.paste(img, box=( - int((IMG_DISPLAY_SZ - img.width) / 2), - int((IMG_DISPLAY_SZ - img.height) / 2))) - return bgImg -# Create GUI and defer control -print("Starting GUI") -root = tki.Tk() -EolImgReviewer(root, imgList) -root.mainloop() |
