diff options
Diffstat (limited to 'backend/data/eol')
| -rw-r--r-- | backend/data/eol/README.md | 5 | ||||
| -rwxr-xr-x | backend/data/eol/downloadImgs.py | 152 | ||||
| -rwxr-xr-x | backend/data/eol/genImagesListDb.sh | 2 | ||||
| -rwxr-xr-x | backend/data/eol/reviewImgs.py | 48 |
4 files changed, 107 insertions, 100 deletions
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md index fbb008d..8c527a8 100644 --- a/backend/data/eol/README.md +++ b/backend/data/eol/README.md @@ -11,9 +11,10 @@ This directory holds files obtained from/using the [Encyclopedia of Life](https: Contains metadata for images from EOL. - imagesList/ <br> Extracted from imagesList.tgz. +- genImagesListDb.sh <br> + Creates a database, and imports imagesList/*.csv files into it. - imagesList.db <br> - Contains data from imagesList/. - Created by running genImagesListDb.sh, which simply imports csv files into a database. <br> + Created by running genImagesListDb.sh <br> Tables: <br> - `images`: `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py index ac72ea1..96bc085 100755 --- a/backend/data/eol/downloadImgs.py +++ b/backend/data/eol/downloadImgs.py @@ -7,18 +7,24 @@ import time from threading import Thread import signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Downloads images from URLs specified in an image-list database,\n" -usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n" -usageInfo += "the form 'eolId1 contentId1.ext1'.\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n" -usageInfo += "The program can be re-run to continue downloading. It looks for\n" -usageInfo += "existing downloaded files, and continues after the one with\n" -usageInfo += "highest EOL ID.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +For some set of EOL IDs, downloads associated images from URLs in +an image-list database. Uses multiple downloading threads. + +May obtain multiple images per ID. The images will get names +with the form 'eolId1 contentId1.ext1'. + +SIGINT causes the program to finish ongoing downloads and exit. +The program can be re-run to continue downloading. It looks for +already-downloaded files, and continues after the one with +highest EOL ID. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) +# In testing, this downloaded about 70k images, over a few days imagesListDb = "imagesList.db" def getInputEolIds(): @@ -30,44 +36,29 @@ def getInputEolIds(): dbCon.close() return eolIds outDir = "imgsForReview/" -LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 5 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) POST_DL_DELAY_MAX = 3 +LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" -# Get eol-ids from data db print("Getting input EOL IDs") eolIds = getInputEolIds() -# Get eol-ids from images db -print("Getting images-list-db EOL IDs") +print("Getting EOL IDs to download for") +# Get IDs from images-list db imgDbCon = sqlite3.connect(imagesListDb) imgCur = imgDbCon.cursor() imgListIds = set() -for row in imgCur.execute("SELECT DISTINCT page_id FROM images"): - imgListIds.add(row[0]) -# Get eol-id intersection, and sort into list +for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): + imgListIds.add(pageId) +# Get set intersection, and sort into list eolIds = eolIds.intersection(imgListIds) eolIds = sorted(eolIds) -print(f"Resulted in {len(eolIds)} EOL IDs") +print(f"Result: {len(eolIds)} EOL IDs") -MAX_IMGS_PER_ID = 3 -MAX_THREADS = 5 -numThreads = 0 -threadException = None # Used for ending main thread after a non-main thread exception -def downloadImg(url, outFile): - global numThreads, threadException - try: - data = requests.get(url) - with open(outFile, 'wb') as file: - file.write(data.content) - time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) - except Exception as e: - print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) - threadException = e - numThreads -= 1 -# Create output directory if not present +print("Checking output directory") if not os.path.exists(outDir): os.mkdir(outDir) -# Find next eol ID to download for print("Finding next ID to download for") nextIdx = 0 fileList = os.listdir(outDir) @@ -78,7 +69,11 @@ if len(ids) > 0: if nextIdx == len(eolIds): print("No IDs left. Exiting...") sys.exit(0) -# Detect SIGINT signals + +print("Starting download threads") +numThreads = 0 +threadException = None # Used for ending main thread after a non-main thread exception +# Handle SIGINT signals interrupted = False oldHandler = None def onSigint(sig, frame): @@ -86,33 +81,27 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) +# Function for threads to execute +def downloadImg(url, outFile): + global numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) + threadException = e + numThreads -= 1 # Manage downloading for idx in range(nextIdx, len(eolIds)): eolId = eolIds[idx] # Get image urls imgDataList = [] ownerSet = set() # Used to get images from different owners, for variety - for row in imgCur.execute( - "SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)): - license = row[3] - copyrightOwner = row[4] - if re.fullmatch(LICENSE_REGEX, license) == None: - continue - if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic - continue - if copyrightOwner not in ownerSet: - ownerSet.add(copyrightOwner) - imgDataList.append(row) - if len(ownerSet) == MAX_IMGS_PER_ID: - break - if len(imgDataList) == 0: - continue - # Determine output filenames - outFiles = [] - urls = [] - for row in imgDataList: - contentId = row[0] - url = row[2] + exitLoop = False + query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" + for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): if url.startswith("data/"): url = "https://content.eol.org/" + url urlParts = urllib.parse.urlparse(url) @@ -120,28 +109,37 @@ for idx in range(nextIdx, len(eolIds)): if len(extension) <= 1: print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) continue - outFiles.append(str(eolId) + " " + str(contentId) + extension) - urls.append(url) - # Start downloads - exitLoop = False - for i in range(len(outFiles)): - outPath = outDir + outFiles[i] - if not os.path.exists(outPath): - # Enforce thread limit - while numThreads == MAX_THREADS: + # Check image-quantity limit + if len(ownerSet) == MAX_IMGS_PER_ID: + break + # Check for skip conditions + if re.fullmatch(LICENSE_REGEX, license) == None: + continue + if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner in ownerSet: + continue + ownerSet.add(copyrightOwner) + # Determine output filename + outPath = f"{outDir}{eolId} {contentId}{extension}" + if os.path.exists(outPath): + print(f"WARNING: {outPath} already exists. Skipping download.") + continue + # Check thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException != None: + print("Waiting for existing threads to end") + while numThreads > 0: time.sleep(1) - # Wait for threads after an interrupt or thread-exception - if interrupted or threadException != None: - print("Waiting for existing threads to end") - while numThreads > 0: - time.sleep(1) - exitLoop = True - break - print(f"Downloading image to {outPath}") - # Perform download - numThreads += 1 - thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True) - thread.start() + exitLoop = True + break + # Perform download + print(f"Downloading image to {outPath}") + numThreads += 1 + thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) + thread.start() if exitLoop: break # Close images-list db diff --git a/backend/data/eol/genImagesListDb.sh b/backend/data/eol/genImagesListDb.sh index 3a8ced7..87dd840 100755 --- a/backend/data/eol/genImagesListDb.sh +++ b/backend/data/eol/genImagesListDb.sh @@ -1,7 +1,9 @@ #!/bin/bash set -e +# Combine CSV files into one, skipping header lines cat imagesList/media_*_{1..58}.csv | tail -n +2 > imagesList.csv +# Create database, and import the CSV file sqlite3 imagesList.db <<END CREATE TABLE images ( content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT); diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py index 5290f9e..ecdf7ab 100755 --- a/backend/data/eol/reviewImgs.py +++ b/backend/data/eol/reviewImgs.py @@ -7,11 +7,14 @@ from tkinter import ttk import PIL from PIL import ImageTk, Image, ImageOps -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Provides a GUI for reviewing images. Looks in a for-review directory for\n" -usageInfo += "images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to\n" -usageInfo += "choose an image to keep, or reject all. Also provides image rotation.\n" -usageInfo += "Chosen images are placed in another directory, and rejected ones are deleted.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Provides a GUI for reviewing images. Looks in a for-review directory for +images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to +choose an image to keep, or reject all. Also provides image rotation. +Chosen images are placed in another directory, and rejected ones are deleted. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -21,6 +24,7 @@ outDir = "imgs/" extraInfoDbCon = sqlite3.connect("../data.db") extraInfoDbCur = extraInfoDbCon.cursor() def getExtraInfo(eolId): + global extraInfoDbCur query = "SELECT names.alt_name FROM" \ " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ " WHERE id = ? and pref_alt = 1" @@ -31,21 +35,21 @@ def getExtraInfo(eolId): return f"Reviewing EOL ID {eolId}" IMG_DISPLAY_SZ = 400 MAX_IMGS_PER_ID = 3 -PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135)) +IMG_BG_COLOR = (88, 28, 135) +PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) -# Create output directory if not present +print("Checking output directory") if not os.path.exists(outDir): os.mkdir(outDir) -# Get images for review -print("Reading input image list") +print("Getting input image list") imgList = os.listdir(imgDir) imgList.sort(key=lambda s: int(s.split(" ")[0])) if len(imgList) == 0: - print("No input images found", file=sys.stderr) - sys.exit(1) + print("No input images found") + sys.exit(0) class EolImgReviewer: - """ Provides the GUI for reviewing images """ + " Provides the GUI for reviewing images " def __init__(self, root, imgList): self.root = root root.title("EOL Image Reviewer") @@ -68,7 +72,7 @@ class EolImgReviewer: # Add padding for child in mainFrame.winfo_children(): child.grid_configure(padx=5, pady=5) - # Add bindings + # Add keyboard bindings root.bind("<q>", self.quit) root.bind("<Key-j>", lambda evt: self.accept(0)) root.bind("<Key-k>", lambda evt: self.accept(1)) @@ -87,11 +91,11 @@ class EolImgReviewer: self.nextImgNames = [] self.rotations = [] self.getNextImgs() - # For more info + # For displaying extra info self.numReviewed = 0 self.startTime = time.time() def getNextImgs(self): - """ Updates display with new images to review, or ends program """ + " Updates display with new images to review, or ends program " # Gather names of next images to review for i in range(MAX_IMGS_PER_ID): if self.imgListIdx == len(self.imgList): @@ -123,7 +127,7 @@ class EolImgReviewer: del self.nextImgNames[idx] del self.rotations[idx] continue - self.imgs[idx] = self.resizeForDisplay(img) + self.imgs[idx] = self.resizeImgForDisplay(img) else: self.imgs[idx] = PLACEHOLDER_IMG self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx]) @@ -140,7 +144,7 @@ class EolImgReviewer: title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})" self.root.title(title) def accept(self, imgIdx): - """ React to a user selecting an image """ + " React to a user selecting an image " if imgIdx >= len(self.nextImgNames): print("Invalid selection") return @@ -159,19 +163,20 @@ class EolImgReviewer: self.numReviewed += 1 self.getNextImgs() def reject(self): - """ React to a user rejecting all images of a set """ + " React to a user rejecting all images of a set " for i in range(len(self.nextImgNames)): os.remove(imgDir + self.nextImgNames[i]) self.numReviewed += 1 self.getNextImgs() def rotate(self, imgIdx, anticlockwise = False): - """ Respond to a user rotating an image """ + " Respond to a user rotating an image " deg = -90 if not anticlockwise else 90 self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg) self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 def quit(self, e = None): + global extraInfoDbCon print(f"Number reviewed: {self.numReviewed}") timeElapsed = time.time() - self.startTime print(f"Time elapsed: {timeElapsed:.2f} seconds") @@ -179,8 +184,8 @@ class EolImgReviewer: print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds") extraInfoDbCon.close() self.root.destroy() - def resizeForDisplay(self, img): - """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """ + def resizeImgForDisplay(self, img): + " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background " if max(img.width, img.height) > IMG_DISPLAY_SZ: if (img.width > img.height): newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) @@ -194,6 +199,7 @@ class EolImgReviewer: int((IMG_DISPLAY_SZ - img.height) / 2))) return bgImg # Create GUI and defer control +print("Starting GUI") root = tki.Tk() EolImgReviewer(root, imgList) root.mainloop() |
