From 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Mon, 11 Jul 2022 01:54:08 +1000 Subject: Make backend dev server script serve the image files Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images. --- backend/tolData/eol/downloadImgs.py | 147 ++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100755 backend/tolData/eol/downloadImgs.py (limited to 'backend/tolData/eol/downloadImgs.py') diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py new file mode 100755 index 0000000..96bc085 --- /dev/null +++ b/backend/tolData/eol/downloadImgs.py @@ -0,0 +1,147 @@ +#!/usr/bin/python3 + +import sys, re, os, random +import sqlite3 +import urllib.parse, requests +import time +from threading import Thread +import signal + +usageInfo = f""" +Usage: {sys.argv[0]} + +For some set of EOL IDs, downloads associated images from URLs in +an image-list database. Uses multiple downloading threads. + +May obtain multiple images per ID. The images will get names +with the form 'eolId1 contentId1.ext1'. + +SIGINT causes the program to finish ongoing downloads and exit. +The program can be re-run to continue downloading. It looks for +already-downloaded files, and continues after the one with +highest EOL ID. +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) +# In testing, this downloaded about 70k images, over a few days + +imagesListDb = "imagesList.db" +def getInputEolIds(): + eolIds = set() + dbCon = sqlite3.connect("../data.db") + dbCur = dbCon.cursor() + for (id,) in dbCur.execute("SELECT id FROM eol_ids"): + eolIds.add(id) + dbCon.close() + return eolIds +outDir = "imgsForReview/" +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 5 +POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) +POST_DL_DELAY_MAX = 3 +LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" + +print("Getting input EOL IDs") +eolIds = getInputEolIds() +print("Getting EOL IDs to download for") +# Get IDs from images-list db +imgDbCon = sqlite3.connect(imagesListDb) +imgCur = imgDbCon.cursor() +imgListIds = set() +for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): + imgListIds.add(pageId) +# Get set intersection, and sort into list +eolIds = eolIds.intersection(imgListIds) +eolIds = sorted(eolIds) +print(f"Result: {len(eolIds)} EOL IDs") + +print("Checking output directory") +if not os.path.exists(outDir): + os.mkdir(outDir) +print("Finding next ID to download for") +nextIdx = 0 +fileList = os.listdir(outDir) +ids = [int(filename.split(" ")[0]) for filename in fileList] +if len(ids) > 0: + ids.sort() + nextIdx = eolIds.index(ids[-1]) + 1 +if nextIdx == len(eolIds): + print("No IDs left. Exiting...") + sys.exit(0) + +print("Starting download threads") +numThreads = 0 +threadException = None # Used for ending main thread after a non-main thread exception +# Handle SIGINT signals +interrupted = False +oldHandler = None +def onSigint(sig, frame): + global interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) +oldHandler = signal.signal(signal.SIGINT, onSigint) +# Function for threads to execute +def downloadImg(url, outFile): + global numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) + threadException = e + numThreads -= 1 +# Manage downloading +for idx in range(nextIdx, len(eolIds)): + eolId = eolIds[idx] + # Get image urls + imgDataList = [] + ownerSet = set() # Used to get images from different owners, for variety + exitLoop = False + query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" + for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): + if url.startswith("data/"): + url = "https://content.eol.org/" + url + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) + continue + # Check image-quantity limit + if len(ownerSet) == MAX_IMGS_PER_ID: + break + # Check for skip conditions + if re.fullmatch(LICENSE_REGEX, license) == None: + continue + if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner in ownerSet: + continue + ownerSet.add(copyrightOwner) + # Determine output filename + outPath = f"{outDir}{eolId} {contentId}{extension}" + if os.path.exists(outPath): + print(f"WARNING: {outPath} already exists. Skipping download.") + continue + # Check thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException != None: + print("Waiting for existing threads to end") + while numThreads > 0: + time.sleep(1) + exitLoop = True + break + # Perform download + print(f"Downloading image to {outPath}") + numThreads += 1 + thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) + thread.start() + if exitLoop: + break +# Close images-list db +print("Finished downloading") +imgDbCon.close() -- cgit v1.2.3