From d87bb9bc0991d7ce4eeb895da61c63a204edaa4d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sat, 30 Apr 2022 13:24:26 +1000 Subject: Add scripts for downloading/reviewing/cropping_and_resizing images Also adjust client code to handle new format, and add backend/data/README.md explaining image production process. --- backend/data/downloadImgsForReview.py | 143 ++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100755 backend/data/downloadImgsForReview.py (limited to 'backend/data/downloadImgsForReview.py') diff --git a/backend/data/downloadImgsForReview.py b/backend/data/downloadImgsForReview.py new file mode 100755 index 0000000..12b52ff --- /dev/null +++ b/backend/data/downloadImgsForReview.py @@ -0,0 +1,143 @@ +#!/usr/bin/python3 + +import sys, re, os, random +import sqlite3 +import urllib.parse, requests +import time +from threading import Thread +import signal + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Downloads images from URLs specified in an image-list database, using\n" +usageInfo += "EOL IDs obtained from another database. Downloaded images get names of\n" +usageInfo += "the form 'eolId1 contentId1.ext1'\n" +usageInfo += "\n" +usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n" +usageInfo += "The program can be re-run to continue downloading, and uses\n" +usageInfo += "existing downloaded files to decide where to continue from.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +imagesListDb = "eol/imagesList.db" +dbFile = "data.db" +outDir = "imgsForReview/" +LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" +POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) +POST_DL_DELAY_MAX = 3 + +# Get eol-ids from data db +eolIds = set() +print("Reading in EOL IDs") +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +for row in dbCur.execute("SELECT DISTINCT eol_id FROM names"): + eolIds.add(row[0]) +dbCon.close() +# Get eol-ids from images db +imgDbCon = sqlite3.connect(imagesListDb) +imgCur = imgDbCon.cursor() +imgListIds = set() +for row in imgCur.execute("SELECT DISTINCT page_id FROM images"): + imgListIds.add(row[0]) +# Get eol-id intersection, and sort into list +eolIds = eolIds.intersection(imgListIds) +eolIds = sorted(eolIds) + +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 10 +numThreads = 0 +threadException = None # Used for ending main thread after a non-main thread exception +def downloadImg(url, outFile): + global numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print("Error while downloading to {}: {}".format(outFile, str(e)), file=sys.stderr) + threadException = e + numThreads -= 1 +# Create output directory if not present +if not os.path.exists(outDir): + os.mkdir(outDir) +# Find next eol ID to download for +print("Finding next ID to download for") +nextIdx = 0 +fileList = os.listdir(outDir) +ids = list(map(lambda filename: int(filename.split(" ")[0]), fileList)) +if len(ids) > 0: + ids.sort() + nextIdx = eolIds.index(ids[-1]) +if nextIdx == len(eolIds): + print("No IDs left. Exiting...") + sys.exit(0) +# Detect SIGINT signals +interrupted = False +oldHandler = None +def onSigint(sig, frame): + global interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) +oldHandler = signal.signal(signal.SIGINT, onSigint) +# Manage downloading +for idx in range(nextIdx, len(eolIds)): + eolId = eolIds[idx] + # Get image urls + imgDataList = [] + ownerSet = set() # Used to get images from different owners, for variety + for row in imgCur.execute( + "SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)): + license = row[3] + copyrightOwner = row[4] + if re.fullmatch(LICENSE_REGEX, license) == None: + continue + if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner not in ownerSet: + ownerSet.add(copyrightOwner) + imgDataList.append(row) + if len(ownerSet) == MAX_IMGS_PER_ID: + break + if len(imgDataList) == 0: + continue + # Determine output filenames + outFiles = [] + urls = [] + for row in imgDataList: + contentId = row[0] + url = row[2] + if url.startswith("data/"): + url = "https://content.eol.org/" + url + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print("WARNING: No filename extension found in URL {}".format(url), file=sys.stderr) + continue + outFiles.append(str(eolId) + " " + str(contentId) + extension) + urls.append(url) + # Start downloads + exitLoop = False + for i in range(len(outFiles)): + outPath = outDir + outFiles[i] + if not os.path.exists(outPath): + # Enforce thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException != None: + print("Waiting for existing threads to end") + while numThreads > 0: + time.sleep(1) + exitLoop = True + break + print("Downloading image to {}".format(outPath)) + # Perform download + numThreads += 1 + thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True) + thread.start() + if exitLoop: + break +# Close images-list db +imgDbCon.close() -- cgit v1.2.3