diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-22 23:16:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-22 23:16:42 +1000 |
| commit | abb936f5d76f7fe5cec1e8948d287da86643d504 (patch) | |
| tree | f07b9eaadf5ae91363fdbac9d81b74e1fb0a436f /backend/data/eol/downloadImgs.py | |
| parent | e78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff) | |
Refactor backend scriptsextended-db
Diffstat (limited to 'backend/data/eol/downloadImgs.py')
| -rwxr-xr-x | backend/data/eol/downloadImgs.py | 152 |
1 files changed, 75 insertions, 77 deletions
diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py index ac72ea1..96bc085 100755 --- a/backend/data/eol/downloadImgs.py +++ b/backend/data/eol/downloadImgs.py @@ -7,18 +7,24 @@ import time from threading import Thread import signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Downloads images from URLs specified in an image-list database,\n" -usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n" -usageInfo += "the form 'eolId1 contentId1.ext1'.\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n" -usageInfo += "The program can be re-run to continue downloading. It looks for\n" -usageInfo += "existing downloaded files, and continues after the one with\n" -usageInfo += "highest EOL ID.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +For some set of EOL IDs, downloads associated images from URLs in +an image-list database. Uses multiple downloading threads. + +May obtain multiple images per ID. The images will get names +with the form 'eolId1 contentId1.ext1'. + +SIGINT causes the program to finish ongoing downloads and exit. +The program can be re-run to continue downloading. It looks for +already-downloaded files, and continues after the one with +highest EOL ID. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) +# In testing, this downloaded about 70k images, over a few days imagesListDb = "imagesList.db" def getInputEolIds(): @@ -30,44 +36,29 @@ def getInputEolIds(): dbCon.close() return eolIds outDir = "imgsForReview/" -LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 5 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) POST_DL_DELAY_MAX = 3 +LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" -# Get eol-ids from data db print("Getting input EOL IDs") eolIds = getInputEolIds() -# Get eol-ids from images db -print("Getting images-list-db EOL IDs") +print("Getting EOL IDs to download for") +# Get IDs from images-list db imgDbCon = sqlite3.connect(imagesListDb) imgCur = imgDbCon.cursor() imgListIds = set() -for row in imgCur.execute("SELECT DISTINCT page_id FROM images"): - imgListIds.add(row[0]) -# Get eol-id intersection, and sort into list +for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): + imgListIds.add(pageId) +# Get set intersection, and sort into list eolIds = eolIds.intersection(imgListIds) eolIds = sorted(eolIds) -print(f"Resulted in {len(eolIds)} EOL IDs") +print(f"Result: {len(eolIds)} EOL IDs") -MAX_IMGS_PER_ID = 3 -MAX_THREADS = 5 -numThreads = 0 -threadException = None # Used for ending main thread after a non-main thread exception -def downloadImg(url, outFile): - global numThreads, threadException - try: - data = requests.get(url) - with open(outFile, 'wb') as file: - file.write(data.content) - time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) - except Exception as e: - print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) - threadException = e - numThreads -= 1 -# Create output directory if not present +print("Checking output directory") if not os.path.exists(outDir): os.mkdir(outDir) -# Find next eol ID to download for print("Finding next ID to download for") nextIdx = 0 fileList = os.listdir(outDir) @@ -78,7 +69,11 @@ if len(ids) > 0: if nextIdx == len(eolIds): print("No IDs left. Exiting...") sys.exit(0) -# Detect SIGINT signals + +print("Starting download threads") +numThreads = 0 +threadException = None # Used for ending main thread after a non-main thread exception +# Handle SIGINT signals interrupted = False oldHandler = None def onSigint(sig, frame): @@ -86,33 +81,27 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) +# Function for threads to execute +def downloadImg(url, outFile): + global numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) + threadException = e + numThreads -= 1 # Manage downloading for idx in range(nextIdx, len(eolIds)): eolId = eolIds[idx] # Get image urls imgDataList = [] ownerSet = set() # Used to get images from different owners, for variety - for row in imgCur.execute( - "SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)): - license = row[3] - copyrightOwner = row[4] - if re.fullmatch(LICENSE_REGEX, license) == None: - continue - if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic - continue - if copyrightOwner not in ownerSet: - ownerSet.add(copyrightOwner) - imgDataList.append(row) - if len(ownerSet) == MAX_IMGS_PER_ID: - break - if len(imgDataList) == 0: - continue - # Determine output filenames - outFiles = [] - urls = [] - for row in imgDataList: - contentId = row[0] - url = row[2] + exitLoop = False + query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" + for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): if url.startswith("data/"): url = "https://content.eol.org/" + url urlParts = urllib.parse.urlparse(url) @@ -120,28 +109,37 @@ for idx in range(nextIdx, len(eolIds)): if len(extension) <= 1: print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) continue - outFiles.append(str(eolId) + " " + str(contentId) + extension) - urls.append(url) - # Start downloads - exitLoop = False - for i in range(len(outFiles)): - outPath = outDir + outFiles[i] - if not os.path.exists(outPath): - # Enforce thread limit - while numThreads == MAX_THREADS: + # Check image-quantity limit + if len(ownerSet) == MAX_IMGS_PER_ID: + break + # Check for skip conditions + if re.fullmatch(LICENSE_REGEX, license) == None: + continue + if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner in ownerSet: + continue + ownerSet.add(copyrightOwner) + # Determine output filename + outPath = f"{outDir}{eolId} {contentId}{extension}" + if os.path.exists(outPath): + print(f"WARNING: {outPath} already exists. Skipping download.") + continue + # Check thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException != None: + print("Waiting for existing threads to end") + while numThreads > 0: time.sleep(1) - # Wait for threads after an interrupt or thread-exception - if interrupted or threadException != None: - print("Waiting for existing threads to end") - while numThreads > 0: - time.sleep(1) - exitLoop = True - break - print(f"Downloading image to {outPath}") - # Perform download - numThreads += 1 - thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True) - thread.start() + exitLoop = True + break + # Perform download + print(f"Downloading image to {outPath}") + numThreads += 1 + thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) + thread.start() if exitLoop: break # Close images-list db |
