#!/usr/bin/python3 """ For some set of EOL IDs, downloads associated images from URLs in an image-list database. Uses multiple downloading threads. May obtain multiple images per ID. The images will get names with the form 'eolId1 contentId1.ext1'. SIGINT causes the program to finish ongoing downloads and exit. The program can be re-run to continue downloading. It looks for already-downloaded files, and continues after the one with highest EOL ID. """ import argparse import sys import re import os import random import sqlite3 import requests import urllib.parse import time from threading import Thread import signal IMAGES_LIST_DB = 'images_list.db' OUT_DIR = 'imgs_for_review' DB_FILE = os.path.join('..', 'data.db') MAX_IMGS_PER_ID = 3 MAX_THREADS = 5 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) POST_DL_DELAY_MAX = 3 LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' def downloadImgs(eolIds, imagesListDb, outDir): print('Getting EOL IDs to download for') # Get IDs from images-list db imgDbCon = sqlite3.connect(imagesListDb) imgCur = imgDbCon.cursor() imgListIds: set[int] = set() for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): imgListIds.add(pageId) # Get set intersection, and sort into list eolIds = eolIds.intersection(imgListIds) eolIdList = sorted(eolIds) nextIdx = 0 print(f'Result: {len(eolIdList)} EOL IDs') print('Checking output directory') if not os.path.exists(outDir): os.mkdir(outDir) else: print('Finding next ID to download for') fileList = os.listdir(outDir) ids = [int(filename.split(' ')[0]) for filename in fileList] if ids: ids.sort() nextIdx = eolIdList.index(ids[-1]) + 1 if nextIdx == len(eolIdList): print('No IDs left. Exiting...') return print('Starting download threads') numThreads = 0 threadException: Exception | None = None # Used for ending main thread after a non-main thread exception # Handle SIGINT signals interrupted = False oldHandler = None def onSigint(sig, frame): nonlocal interrupted interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) # Function for threads to execute def downloadImg(url, outFile): nonlocal numThreads, threadException try: data = requests.get(url) with open(outFile, 'wb') as file: file.write(data.content) time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) except Exception as e: print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) threadException = e numThreads -= 1 # Manage downloading for idx in range(nextIdx, len(eolIdList)): eolId = eolIdList[idx] # Get image urls ownerSet: set[str] = set() # Used to get images from different owners, for variety exitLoop = False query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): if url.startswith('data/'): url = 'https://content.eol.org/' + url urlParts = urllib.parse.urlparse(url) extension = os.path.splitext(urlParts.path)[1] if len(extension) <= 1: print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) continue # Check image-quantity limit if len(ownerSet) == MAX_IMGS_PER_ID: break # Check for skip conditions if re.fullmatch(LICENSE_REGEX, license) is None: continue if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic continue if copyrightOwner in ownerSet: continue ownerSet.add(copyrightOwner) # Determine output filename outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}') if os.path.exists(outPath): print(f'WARNING: {outPath} already exists. Skipping download.') continue # Check thread limit while numThreads == MAX_THREADS: time.sleep(1) # Wait for threads after an interrupt or thread-exception if interrupted or threadException is not None: print('Waiting for existing threads to end') while numThreads > 0: time.sleep(1) exitLoop = True break # Perform download print(f'Downloading image to {outPath}') numThreads += 1 thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) thread.start() if exitLoop: break # Close images-list db while numThreads > 0: time.sleep(1) print('Finished downloading') imgDbCon.close() def getEolIdsFromDb(dbFile) -> set[int]: eolIds: set[int] = set() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() for (id,) in dbCur.execute('SELECT id FROM eol_ids'): eolIds.add(id) dbCon.close() return eolIds if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() eolIds = getEolIdsFromDb(DB_FILE) downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR)