diff options
Diffstat (limited to 'backend/tolData/eol/downloadImgs.py')
| -rwxr-xr-x | backend/tolData/eol/downloadImgs.py | 76 |
1 files changed, 38 insertions, 38 deletions
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py index 4d658e7..5213aaf 100755 --- a/backend/tolData/eol/downloadImgs.py +++ b/backend/tolData/eol/downloadImgs.py @@ -22,53 +22,53 @@ highest EOL ID. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -imagesListDb = "imagesList.db" -def getInputEolIds(): - eolIds = set() - dbCon = sqlite3.connect("../data.db") +imagesListDb = 'imagesList.db' +def getInputEolIds() -> set[int]: + eolIds: set[int] = set() + dbCon = sqlite3.connect('../data.db') dbCur = dbCon.cursor() - for (id,) in dbCur.execute("SELECT id FROM eol_ids"): + for (id,) in dbCur.execute('SELECT id FROM eol_ids'): eolIds.add(id) dbCon.close() return eolIds -outDir = "imgsForReview/" +outDir = 'imgsForReview/' MAX_IMGS_PER_ID = 3 MAX_THREADS = 5 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) POST_DL_DELAY_MAX = 3 -LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" +LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' -print("Getting input EOL IDs") +print('Getting input EOL IDs') eolIds = getInputEolIds() -print("Getting EOL IDs to download for") +print('Getting EOL IDs to download for') # Get IDs from images-list db imgDbCon = sqlite3.connect(imagesListDb) imgCur = imgDbCon.cursor() -imgListIds = set() -for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): +imgListIds: set[int] = set() +for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): imgListIds.add(pageId) # Get set intersection, and sort into list eolIds = eolIds.intersection(imgListIds) -eolIds = sorted(eolIds) -print(f"Result: {len(eolIds)} EOL IDs") +eolIdList = sorted(eolIds) +print(f'Result: {len(eolIdList)} EOL IDs') -print("Checking output directory") +print('Checking output directory') if not os.path.exists(outDir): os.mkdir(outDir) -print("Finding next ID to download for") +print('Finding next ID to download for') nextIdx = 0 fileList = os.listdir(outDir) -ids = [int(filename.split(" ")[0]) for filename in fileList] -if len(ids) > 0: +ids = [int(filename.split(' ')[0]) for filename in fileList] +if ids: ids.sort() - nextIdx = eolIds.index(ids[-1]) + 1 -if nextIdx == len(eolIds): - print("No IDs left. Exiting...") + nextIdx = eolIdList.index(ids[-1]) + 1 +if nextIdx == len(eolIdList): + print('No IDs left. Exiting...') sys.exit(0) -print("Starting download threads") +print('Starting download threads') numThreads = 0 -threadException = None # Used for ending main thread after a non-main thread exception +threadException: Exception | None = None # Used for ending main thread after a non-main thread exception # Handle SIGINT signals interrupted = False oldHandler = None @@ -86,29 +86,29 @@ def downloadImg(url, outFile): file.write(data.content) time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) except Exception as e: - print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) + print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) threadException = e numThreads -= 1 # Manage downloading -for idx in range(nextIdx, len(eolIds)): - eolId = eolIds[idx] +for idx in range(nextIdx, len(eolIdList)): + eolId = eolIdList[idx] # Get image urls - ownerSet = set() # Used to get images from different owners, for variety + ownerSet: set[str] = set() # Used to get images from different owners, for variety exitLoop = False - query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" - for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): - if url.startswith("data/"): - url = "https://content.eol.org/" + url + query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' + for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): + if url.startswith('data/'): + url = 'https://content.eol.org/' + url urlParts = urllib.parse.urlparse(url) extension = os.path.splitext(urlParts.path)[1] if len(extension) <= 1: - print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) + print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) continue # Check image-quantity limit if len(ownerSet) == MAX_IMGS_PER_ID: break # Check for skip conditions - if re.fullmatch(LICENSE_REGEX, license) == None: + if re.fullmatch(LICENSE_REGEX, license) is None: continue if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic continue @@ -116,27 +116,27 @@ for idx in range(nextIdx, len(eolIds)): continue ownerSet.add(copyrightOwner) # Determine output filename - outPath = f"{outDir}{eolId} {contentId}{extension}" + outPath = f'{outDir}{eolId} {contentId}{extension}' if os.path.exists(outPath): - print(f"WARNING: {outPath} already exists. Skipping download.") + print(f'WARNING: {outPath} already exists. Skipping download.') continue # Check thread limit while numThreads == MAX_THREADS: time.sleep(1) # Wait for threads after an interrupt or thread-exception - if interrupted or threadException != None: - print("Waiting for existing threads to end") + if interrupted or threadException is not None: + print('Waiting for existing threads to end') while numThreads > 0: time.sleep(1) exitLoop = True break # Perform download - print(f"Downloading image to {outPath}") + print(f'Downloading image to {outPath}') numThreads += 1 thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) thread.start() if exitLoop: break # Close images-list db -print("Finished downloading") +print('Finished downloading') imgDbCon.close() |
