From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tolData/eol/downloadImgs.py | 142 ------------------------------------ 1 file changed, 142 deletions(-) delete mode 100755 backend/tolData/eol/downloadImgs.py (limited to 'backend/tolData/eol/downloadImgs.py') diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py deleted file mode 100755 index 5213aaf..0000000 --- a/backend/tolData/eol/downloadImgs.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os, random -import sqlite3 -import urllib.parse, requests -import time -from threading import Thread -import signal - -import argparse -parser = argparse.ArgumentParser(description=""" -For some set of EOL IDs, downloads associated images from URLs in -an image-list database. Uses multiple downloading threads. - -May obtain multiple images per ID. The images will get names -with the form 'eolId1 contentId1.ext1'. - -SIGINT causes the program to finish ongoing downloads and exit. -The program can be re-run to continue downloading. It looks for -already-downloaded files, and continues after the one with -highest EOL ID. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imagesListDb = 'imagesList.db' -def getInputEolIds() -> set[int]: - eolIds: set[int] = set() - dbCon = sqlite3.connect('../data.db') - dbCur = dbCon.cursor() - for (id,) in dbCur.execute('SELECT id FROM eol_ids'): - eolIds.add(id) - dbCon.close() - return eolIds -outDir = 'imgsForReview/' -MAX_IMGS_PER_ID = 3 -MAX_THREADS = 5 -POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) -POST_DL_DELAY_MAX = 3 -LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' - -print('Getting input EOL IDs') -eolIds = getInputEolIds() -print('Getting EOL IDs to download for') -# Get IDs from images-list db -imgDbCon = sqlite3.connect(imagesListDb) -imgCur = imgDbCon.cursor() -imgListIds: set[int] = set() -for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): - imgListIds.add(pageId) -# Get set intersection, and sort into list -eolIds = eolIds.intersection(imgListIds) -eolIdList = sorted(eolIds) -print(f'Result: {len(eolIdList)} EOL IDs') - -print('Checking output directory') -if not os.path.exists(outDir): - os.mkdir(outDir) -print('Finding next ID to download for') -nextIdx = 0 -fileList = os.listdir(outDir) -ids = [int(filename.split(' ')[0]) for filename in fileList] -if ids: - ids.sort() - nextIdx = eolIdList.index(ids[-1]) + 1 -if nextIdx == len(eolIdList): - print('No IDs left. Exiting...') - sys.exit(0) - -print('Starting download threads') -numThreads = 0 -threadException: Exception | None = None # Used for ending main thread after a non-main thread exception -# Handle SIGINT signals -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) -# Function for threads to execute -def downloadImg(url, outFile): - global numThreads, threadException - try: - data = requests.get(url) - with open(outFile, 'wb') as file: - file.write(data.content) - time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) - except Exception as e: - print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) - threadException = e - numThreads -= 1 -# Manage downloading -for idx in range(nextIdx, len(eolIdList)): - eolId = eolIdList[idx] - # Get image urls - ownerSet: set[str] = set() # Used to get images from different owners, for variety - exitLoop = False - query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' - for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): - if url.startswith('data/'): - url = 'https://content.eol.org/' + url - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) - continue - # Check image-quantity limit - if len(ownerSet) == MAX_IMGS_PER_ID: - break - # Check for skip conditions - if re.fullmatch(LICENSE_REGEX, license) is None: - continue - if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic - continue - if copyrightOwner in ownerSet: - continue - ownerSet.add(copyrightOwner) - # Determine output filename - outPath = f'{outDir}{eolId} {contentId}{extension}' - if os.path.exists(outPath): - print(f'WARNING: {outPath} already exists. Skipping download.') - continue - # Check thread limit - while numThreads == MAX_THREADS: - time.sleep(1) - # Wait for threads after an interrupt or thread-exception - if interrupted or threadException is not None: - print('Waiting for existing threads to end') - while numThreads > 0: - time.sleep(1) - exitLoop = True - break - # Perform download - print(f'Downloading image to {outPath}') - numThreads += 1 - thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) - thread.start() - if exitLoop: - break -# Close images-list db -print('Finished downloading') -imgDbCon.close() -- cgit v1.2.3