From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tolData/enwiki/downloadImgs.py | 88 ---------------------------------- 1 file changed, 88 deletions(-) delete mode 100755 backend/tolData/enwiki/downloadImgs.py (limited to 'backend/tolData/enwiki/downloadImgs.py') diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py deleted file mode 100755 index def4714..0000000 --- a/backend/tolData/enwiki/downloadImgs.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import sqlite3 -import urllib.parse, requests -import time, signal - -import argparse -parser = argparse.ArgumentParser(description=""" -Downloads images from URLs in an image database, into an output directory, -with names of the form 'pageId1.ext1'. - -SIGINT causes the program to finish an ongoing download and exit. -The program can be re-run to continue downloading, and looks -in the output directory do decide what to skip. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imgDb = 'imgData.db' # About 130k image names -outDir = 'imgs' -licenseRegex = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) -# In testing, this downloaded about 100k images, over several days - -if not os.path.exists(outDir): - os.mkdir(outDir) -print('Checking for already-downloaded images') -fileList = os.listdir(outDir) -pageIdsDone: set[int] = set() -for filename in fileList: - basename, extension = os.path.splitext(filename) - pageIdsDone.add(int(basename)) -print(f'Found {len(pageIdsDone)}') - -# Set SIGINT handler -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) - -print('Opening database') -dbCon = sqlite3.connect(imgDb) -dbCur = dbCon.cursor() -print('Starting downloads') -iterNum = 0 -query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \ - ' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name' -for pageId, license, artist, credit, restrictions, url in dbCur.execute(query): - if pageId in pageIdsDone: - continue - if interrupted: - print('Exiting loop') - break - # Check for problematic attributes - if license is None or licenseRegex.fullmatch(license) is None: - continue - if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None: - continue - if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None: - continue - if restrictions is not None and restrictions != '': - continue - # Download image - iterNum += 1 - print(f'Iteration {iterNum}: Downloading for page-id {pageId}') - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f'WARNING: No filename extension found in URL {url}') - sys.exit(1) - outFile = f'{outDir}/{pageId}{extension}' - headers = { - 'user-agent': 'terryt.dev (terry06890@gmail.com)', - 'accept-encoding': 'gzip', - } - try: - response = requests.get(url, headers=headers) - with open(outFile, 'wb') as file: - file.write(response.content) - time.sleep(1) - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle self to 1 cache miss per sec' - # It's unclear how to properly check for cache misses, so this just aims for 1 per sec - except Exception as e: - print(f'Error while downloading to {outFile}: {e}') -print('Closing database') -dbCon.close() -- cgit v1.2.3