Add backend unit tests

- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
author: Terry Truong <terry06890@gmail.com> 2022-09-11 14:55:42 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-09-11 15:04:14 +1000
commit: 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree: 2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/enwiki/downloadImgs.py
parent: daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
1 files changed, 0 insertions, 88 deletions
diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py
deleted file mode 100755
index def4714..0000000
--- a/backend/tolData/enwiki/downloadImgs.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os
-import sqlite3
-import urllib.parse, requests
-import time, signal
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Downloads images from URLs in an image database, into an output directory,
-with names of the form 'pageId1.ext1'.
-
-SIGINT causes the program to finish an ongoing download and exit.
-The program can be re-run to continue downloading, and looks
-in the output directory do decide what to skip.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imgDb = 'imgData.db' # About 130k image names
-outDir = 'imgs'
-licenseRegex = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
-# In testing, this downloaded about 100k images, over several days
-
-if not os.path.exists(outDir):
-	os.mkdir(outDir)
-print('Checking for already-downloaded images')
-fileList = os.listdir(outDir)
-pageIdsDone: set[int] = set()
-for filename in fileList:
-	basename, extension = os.path.splitext(filename)
-	pageIdsDone.add(int(basename))
-print(f'Found {len(pageIdsDone)}')
-
-# Set SIGINT handler
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-	signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-
-print('Opening database')
-dbCon = sqlite3.connect(imgDb)
-dbCur = dbCon.cursor()
-print('Starting downloads')
-iterNum = 0
-query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \
-	' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name'
-for pageId, license, artist, credit, restrictions, url in dbCur.execute(query):
-	if pageId in pageIdsDone:
-		continue
-	if interrupted:
-		print('Exiting loop')
-		break
-	# Check for problematic attributes
-	if license is None or licenseRegex.fullmatch(license) is None:
-		continue
-	if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None:
-		continue
-	if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None:
-		continue
-	if restrictions is not None and restrictions != '':
-		continue
-	# Download image
-	iterNum += 1
-	print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
-	urlParts = urllib.parse.urlparse(url)
-	extension = os.path.splitext(urlParts.path)[1]
-	if len(extension) <= 1:
-		print(f'WARNING: No filename extension found in URL {url}')
-		sys.exit(1)
-	outFile = f'{outDir}/{pageId}{extension}'
-	headers = {
-		'user-agent': 'terryt.dev (terry06890@gmail.com)',
-		'accept-encoding': 'gzip',
-	}
-	try:
-		response = requests.get(url, headers=headers)
-		with open(outFile, 'wb') as file:
-			file.write(response.content)
-		time.sleep(1)
-			# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle self to 1 cache miss per sec'
-			# It's unclear how to properly check for cache misses, so this just aims for 1 per sec
-	except Exception as e:
-		print(f'Error while downloading to {outFile}: {e}')
-print('Closing database')
-dbCon.close()
author	Terry Truong <terry06890@gmail.com>	2022-09-11 14:55:42 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-09-11 15:04:14 +1000
commit	5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree	2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/enwiki/downloadImgs.py
parent	daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)