From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Sun, 11 Sep 2022 14:55:42 +1000
Subject: Add backend unit tests

- Add unit testing code in backend/tests/
- Change to snake-case for script/file/directory names
- Use os.path.join() instead of '/'
- Refactor script code into function defs and a main-guard
- Make global vars all-caps

Some fixes:
- For getting descriptions, some wiki redirects weren't properly resolved
- Linked images were sub-optimally propagated
- Generation of reduced trees assumed a wiki-id association implied a description
- Tilo.py had potential null dereferences by not always using a reduced node set
- EOL image downloading didn't properly wait for all threads to end when finishing
---
 backend/tolData/eol/downloadImgs.py | 142 ------------------------------------
 1 file changed, 142 deletions(-)
 delete mode 100755 backend/tolData/eol/downloadImgs.py

(limited to 'backend/tolData/eol/downloadImgs.py')

diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py
deleted file mode 100755
index 5213aaf..0000000
--- a/backend/tolData/eol/downloadImgs.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os, random
-import sqlite3
-import urllib.parse, requests
-import time
-from threading import Thread
-import signal
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-For some set of EOL IDs, downloads associated images from URLs in
-an image-list database. Uses multiple downloading threads.
-
-May obtain multiple images per ID. The images will get names
-with the form 'eolId1 contentId1.ext1'.
-
-SIGINT causes the program to finish ongoing downloads and exit.
-The program can be re-run to continue downloading. It looks for
-already-downloaded files, and continues after the one with
-highest EOL ID.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imagesListDb = 'imagesList.db'
-def getInputEolIds() -> set[int]:
-	eolIds: set[int] = set()
-	dbCon = sqlite3.connect('../data.db')
-	dbCur = dbCon.cursor()
-	for (id,) in dbCur.execute('SELECT id FROM eol_ids'):
-		eolIds.add(id)
-	dbCon.close()
-	return eolIds
-outDir = 'imgsForReview/'
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
-POST_DL_DELAY_MAX = 3
-LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain'
-
-print('Getting input EOL IDs')
-eolIds = getInputEolIds()
-print('Getting EOL IDs to download for')
-# Get IDs from images-list db
-imgDbCon = sqlite3.connect(imagesListDb)
-imgCur = imgDbCon.cursor()
-imgListIds: set[int] = set()
-for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'):
-	imgListIds.add(pageId)
-# Get set intersection, and sort into list
-eolIds = eolIds.intersection(imgListIds)
-eolIdList = sorted(eolIds)
-print(f'Result: {len(eolIdList)} EOL IDs')
-
-print('Checking output directory')
-if not os.path.exists(outDir):
-	os.mkdir(outDir)
-print('Finding next ID to download for')
-nextIdx = 0
-fileList = os.listdir(outDir)
-ids = [int(filename.split(' ')[0]) for filename in fileList]
-if ids:
-	ids.sort()
-	nextIdx = eolIdList.index(ids[-1]) + 1
-if nextIdx == len(eolIdList):
-	print('No IDs left. Exiting...')
-	sys.exit(0)
-
-print('Starting download threads')
-numThreads = 0
-threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
-# Handle SIGINT signals
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-	signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Function for threads to execute
-def downloadImg(url, outFile):
-	global numThreads, threadException
-	try:
-		data = requests.get(url)
-		with open(outFile, 'wb') as file:
-			file.write(data.content)
-		time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
-	except Exception as e:
-		print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
-		threadException = e
-	numThreads -= 1
-# Manage downloading
-for idx in range(nextIdx, len(eolIdList)):
-	eolId = eolIdList[idx]
-	# Get image urls
-	ownerSet: set[str] = set() # Used to get images from different owners, for variety
-	exitLoop = False
-	query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?'
-	for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)):
-		if url.startswith('data/'):
-			url = 'https://content.eol.org/' + url
-		urlParts = urllib.parse.urlparse(url)
-		extension = os.path.splitext(urlParts.path)[1]
-		if len(extension) <= 1:
-			print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
-			continue
-		# Check image-quantity limit
-		if len(ownerSet) == MAX_IMGS_PER_ID:
-			break
-		# Check for skip conditions
-		if re.fullmatch(LICENSE_REGEX, license) is None:
-			continue
-		if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
-			continue
-		if copyrightOwner in ownerSet:
-			continue
-		ownerSet.add(copyrightOwner)
-		# Determine output filename
-		outPath = f'{outDir}{eolId} {contentId}{extension}'
-		if os.path.exists(outPath):
-			print(f'WARNING: {outPath} already exists. Skipping download.')
-			continue
-		# Check thread limit
-		while numThreads == MAX_THREADS:
-			time.sleep(1)
-		# Wait for threads after an interrupt or thread-exception
-		if interrupted or threadException is not None:
-			print('Waiting for existing threads to end')
-			while numThreads > 0:
-				time.sleep(1)
-			exitLoop = True
-			break
-		# Perform download
-		print(f'Downloading image to {outPath}')
-		numThreads += 1
-		thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
-		thread.start()
-	if exitLoop:
-		break
-# Close images-list db
-print('Finished downloading')
-imgDbCon.close()
-- 
cgit v1.2.3