1 files changed, 152 insertions, 0 deletions
diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py
new file mode 100755
index 0000000..8454a35
--- /dev/null
+++ b/backend/tol_data/eol/download_imgs.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python3
+
+"""
+For some set of EOL IDs, downloads associated images from URLs in
+an image-list database. Uses multiple downloading threads.
+
+May obtain multiple images per ID. The images will get names
+with the form 'eolId1 contentId1.ext1'.
+
+SIGINT causes the program to finish ongoing downloads and exit.
+The program can be re-run to continue downloading. It looks for
+already-downloaded files, and continues after the one with
+highest EOL ID.
+"""
+
+import sys, re, os, random
+import sqlite3
+import urllib.parse, requests
+import time
+from threading import Thread
+import signal
+
+IMAGES_LIST_DB = 'images_list.db'
+OUT_DIR = 'imgs_for_review'
+DB_FILE = os.path.join('..', 'data.db')
+#
+MAX_IMGS_PER_ID = 3
+MAX_THREADS = 5
+POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
+POST_DL_DELAY_MAX = 3
+LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain'
+
+def downloadImgs(eolIds, imagesListDb, outDir):
+	print('Getting EOL IDs to download for')
+	# Get IDs from images-list db
+	imgDbCon = sqlite3.connect(imagesListDb)
+	imgCur = imgDbCon.cursor()
+	imgListIds: set[int] = set()
+	for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'):
+		imgListIds.add(pageId)
+	# Get set intersection, and sort into list
+	eolIds = eolIds.intersection(imgListIds)
+	eolIdList = sorted(eolIds)
+	nextIdx = 0
+	print(f'Result: {len(eolIdList)} EOL IDs')
+	#
+	print('Checking output directory')
+	if not os.path.exists(outDir):
+		os.mkdir(outDir)
+	else:
+		print('Finding next ID to download for')
+		fileList = os.listdir(outDir)
+		ids = [int(filename.split(' ')[0]) for filename in fileList]
+		if ids:
+			ids.sort()
+			nextIdx = eolIdList.index(ids[-1]) + 1
+	if nextIdx == len(eolIdList):
+		print('No IDs left. Exiting...')
+		return
+	#
+	print('Starting download threads')
+	numThreads = 0
+	threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
+	# Handle SIGINT signals
+	interrupted = False
+	oldHandler = None
+	def onSigint(sig, frame):
+		nonlocal interrupted
+		interrupted = True
+		signal.signal(signal.SIGINT, oldHandler)
+	oldHandler = signal.signal(signal.SIGINT, onSigint)
+	# Function for threads to execute
+	def downloadImg(url, outFile):
+		nonlocal numThreads, threadException
+		try:
+			data = requests.get(url)
+			with open(outFile, 'wb') as file:
+				file.write(data.content)
+			time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
+		except Exception as e:
+			print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
+			threadException = e
+		numThreads -= 1
+	# Manage downloading
+	for idx in range(nextIdx, len(eolIdList)):
+		eolId = eolIdList[idx]
+		# Get image urls
+		ownerSet: set[str] = set() # Used to get images from different owners, for variety
+		exitLoop = False
+		query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?'
+		for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)):
+			if url.startswith('data/'):
+				url = 'https://content.eol.org/' + url
+			urlParts = urllib.parse.urlparse(url)
+			extension = os.path.splitext(urlParts.path)[1]
+			if len(extension) <= 1:
+				print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
+				continue
+			# Check image-quantity limit
+			if len(ownerSet) == MAX_IMGS_PER_ID:
+				break
+			# Check for skip conditions
+			if re.fullmatch(LICENSE_REGEX, license) is None:
+				continue
+			if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
+				continue
+			if copyrightOwner in ownerSet:
+				continue
+			ownerSet.add(copyrightOwner)
+			# Determine output filename
+			outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}')
+			if os.path.exists(outPath):
+				print(f'WARNING: {outPath} already exists. Skipping download.')
+				continue
+			# Check thread limit
+			while numThreads == MAX_THREADS:
+				time.sleep(1)
+			# Wait for threads after an interrupt or thread-exception
+			if interrupted or threadException is not None:
+				print('Waiting for existing threads to end')
+				while numThreads > 0:
+					time.sleep(1)
+				exitLoop = True
+				break
+			# Perform download
+			print(f'Downloading image to {outPath}')
+			numThreads += 1
+			thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
+			thread.start()
+		if exitLoop:
+			break
+	# Close images-list db
+	while numThreads > 0:
+		time.sleep(1)
+	print('Finished downloading')
+	imgDbCon.close()
+
+def getEolIdsFromDb(dbFile) -> set[int]:
+	eolIds: set[int] = set()
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	for (id,) in dbCur.execute('SELECT id FROM eol_ids'):
+		eolIds.add(id)
+	dbCon.close()
+	return eolIds
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.parse_args()
+	#
+	eolIds = getEolIdsFromDb(DB_FILE)
+	downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR)