1 files changed, 0 insertions, 147 deletions
diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py
deleted file mode 100755
index 96bc085..0000000
--- a/backend/data/eol/downloadImgs.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os, random
-import sqlite3
-import urllib.parse, requests
-import time
-from threading import Thread
-import signal
-
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-For some set of EOL IDs, downloads associated images from URLs in
-an image-list database. Uses multiple downloading threads.
-
-May obtain multiple images per ID. The images will get names
-with the form 'eolId1 contentId1.ext1'.
-
-SIGINT causes the program to finish ongoing downloads and exit.
-The program can be re-run to continue downloading. It looks for
-already-downloaded files, and continues after the one with
-highest EOL ID.
-"""
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-# In testing, this downloaded about 70k images, over a few days
-
-imagesListDb = "imagesList.db"
-def getInputEolIds():
-	eolIds = set()
-	dbCon = sqlite3.connect("../data.db")
-	dbCur = dbCon.cursor()
-	for (id,) in dbCur.execute("SELECT id FROM eol_ids"):
-		eolIds.add(id)
-	dbCon.close()
-	return eolIds
-outDir = "imgsForReview/"
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
-POST_DL_DELAY_MAX = 3
-LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
-
-print("Getting input EOL IDs")
-eolIds = getInputEolIds()
-print("Getting EOL IDs to download for")
-# Get IDs from images-list db
-imgDbCon = sqlite3.connect(imagesListDb)
-imgCur = imgDbCon.cursor()
-imgListIds = set()
-for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"):
-	imgListIds.add(pageId)
-# Get set intersection, and sort into list
-eolIds = eolIds.intersection(imgListIds)
-eolIds = sorted(eolIds)
-print(f"Result: {len(eolIds)} EOL IDs")
-
-print("Checking output directory")
-if not os.path.exists(outDir):
-	os.mkdir(outDir)
-print("Finding next ID to download for")
-nextIdx = 0
-fileList = os.listdir(outDir)
-ids = [int(filename.split(" ")[0]) for filename in fileList]
-if len(ids) > 0:
-	ids.sort()
-	nextIdx = eolIds.index(ids[-1]) + 1
-if nextIdx == len(eolIds):
-	print("No IDs left. Exiting...")
-	sys.exit(0)
-
-print("Starting download threads")
-numThreads = 0
-threadException = None # Used for ending main thread after a non-main thread exception
-# Handle SIGINT signals
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-	signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Function for threads to execute
-def downloadImg(url, outFile):
-	global numThreads, threadException
-	try:
-		data = requests.get(url)
-		with open(outFile, 'wb') as file:
-			file.write(data.content)
-		time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
-	except Exception as e:
-		print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
-		threadException = e
-	numThreads -= 1
-# Manage downloading
-for idx in range(nextIdx, len(eolIds)):
-	eolId = eolIds[idx]
-	# Get image urls
-	imgDataList = []
-	ownerSet = set() # Used to get images from different owners, for variety
-	exitLoop = False
-	query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
-	for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)):
-		if url.startswith("data/"):
-			url = "https://content.eol.org/" + url
-		urlParts = urllib.parse.urlparse(url)
-		extension = os.path.splitext(urlParts.path)[1]
-		if len(extension) <= 1:
-			print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
-			continue
-		# Check image-quantity limit
-		if len(ownerSet) == MAX_IMGS_PER_ID:
-			break
-		# Check for skip conditions
-		if re.fullmatch(LICENSE_REGEX, license) == None:
-			continue
-		if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
-			continue
-		if copyrightOwner in ownerSet:
-			continue
-		ownerSet.add(copyrightOwner)
-		# Determine output filename
-		outPath = f"{outDir}{eolId} {contentId}{extension}"
-		if os.path.exists(outPath):
-			print(f"WARNING: {outPath} already exists. Skipping download.")
-			continue
-		# Check thread limit
-		while numThreads == MAX_THREADS:
-			time.sleep(1)
-		# Wait for threads after an interrupt or thread-exception
-		if interrupted or threadException != None:
-			print("Waiting for existing threads to end")
-			while numThreads > 0:
-				time.sleep(1)
-			exitLoop = True
-			break
-		# Perform download
-		print(f"Downloading image to {outPath}")
-		numThreads += 1
-		thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
-		thread.start()
-	if exitLoop:
-		break
-# Close images-list db
-print("Finished downloading")
-imgDbCon.close()