Refactor backend scriptsextended-db

author: Terry Truong <terry06890@gmail.com> 2022-06-22 23:16:42 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-06-22 23:16:42 +1000
commit: abb936f5d76f7fe5cec1e8948d287da86643d504 (patch)
tree: f07b9eaadf5ae91363fdbac9d81b74e1fb0a436f /backend/data/eol/downloadImgs.py
parent: e78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff)
1 files changed, 75 insertions, 77 deletions
diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py
index ac72ea1..96bc085 100755
--- a/backend/data/eol/downloadImgs.py
+++ b/backend/data/eol/downloadImgs.py
@@ -7,18 +7,24 @@ import time
 from threading import Thread
 import signal
 
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an image-list database,\n"
-usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n"
-usageInfo += "the form 'eolId1 contentId1.ext1'.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n"
-usageInfo += "The program can be re-run to continue downloading. It looks for\n"
-usageInfo += "existing downloaded files, and continues after the one with\n"
-usageInfo += "highest EOL ID.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of EOL IDs, downloads associated images from URLs in
+an image-list database. Uses multiple downloading threads.
+
+May obtain multiple images per ID. The images will get names
+with the form 'eolId1 contentId1.ext1'.
+
+SIGINT causes the program to finish ongoing downloads and exit.
+The program can be re-run to continue downloading. It looks for
+already-downloaded files, and continues after the one with
+highest EOL ID.
+"""
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
+# In testing, this downloaded about 70k images, over a few days
 
 imagesListDb = "imagesList.db"
 def getInputEolIds():
@@ -30,44 +36,29 @@ def getInputEolIds():
 	dbCon.close()
 	return eolIds
 outDir = "imgsForReview/"
-LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
+MAX_IMGS_PER_ID = 3
+MAX_THREADS = 5
 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
 POST_DL_DELAY_MAX = 3
+LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
 
-# Get eol-ids from data db
 print("Getting input EOL IDs")
 eolIds = getInputEolIds()
-# Get eol-ids from images db
-print("Getting images-list-db EOL IDs")
+print("Getting EOL IDs to download for")
+# Get IDs from images-list db
 imgDbCon = sqlite3.connect(imagesListDb)
 imgCur = imgDbCon.cursor()
 imgListIds = set()
-for row in imgCur.execute("SELECT DISTINCT page_id FROM images"):
-	imgListIds.add(row[0])
-# Get eol-id intersection, and sort into list
+for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"):
+	imgListIds.add(pageId)
+# Get set intersection, and sort into list
 eolIds = eolIds.intersection(imgListIds)
 eolIds = sorted(eolIds)
-print(f"Resulted in {len(eolIds)} EOL IDs")
+print(f"Result: {len(eolIds)} EOL IDs")
 
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-numThreads = 0
-threadException = None # Used for ending main thread after a non-main thread exception
-def downloadImg(url, outFile):
-	global numThreads, threadException
-	try:
-		data = requests.get(url)
-		with open(outFile, 'wb') as file:
-			file.write(data.content)
-		time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
-	except Exception as e:
-		print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
-		threadException = e
-	numThreads -= 1
-# Create output directory if not present
+print("Checking output directory")
 if not os.path.exists(outDir):
 	os.mkdir(outDir)
-# Find next eol ID to download for
 print("Finding next ID to download for")
 nextIdx = 0
 fileList = os.listdir(outDir)
@@ -78,7 +69,11 @@ if len(ids) > 0:
 if nextIdx == len(eolIds):
 	print("No IDs left. Exiting...")
 	sys.exit(0)
-# Detect SIGINT signals
+
+print("Starting download threads")
+numThreads = 0
+threadException = None # Used for ending main thread after a non-main thread exception
+# Handle SIGINT signals
 interrupted = False
 oldHandler = None
 def onSigint(sig, frame):
@@ -86,33 +81,27 @@ def onSigint(sig, frame):
 	interrupted = True
 	signal.signal(signal.SIGINT, oldHandler)
 oldHandler = signal.signal(signal.SIGINT, onSigint)
+# Function for threads to execute
+def downloadImg(url, outFile):
+	global numThreads, threadException
+	try:
+		data = requests.get(url)
+		with open(outFile, 'wb') as file:
+			file.write(data.content)
+		time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
+	except Exception as e:
+		print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
+		threadException = e
+	numThreads -= 1
 # Manage downloading
 for idx in range(nextIdx, len(eolIds)):
 	eolId = eolIds[idx]
 	# Get image urls
 	imgDataList = []
 	ownerSet = set() # Used to get images from different owners, for variety
-	for row in imgCur.execute(
-		"SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)):
-		license = row[3]
-		copyrightOwner = row[4]
-		if re.fullmatch(LICENSE_REGEX, license) == None:
-			continue
-		if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic
-			continue
-		if copyrightOwner not in ownerSet:
-			ownerSet.add(copyrightOwner)
-			imgDataList.append(row)
-			if len(ownerSet) == MAX_IMGS_PER_ID:
-				break
-	if len(imgDataList) == 0:
-		continue
-	# Determine output filenames
-	outFiles = []
-	urls = []
-	for row in imgDataList:
-		contentId = row[0]
-		url = row[2]
+	exitLoop = False
+	query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
+	for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)):
 		if url.startswith("data/"):
 			url = "https://content.eol.org/" + url
 		urlParts = urllib.parse.urlparse(url)
@@ -120,28 +109,37 @@ for idx in range(nextIdx, len(eolIds)):
 		if len(extension) <= 1:
 			print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
 			continue
-		outFiles.append(str(eolId) + " " + str(contentId) + extension)
-		urls.append(url)
-	# Start downloads
-	exitLoop = False
-	for i in range(len(outFiles)):
-		outPath = outDir + outFiles[i]
-		if not os.path.exists(outPath):
-			# Enforce thread limit
-			while numThreads == MAX_THREADS:
+		# Check image-quantity limit
+		if len(ownerSet) == MAX_IMGS_PER_ID:
+			break
+		# Check for skip conditions
+		if re.fullmatch(LICENSE_REGEX, license) == None:
+			continue
+		if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
+			continue
+		if copyrightOwner in ownerSet:
+			continue
+		ownerSet.add(copyrightOwner)
+		# Determine output filename
+		outPath = f"{outDir}{eolId} {contentId}{extension}"
+		if os.path.exists(outPath):
+			print(f"WARNING: {outPath} already exists. Skipping download.")
+			continue
+		# Check thread limit
+		while numThreads == MAX_THREADS:
+			time.sleep(1)
+		# Wait for threads after an interrupt or thread-exception
+		if interrupted or threadException != None:
+			print("Waiting for existing threads to end")
+			while numThreads > 0:
 				time.sleep(1)
-			# Wait for threads after an interrupt or thread-exception
-			if interrupted or threadException != None:
-				print("Waiting for existing threads to end")
-				while numThreads > 0:
-					time.sleep(1)
-				exitLoop = True
-				break
-			print(f"Downloading image to {outPath}")
-			# Perform download
-			numThreads += 1
-			thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True)
-			thread.start()
+			exitLoop = True
+			break
+		# Perform download
+		print(f"Downloading image to {outPath}")
+		numThreads += 1
+		thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
+		thread.start()
 	if exitLoop:
 		break
 # Close images-list db
author	Terry Truong <terry06890@gmail.com>	2022-06-22 23:16:42 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-06-22 23:16:42 +1000
commit	abb936f5d76f7fe5cec1e8948d287da86643d504 (patch)
tree	f07b9eaadf5ae91363fdbac9d81b74e1fb0a436f /backend/data/eol/downloadImgs.py
parent	e78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff)