Add python type annotations

Also use consistent quote symbols Also use 'is None' instead of '== None' Also use 'if list1' instead of 'if len(list1) > 0'
author: Terry Truong <terry06890@gmail.com> 2022-09-07 11:37:37 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-09-07 11:37:37 +1000
commit: daccbbd9c73a5292ea9d6746560d7009e5aa666d (patch)
tree: 9156bf011ab6302eb3c0d219d40587d594f51841 /backend/tolData/eol/downloadImgs.py
parent: 1a7fe33edafa68a6f759d124bdeee673ff9cf9ff (diff)
1 files changed, 38 insertions, 38 deletions
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py
index 4d658e7..5213aaf 100755
--- a/backend/tolData/eol/downloadImgs.py
+++ b/backend/tolData/eol/downloadImgs.py
@@ -22,53 +22,53 @@ highest EOL ID.
 """, formatter_class=argparse.RawDescriptionHelpFormatter)
 parser.parse_args()
 
-imagesListDb = "imagesList.db"
-def getInputEolIds():
-	eolIds = set()
-	dbCon = sqlite3.connect("../data.db")
+imagesListDb = 'imagesList.db'
+def getInputEolIds() -> set[int]:
+	eolIds: set[int] = set()
+	dbCon = sqlite3.connect('../data.db')
 	dbCur = dbCon.cursor()
-	for (id,) in dbCur.execute("SELECT id FROM eol_ids"):
+	for (id,) in dbCur.execute('SELECT id FROM eol_ids'):
 		eolIds.add(id)
 	dbCon.close()
 	return eolIds
-outDir = "imgsForReview/"
+outDir = 'imgsForReview/'
 MAX_IMGS_PER_ID = 3
 MAX_THREADS = 5
 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
 POST_DL_DELAY_MAX = 3
-LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
+LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain'
 
-print("Getting input EOL IDs")
+print('Getting input EOL IDs')
 eolIds = getInputEolIds()
-print("Getting EOL IDs to download for")
+print('Getting EOL IDs to download for')
 # Get IDs from images-list db
 imgDbCon = sqlite3.connect(imagesListDb)
 imgCur = imgDbCon.cursor()
-imgListIds = set()
-for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"):
+imgListIds: set[int] = set()
+for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'):
 	imgListIds.add(pageId)
 # Get set intersection, and sort into list
 eolIds = eolIds.intersection(imgListIds)
-eolIds = sorted(eolIds)
-print(f"Result: {len(eolIds)} EOL IDs")
+eolIdList = sorted(eolIds)
+print(f'Result: {len(eolIdList)} EOL IDs')
 
-print("Checking output directory")
+print('Checking output directory')
 if not os.path.exists(outDir):
 	os.mkdir(outDir)
-print("Finding next ID to download for")
+print('Finding next ID to download for')
 nextIdx = 0
 fileList = os.listdir(outDir)
-ids = [int(filename.split(" ")[0]) for filename in fileList]
-if len(ids) > 0:
+ids = [int(filename.split(' ')[0]) for filename in fileList]
+if ids:
 	ids.sort()
-	nextIdx = eolIds.index(ids[-1]) + 1
-if nextIdx == len(eolIds):
-	print("No IDs left. Exiting...")
+	nextIdx = eolIdList.index(ids[-1]) + 1
+if nextIdx == len(eolIdList):
+	print('No IDs left. Exiting...')
 	sys.exit(0)
 
-print("Starting download threads")
+print('Starting download threads')
 numThreads = 0
-threadException = None # Used for ending main thread after a non-main thread exception
+threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
 # Handle SIGINT signals
 interrupted = False
 oldHandler = None
@@ -86,29 +86,29 @@ def downloadImg(url, outFile):
 			file.write(data.content)
 		time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
 	except Exception as e:
-		print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
+		print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
 		threadException = e
 	numThreads -= 1
 # Manage downloading
-for idx in range(nextIdx, len(eolIds)):
-	eolId = eolIds[idx]
+for idx in range(nextIdx, len(eolIdList)):
+	eolId = eolIdList[idx]
 	# Get image urls
-	ownerSet = set() # Used to get images from different owners, for variety
+	ownerSet: set[str] = set() # Used to get images from different owners, for variety
 	exitLoop = False
-	query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
-	for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)):
-		if url.startswith("data/"):
-			url = "https://content.eol.org/" + url
+	query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?'
+	for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)):
+		if url.startswith('data/'):
+			url = 'https://content.eol.org/' + url
 		urlParts = urllib.parse.urlparse(url)
 		extension = os.path.splitext(urlParts.path)[1]
 		if len(extension) <= 1:
-			print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
+			print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
 			continue
 		# Check image-quantity limit
 		if len(ownerSet) == MAX_IMGS_PER_ID:
 			break
 		# Check for skip conditions
-		if re.fullmatch(LICENSE_REGEX, license) == None:
+		if re.fullmatch(LICENSE_REGEX, license) is None:
 			continue
 		if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
 			continue
@@ -116,27 +116,27 @@ for idx in range(nextIdx, len(eolIds)):
 			continue
 		ownerSet.add(copyrightOwner)
 		# Determine output filename
-		outPath = f"{outDir}{eolId} {contentId}{extension}"
+		outPath = f'{outDir}{eolId} {contentId}{extension}'
 		if os.path.exists(outPath):
-			print(f"WARNING: {outPath} already exists. Skipping download.")
+			print(f'WARNING: {outPath} already exists. Skipping download.')
 			continue
 		# Check thread limit
 		while numThreads == MAX_THREADS:
 			time.sleep(1)
 		# Wait for threads after an interrupt or thread-exception
-		if interrupted or threadException != None:
-			print("Waiting for existing threads to end")
+		if interrupted or threadException is not None:
+			print('Waiting for existing threads to end')
 			while numThreads > 0:
 				time.sleep(1)
 			exitLoop = True
 			break
 		# Perform download
-		print(f"Downloading image to {outPath}")
+		print(f'Downloading image to {outPath}')
 		numThreads += 1
 		thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
 		thread.start()
 	if exitLoop:
 		break
 # Close images-list db
-print("Finished downloading")
+print('Finished downloading')
 imgDbCon.close()
author	Terry Truong <terry06890@gmail.com>	2022-09-07 11:37:37 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-09-07 11:37:37 +1000
commit	daccbbd9c73a5292ea9d6746560d7009e5aa666d (patch)
tree	9156bf011ab6302eb3c0d219d40587d594f51841 /backend/tolData/eol/downloadImgs.py
parent	1a7fe33edafa68a6f759d124bdeee673ff9cf9ff (diff)