From daccbbd9c73a5292ea9d6746560d7009e5aa666d Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Wed, 7 Sep 2022 11:37:37 +1000
Subject: Add python type annotations

Also use consistent quote symbols
Also use 'is None' instead of '== None'
Also use 'if list1' instead of 'if len(list1) > 0'
---
 backend/tolData/enwiki/downloadImgs.py | 50 +++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'backend/tolData/enwiki/downloadImgs.py')

diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py
index 520677f..def4714 100755
--- a/backend/tolData/enwiki/downloadImgs.py
+++ b/backend/tolData/enwiki/downloadImgs.py
@@ -16,20 +16,20 @@ in the output directory do decide what to skip.
 """, formatter_class=argparse.RawDescriptionHelpFormatter)
 parser.parse_args()
 
-imgDb = "imgData.db" # About 130k image names
-outDir = "imgs"
-licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
+imgDb = 'imgData.db' # About 130k image names
+outDir = 'imgs'
+licenseRegex = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
 # In testing, this downloaded about 100k images, over several days
 
 if not os.path.exists(outDir):
 	os.mkdir(outDir)
-print("Checking for already-downloaded images")
+print('Checking for already-downloaded images')
 fileList = os.listdir(outDir)
-pageIdsDone = set()
+pageIdsDone: set[int] = set()
 for filename in fileList:
-	(basename, extension) = os.path.splitext(filename)
+	basename, extension = os.path.splitext(filename)
 	pageIdsDone.add(int(basename))
-print(f"Found {len(pageIdsDone)}")
+print(f'Found {len(pageIdsDone)}')
 
 # Set SIGINT handler
 interrupted = False
@@ -40,49 +40,49 @@ def onSigint(sig, frame):
 	signal.signal(signal.SIGINT, oldHandler)
 oldHandler = signal.signal(signal.SIGINT, onSigint)
 
-print("Opening database")
+print('Opening database')
 dbCon = sqlite3.connect(imgDb)
 dbCur = dbCon.cursor()
-print("Starting downloads")
+print('Starting downloads')
 iterNum = 0
-query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
-	" imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name"
-for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query):
+query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \
+	' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name'
+for pageId, license, artist, credit, restrictions, url in dbCur.execute(query):
 	if pageId in pageIdsDone:
 		continue
 	if interrupted:
-		print(f"Exiting loop")
+		print('Exiting loop')
 		break
 	# Check for problematic attributes
-	if license == None or licenseRegex.fullmatch(license) == None:
+	if license is None or licenseRegex.fullmatch(license) is None:
 		continue
-	if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None:
+	if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None:
 		continue
-	if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None:
+	if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None:
 		continue
-	if restrictions != None and restrictions != "":
+	if restrictions is not None and restrictions != '':
 		continue
 	# Download image
 	iterNum += 1
-	print(f"Iteration {iterNum}: Downloading for page-id {pageId}")
+	print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
 	urlParts = urllib.parse.urlparse(url)
 	extension = os.path.splitext(urlParts.path)[1]
 	if len(extension) <= 1:
-		print(f"WARNING: No filename extension found in URL {url}")
+		print(f'WARNING: No filename extension found in URL {url}')
 		sys.exit(1)
-	outFile = f"{outDir}/{pageId}{extension}"
+	outFile = f'{outDir}/{pageId}{extension}'
 	headers = {
-		"user-agent": "terryt.dev (terry06890@gmail.com)",
-		"accept-encoding": "gzip",
+		'user-agent': 'terryt.dev (terry06890@gmail.com)',
+		'accept-encoding': 'gzip',
 	}
 	try:
 		response = requests.get(url, headers=headers)
 		with open(outFile, 'wb') as file:
 			file.write(response.content)
 		time.sleep(1)
-			# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
+			# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle self to 1 cache miss per sec'
 			# It's unclear how to properly check for cache misses, so this just aims for 1 per sec
 	except Exception as e:
-		print(f"Error while downloading to {outFile}: {e}")
-print("Closing database")
+		print(f'Error while downloading to {outFile}: {e}')
+print('Closing database')
 dbCon.close()
-- 
cgit v1.2.3