From daccbbd9c73a5292ea9d6746560d7009e5aa666d Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Wed, 7 Sep 2022 11:37:37 +1000
Subject: Add python type annotations

Also use consistent quote symbols
Also use 'is None' instead of '== None'
Also use 'if list1' instead of 'if len(list1) > 0'
---
 backend/tolData/enwiki/downloadImgLicenseInfo.py | 136 +++++++++++------------
 1 file changed, 68 insertions(+), 68 deletions(-)

(limited to 'backend/tolData/enwiki/downloadImgLicenseInfo.py')

diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py
index dd39d54..ba6317e 100755
--- a/backend/tolData/enwiki/downloadImgLicenseInfo.py
+++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-import sys, re
+import re
 import sqlite3, urllib.parse, html
 import requests
 import time, signal
@@ -16,33 +16,33 @@ at already-processed names to decide what to skip.
 """, formatter_class=argparse.RawDescriptionHelpFormatter)
 parser.parse_args()
 
-imgDb = "imgData.db"
-apiUrl = "https://en.wikipedia.org/w/api.php"
-userAgent = "terryt.dev (terry06890@gmail.com)"
+imgDb = 'imgData.db'
+apiUrl = 'https://en.wikipedia.org/w/api.php'
+userAgent = 'terryt.dev (terry06890@gmail.com)'
 batchSz = 50 # Max 50
-tagRegex = re.compile(r"<[^<]+>")
-whitespaceRegex = re.compile(r"\s+")
+tagRegex = re.compile(r'<[^<]+>')
+whitespaceRegex = re.compile(r'\s+')
 
-print("Opening database")
+print('Opening database')
 dbCon = sqlite3.connect(imgDb)
 dbCur = dbCon.cursor()
 dbCur2 = dbCon.cursor()
-print("Checking for table")
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
-	dbCur.execute("CREATE TABLE imgs(" \
-		"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
+print('Checking for table')
+if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
+	dbCur.execute('CREATE TABLE imgs(' \
+		'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
 
-print("Reading image names")
-imgNames = set()
-for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
+print('Reading image names')
+imgNames: set[str] = set()
+for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
 	imgNames.add(imgName)
-print(f"Found {len(imgNames)}")
+print(f'Found {len(imgNames)}')
 
-print("Checking for already-processed images")
+print('Checking for already-processed images')
 oldSz = len(imgNames)
-for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
+for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
 	imgNames.discard(imgName)
-print(f"Found {oldSz - len(imgNames)}")
+print(f'Found {oldSz - len(imgNames)}')
 
 # Set SIGINT handler
 interrupted = False
@@ -53,95 +53,95 @@ def onSigint(sig, frame):
 	signal.signal(signal.SIGINT, oldHandler)
 oldHandler = signal.signal(signal.SIGINT, onSigint)
 
-print("Iterating through image names")
-imgNames = list(imgNames)
+print('Iterating through image names')
+imgNameList = list(imgNames)
 iterNum = 0
-for i in range(0, len(imgNames), batchSz):
+for i in range(0, len(imgNameList), batchSz):
 	iterNum += 1
 	if iterNum % 1 == 0:
-		print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
+		print(f'At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)')
 	if interrupted:
-		print(f"Exiting loop at iteration {iterNum}")
+		print(f'Exiting loop at iteration {iterNum}')
 		break
 	# Get batch
-	imgBatch = imgNames[i:i+batchSz]
-	imgBatch = ["File:" + x for x in imgBatch]
+	imgBatch = imgNameList[i:i+batchSz]
+	imgBatch = ['File:' + x for x in imgBatch]
 	# Make request
 	headers = {
-		"user-agent": userAgent,
-		"accept-encoding": "gzip",
+		'user-agent': userAgent,
+		'accept-encoding': 'gzip',
 	}
 	params = {
-		"action": "query",
-		"format": "json",
-		"prop": "imageinfo",
-		"iiprop": "extmetadata|url",
-		"maxlag": "5",
-		"titles": "|".join(imgBatch),
-		"iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
+		'action': 'query',
+		'format': 'json',
+		'prop': 'imageinfo',
+		'iiprop': 'extmetadata|url',
+		'maxlag': '5',
+		'titles': '|'.join(imgBatch),
+		'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions',
 	}
 	responseObj = None
 	try:
 		response = requests.get(apiUrl, params=params, headers=headers)
 		responseObj = response.json()
 	except Exception as e:
-		print(f"ERROR: Exception while downloading info: {e}")
-		print(f"\tImage batch: " + "|".join(imgBatch))
+		print(f'ERROR: Exception while downloading info: {e}')
+		print('\tImage batch: ' + '|'.join(imgBatch))
 		continue
 	# Parse response-object
-	if "query" not in responseObj or "pages" not in responseObj["query"]:
-		print("WARNING: Response object for doesn't have page data")
-		print("\tImage batch: " + "|".join(imgBatch))
-		if "error" in responseObj:
-			errorCode = responseObj["error"]["code"]
-			print(f"\tError code: {errorCode}")
-			if errorCode == "maxlag":
+	if 'query' not in responseObj or 'pages' not in responseObj['query']:
+		print('WARNING: Response object for doesn\'t have page data')
+		print('\tImage batch: ' + '|'.join(imgBatch))
+		if 'error' in responseObj:
+			errorCode = responseObj['error']['code']
+			print(f'\tError code: {errorCode}')
+			if errorCode == 'maxlag':
 				time.sleep(5)
 		continue
-	pages = responseObj["query"]["pages"]
-	normalisedToInput = {}
-	if "normalized" in responseObj["query"]:
-		for entry in responseObj["query"]["normalized"]:
-			normalisedToInput[entry["to"]] = entry["from"]
-	for (_, page) in pages.items():
+	pages = responseObj['query']['pages']
+	normalisedToInput: dict[str, str] = {}
+	if 'normalized' in responseObj['query']:
+		for entry in responseObj['query']['normalized']:
+			normalisedToInput[entry['to']] = entry['from']
+	for _, page in pages.items():
 		# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
 			# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
 			# Artist: author name (might contain complex html, multiple authors, etc)
 			# Credit: 'source'
 				# For image-map-like images, can be quite large/complex html, creditng each sub-image
-				# May be <a href="text1">text2</a>, where the text2 might be non-indicative
+				# May be <a href='text1'>text2</a>, where the text2 might be non-indicative
 			# Restrictions: specifies non-copyright legal restrictions
-		title = page["title"]
+		title: str = page['title']
 		if title in normalisedToInput:
 			title = normalisedToInput[title]
 		title = title[5:] # Remove 'File:'
 		if title not in imgNames:
-			print(f"WARNING: Got title \"{title}\" not in image-name list")
+			print(f'WARNING: Got title "{title}" not in image-name list')
 			continue
-		if "imageinfo" not in page:
-			print(f"WARNING: No imageinfo section for page \"{title}\"")
+		if 'imageinfo' not in page:
+			print(f'WARNING: No imageinfo section for page "{title}"')
 			continue
-		metadata = page["imageinfo"][0]["extmetadata"]
-		url = page["imageinfo"][0]["url"]
-		license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
-		artist = metadata['Artist']['value'] if 'Artist' in metadata else None
-		credit = metadata['Credit']['value'] if 'Credit' in metadata else None
-		restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+		metadata = page['imageinfo'][0]['extmetadata']
+		url: str = page['imageinfo'][0]['url']
+		license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
+		artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
+		credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
+		restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
 		# Remove markup
-		if artist != None:
-			artist = tagRegex.sub(" ", artist)
-			artist = whitespaceRegex.sub(" ", artist)
+		if artist is not None:
+			artist = tagRegex.sub(' ', artist)
+			artist = whitespaceRegex.sub(' ', artist)
 			artist = html.unescape(artist)
 			artist = urllib.parse.unquote(artist)
-		if credit != None:
-			credit = tagRegex.sub(" ", credit)
-			credit = whitespaceRegex.sub(" ", credit)
+		if credit is not None:
+			credit = tagRegex.sub(' ', credit)
+			credit = whitespaceRegex.sub(' ', credit)
 			credit = html.unescape(credit)
 			credit = urllib.parse.unquote(credit)
 		# Add to db
-		dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
+		dbCur2.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
 			(title, license, artist, credit, restrictions, url))
 
-print("Closing database")
+print('Closing database')
 dbCon.commit()
 dbCon.close()
-- 
cgit v1.2.3