From daccbbd9c73a5292ea9d6746560d7009e5aa666d Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Wed, 7 Sep 2022 11:37:37 +1000
Subject: Add python type annotations

Also use consistent quote symbols
Also use 'is None' instead of '== None'
Also use 'if list1' instead of 'if len(list1) > 0'
---
 backend/tolData/enwiki/genImgData.py | 118 +++++++++++++++++------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

(limited to 'backend/tolData/enwiki/genImgData.py')
diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py
index b5d546d..00140f6 100755
--- a/backend/tolData/enwiki/genImgData.py
+++ b/backend/tolData/enwiki/genImgData.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-import sys, re
+import re
 import bz2, html, urllib.parse
 import sqlite3
 
@@ -15,117 +15,117 @@ will skip already-processed page IDs.
 parser.parse_args()
 
 def getInputPageIds():
-	pageIds = set()
-	dbCon = sqlite3.connect("../data.db")
+	pageIds: set[int] = set()
+	dbCon = sqlite3.connect('../data.db')
 	dbCur = dbCon.cursor()
-	for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
+	for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
 		pageIds.add(pageId)
 	dbCon.close()
 	return pageIds
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
-indexDb = "dumpIndex.db"
-imgDb = "imgData.db" # The database to create
-idLineRegex = re.compile(r"<id>(.*)</id>")
-imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
-bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
-imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
-cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+indexDb = 'dumpIndex.db'
+imgDb = 'imgData.db' # The database to create
+idLineRegex = re.compile(r'<id>(.*)</id>')
+imageLineRegex = re.compile(r'.*\| *image *= *([^|]*)')
+bracketImageRegex = re.compile(r'\[\[(File:[^|]*).*]]')
+imageNameRegex = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
+cssImgCropRegex = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
 
-print("Getting input page-ids")
+print('Getting input page-ids')
 pageIds = getInputPageIds()
-print(f"Found {len(pageIds)}")
+print(f'Found {len(pageIds)}')
 
-print("Opening databases")
+print('Opening databases')
 indexDbCon = sqlite3.connect(indexDb)
 indexDbCur = indexDbCon.cursor()
 imgDbCon = sqlite3.connect(imgDb)
 imgDbCur = imgDbCon.cursor()
-print("Checking tables")
-if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+print('Checking tables')
+if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
 	# Create tables if not present
-	imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
-	imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+	imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+	imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
 else:
 	# Check for already-processed page IDs
 	numSkipped = 0
-	for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+	for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
 		if pid in pageIds:
 			pageIds.remove(pid)
 			numSkipped += 1
 		else:
-			print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
-	print(f"Will skip {numSkipped} already-processed page IDs")
+			print(f'WARNING: Found already-processed page ID {pid} which was not in input set')
+	print(f'Will skip {numSkipped} already-processed page IDs')
 
-print("Getting dump-file offsets")
-offsetToPageids = {}
-offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
+print('Getting dump-file offsets')
+offsetToPageids: dict[int, list[int]] = {}
+offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
 iterNum = 0
 for pageId in pageIds:
 	iterNum += 1
 	if iterNum % 1e4 == 0:
-		print(f"At iteration {iterNum}")
+		print(f'At iteration {iterNum}')
 	#
-	query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
-	row = indexDbCur.execute(query, (pageId,)).fetchone()
-	if row == None:
-		print(f"WARNING: Page ID {pageId} not found")
+	query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
+	row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
+	if row is None:
+		print(f'WARNING: Page ID {pageId} not found')
 		continue
-	(chunkOffset, endOffset) = row
+	chunkOffset, endOffset = row
 	offsetToEnd[chunkOffset] = endOffset
 	if chunkOffset not in offsetToPageids:
 		offsetToPageids[chunkOffset] = []
 	offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check")
+print(f'Found {len(offsetToEnd)} chunks to check')
 
-print("Iterating through chunks in dump file")
-def getImageName(content):
-	" Given an array of text-content lines, tries to return an infoxbox image name, or None "
+print('Iterating through chunks in dump file')
+def getImageName(content: list[str]) -> str | None:
+	""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
 	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
 	for line in content:
 		match = imageLineRegex.match(line)
-		if match != None:
+		if match is not None:
 			imageName = match.group(1).strip()
-			if imageName == "":
+			if imageName == '':
 				return None
 			imageName = html.unescape(imageName)
 			# Account for {{...
-			if imageName.startswith("{"):
+			if imageName.startswith('{'):
 				match = cssImgCropRegex.match(imageName)
-				if match == None:
+				if match is None:
 					return None
 				imageName = match.group(1)
 			# Account for [[File:...|...]]
-			if imageName.startswith("["):
+			if imageName.startswith('['):
 				match = bracketImageRegex.match(imageName)
-				if match == None:
+				if match is None:
 					return None
 				imageName = match.group(1)
 			# Account for <!--
-			if imageName.find("<!--") != -1:
+			if imageName.find('<!--') != -1:
 				return None
 			# Remove an initial 'File:'
-			if imageName.startswith("File:"):
+			if imageName.startswith('File:'):
 				imageName = imageName[5:]
 			# Remove an initial 'Image:'
-			if imageName.startswith("Image:"):
+			if imageName.startswith('Image:'):
 				imageName = imageName[6:]
 			# Check for extension
 			match = imageNameRegex.match(imageName)
-			if match != None:
+			if match is not None:
 				imageName = match.group(0)
 				imageName = urllib.parse.unquote(imageName)
 				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
-				imageName = imageName.replace("_", " ")
+				imageName = imageName.replace('_', ' ')
 				return imageName
 			# Exclude lines like: | image = &lt;imagemap&gt;
 			return None
 	return None
 with open(dumpFile, mode='rb') as file:
 	iterNum = 0
-	for (pageOffset, endOffset) in offsetToEnd.items():
+	for pageOffset, endOffset in offsetToEnd.items():
 		iterNum += 1
 		if iterNum % 100 == 0:
-			print(f"At iteration {iterNum}")
+			print(f'At iteration {iterNum}')
 		#
 		pageIds = offsetToPageids[pageOffset]
 		# Jump to chunk
@@ -137,14 +137,14 @@ with open(dumpFile, mode='rb') as file:
 		lineIdx = 0
 		while lineIdx < len(lines):
 			# Look for <page>
-			if lines[lineIdx].lstrip() != "<page>":
+			if lines[lineIdx].lstrip() != '<page>':
 				lineIdx += 1
 				continue
 			# Check page id
 			lineIdx += 3
 			idLine = lines[lineIdx].lstrip()
 			match = idLineRegex.fullmatch(idLine)
-			if match == None or int(match.group(1)) not in pageIds:
+			if match is None or int(match.group(1)) not in pageIds:
 				lineIdx += 1
 				continue
 			pageId = int(match.group(1))
@@ -152,35 +152,35 @@ with open(dumpFile, mode='rb') as file:
 			# Look for <text> in <page>
 			foundText = False
 			while lineIdx < len(lines):
-				if not lines[lineIdx].lstrip().startswith("<text "):
+				if not lines[lineIdx].lstrip().startswith('<text '):
 					lineIdx += 1
 					continue
 				foundText = True
 				# Get text content
-				content = []
+				content: list[str] = []
 				line = lines[lineIdx]
-				content.append(line[line.find(">") + 1:])
+				content.append(line[line.find('>') + 1:])
 				lineIdx += 1
 				foundTextEnd = False
 				while lineIdx < len(lines):
 					line = lines[lineIdx]
-					if not line.endswith("</text>"):
+					if not line.endswith('</text>'):
 						content.append(line)
 						lineIdx += 1
 						continue
 					foundTextEnd = True
-					content.append(line[:line.rfind("</text>")])
+					content.append(line[:line.rfind('</text>')])
 					# Look for image-filename
 					imageName = getImageName(content)
-					imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+					imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName))
 					break
 				if not foundTextEnd:
-					print(f"WARNING: Did not find </text> for page id {pageId}")
+					print(f'WARNING: Did not find </text> for page id {pageId}')
 				break
 			if not foundText:
-				print(f"WARNING: Did not find <text> for page id {pageId}")
+				print(f'WARNING: Did not find <text> for page id {pageId}')
 
-print("Closing databases")
+print('Closing databases')
 indexDbCon.close()
 imgDbCon.commit()
 imgDbCon.close()
-- 
cgit v1.2.3