From abb936f5d76f7fe5cec1e8948d287da86643d504 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Wed, 22 Jun 2022 23:16:42 +1000
Subject: Refactor backend scripts

---
 backend/data/enwiki/README.md                 |  2 +-
 backend/data/enwiki/downloadEnwikiImgs.py     | 88 --------------------------
 backend/data/enwiki/downloadImgLicenseInfo.py | 60 ++++++++++--------
 backend/data/enwiki/downloadImgs.py           | 91 +++++++++++++++++++++++++++
 backend/data/enwiki/genDescData.py            | 43 +++++++------
 backend/data/enwiki/genDumpIndexDb.py         | 26 ++++----
 backend/data/enwiki/genImgData.py             | 72 ++++++++++++---------
 backend/data/enwiki/lookupPage.py             | 22 ++++---
 8 files changed, 219 insertions(+), 185 deletions(-)
 delete mode 100755 backend/data/enwiki/downloadEnwikiImgs.py
 create mode 100755 backend/data/enwiki/downloadImgs.py

(limited to 'backend/data/enwiki')
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index 1c16a2e..90d16c7 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -42,7 +42,7 @@ This directory holds files obtained from/using [English Wikipedia](https://en.wi
         `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
     -   `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
         Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
--   downloadEnwikiImgs.py <br>
+-   downloadImgs.py <br>
     Used to download image files into imgs/.
 
 # Other Files
diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py
deleted file mode 100755
index 2929a0d..0000000
--- a/backend/data/enwiki/downloadEnwikiImgs.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os
-import sqlite3
-import urllib.parse, requests
-import time, signal
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an sqlite db,\n"
-usageInfo += "into a specified directory.'\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and looks\n"
-usageInfo += "in the output directory do decide what to skip.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-imgDb = "imgData.db" # About 130k image names
-outDir = "imgs"
-licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
-
-# Create output directory if not present
-if not os.path.exists(outDir):
-	os.mkdir(outDir)
-# Get existing image names
-print("Gettings already-downloaded images")
-fileList = os.listdir(outDir)
-pageIdsDone = set()
-for filename in fileList:
-	(basename, extension) = os.path.splitext(filename)
-	pageIdsDone.add(int(basename))
-print(f"Found {len(pageIdsDone)} already-downloaded images")
-# Set SIGINT handler
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-	signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Open db
-dbCon = sqlite3.connect(imgDb)
-dbCur = dbCon.cursor()
-# Start downloads
-print("Starting downloads")
-iterNum = 0
-query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
-	" imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name"
-for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query):
-	if pageId in pageIdsDone:
-		continue
-	if interrupted:
-		print(f"Exiting loop")
-		break
-	# Check for problematic attributes
-	if license == None or licenseRegex.fullmatch(license) == None:
-		continue
-	if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None:
-		continue
-	if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None:
-		continue
-	if restrictions != None and restrictions != "":
-		continue
-	# Download image
-	iterNum += 1
-	print(f"Iteration {iterNum}: Downloading for page-id {pageId}")
-	urlParts = urllib.parse.urlparse(url)
-	extension = os.path.splitext(urlParts.path)[1]
-	if len(extension) <= 1:
-		print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
-		sys.exit(1)
-	outFile = f"{outDir}/{pageId}{extension}"
-	headers = {
-		"user-agent": "terryt.dev (terry06890@gmail.com)",
-		"accept-encoding": "gzip",
-	}
-	try:
-		response = requests.get(url, headers=headers)
-		with open(outFile, 'wb') as file:
-			file.write(response.content)
-		time.sleep(1)
-			# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
-			# It's unclear how to properly check for cache misses, so just do about <=1 per sec
-	except Exception as e:
-		print(f"Error while downloading to {outFile}: {e}", file=sys.stderr)
-# Close db
-dbCon.close()
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
index 097304b..399922e 100755
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ b/backend/data/enwiki/downloadImgLicenseInfo.py
@@ -5,41 +5,48 @@ import sqlite3, urllib.parse, html
 import requests
 import time, signal
 
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n"
-usageInfo += "licensing information for them, adding the info to a sqlite db.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and looks\n"
-usageInfo += "at names added to the db to decide what to skip.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads image names from a database, and uses enwiki's online API to obtain
+licensing information for them, adding the info to the database.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+at already-processed names to decide what to skip.
+"""
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-imgDb = "imgData.db" # About 130k image names
+imgDb = "imgData.db"
 apiUrl = "https://en.wikipedia.org/w/api.php"
+userAgent = "terryt.dev (terry06890@gmail.com)"
 batchSz = 50 # Max 50
 tagRegex = re.compile(r"<[^<]+>")
 whitespaceRegex = re.compile(r"\s+")
 
-# Open db
+print("Opening database")
 dbCon = sqlite3.connect(imgDb)
 dbCur = dbCon.cursor()
 dbCur2 = dbCon.cursor()
-# Create table if it doesn't exist
+print("Checking for table")
 if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
 	dbCur.execute("CREATE TABLE imgs(" \
 		"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
-# Get image names
+
 print("Reading image names")
 imgNames = set()
 for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
 	imgNames.add(imgName)
-print(f"Found {len(imgNames)} images")
+print(f"Found {len(imgNames)}")
+
+print("Checking for already-processed images")
 oldSz = len(imgNames)
 for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
 	imgNames.discard(imgName)
-print(f"Skipping {oldSz - len(imgNames)} already-done images")
+print(f"Found {oldSz - len(imgNames)}")
+
 # Set SIGINT handler
 interrupted = False
 oldHandler = None
@@ -48,7 +55,8 @@ def onSigint(sig, frame):
 	interrupted = True
 	signal.signal(signal.SIGINT, oldHandler)
 oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Iterate through image names, making API requests
+
+print("Iterating through image names")
 imgNames = list(imgNames)
 iterNum = 0
 for i in range(0, len(imgNames), batchSz):
@@ -63,7 +71,7 @@ for i in range(0, len(imgNames), batchSz):
 	imgBatch = ["File:" + x for x in imgBatch]
 	# Make request
 	headers = {
-		"user-agent": "terryt.dev (terry06890@gmail.com)",
+		"user-agent": userAgent,
 		"accept-encoding": "gzip",
 	}
 	params = {
@@ -80,16 +88,16 @@ for i in range(0, len(imgNames), batchSz):
 		response = requests.get(apiUrl, params=params, headers=headers)
 		responseObj = response.json()
 	except Exception as e:
-		print(f"Error while downloading info: {e}", file=sys.stderr)
-		print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
+		print(f"ERROR: Exception while downloading info: {e}")
+		print(f"\tImage batch: " + "|".join(imgBatch))
 		continue
 	# Parse response-object
 	if "query" not in responseObj or "pages" not in responseObj["query"]:
-		print("WARNING: Response object for doesn't have page data", file=sys.stderr)
-		print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
+		print("WARNING: Response object for doesn't have page data")
+		print("\tImage batch: " + "|".join(imgBatch))
 		if "error" in responseObj:
 			errorCode = responseObj["error"]["code"]
-			print(f"\tError code: {errorCode}", file=sys.stderr)
+			print(f"\tError code: {errorCode}")
 			if errorCode == "maxlag":
 				time.sleep(5)
 		continue
@@ -111,10 +119,10 @@ for i in range(0, len(imgNames), batchSz):
 			title = normalisedToInput[title]
 		title = title[5:] # Remove 'File:'
 		if title not in imgNames:
-			print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr)
+			print(f"WARNING: Got title \"{title}\" not in image-name list")
 			continue
 		if "imageinfo" not in page:
-			print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr)
+			print(f"WARNING: No imageinfo section for page \"{title}\"")
 			continue
 		metadata = page["imageinfo"][0]["extmetadata"]
 		url = page["imageinfo"][0]["url"]
@@ -122,7 +130,7 @@ for i in range(0, len(imgNames), batchSz):
 		artist = metadata['Artist']['value'] if 'Artist' in metadata else None
 		credit = metadata['Credit']['value'] if 'Credit' in metadata else None
 		restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
-		# Remove newlines
+		# Remove markup
 		if artist != None:
 			artist = tagRegex.sub(" ", artist)
 			artist = whitespaceRegex.sub(" ", artist)
@@ -134,7 +142,9 @@ for i in range(0, len(imgNames), batchSz):
 			credit = html.unescape(credit)
 			credit = urllib.parse.unquote(credit)
 		# Add to db
-		dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url))
-# Close db
+		dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
+			(title, license, artist, credit, restrictions, url))
+
+print("Closing database")
 dbCon.commit()
 dbCon.close()
diff --git a/backend/data/enwiki/downloadImgs.py b/backend/data/enwiki/downloadImgs.py
new file mode 100755
index 0000000..8fb605f
--- /dev/null
+++ b/backend/data/enwiki/downloadImgs.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+
+import sys, re, os
+import sqlite3
+import urllib.parse, requests
+import time, signal
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'pageId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+imgDb = "imgData.db" # About 130k image names
+outDir = "imgs"
+licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
+# In testing, this downloaded about 100k images, over several days
+
+if not os.path.exists(outDir):
+	os.mkdir(outDir)
+print("Checking for already-downloaded images")
+fileList = os.listdir(outDir)
+pageIdsDone = set()
+for filename in fileList:
+	(basename, extension) = os.path.splitext(filename)
+	pageIdsDone.add(int(basename))
+print(f"Found {len(pageIdsDone)}")
+
+# Set SIGINT handler
+interrupted = False
+oldHandler = None
+def onSigint(sig, frame):
+	global interrupted
+	interrupted = True
+	signal.signal(signal.SIGINT, oldHandler)
+oldHandler = signal.signal(signal.SIGINT, onSigint)
+
+print("Opening database")
+dbCon = sqlite3.connect(imgDb)
+dbCur = dbCon.cursor()
+print("Starting downloads")
+iterNum = 0
+query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
+	" imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name"
+for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query):
+	if pageId in pageIdsDone:
+		continue
+	if interrupted:
+		print(f"Exiting loop")
+		break
+	# Check for problematic attributes
+	if license == None or licenseRegex.fullmatch(license) == None:
+		continue
+	if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None:
+		continue
+	if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None:
+		continue
+	if restrictions != None and restrictions != "":
+		continue
+	# Download image
+	iterNum += 1
+	print(f"Iteration {iterNum}: Downloading for page-id {pageId}")
+	urlParts = urllib.parse.urlparse(url)
+	extension = os.path.splitext(urlParts.path)[1]
+	if len(extension) <= 1:
+		print(f"WARNING: No filename extension found in URL {url}")
+		sys.exit(1)
+	outFile = f"{outDir}/{pageId}{extension}"
+	headers = {
+		"user-agent": "terryt.dev (terry06890@gmail.com)",
+		"accept-encoding": "gzip",
+	}
+	try:
+		response = requests.get(url, headers=headers)
+		with open(outFile, 'wb') as file:
+			file.write(response.content)
+		time.sleep(1)
+			# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
+			# It's unclear how to properly check for cache misses, so this just aims for 1 per sec
+	except Exception as e:
+		print(f"Error while downloading to {outFile}: {e}")
+print("Closing database")
+dbCon.close()
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
index 032dbed..b0ca272 100755
--- a/backend/data/enwiki/genDescData.py
+++ b/backend/data/enwiki/genDescData.py
@@ -5,31 +5,36 @@ import bz2
 import html, mwxml, mwparserfromhell
 import sqlite3
 
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
-usageInfo += "and short-description info to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads through the wiki dump, and attempts to
+parse short-descriptions, and add them to a database.
+"""
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages
 enwikiDb = "descData.db"
+# In testing, this script took over 10 hours to run, and generated about 5GB
 
-# Some regexps and functions for parsing wikitext
 descLineRegex = re.compile("^ *[A-Z'\"]")
 embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
 	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
 convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
-parensGrpRegex = re.compile(r" \([^()]*\)")
-leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
 def convertTemplateReplace(match):
 	if match.group(2) == None:
 		return f"{match.group(1)} {match.group(4)}"
 	else:
 		return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
+parensGroupRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+
 def parseDesc(text):
-	# Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
-	# Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, 
+	# Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+		# and then accumulate lines until a blank one.
+	# Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, 
 		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
 	lines = []
 	openBraceCount = 0
@@ -74,18 +79,15 @@ def removeMarkup(content):
 	content = embeddedHtmlRegex.sub("", content)
 	content = convertTemplateRegex.sub(convertTemplateReplace, content)
 	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
-	content = parensGrpRegex.sub("", content)
+	content = parensGroupRegex.sub("", content)
 	content = leftoverBraceRegex.sub("", content)
 	return content
-# Other helper functions
 def convertTitle(title):
 	return html.unescape(title).replace("_", " ")
 
-# Check for existing db
+print("Creating database")
 if os.path.exists(enwikiDb):
-	print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
-	sys.exit(1)
-# Create db
+	raise Exception(f"ERROR: Existing {enwikiDb}")
 dbCon = sqlite3.connect(enwikiDb)
 dbCur = dbCon.cursor()
 dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
@@ -93,8 +95,8 @@ dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
 dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
 dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
 dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Read through dump file
-print("Reading dump file")
+
+print("Iterating through dump file")
 with bz2.open(dumpFile, mode='rt') as file:
 	dump = mwxml.Dump.from_file(file)
 	pageNum = 0
@@ -102,13 +104,15 @@ with bz2.open(dumpFile, mode='rt') as file:
 		pageNum += 1
 		if pageNum % 1e4 == 0:
 			print(f"At page {pageNum}")
+		if pageNum > 3e4:
+			break
 		# Parse page
 		if page.namespace == 0:
 			try:
 				dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
 			except sqlite3.IntegrityError as e:
 				# Accounts for certain pages that have the same title
-				print(f"Failed to add page with title \"{page.title}\": {e}")
+				print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr)
 				continue
 			if page.redirect != None:
 				dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
@@ -117,6 +121,7 @@ with bz2.open(dumpFile, mode='rt') as file:
 				desc = parseDesc(revision.text)
 				if desc != None:
 					dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
+
+print("Closing database")
 dbCon.commit()
 dbCon.close()
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py
index ee3e813..3955885 100755
--- a/backend/data/enwiki/genDumpIndexDb.py
+++ b/backend/data/enwiki/genDumpIndexDb.py
@@ -4,25 +4,26 @@ import sys, os, re
 import bz2
 import sqlite3
 
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump index file,\n"
-usageInfo += "and stores it's offset and title data to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Adds data from the wiki dump index-file into a database.
+"""
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines
+indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines
 indexDb = "dumpIndex.db"
 
-# Check for existing db
 if os.path.exists(indexDb):
-	print(f"ERROR: Existing {indexDb}", file=sys.stderr)
-	sys.exit(1)
-# Create db
+	raise Exception(f"ERROR: Existing {indexDb}")
+print("Creating database")
 dbCon = sqlite3.connect(indexDb)
 dbCur = dbCon.cursor()
 dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)")
-# Reading index file
+
+print("Iterating through index file")
 lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
 lastOffset = 0
 lineNum = 0
@@ -42,7 +43,7 @@ with bz2.open(indexFile, mode='rt') as file:
 					dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset))
 				except sqlite3.IntegrityError as e:
 					# Accounts for certain entries in the file that have the same title
-					print(f"Failed on title \"{t}\": {e}")
+					print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
 			entriesToAdd = []
 			lastOffset = offset
 		entriesToAdd.append([title, pageId])
@@ -50,7 +51,8 @@ for (title, pageId) in entriesToAdd:
 	try:
 		dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1))
 	except sqlite3.IntegrityError as e:
-		print(f"Failed on title \"{t}\": {e}")
-# Close db
+		print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+
+print("Closing database")
 dbCon.commit()
 dbCon.close()
diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py
index 9bd28f4..dedfe14 100755
--- a/backend/data/enwiki/genImgData.py
+++ b/backend/data/enwiki/genImgData.py
@@ -4,9 +4,15 @@ import sys, re
 import bz2, html, urllib.parse
 import sqlite3
 
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
-usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
@@ -21,58 +27,64 @@ def getInputPageIds():
 	return pageIds
 dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
 indexDb = "dumpIndex.db"
-imgDb = "imgData.db" # Output db
+imgDb = "imgData.db" # The database to create
 idLineRegex = re.compile(r"<id>(.*)</id>")
 imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
 bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
 imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
 cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+# In testing, got about 360k image names
 
-# Open dbs
+print("Getting input page-ids")
+pageIds = getInputPageIds()
+print(f"Found {len(pageIds)}")
+
+print("Opening databases")
 indexDbCon = sqlite3.connect(indexDb)
 indexDbCur = indexDbCon.cursor()
 imgDbCon = sqlite3.connect(imgDb)
 imgDbCur = imgDbCon.cursor()
-# Create image-db table
-pidsDone = set()
+print("Checking tables")
 if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+	# Create tables if not present
 	imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
 	imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
 else:
+	# Check for already-processed page IDs
+	numSkipped = 0
 	for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
-		pidsDone.add(pid)
-	print(f"Will skip {len(pidsDone)} already-processed page-ids")
-# Get input pageIds
-print("Getting input page-ids", file=sys.stderr)
-pageIds = getInputPageIds()
-for pid in pidsDone:
-	pageIds.remove(pid)
-print(f"Found {len(pageIds)} page-ids to process")
-# Get page-id dump-file offsets
-print("Getting dump-file offsets", file=sys.stderr)
+		if pid in pageIds:
+			pageIds.remove(pid)
+			numSkipped += 1
+		else:
+			print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
+	print(f"Will skip {numSkipped} already-processed page IDs")
+
+print("Getting dump-file offsets")
 offsetToPageids = {}
-offsetToEnd = {}
+offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
 iterNum = 0
 for pageId in pageIds:
 	iterNum += 1
 	if iterNum % 1e4 == 0:
-		print(f"At iteration {iterNum}", file=sys.stderr)
+		print(f"At iteration {iterNum}")
 	#
 	query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
 	row = indexDbCur.execute(query, (pageId,)).fetchone()
 	if row == None:
-		print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
+		print(f"WARNING: Page ID {pageId} not found")
 		continue
 	(chunkOffset, endOffset) = row
 	offsetToEnd[chunkOffset] = endOffset
 	if chunkOffset not in offsetToPageids:
 		offsetToPageids[chunkOffset] = []
 	offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
-# Look through dump file, jumping to chunks containing relevant pages
-print("Reading through dump file", file=sys.stderr)
+print(f"Found {len(offsetToEnd)} chunks to check")
+
+print("Iterating through chunks in dump file")
 def getImageName(content):
-	""" Given an array of text-content lines, returns an image-filename, or None """
+	" Given an array of text-content lines, tries to return an infoxbox image name, or None "
+	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
 	for line in content:
 		match = imageLineRegex.match(line)
 		if match != None:
@@ -109,16 +121,15 @@ def getImageName(content):
 				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
 				imageName = imageName.replace("_", " ")
 				return imageName
-			# Skip lines like: | image = &lt;imagemap&gt;
+			# Exclude lines like: | image = &lt;imagemap&gt;
 			return None
-	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
 	return None
 with open(dumpFile, mode='rb') as file:
 	iterNum = 0
 	for (pageOffset, endOffset) in offsetToEnd.items():
 		iterNum += 1
 		if iterNum % 100 == 0:
-			print(f"At iteration {iterNum}", file=sys.stderr)
+			print(f"At iteration {iterNum}")
 		#
 		pageIds = offsetToPageids[pageOffset]
 		# Jump to chunk
@@ -168,11 +179,12 @@ with open(dumpFile, mode='rb') as file:
 					imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
 					break
 				if not foundTextEnd:
-					print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
+					print(f"WARNING: Did not find </text> for page id {pageId}")
 				break
 			if not foundText:
-				print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
-# Close dbs
+				print(f"WARNING: Did not find <text> for page id {pageId}")
+
+print("Closing databases")
 indexDbCon.close()
 imgDbCon.commit()
 imgDbCon.close()
diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py
index 76f2f95..1a90851 100755
--- a/backend/data/enwiki/lookupPage.py
+++ b/backend/data/enwiki/lookupPage.py
@@ -4,9 +4,12 @@ import sys, re
 import bz2
 import sqlite3
 
-usageInfo =  f"usage: {sys.argv[0]} title1\n"
-usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n"
-usageInfo += "using a dump index db, and prints the corresponding <page>.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]} title1
+
+Looks up a page with title title1 in the wiki dump, using
+the dump-index db, and prints the corresponding <page>.
+"""
 if len(sys.argv) != 2:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
@@ -15,20 +18,19 @@ dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
 indexDb = "dumpIndex.db"
 pageTitle = sys.argv[1].replace("_", " ")
 
-# Searching index file
-print("Lookup offset in index db")
+print("Looking up offset in index db")
 dbCon = sqlite3.connect(indexDb)
 dbCur = dbCon.cursor()
 query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?"
 row = dbCur.execute(query, (pageTitle,)).fetchone()
 if row == None:
 	print("Title not found")
-	sys.exit(1)
-(_, pageOffset, endOffset) = row
+	sys.exit(0)
+_, pageOffset, endOffset = row
 dbCon.close()
 print(f"Found chunk at offset {pageOffset}")
-# Read dump file
-print("Reading dump file")
+
+print("Reading from wiki dump")
 content = []
 with open(dumpFile, mode='rb') as file:
 	# Get uncompressed chunk
@@ -61,6 +63,6 @@ with open(dumpFile, mode='rb') as file:
 					if line.lstrip() == "</page>":
 						break
 		lineIdx += 1
-# Print content
+
 print("Content: ")
 print("\n".join(content))
-- 
cgit v1.2.3