7 files changed, 39 insertions, 61 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index 29fc2ff..76d33e5 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -33,12 +33,12 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
 
 # Image Files
 -   `gen_img_data.py` <br>
-    Used to find infobox image names for page IDs, and store them into a database.
+    Finds infobox image names for page IDs, and stores them into a database.
 -   `download_img_license_info.py` <br>
-    Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database.
+    Downloads licensing metadata for image names, via wikipedia's online API, and stores them into a database.
 -   `img_data.db` <br>
-    Used to hold metadata about infobox images for a set of page IDs.
-    Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br>
+    Holds metadata about infobox images for a set of page IDs.
+    Generated using `gen_img_data.py` and `download_img_license_info.py`. <br>
     Tables: <br>
     -   `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br>
         `img_name` may be NULL, which means 'none found', and is used to avoid re-processing page IDs.
@@ -47,7 +47,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
             <br>
         Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
 -   `download_imgs.py` <br>
-    Used to download image files into imgs/.
+    Downloads image files into imgs/.
 
 # Description Files
 -   `gen_desc_data.py` <br>
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 1217caf..43f2c43 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks
 at already-processed names to decide what to skip.
 """
 
-import re
+import argparse
+import re, time, signal
 import sqlite3, urllib.parse, html
 import requests
-import time, signal
 
 IMG_DB = 'img_data.db'
 #
@@ -150,7 +150,6 @@ def downloadInfo(imgDb: str) -> None:
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index bbd2cda..7dd0771 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks
 in the output directory do decide what to skip.
 """
 
-import re, os
+import argparse
+import re, os, time, signal
 import sqlite3
 import urllib.parse, requests
-import time, signal
 
 IMG_DB = 'img_data.db' # About 130k image names
 OUT_DIR = 'imgs'
@@ -22,7 +22,7 @@ USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 TIMEOUT = 1
 	# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
 	# It's unclear how to properly check for cache misses, so we just aim for 1 per sec
-BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit)
+EXP_BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit)
 
 def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	if not os.path.exists(outDir):
@@ -84,7 +84,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 			time.sleep(timeout)
 		except Exception as e:
 			print(f'Error while downloading to {outFile}: {e}')
-			if not BACKOFF:
+			if not EXP_BACKOFF:
 				return
 			else:
 				timeout *= 2
@@ -94,7 +94,6 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index b3fde52..bb2b845 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -7,14 +7,14 @@ and adds them to a database
 
 # In testing, this script took over 10 hours to run, and generated about 5GB
 
+import argparse
 import sys, os, re
-import bz2
-import html, mwxml, mwparserfromhell
+import bz2, html, mwxml, mwparserfromhell
 import sqlite3
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
 DB_FILE = 'desc_data.db'
-
+# Regexps
 DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
 EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
 	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
@@ -119,7 +119,6 @@ def convertTitle(title: str) -> str:
 	return html.unescape(title).replace('_', ' ')
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
index 5778680..6be8bc5 100755
--- a/backend/hist_data/enwiki/gen_dump_index_db.py
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -1,11 +1,12 @@
 #!/usr/bin/python3
 
 """
-Adds data from the wiki dump index-file into a database
+Adds data from the wiki-dump index-file into a database
 """
+
+import argparse
 import sys, os, re
-import bz2
-import sqlite3
+import bz2, sqlite3
 
 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
 DB_FILE = 'dump_index.db'
@@ -53,7 +54,6 @@ def genData(indexFile: str, dbFile: str) -> None:
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index b4ade9f..9aa3863 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -8,17 +8,15 @@ The program can be re-run with an updated set of page IDs, and
 will skip already-processed page IDs.
 """
 
-import re
-import os, bz2, html, urllib.parse
+import os, re
+import bz2, html, urllib.parse
 import sqlite3
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
 INDEX_DB = 'dump_index.db'
-PAGEVIEW_DB = 'pageview_data.db'
 IMG_DB = 'img_data.db' # The database to create
 DB_FILE = os.path.join('..', 'data.db')
-MAX_IMGS_PER_CTG = 20000
-#
+# Regexps
 ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
 IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
 BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
@@ -35,7 +33,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 	if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
 		# Create tables if not present
 		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)')
-			# 'img_name' may be NULL
+			# 'img_name' values are set to NULL to indicate page IDs where no image was found
 		imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
 	else:
 		# Check for already-processed page IDs
@@ -179,49 +177,28 @@ def getImageName(content: list[str]) -> str | None:
 			return None
 	return None
 
-def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]:
+def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
 	print('Getting event data')
-	titleToCtg: dict[str, str] = {}
+	titles: set[str] = set()
 	dbCon = sqlite3.connect(dbFile)
-	for title, ctg in dbCon.execute('SELECT title, ctg from events'):
-		titleToCtg[title] = ctg
+	for (title,) in dbCon.execute('SELECT title from events'):
+		titles.add(title)
 	dbCon.close()
-	print('Getting top images for each event category')
-	ctgToTitles: dict[str, list[str]] = {}
-	dbCon = sqlite3.connect(pageviewDb)
-	for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'):
-		if title not in titleToCtg:
-			continue
-		ctg = titleToCtg[title]
-		if ctg not in ctgToTitles:
-			ctgToTitles[ctg] = []
-		elif len(ctgToTitles[ctg]) == maxImgsPerCtg:
-			continue
-		ctgToTitles[ctg].append(title)
-		del titleToCtg[title]
-	dbCon.close()
-	for title, ctg in titleToCtg.items(): # Account for titles without view counts
-		if ctg not in ctgToTitles:
-			ctgToTitles[ctg] = []
-		elif len(ctgToTitles[ctg]) == maxImgsPerCtg:
-			continue
-		ctgToTitles[ctg].append(title)
 	print('Getting page IDs')
 	pageIds: set[int] = set()
 	dbCon = sqlite3.connect(indexDb)
 	dbCur = dbCon.cursor()
-	for ctg in ctgToTitles:
-		for title in ctgToTitles[ctg]:
-			row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
-			if row:
-				pageIds.add(row[0])
+	for title in titles:
+		row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+		if row:
+			pageIds.add(row[0])
 	dbCon.close()
-	print(f'Result: {len(pageIds)} out of {len(titleToCtg)}')
+	print(f'Result: {len(pageIds)} out of {len(titles)}')
 	return pageIds
 if __name__ == '__main__':
 	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
-	pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG)
+	pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB)
 	genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 90ec925..935b303 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -36,9 +36,13 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 				if not line.startswith(linePrefix):
 					continue
 				# Get second and second-last fields
-				line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
-				title = line[:line.find(b' ')].decode('utf-8')
-				viewCount = int(line[line.rfind(b' ')+1:])
+				linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
+				title = linePart[:linePart.find(b' ')].decode('utf-8')
+				try:
+					viewCount = int(linePart[linePart.rfind(b' ')+1:])
+				except ValueError:
+					print(f'Unable to read count in line {lineNum}: {line}')
+					continue
 				if namespaceRegex.match(title) is not None:
 					continue
 				# Update map