From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Sun, 11 Sep 2022 14:55:42 +1000
Subject: Add backend unit tests

- Add unit testing code in backend/tests/
- Change to snake-case for script/file/directory names
- Use os.path.join() instead of '/'
- Refactor script code into function defs and a main-guard
- Make global vars all-caps

Some fixes:
- For getting descriptions, some wiki redirects weren't properly resolved
- Linked images were sub-optimally propagated
- Generation of reduced trees assumed a wiki-id association implied a description
- Tilo.py had potential null dereferences by not always using a reduced node set
- EOL image downloading didn't properly wait for all threads to end when finishing
---
 backend/tol_data/enwiki/README.md                  |  63 +++++++
 backend/tol_data/enwiki/__init__.py                |   0
 .../tol_data/enwiki/download_img_license_info.py   | 154 ++++++++++++++++
 backend/tol_data/enwiki/download_imgs.py           |  99 +++++++++++
 backend/tol_data/enwiki/gen_desc_data.py           | 126 ++++++++++++++
 backend/tol_data/enwiki/gen_dump_index_db.py       |  60 +++++++
 backend/tol_data/enwiki/gen_img_data.py            | 193 +++++++++++++++++++++
 backend/tol_data/enwiki/gen_pageview_data.py       |  68 ++++++++
 backend/tol_data/enwiki/lookup_page.py             |  71 ++++++++
 9 files changed, 834 insertions(+)
 create mode 100644 backend/tol_data/enwiki/README.md
 create mode 100644 backend/tol_data/enwiki/__init__.py
 create mode 100755 backend/tol_data/enwiki/download_img_license_info.py
 create mode 100755 backend/tol_data/enwiki/download_imgs.py
 create mode 100755 backend/tol_data/enwiki/gen_desc_data.py
 create mode 100755 backend/tol_data/enwiki/gen_dump_index_db.py
 create mode 100755 backend/tol_data/enwiki/gen_img_data.py
 create mode 100755 backend/tol_data/enwiki/gen_pageview_data.py
 create mode 100755 backend/tol_data/enwiki/lookup_page.py

(limited to 'backend/tol_data/enwiki')
diff --git a/backend/tol_data/enwiki/README.md b/backend/tol_data/enwiki/README.md
new file mode 100644
index 0000000..ba1de33
--- /dev/null
+++ b/backend/tol_data/enwiki/README.md
@@ -0,0 +1,63 @@
+This directory holds files obtained/derived from [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page).
+
+# Downloaded Files
+-   `enwiki-20220501-pages-articles-multistream.xml.bz2` <br>
+    Contains text content and metadata for pages in enwiki.
+    Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror).
+    Some file content and format information was available from
+        <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+-   `enwiki-20220501-pages-articles-multistream-index.txt.bz2` <br>
+    Obtained like above. Holds lines of the form offset1:pageId1:title1,
+    providing, for each page, an offset into the dump file of a chunk of
+    100 pages that includes it.
+
+# Dump-Index Files
+-   `gen_dump_index_db.py` <br>
+    Creates a database version of the enwiki-dump index file.
+-   `dumpIndex.db` <br>
+    Generated by `gen_dump_index_db.py`. <br>
+    Tables: <br>
+    -   `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT`
+
+# Description Database Files
+-   `gen_desc_data.py` <br>
+    Reads through pages in the dump file, and adds short-description info to a database.
+-   `desc_data.db` <br>
+    Generated by `gen_desc_data.py`. <br>
+    Tables: <br>
+    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
+    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
+    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
+
+# Image Database Files
+-   `gen_img_data.py` <br>
+    Used to find infobox image names for page IDs, storing them into a database.
+-   `downloadImgLicenseInfo.py` <br>
+    Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database.
+-   `img_data.db` <br>
+    Used to hold metadata about infobox images for a set of pageIDs.
+    Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br>
+    Tables: <br>
+    -   `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT` <br>
+        `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
+    -   `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
+        Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
+-   `downloadImgs.py` <br>
+    Used to download image files into imgs/.
+
+# Page View Files
+-   `pageviews/pageviews-*-user.bz2`
+    Each holds wikimedia article page view data for some month.
+    Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>.
+    Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>.
+-   `gen_pageview_data.py` <br>
+    Reads pageview/*, and creates a database holding average monthly pageview counts.
+-   `pageview_data.db` <br>
+    Generated using `gen_pageview_data.py`. <br>
+    Tables: <br>
+    -   `views`: `title TEXT PRIMARY KEY, id INT, views INT`
+
+# Other Files
+-   `lookup_page.py` <br>
+    Running `lookup_page.py title1` looks in the dump for a page with a given title,
+    and prints the contents to stdout. Uses dumpIndex.db.
diff --git a/backend/tol_data/enwiki/__init__.py b/backend/tol_data/enwiki/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py
new file mode 100755
index 0000000..0a809ac
--- /dev/null
+++ b/backend/tol_data/enwiki/download_img_license_info.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python3
+
+"""
+Reads image names from a database, and uses enwiki's online API to obtain
+licensing information for them, adding the info to the database.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+at already-processed names to decide what to skip.
+"""
+
+import re
+import sqlite3, urllib.parse, html
+import requests
+import time, signal
+
+IMG_DB = 'img_data.db'
+#
+API_URL = 'https://en.wikipedia.org/w/api.php'
+USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
+BATCH_SZ = 50 # Max 50
+TAG_REGEX = re.compile(r'<[^<]+>')
+WHITESPACE_REGEX = re.compile(r'\s+')
+
+def downloadInfo(imgDb: str) -> None:
+	print('Opening database')
+	dbCon = sqlite3.connect(imgDb)
+	dbCur = dbCon.cursor()
+	print('Checking for table')
+	if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
+		dbCur.execute('CREATE TABLE imgs (' \
+			'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
+	#
+	print('Reading image names')
+	imgNames: set[str] = set()
+	for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
+		imgNames.add(imgName)
+	print(f'Found {len(imgNames)}')
+	#
+	print('Checking for already-processed images')
+	oldSz = len(imgNames)
+	for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
+		imgNames.discard(imgName)
+	print(f'Found {oldSz - len(imgNames)}')
+	#
+	# Set SIGINT handler
+	interrupted = False
+	oldHandler = None
+	def onSigint(sig, frame):
+		nonlocal interrupted
+		interrupted = True
+		signal.signal(signal.SIGINT, oldHandler)
+	oldHandler = signal.signal(signal.SIGINT, onSigint)
+	#
+	print('Iterating through image names')
+	imgNameList = list(imgNames)
+	iterNum = 0
+	for i in range(0, len(imgNameList), BATCH_SZ):
+		iterNum += 1
+		if iterNum % 1 == 0:
+			print(f'At iteration {iterNum} (after {(iterNum - 1) * BATCH_SZ} images)')
+		if interrupted:
+			print(f'Exiting loop at iteration {iterNum}')
+			break
+		# Get batch
+		imgBatch = imgNameList[i:i+BATCH_SZ]
+		imgBatch = ['File:' + x for x in imgBatch]
+		# Make request
+		headers = {
+			'user-agent': USER_AGENT,
+			'accept-encoding': 'gzip',
+		}
+		params = {
+			'action': 'query',
+			'format': 'json',
+			'prop': 'imageinfo',
+			'iiprop': 'extmetadata|url',
+			'maxlag': '5',
+			'titles': '|'.join(imgBatch),
+			'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions',
+		}
+		responseObj = None
+		try:
+			response = requests.get(API_URL, params=params, headers=headers)
+			responseObj = response.json()
+		except Exception as e:
+			print(f'ERROR: Exception while downloading info: {e}')
+			print('\tImage batch: ' + '|'.join(imgBatch))
+			continue
+		# Parse response-object
+		if 'query' not in responseObj or 'pages' not in responseObj['query']:
+			print('WARNING: Response object for doesn\'t have page data')
+			print('\tImage batch: ' + '|'.join(imgBatch))
+			if 'error' in responseObj:
+				errorCode = responseObj['error']['code']
+				print(f'\tError code: {errorCode}')
+				if errorCode == 'maxlag':
+					time.sleep(5)
+			continue
+		pages = responseObj['query']['pages']
+		normalisedToInput: dict[str, str] = {}
+		if 'normalized' in responseObj['query']:
+			for entry in responseObj['query']['normalized']:
+				normalisedToInput[entry['to']] = entry['from']
+		for page in pages.values():
+			# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
+				# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
+				# Artist: author name (might contain complex html, multiple authors, etc)
+				# Credit: 'source'
+					# For image-map-like images, can be quite large/complex html, creditng each sub-image
+					# May be <a href='text1'>text2</a>, where the text2 might be non-indicative
+				# Restrictions: specifies non-copyright legal restrictions
+			title: str = page['title']
+			if title in normalisedToInput:
+				title = normalisedToInput[title]
+			title = title[5:] # Remove 'File:'
+			if title not in imgNames:
+				print(f'WARNING: Got title "{title}" not in image-name list')
+				continue
+			if 'imageinfo' not in page:
+				print(f'WARNING: No imageinfo section for page "{title}"')
+				continue
+			metadata = page['imageinfo'][0]['extmetadata']
+			url: str = page['imageinfo'][0]['url']
+			license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
+			artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
+			credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
+			restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+			# Remove markup
+			if artist is not None:
+				artist = TAG_REGEX.sub(' ', artist).strip()
+				artist = WHITESPACE_REGEX.sub(' ', artist)
+				artist = html.unescape(artist)
+				artist = urllib.parse.unquote(artist)
+			if credit is not None:
+				credit = TAG_REGEX.sub(' ', credit).strip()
+				credit = WHITESPACE_REGEX.sub(' ', credit)
+				credit = html.unescape(credit)
+				credit = urllib.parse.unquote(credit)
+			# Add to db
+			print((title, license, artist, credit, restrictions, url))
+			dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
+				(title, license, artist, credit, restrictions, url))
+	#
+	print('Closing database')
+	dbCon.commit()
+	dbCon.close()
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.parse_args()
+	#
+	downloadInfo(IMG_DB)
diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py
new file mode 100755
index 0000000..ba874e1
--- /dev/null
+++ b/backend/tol_data/enwiki/download_imgs.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python3
+
+"""
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'pageId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
+
+# In testing, this downloaded about 100k images, over several days
+
+import re, os
+import sqlite3
+import urllib.parse, requests
+import time, signal
+
+IMG_DB = 'img_data.db' # About 130k image names
+OUT_DIR = 'imgs'
+#
+LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
+USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
+TIMEOUT = 1
+	# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
+	# It's unclear how to properly check for cache misses, so we just aim for 1 per sec
+
+def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
+	if not os.path.exists(outDir):
+		os.mkdir(outDir)
+	print('Checking for already-downloaded images')
+	fileList = os.listdir(outDir)
+	pageIdsDone: set[int] = set()
+	for filename in fileList:
+		pageIdsDone.add(int(os.path.splitext(filename)[0]))
+	print(f'Found {len(pageIdsDone)}')
+	#
+	# Set SIGINT handler
+	interrupted = False
+	oldHandler = None
+	def onSigint(sig, frame):
+		nonlocal interrupted
+		interrupted = True
+		signal.signal(signal.SIGINT, oldHandler)
+	oldHandler = signal.signal(signal.SIGINT, onSigint)
+	#
+	print('Opening database')
+	dbCon = sqlite3.connect(imgDb)
+	dbCur = dbCon.cursor()
+	print('Starting downloads')
+	iterNum = 0
+	query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \
+		' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name'
+	for pageId, license, artist, credit, restrictions, url in dbCur.execute(query):
+		if pageId in pageIdsDone:
+			continue
+		if interrupted:
+			print('Exiting loop')
+			break
+		# Check for problematic attributes
+		if license is None or LICENSE_REGEX.fullmatch(license) is None:
+			continue
+		if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None:
+			continue
+		if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None:
+			continue
+		if restrictions is not None and restrictions != '':
+			continue
+		# Download image
+		iterNum += 1
+		print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
+		urlParts = urllib.parse.urlparse(url)
+		extension = os.path.splitext(urlParts.path)[1]
+		if len(extension) <= 1:
+			print(f'WARNING: No filename extension found in URL {url}')
+			continue
+		outFile = os.path.join(outDir, f'{pageId}{extension}')
+		print(outFile)
+		headers = {
+			'user-agent': USER_AGENT,
+			'accept-encoding': 'gzip',
+		}
+		try:
+			response = requests.get(url, headers=headers)
+			with open(outFile, 'wb') as file:
+				file.write(response.content)
+			time.sleep(timeout)
+		except Exception as e:
+			print(f'Error while downloading to {outFile}: {e}')
+			return
+	print('Closing database')
+	dbCon.close()
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.parse_args()
+	#
+	downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py
new file mode 100755
index 0000000..0dca16b
--- /dev/null
+++ b/backend/tol_data/enwiki/gen_desc_data.py
@@ -0,0 +1,126 @@
+#!/usr/bin/python3
+
+"""
+Reads through the wiki dump, and attempts to parse short-descriptions,
+and add them to a database
+"""
+
+# In testing, this script took over 10 hours to run, and generated about 5GB
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
+DB_FILE = 'desc_data.db'
+
+DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
+EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
+	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
+CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+def convertTemplateReplace(match):
+	""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
+	if match.group(2) is None:
+		return f'{match.group(1)} {match.group(4)}'
+	else:
+		return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+def genData(dumpFile: str, dbFile: str) -> None:
+	print('Creating database')
+	if os.path.exists(dbFile):
+		raise Exception(f'ERROR: Existing {dbFile}')
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)')
+	dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)')
+	dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
+	dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
+	dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
+	#
+	print('Iterating through dump file')
+	with bz2.open(dumpFile, mode='rt') as file:
+		for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
+			if pageNum % 1e4 == 0:
+				print(f'At page {pageNum}')
+			# Parse page
+			if page.namespace == 0:
+				try:
+					dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
+				except sqlite3.IntegrityError as e:
+					# Accounts for certain pages that have the same title
+					print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr)
+					continue
+				if page.redirect is not None:
+					dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect)))
+				else:
+					revision = next(page)
+					desc = parseDesc(revision.text)
+					if desc is not None:
+						dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
+	#
+	print('Closing database')
+	dbCon.commit()
+	dbCon.close()
+def parseDesc(text: str) -> str | None:
+	# Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+		# and then accumulate lines until a blank one.
+	# Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, 
+		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	lines: list[str] = []
+	openBraceCount = 0
+	openBracketCount = 0
+	inComment = False
+	skip = False
+	for line in text.splitlines():
+		line = line.strip()
+		if not lines:
+			if line:
+				if openBraceCount > 0 or line[0] == '{':
+					openBraceCount += line.count('{')
+					openBraceCount -= line.count('}')
+					skip = True
+				if openBracketCount > 0 or line[0] == '[':
+					openBracketCount += line.count('[')
+					openBracketCount -= line.count(']')
+					skip = True
+				if inComment or line.find('<!--') != -1:
+					if line.find('-->') != -1:
+						if inComment:
+							inComment = False
+							skip = True
+					else:
+						inComment = True
+						skip = True
+				if skip:
+					skip = False
+					continue
+				if line[-1] == ':': # Seems to help avoid disambiguation pages
+					return None
+				if DESC_LINE_REGEX.match(line) is not None:
+					lines.append(line)
+		else:
+			if not line:
+				return removeMarkup(' '.join(lines))
+			lines.append(line)
+	if lines:
+		return removeMarkup(' '.join(lines))
+	return None
+def removeMarkup(content: str) -> str:
+	content = EMBEDDED_HTML_REGEX.sub('', content)
+	content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
+	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+	content = PARENS_GROUP_REGEX.sub('', content)
+	content = LEFTOVER_BRACE_REGEX.sub('', content)
+	return content
+def convertTitle(title: str) -> str:
+	return html.unescape(title).replace('_', ' ')
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.parse_args()
+	#
+	genData(DUMP_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py
new file mode 100755
index 0000000..5f21c9b
--- /dev/null
+++ b/backend/tol_data/enwiki/gen_dump_index_db.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python3
+
+"""
+Adds data from the wiki dump index-file into a database
+"""
+import sys, os, re
+import bz2
+import sqlite3
+
+INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
+DB_FILE = 'dumpIndex.db'
+
+def genData(indexFile: str, dbFile: str) -> None:
+	""" Reads the index file and creates the db """
+	if os.path.exists(dbFile):
+		raise Exception(f'ERROR: Existing {dbFile}')
+	print('Creating database')
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+	print('Iterating through index file')
+	lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
+	lastOffset = 0
+	lineNum = 0
+	entriesToAdd: list[tuple[str, str]] = []
+	with bz2.open(indexFile, mode='rt') as file:
+		for line in file:
+			lineNum += 1
+			if lineNum % 1e5 == 0:
+				print(f'At line {lineNum}')
+			#
+			match = lineRegex.fullmatch(line.rstrip())
+			assert match is not None
+			offsetStr, pageId, title = match.group(1,2,3)
+			offset = int(offsetStr)
+			if offset > lastOffset:
+				for t, p in entriesToAdd:
+					try:
+						dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset))
+					except sqlite3.IntegrityError as e:
+						# Accounts for certain entries in the file that have the same title
+						print(f'Failed on title "{t}": {e}', file=sys.stderr)
+				entriesToAdd = []
+				lastOffset = offset
+			entriesToAdd.append((title, pageId))
+	for title, pageId in entriesToAdd:
+		try:
+			dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
+		except sqlite3.IntegrityError as e:
+			print(f'Failed on title "{t}": {e}', file=sys.stderr)
+	print('Closing database')
+	dbCon.commit()
+	dbCon.close()
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.parse_args()
+	#
+	genData(INDEX_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
new file mode 100755
index 0000000..d4696f0
--- /dev/null
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -0,0 +1,193 @@
+#!/usr/bin/python3
+
+"""
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
+
+import re
+import os, bz2, html, urllib.parse
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+INDEX_DB = 'dumpIndex.db'
+IMG_DB = 'img_data.db' # The database to create
+DB_FILE = os.path.join('..', 'data.db')
+#
+ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
+IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
+BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
+IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
+CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+
+def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
+	print('Opening databases')
+	indexDbCon = sqlite3.connect(indexDb)
+	indexDbCur = indexDbCon.cursor()
+	imgDbCon = sqlite3.connect(imgDb)
+	imgDbCur = imgDbCon.cursor()
+	print('Checking tables')
+	if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
+		# Create tables if not present
+		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+		imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
+	else:
+		# Check for already-processed page IDs
+		numSkipped = 0
+		for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
+			if pid in pageIds:
+				pageIds.remove(pid)
+				numSkipped += 1
+			else:
+				print(f'Found already-processed page ID {pid} which was not in input set')
+		print(f'Will skip {numSkipped} already-processed page IDs')
+	#
+	print('Getting dump-file offsets')
+	offsetToPageids: dict[int, list[int]] = {}
+	offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
+	iterNum = 0
+	for pageId in pageIds:
+		iterNum += 1
+		if iterNum % 1e4 == 0:
+			print(f'At iteration {iterNum}')
+		#
+		query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
+		row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
+		if row is None:
+			print(f'WARNING: Page ID {pageId} not found')
+			continue
+		chunkOffset, endOffset = row
+		offsetToEnd[chunkOffset] = endOffset
+		if chunkOffset not in offsetToPageids:
+			offsetToPageids[chunkOffset] = []
+		offsetToPageids[chunkOffset].append(pageId)
+	print(f'Found {len(offsetToEnd)} chunks to check')
+	#
+	print('Iterating through chunks in dump file')
+	with open(dumpFile, mode='rb') as file:
+		iterNum = 0
+		for pageOffset, endOffset in offsetToEnd.items():
+			iterNum += 1
+			if iterNum % 100 == 0:
+				print(f'At iteration {iterNum}')
+			#
+			chunkPageIds = offsetToPageids[pageOffset]
+			# Jump to chunk
+			file.seek(pageOffset)
+			compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+			data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+			# Look in chunk for pages
+			lines = data.splitlines()
+			lineIdx = 0
+			while lineIdx < len(lines):
+				# Look for <page>
+				if lines[lineIdx].lstrip() != '<page>':
+					lineIdx += 1
+					continue
+				# Check page id
+				lineIdx += 3
+				idLine = lines[lineIdx].lstrip()
+				match = ID_LINE_REGEX.fullmatch(idLine)
+				if match is None or int(match.group(1)) not in chunkPageIds:
+					lineIdx += 1
+					continue
+				pageId = int(match.group(1))
+				lineIdx += 1
+				# Look for <text> in <page>
+				foundText = False
+				while lineIdx < len(lines):
+					if not lines[lineIdx].lstrip().startswith('<text '):
+						lineIdx += 1
+						continue
+					foundText = True
+					# Get text content
+					content: list[str] = []
+					line = lines[lineIdx]
+					content.append(line[line.find('>') + 1:])
+					lineIdx += 1
+					foundTextEnd = False
+					while lineIdx < len(lines):
+						line = lines[lineIdx]
+						if not line.endswith('</text>'):
+							content.append(line)
+							lineIdx += 1
+							continue
+						foundTextEnd = True
+						content.append(line[:line.rfind('</text>')])
+						# Look for image-filename
+						imageName = getImageName(content)
+						imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName))
+						break
+					if not foundTextEnd:
+						print(f'WARNING: Did not find </text> for page id {pageId}')
+					break
+				if not foundText:
+					print(f'WARNING: Did not find <text> for page id {pageId}')
+	#
+	print('Closing databases')
+	indexDbCon.close()
+	imgDbCon.commit()
+	imgDbCon.close()
+def getImageName(content: list[str]) -> str | None:
+	""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
+	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	for line in content:
+		match = IMG_LINE_REGEX.match(line)
+		if match is not None:
+			imageName = match.group(1).strip()
+			if imageName == '':
+				return None
+			imageName = html.unescape(imageName)
+			# Account for {{...
+			if imageName.startswith('{'):
+				match = CSS_IMG_CROP_REGEX.match(imageName)
+				if match is None:
+					return None
+				imageName = match.group(1)
+			# Account for [[File:...|...]]
+			if imageName.startswith('['):
+				match = BRACKET_IMG_REGEX.match(imageName)
+				if match is None:
+					return None
+				imageName = match.group(1)
+			# Account for <!--
+			if imageName.find('<!--') != -1:
+				return None
+			# Remove an initial 'File:'
+			if imageName.startswith('File:'):
+				imageName = imageName[5:]
+			# Remove an initial 'Image:'
+			if imageName.startswith('Image:'):
+				imageName = imageName[6:]
+			# Check for extension
+			match = IMG_NAME_REGEX.match(imageName)
+			if match is not None:
+				imageName = match.group(0)
+				imageName = urllib.parse.unquote(imageName)
+				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+				imageName = imageName.replace('_', ' ')
+				return imageName
+			# Exclude lines like: | image = &lt;imagemap&gt;
+			return None
+	return None
+
+def getInputPageIdsFromDb(dbFile: str) -> set[int]:
+	print('Getting input page-ids')
+	pageIds: set[int] = set()
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
+		pageIds.add(pageId)
+	dbCon.close()
+	print(f'Found {len(pageIds)}')
+	return pageIds
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.parse_args()
+	#
+	pageIds = getInputPageIdsFromDb(DB_FILE)
+	genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py
new file mode 100755
index 0000000..ce3b674
--- /dev/null
+++ b/backend/tol_data/enwiki/gen_pageview_data.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+"""
+Reads through wikimedia files containing pageview counts,
+computes average counts, and adds them to a database
+"""
+
+# Took about 15min per file (each had about 180e6 lines)
+
+import sys, os, glob, math, re
+from collections import defaultdict
+import bz2, sqlite3
+
+PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
+DUMP_INDEX_DB = 'dumpIndex.db'
+DB_FILE = 'pageview_data.db'
+
+def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
+	# Each pageview file has lines that seem to hold these space-separated fields:
+		# wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+		# platform (eg: mobile-web), monthly view count,
+		# hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
+	if os.path.exists(dbFile):
+		print('ERROR: Database already exists')
+		sys.exit(1)
+	#
+	namespaceRegex = re.compile(r'[a-zA-Z]+:')
+	titleToViews: dict[str, int] = defaultdict(int)
+	linePrefix = b'en.wikipedia '
+	for filename in pageviewFiles:
+		print(f'Reading from {filename}')
+		with bz2.open(filename, 'rb') as file:
+			for lineNum, line in enumerate(file, 1):
+				if lineNum % 1e6 == 0:
+					print(f'At line {lineNum}')
+				if not line.startswith(linePrefix):
+					continue
+				# Get second and second-last fields
+				line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
+				title = line[:line.find(b' ')].decode('utf-8')
+				viewCount = int(line[line.rfind(b' ')+1:])
+				if namespaceRegex.match(title) is not None:
+					continue
+				# Update map
+				titleToViews[title] += viewCount
+	print(f'Found {len(titleToViews)} titles')
+	#
+	print('Writing to db')
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	idbCon = sqlite3.connect(dumpIndexDb)
+	idbCur = idbCon.cursor()
+	dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)')
+	for title, views in titleToViews.items():
+		row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+		if row is not None:
+			wikiId = int(row[0])
+			dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles))))
+	dbCon.commit()
+	dbCon.close()
+	idbCon.close()
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	args = parser.parse_args()
+	#
+	genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py
new file mode 100755
index 0000000..8ef1229
--- /dev/null
+++ b/backend/tol_data/enwiki/lookup_page.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+
+"""
+Looks up a page with title title1 in the wiki dump, using the dump-index
+db, and prints the corresponding <page>.
+"""
+
+import sys
+import bz2
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+INDEX_DB = 'dumpIndex.db'
+
+def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
+	print('Looking up offset in index db')
+	dbCon = sqlite3.connect(indexDb)
+	dbCur = dbCon.cursor()
+	query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?'
+	row = dbCur.execute(query, (pageTitle,)).fetchone()
+	if row is None:
+		print('Title not found')
+		sys.exit(0)
+	_, pageOffset, endOffset = row
+	dbCon.close()
+	print(f'Found chunk at offset {pageOffset}')
+	#
+	print('Reading from wiki dump')
+	content: list[str] = []
+	with open(dumpFile, mode='rb') as file:
+		# Get uncompressed chunk
+		file.seek(pageOffset)
+		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+		# Look in chunk for page
+		lines = data.splitlines()
+		lineIdx = 0
+		found = False
+		pageNum = 0
+		while not found:
+			line = lines[lineIdx]
+			if line.lstrip() == '<page>':
+				pageNum += 1
+				if pageNum > 100:
+					print('ERROR: Did not find title after 100 pages')
+					break
+				lineIdx += 1
+				titleLine = lines[lineIdx]
+				if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
+					found = True
+					print(f'Found title in chunk as page {pageNum}')
+					content.append(line)
+					content.append(titleLine)
+					while True:
+						lineIdx += 1
+						line = lines[lineIdx]
+						content.append(line)
+						if line.lstrip() == '</page>':
+							break
+			lineIdx += 1
+	#
+	print('Content: ')
+	print('\n'.join(content))
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.add_argument('title', help='The title to look up')
+	args = parser.parse_args()
+	#
+	lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' '))
-- 
cgit v1.2.3