Make backend dev server script serve the image files

Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images.
author: Terry Truong <terry06890@gmail.com> 2022-07-11 01:54:08 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-07-11 01:54:08 +1000
commit: 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree: 3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/tolData/enwiki
parent: a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)
7 files changed, 736 insertions, 0 deletions
diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md
new file mode 100644
index 0000000..90d16c7
--- /dev/null
+++ b/backend/tolData/enwiki/README.md
@@ -0,0 +1,52 @@
+This directory holds files obtained from/using [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page).
+
+# Downloaded Files
+-   enwiki-20220501-pages-articles-multistream.xml.bz2 <br>
+    Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror).
+    Contains text content and metadata for pages in enwiki.
+    Some file content and format information was available from
+        <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+-   enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br>
+    Obtained like above. Holds lines of the form offset1:pageId1:title1,
+    providing, for each page, an offset into the dump file of a chunk of
+    100 pages that includes it.
+
+# Generated Dump-Index Files
+-   genDumpIndexDb.py <br>
+    Creates an sqlite-database version of the enwiki-dump index file.
+-   dumpIndex.db <br>
+    Generated by genDumpIndexDb.py. <br>
+    Tables: <br>
+    -   `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT`
+
+# Description Database Files
+-   genDescData.py <br>
+    Reads through pages in the dump file, and adds short-description info to a database.
+-   descData.db <br>
+    Generated by genDescData.py. <br>
+    Tables: <br>
+    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
+    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
+    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
+
+# Image Database Files
+-   genImgData.py <br>
+    Used to find infobox image names for page IDs, storing them into a database.
+-   downloadImgLicenseInfo.py <br>
+    Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database.
+-   imgData.db <br>
+    Used to hold metadata about infobox images for a set of pageIDs.
+    Generated using getEnwikiImgData.py and downloadImgLicenseInfo.py. <br>
+    Tables: <br>
+    -   `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT` <br>
+        `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
+    -   `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
+        Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
+-   downloadImgs.py <br>
+    Used to download image files into imgs/.
+
+# Other Files
+-   lookupPage.py <br>
+    Running `lookupPage.py title1` looks in the dump for a page with a given title,
+    and prints the contents to stdout. Uses dumpIndex.db.
+
diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py
new file mode 100755
index 0000000..399922e
--- /dev/null
+++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3, urllib.parse, html
+import requests
+import time, signal
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads image names from a database, and uses enwiki's online API to obtain
+licensing information for them, adding the info to the database.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+at already-processed names to decide what to skip.
+"""
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+imgDb = "imgData.db"
+apiUrl = "https://en.wikipedia.org/w/api.php"
+userAgent = "terryt.dev (terry06890@gmail.com)"
+batchSz = 50 # Max 50
+tagRegex = re.compile(r"<[^<]+>")
+whitespaceRegex = re.compile(r"\s+")
+
+print("Opening database")
+dbCon = sqlite3.connect(imgDb)
+dbCur = dbCon.cursor()
+dbCur2 = dbCon.cursor()
+print("Checking for table")
+if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
+	dbCur.execute("CREATE TABLE imgs(" \
+		"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
+
+print("Reading image names")
+imgNames = set()
+for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
+	imgNames.add(imgName)
+print(f"Found {len(imgNames)}")
+
+print("Checking for already-processed images")
+oldSz = len(imgNames)
+for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
+	imgNames.discard(imgName)
+print(f"Found {oldSz - len(imgNames)}")
+
+# Set SIGINT handler
+interrupted = False
+oldHandler = None
+def onSigint(sig, frame):
+	global interrupted
+	interrupted = True
+	signal.signal(signal.SIGINT, oldHandler)
+oldHandler = signal.signal(signal.SIGINT, onSigint)
+
+print("Iterating through image names")
+imgNames = list(imgNames)
+iterNum = 0
+for i in range(0, len(imgNames), batchSz):
+	iterNum += 1
+	if iterNum % 1 == 0:
+		print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
+	if interrupted:
+		print(f"Exiting loop at iteration {iterNum}")
+		break
+	# Get batch
+	imgBatch = imgNames[i:i+batchSz]
+	imgBatch = ["File:" + x for x in imgBatch]
+	# Make request
+	headers = {
+		"user-agent": userAgent,
+		"accept-encoding": "gzip",
+	}
+	params = {
+		"action": "query",
+		"format": "json",
+		"prop": "imageinfo",
+		"iiprop": "extmetadata|url",
+		"maxlag": "5",
+		"titles": "|".join(imgBatch),
+		"iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
+	}
+	responseObj = None
+	try:
+		response = requests.get(apiUrl, params=params, headers=headers)
+		responseObj = response.json()
+	except Exception as e:
+		print(f"ERROR: Exception while downloading info: {e}")
+		print(f"\tImage batch: " + "|".join(imgBatch))
+		continue
+	# Parse response-object
+	if "query" not in responseObj or "pages" not in responseObj["query"]:
+		print("WARNING: Response object for doesn't have page data")
+		print("\tImage batch: " + "|".join(imgBatch))
+		if "error" in responseObj:
+			errorCode = responseObj["error"]["code"]
+			print(f"\tError code: {errorCode}")
+			if errorCode == "maxlag":
+				time.sleep(5)
+		continue
+	pages = responseObj["query"]["pages"]
+	normalisedToInput = {}
+	if "normalized" in responseObj["query"]:
+		for entry in responseObj["query"]["normalized"]:
+			normalisedToInput[entry["to"]] = entry["from"]
+	for (_, page) in pages.items():
+		# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
+			# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
+			# Artist: author name (might contain complex html, multiple authors, etc)
+			# Credit: 'source'
+				# For image-map-like images, can be quite large/complex html, creditng each sub-image
+				# May be <a href="text1">text2</a>, where the text2 might be non-indicative
+			# Restrictions: specifies non-copyright legal restrictions
+		title = page["title"]
+		if title in normalisedToInput:
+			title = normalisedToInput[title]
+		title = title[5:] # Remove 'File:'
+		if title not in imgNames:
+			print(f"WARNING: Got title \"{title}\" not in image-name list")
+			continue
+		if "imageinfo" not in page:
+			print(f"WARNING: No imageinfo section for page \"{title}\"")
+			continue
+		metadata = page["imageinfo"][0]["extmetadata"]
+		url = page["imageinfo"][0]["url"]
+		license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
+		artist = metadata['Artist']['value'] if 'Artist' in metadata else None
+		credit = metadata['Credit']['value'] if 'Credit' in metadata else None
+		restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+		# Remove markup
+		if artist != None:
+			artist = tagRegex.sub(" ", artist)
+			artist = whitespaceRegex.sub(" ", artist)
+			artist = html.unescape(artist)
+			artist = urllib.parse.unquote(artist)
+		if credit != None:
+			credit = tagRegex.sub(" ", credit)
+			credit = whitespaceRegex.sub(" ", credit)
+			credit = html.unescape(credit)
+			credit = urllib.parse.unquote(credit)
+		# Add to db
+		dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
+			(title, license, artist, credit, restrictions, url))
+
+print("Closing database")
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py
new file mode 100755
index 0000000..8fb605f
--- /dev/null
+++ b/backend/tolData/enwiki/downloadImgs.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+
+import sys, re, os
+import sqlite3
+import urllib.parse, requests
+import time, signal
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'pageId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+imgDb = "imgData.db" # About 130k image names
+outDir = "imgs"
+licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
+# In testing, this downloaded about 100k images, over several days
+
+if not os.path.exists(outDir):
+	os.mkdir(outDir)
+print("Checking for already-downloaded images")
+fileList = os.listdir(outDir)
+pageIdsDone = set()
+for filename in fileList:
+	(basename, extension) = os.path.splitext(filename)
+	pageIdsDone.add(int(basename))
+print(f"Found {len(pageIdsDone)}")
+
+# Set SIGINT handler
+interrupted = False
+oldHandler = None
+def onSigint(sig, frame):
+	global interrupted
+	interrupted = True
+	signal.signal(signal.SIGINT, oldHandler)
+oldHandler = signal.signal(signal.SIGINT, onSigint)
+
+print("Opening database")
+dbCon = sqlite3.connect(imgDb)
+dbCur = dbCon.cursor()
+print("Starting downloads")
+iterNum = 0
+query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
+	" imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name"
+for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query):
+	if pageId in pageIdsDone:
+		continue
+	if interrupted:
+		print(f"Exiting loop")
+		break
+	# Check for problematic attributes
+	if license == None or licenseRegex.fullmatch(license) == None:
+		continue
+	if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None:
+		continue
+	if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None:
+		continue
+	if restrictions != None and restrictions != "":
+		continue
+	# Download image
+	iterNum += 1
+	print(f"Iteration {iterNum}: Downloading for page-id {pageId}")
+	urlParts = urllib.parse.urlparse(url)
+	extension = os.path.splitext(urlParts.path)[1]
+	if len(extension) <= 1:
+		print(f"WARNING: No filename extension found in URL {url}")
+		sys.exit(1)
+	outFile = f"{outDir}/{pageId}{extension}"
+	headers = {
+		"user-agent": "terryt.dev (terry06890@gmail.com)",
+		"accept-encoding": "gzip",
+	}
+	try:
+		response = requests.get(url, headers=headers)
+		with open(outFile, 'wb') as file:
+			file.write(response.content)
+		time.sleep(1)
+			# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
+			# It's unclear how to properly check for cache misses, so this just aims for 1 per sec
+	except Exception as e:
+		print(f"Error while downloading to {outFile}: {e}")
+print("Closing database")
+dbCon.close()
diff --git a/backend/tolData/enwiki/genDescData.py b/backend/tolData/enwiki/genDescData.py
new file mode 100755
index 0000000..b0ca272
--- /dev/null
+++ b/backend/tolData/enwiki/genDescData.py
@@ -0,0 +1,127 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads through the wiki dump, and attempts to
+parse short-descriptions, and add them to a database.
+"""
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages
+enwikiDb = "descData.db"
+# In testing, this script took over 10 hours to run, and generated about 5GB
+
+descLineRegex = re.compile("^ *[A-Z'\"]")
+embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
+	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
+def convertTemplateReplace(match):
+	if match.group(2) == None:
+		return f"{match.group(1)} {match.group(4)}"
+	else:
+		return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
+parensGroupRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+
+def parseDesc(text):
+	# Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+		# and then accumulate lines until a blank one.
+	# Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, 
+		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	lines = []
+	openBraceCount = 0
+	openBracketCount = 0
+	inComment = False
+	skip = False
+	for line in text.splitlines():
+		line = line.strip()
+		if len(lines) == 0:
+			if len(line) > 0:
+				if openBraceCount > 0 or line[0] == "{":
+					openBraceCount += line.count("{")
+					openBraceCount -= line.count("}")
+					skip = True
+				if openBracketCount > 0 or line[0] == "[":
+					openBracketCount += line.count("[")
+					openBracketCount -= line.count("]")
+					skip = True
+				if inComment or line.find("<!--") != -1:
+					if line.find("-->") != -1:
+						if inComment:
+							inComment = False
+							skip = True
+					else:
+						inComment = True
+						skip = True
+				if skip:
+					skip = False
+					continue
+				if line[-1] == ":": # Seems to help avoid disambiguation pages
+					return None
+				if descLineRegex.match(line) != None:
+					lines.append(line)
+		else:
+			if len(line) == 0:
+				return removeMarkup(" ".join(lines))
+			lines.append(line)
+	if len(lines) > 0:
+		return removeMarkup(" ".join(lines))
+	return None
+def removeMarkup(content):
+	content = embeddedHtmlRegex.sub("", content)
+	content = convertTemplateRegex.sub(convertTemplateReplace, content)
+	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+	content = parensGroupRegex.sub("", content)
+	content = leftoverBraceRegex.sub("", content)
+	return content
+def convertTitle(title):
+	return html.unescape(title).replace("_", " ")
+
+print("Creating database")
+if os.path.exists(enwikiDb):
+	raise Exception(f"ERROR: Existing {enwikiDb}")
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
+dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+
+print("Iterating through dump file")
+with bz2.open(dumpFile, mode='rt') as file:
+	dump = mwxml.Dump.from_file(file)
+	pageNum = 0
+	for page in dump:
+		pageNum += 1
+		if pageNum % 1e4 == 0:
+			print(f"At page {pageNum}")
+		if pageNum > 3e4:
+			break
+		# Parse page
+		if page.namespace == 0:
+			try:
+				dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
+			except sqlite3.IntegrityError as e:
+				# Accounts for certain pages that have the same title
+				print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr)
+				continue
+			if page.redirect != None:
+				dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
+			else:
+				revision = next(page)
+				desc = parseDesc(revision.text)
+				if desc != None:
+					dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+
+print("Closing database")
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py
new file mode 100755
index 0000000..3955885
--- /dev/null
+++ b/backend/tolData/enwiki/genDumpIndexDb.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Adds data from the wiki dump index-file into a database.
+"""
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines
+indexDb = "dumpIndex.db"
+
+if os.path.exists(indexDb):
+	raise Exception(f"ERROR: Existing {indexDb}")
+print("Creating database")
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)")
+
+print("Iterating through index file")
+lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
+lastOffset = 0
+lineNum = 0
+entriesToAdd = []
+with bz2.open(indexFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"At line {lineNum}")
+		#
+		match = lineRegex.fullmatch(line.rstrip())
+		(offset, pageId, title) = match.group(1,2,3)
+		offset = int(offset)
+		if offset > lastOffset:
+			for (t, p) in entriesToAdd:
+				try:
+					dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset))
+				except sqlite3.IntegrityError as e:
+					# Accounts for certain entries in the file that have the same title
+					print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+			entriesToAdd = []
+			lastOffset = offset
+		entriesToAdd.append([title, pageId])
+for (title, pageId) in entriesToAdd:
+	try:
+		dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1))
+	except sqlite3.IntegrityError as e:
+		print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+
+print("Closing database")
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py
new file mode 100755
index 0000000..dedfe14
--- /dev/null
+++ b/backend/tolData/enwiki/genImgData.py
@@ -0,0 +1,190 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2, html, urllib.parse
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+def getInputPageIds():
+	pageIds = set()
+	dbCon = sqlite3.connect("../data.db")
+	dbCur = dbCon.cursor()
+	for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
+		pageIds.add(pageId)
+	dbCon.close()
+	return pageIds
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+imgDb = "imgData.db" # The database to create
+idLineRegex = re.compile(r"<id>(.*)</id>")
+imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
+bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
+imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
+cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+# In testing, got about 360k image names
+
+print("Getting input page-ids")
+pageIds = getInputPageIds()
+print(f"Found {len(pageIds)}")
+
+print("Opening databases")
+indexDbCon = sqlite3.connect(indexDb)
+indexDbCur = indexDbCon.cursor()
+imgDbCon = sqlite3.connect(imgDb)
+imgDbCur = imgDbCon.cursor()
+print("Checking tables")
+if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+	# Create tables if not present
+	imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
+	imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+else:
+	# Check for already-processed page IDs
+	numSkipped = 0
+	for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+		if pid in pageIds:
+			pageIds.remove(pid)
+			numSkipped += 1
+		else:
+			print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
+	print(f"Will skip {numSkipped} already-processed page IDs")
+
+print("Getting dump-file offsets")
+offsetToPageids = {}
+offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
+iterNum = 0
+for pageId in pageIds:
+	iterNum += 1
+	if iterNum % 1e4 == 0:
+		print(f"At iteration {iterNum}")
+	#
+	query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
+	row = indexDbCur.execute(query, (pageId,)).fetchone()
+	if row == None:
+		print(f"WARNING: Page ID {pageId} not found")
+		continue
+	(chunkOffset, endOffset) = row
+	offsetToEnd[chunkOffset] = endOffset
+	if chunkOffset not in offsetToPageids:
+		offsetToPageids[chunkOffset] = []
+	offsetToPageids[chunkOffset].append(pageId)
+print(f"Found {len(offsetToEnd)} chunks to check")
+
+print("Iterating through chunks in dump file")
+def getImageName(content):
+	" Given an array of text-content lines, tries to return an infoxbox image name, or None "
+	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	for line in content:
+		match = imageLineRegex.match(line)
+		if match != None:
+			imageName = match.group(1).strip()
+			if imageName == "":
+				return None
+			imageName = html.unescape(imageName)
+			# Account for {{...
+			if imageName.startswith("{"):
+				match = cssImgCropRegex.match(imageName)
+				if match == None:
+					return None
+				imageName = match.group(1)
+			# Account for [[File:...|...]]
+			if imageName.startswith("["):
+				match = bracketImageRegex.match(imageName)
+				if match == None:
+					return None
+				imageName = match.group(1)
+			# Account for <!--
+			if imageName.find("<!--") != -1:
+				return None
+			# Remove an initial 'File:'
+			if imageName.startswith("File:"):
+				imageName = imageName[5:]
+			# Remove an initial 'Image:'
+			if imageName.startswith("Image:"):
+				imageName = imageName[6:]
+			# Check for extension
+			match = imageNameRegex.match(imageName)
+			if match != None:
+				imageName = match.group(0)
+				imageName = urllib.parse.unquote(imageName)
+				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+				imageName = imageName.replace("_", " ")
+				return imageName
+			# Exclude lines like: | image = &lt;imagemap&gt;
+			return None
+	return None
+with open(dumpFile, mode='rb') as file:
+	iterNum = 0
+	for (pageOffset, endOffset) in offsetToEnd.items():
+		iterNum += 1
+		if iterNum % 100 == 0:
+			print(f"At iteration {iterNum}")
+		#
+		pageIds = offsetToPageids[pageOffset]
+		# Jump to chunk
+		file.seek(pageOffset)
+		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+		# Look in chunk for pages
+		lines = data.splitlines()
+		lineIdx = 0
+		while lineIdx < len(lines):
+			# Look for <page>
+			if lines[lineIdx].lstrip() != "<page>":
+				lineIdx += 1
+				continue
+			# Check page id
+			lineIdx += 3
+			idLine = lines[lineIdx].lstrip()
+			match = idLineRegex.fullmatch(idLine)
+			if match == None or int(match.group(1)) not in pageIds:
+				lineIdx += 1
+				continue
+			pageId = int(match.group(1))
+			lineIdx += 1
+			# Look for <text> in <page>
+			foundText = False
+			while lineIdx < len(lines):
+				if not lines[lineIdx].lstrip().startswith("<text "):
+					lineIdx += 1
+					continue
+				foundText = True
+				# Get text content
+				content = []
+				line = lines[lineIdx]
+				content.append(line[line.find(">") + 1:])
+				lineIdx += 1
+				foundTextEnd = False
+				while lineIdx < len(lines):
+					line = lines[lineIdx]
+					if not line.endswith("</text>"):
+						content.append(line)
+						lineIdx += 1
+						continue
+					foundTextEnd = True
+					content.append(line[:line.rfind("</text>")])
+					# Look for image-filename
+					imageName = getImageName(content)
+					imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+					break
+				if not foundTextEnd:
+					print(f"WARNING: Did not find </text> for page id {pageId}")
+				break
+			if not foundText:
+				print(f"WARNING: Did not find <text> for page id {pageId}")
+
+print("Closing databases")
+indexDbCon.close()
+imgDbCon.commit()
+imgDbCon.close()
diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py
new file mode 100755
index 0000000..1a90851
--- /dev/null
+++ b/backend/tolData/enwiki/lookupPage.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]} title1
+
+Looks up a page with title title1 in the wiki dump, using
+the dump-index db, and prints the corresponding <page>.
+"""
+if len(sys.argv) != 2:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+pageTitle = sys.argv[1].replace("_", " ")
+
+print("Looking up offset in index db")
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?"
+row = dbCur.execute(query, (pageTitle,)).fetchone()
+if row == None:
+	print("Title not found")
+	sys.exit(0)
+_, pageOffset, endOffset = row
+dbCon.close()
+print(f"Found chunk at offset {pageOffset}")
+
+print("Reading from wiki dump")
+content = []
+with open(dumpFile, mode='rb') as file:
+	# Get uncompressed chunk
+	file.seek(pageOffset)
+	compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+	data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+	# Look in chunk for page
+	lines = data.splitlines()
+	lineIdx = 0
+	found = False
+	pageNum = 0
+	while not found:
+		line = lines[lineIdx]
+		if line.lstrip() == "<page>":
+			pageNum += 1
+			if pageNum > 100:
+				print("ERROR: Did not find title after 100 pages")
+				break
+			lineIdx += 1
+			titleLine = lines[lineIdx]
+			if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
+				found = True
+				print(f"Found title in chunk as page {pageNum}")
+				content.append(line)
+				content.append(titleLine)
+				while True:
+					lineIdx += 1
+					line = lines[lineIdx]
+					content.append(line)
+					if line.lstrip() == "</page>":
+						break
+		lineIdx += 1
+
+print("Content: ")
+print("\n".join(content))
author	Terry Truong <terry06890@gmail.com>	2022-07-11 01:54:08 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-07-11 01:54:08 +1000
commit	5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree	3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/tolData/enwiki
parent	a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)