From e78c4df403e5f98afa08f7a0841ff233d5f6d05b Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Wed, 22 Jun 2022 01:42:41 +1000
Subject: Update backend READMEs, rename some files for consistency

---
 backend/data/enwiki/README.md                 |  73 ++++++-----
 backend/data/enwiki/downloadEnwikiImgs.py     |   2 +-
 backend/data/enwiki/downloadImgLicenseInfo.py |   2 +-
 backend/data/enwiki/genData.py                | 122 ------------------
 backend/data/enwiki/genDescData.py            | 122 ++++++++++++++++++
 backend/data/enwiki/genImgData.py             | 178 ++++++++++++++++++++++++++
 backend/data/enwiki/getEnwikiImgData.py       | 178 --------------------------
 7 files changed, 345 insertions(+), 332 deletions(-)
 delete mode 100755 backend/data/enwiki/genData.py
 create mode 100755 backend/data/enwiki/genDescData.py
 create mode 100755 backend/data/enwiki/genImgData.py
 delete mode 100755 backend/data/enwiki/getEnwikiImgData.py

(limited to 'backend/data/enwiki')
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index 6462d7d..1c16a2e 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -1,39 +1,52 @@
-Downloaded Files
-================
+This directory holds files obtained from/using [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page).
+
+# Downloaded Files
 -   enwiki-20220501-pages-articles-multistream.xml.bz2 <br>
-    Obtained via <https://dumps.wikimedia.org/backup-index.html>
-    (site suggests downloading from a mirror).  Contains text
-    content and metadata for pages in English Wikipedia
-    (current revision only, excludes talk pages).  Some file
-    content and format information was available from
-    <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+    Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror).
+    Contains text content and metadata for pages in enwiki.
+    Some file content and format information was available from
+        <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
 -   enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br>
     Obtained like above. Holds lines of the form offset1:pageId1:title1,
-    providing offsets, for each page, into the dump file, of a chunk of
+    providing, for each page, an offset into the dump file of a chunk of
     100 pages that includes it.
 
-Generated Files
-===============
+# Generated Dump-Index Files
+-   genDumpIndexDb.py <br>
+    Creates an sqlite-database version of the enwiki-dump index file.
 -   dumpIndex.db <br>
-    Holds data from the enwiki dump index file. Generated by
-    genDumpIndexDb.py, and used by lookupPage.py to get content for a
-    given page title. <br>
+    Generated by genDumpIndexDb.py. <br>
     Tables: <br>
-    -   offsets: title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next\_offset INT
--   enwikiData.db <br>
-    Holds data obtained from the enwiki dump file, in 'pages',
-    'redirects', and 'descs' tables. Generated by genData.py, which uses
-    python packages mwxml and mwparserfromhell. <br>
+    -   `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT`
+
+# Description Database Files
+-   genDescData.py <br>
+    Reads through pages in the dump file, and adds short-description info to a database.
+-   descData.db <br>
+    Generated by genDescData.py. <br>
     Tables: <br>
-    -   pages:     id INT PRIMARY KEY, title TEXT UNIQUE
-    -   redirects: id INT PRIMARY KEY, target TEXT
-    -   descs:     id INT PRIMARY KEY, desc TEXT
--   enwikiImgs.db <br>
-    Holds infobox-images obtained for some set of wiki page-ids.
-    Generated by running getEnwikiImgData.py, which uses the enwiki dump
-    file and dumpIndex.db. <br>
+    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
+    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
+    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
+
+# Image Database Files
+-   genImgData.py <br>
+    Used to find infobox image names for page IDs, storing them into a database.
+-   downloadImgLicenseInfo.py <br>
+    Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database.
+-   imgData.db <br>
+    Used to hold metadata about infobox images for a set of pageIDs.
+    Generated using getEnwikiImgData.py and downloadImgLicenseInfo.py. <br>
     Tables: <br>
-    -   page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT
-        (img\_name may be null, which is used to avoid re-processing the page-id on a second pass)
-    -   imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT
-        (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info)
+    -   `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT` <br>
+        `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
+    -   `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
+        Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
+-   downloadEnwikiImgs.py <br>
+    Used to download image files into imgs/.
+
+# Other Files
+-   lookupPage.py <br>
+    Running `lookupPage.py title1` looks in the dump for a page with a given title,
+    and prints the contents to stdout. Uses dumpIndex.db.
+
diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py
index de9b862..2929a0d 100755
--- a/backend/data/enwiki/downloadEnwikiImgs.py
+++ b/backend/data/enwiki/downloadEnwikiImgs.py
@@ -16,7 +16,7 @@ if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-imgDb = "enwikiImgs.db" # About 130k image names
+imgDb = "imgData.db" # About 130k image names
 outDir = "imgs"
 licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
 
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
index 8231fbb..097304b 100755
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ b/backend/data/enwiki/downloadImgLicenseInfo.py
@@ -16,7 +16,7 @@ if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-imgDb = "enwikiImgs.db" # About 130k image names
+imgDb = "imgData.db" # About 130k image names
 apiUrl = "https://en.wikipedia.org/w/api.php"
 batchSz = 50 # Max 50
 tagRegex = re.compile(r"<[^<]+>")
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py
deleted file mode 100755
index 3e60bb5..0000000
--- a/backend/data/enwiki/genData.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os, re
-import bz2
-import html, mwxml, mwparserfromhell
-import sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
-usageInfo += "and short-description info to an sqlite db.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
-enwikiDb = "enwikiData.db"
-
-# Some regexps and functions for parsing wikitext
-descLineRegex = re.compile("^ *[A-Z'\"]")
-embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
-	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
-convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
-parensGrpRegex = re.compile(r" \([^()]*\)")
-leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
-def convertTemplateReplace(match):
-	if match.group(2) == None:
-		return f"{match.group(1)} {match.group(4)}"
-	else:
-		return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
-def parseDesc(text):
-	# Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
-	# Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, 
-		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
-	lines = []
-	openBraceCount = 0
-	openBracketCount = 0
-	inComment = False
-	skip = False
-	for line in text.splitlines():
-		line = line.strip()
-		if len(lines) == 0:
-			if len(line) > 0:
-				if openBraceCount > 0 or line[0] == "{":
-					openBraceCount += line.count("{")
-					openBraceCount -= line.count("}")
-					skip = True
-				if openBracketCount > 0 or line[0] == "[":
-					openBracketCount += line.count("[")
-					openBracketCount -= line.count("]")
-					skip = True
-				if inComment or line.find("<!--") != -1:
-					if line.find("-->") != -1:
-						if inComment:
-							inComment = False
-							skip = True
-					else:
-						inComment = True
-						skip = True
-				if skip:
-					skip = False
-					continue
-				if line[-1] == ":": # Seems to help avoid disambiguation pages
-					return None
-				if descLineRegex.match(line) != None:
-					lines.append(line)
-		else:
-			if len(line) == 0:
-				return removeMarkup(" ".join(lines))
-			lines.append(line)
-	if len(lines) > 0:
-		return removeMarkup(" ".join(lines))
-	return None
-def removeMarkup(content):
-	content = embeddedHtmlRegex.sub("", content)
-	content = convertTemplateRegex.sub(convertTemplateReplace, content)
-	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
-	content = parensGrpRegex.sub("", content)
-	content = leftoverBraceRegex.sub("", content)
-	return content
-# Other helper functions
-def convertTitle(title):
-	return html.unescape(title).replace("_", " ")
-
-# Check for existing db
-if os.path.exists(enwikiDb):
-	print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
-	sys.exit(1)
-# Create db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
-dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
-dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
-dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Read through dump file
-print("Reading dump file")
-with bz2.open(dumpFile, mode='rt') as file:
-	dump = mwxml.Dump.from_file(file)
-	pageNum = 0
-	for page in dump:
-		pageNum += 1
-		if pageNum % 1e4 == 0:
-			print(f"At page {pageNum}")
-		# Parse page
-		if page.namespace == 0:
-			try:
-				dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
-			except sqlite3.IntegrityError as e:
-				# Accounts for certain pages that have the same title
-				print(f"Failed to add page with title \"{page.title}\": {e}")
-				continue
-			if page.redirect != None:
-				dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
-			else:
-				revision = next(page)
-				desc = parseDesc(revision.text)
-				if desc != None:
-					dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
new file mode 100755
index 0000000..032dbed
--- /dev/null
+++ b/backend/data/enwiki/genDescData.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
+usageInfo += "and short-description info to an sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
+enwikiDb = "descData.db"
+
+# Some regexps and functions for parsing wikitext
+descLineRegex = re.compile("^ *[A-Z'\"]")
+embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
+	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
+parensGrpRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+def convertTemplateReplace(match):
+	if match.group(2) == None:
+		return f"{match.group(1)} {match.group(4)}"
+	else:
+		return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
+def parseDesc(text):
+	# Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
+	# Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, 
+		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	lines = []
+	openBraceCount = 0
+	openBracketCount = 0
+	inComment = False
+	skip = False
+	for line in text.splitlines():
+		line = line.strip()
+		if len(lines) == 0:
+			if len(line) > 0:
+				if openBraceCount > 0 or line[0] == "{":
+					openBraceCount += line.count("{")
+					openBraceCount -= line.count("}")
+					skip = True
+				if openBracketCount > 0 or line[0] == "[":
+					openBracketCount += line.count("[")
+					openBracketCount -= line.count("]")
+					skip = True
+				if inComment or line.find("<!--") != -1:
+					if line.find("-->") != -1:
+						if inComment:
+							inComment = False
+							skip = True
+					else:
+						inComment = True
+						skip = True
+				if skip:
+					skip = False
+					continue
+				if line[-1] == ":": # Seems to help avoid disambiguation pages
+					return None
+				if descLineRegex.match(line) != None:
+					lines.append(line)
+		else:
+			if len(line) == 0:
+				return removeMarkup(" ".join(lines))
+			lines.append(line)
+	if len(lines) > 0:
+		return removeMarkup(" ".join(lines))
+	return None
+def removeMarkup(content):
+	content = embeddedHtmlRegex.sub("", content)
+	content = convertTemplateRegex.sub(convertTemplateReplace, content)
+	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+	content = parensGrpRegex.sub("", content)
+	content = leftoverBraceRegex.sub("", content)
+	return content
+# Other helper functions
+def convertTitle(title):
+	return html.unescape(title).replace("_", " ")
+
+# Check for existing db
+if os.path.exists(enwikiDb):
+	print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
+	sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
+dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+# Read through dump file
+print("Reading dump file")
+with bz2.open(dumpFile, mode='rt') as file:
+	dump = mwxml.Dump.from_file(file)
+	pageNum = 0
+	for page in dump:
+		pageNum += 1
+		if pageNum % 1e4 == 0:
+			print(f"At page {pageNum}")
+		# Parse page
+		if page.namespace == 0:
+			try:
+				dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
+			except sqlite3.IntegrityError as e:
+				# Accounts for certain pages that have the same title
+				print(f"Failed to add page with title \"{page.title}\": {e}")
+				continue
+			if page.redirect != None:
+				dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
+			else:
+				revision = next(page)
+				desc = parseDesc(revision.text)
+				if desc != None:
+					dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py
new file mode 100755
index 0000000..9bd28f4
--- /dev/null
+++ b/backend/data/enwiki/genImgData.py
@@ -0,0 +1,178 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2, html, urllib.parse
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
+usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+def getInputPageIds():
+	pageIds = set()
+	dbCon = sqlite3.connect("../data.db")
+	dbCur = dbCon.cursor()
+	for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
+		pageIds.add(pageId)
+	dbCon.close()
+	return pageIds
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+imgDb = "imgData.db" # Output db
+idLineRegex = re.compile(r"<id>(.*)</id>")
+imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
+bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
+imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
+cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+
+# Open dbs
+indexDbCon = sqlite3.connect(indexDb)
+indexDbCur = indexDbCon.cursor()
+imgDbCon = sqlite3.connect(imgDb)
+imgDbCur = imgDbCon.cursor()
+# Create image-db table
+pidsDone = set()
+if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+	imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
+	imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+else:
+	for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+		pidsDone.add(pid)
+	print(f"Will skip {len(pidsDone)} already-processed page-ids")
+# Get input pageIds
+print("Getting input page-ids", file=sys.stderr)
+pageIds = getInputPageIds()
+for pid in pidsDone:
+	pageIds.remove(pid)
+print(f"Found {len(pageIds)} page-ids to process")
+# Get page-id dump-file offsets
+print("Getting dump-file offsets", file=sys.stderr)
+offsetToPageids = {}
+offsetToEnd = {}
+iterNum = 0
+for pageId in pageIds:
+	iterNum += 1
+	if iterNum % 1e4 == 0:
+		print(f"At iteration {iterNum}", file=sys.stderr)
+	#
+	query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
+	row = indexDbCur.execute(query, (pageId,)).fetchone()
+	if row == None:
+		print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
+		continue
+	(chunkOffset, endOffset) = row
+	offsetToEnd[chunkOffset] = endOffset
+	if chunkOffset not in offsetToPageids:
+		offsetToPageids[chunkOffset] = []
+	offsetToPageids[chunkOffset].append(pageId)
+print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
+# Look through dump file, jumping to chunks containing relevant pages
+print("Reading through dump file", file=sys.stderr)
+def getImageName(content):
+	""" Given an array of text-content lines, returns an image-filename, or None """
+	for line in content:
+		match = imageLineRegex.match(line)
+		if match != None:
+			imageName = match.group(1).strip()
+			if imageName == "":
+				return None
+			imageName = html.unescape(imageName)
+			# Account for {{...
+			if imageName.startswith("{"):
+				match = cssImgCropRegex.match(imageName)
+				if match == None:
+					return None
+				imageName = match.group(1)
+			# Account for [[File:...|...]]
+			if imageName.startswith("["):
+				match = bracketImageRegex.match(imageName)
+				if match == None:
+					return None
+				imageName = match.group(1)
+			# Account for <!--
+			if imageName.find("<!--") != -1:
+				return None
+			# Remove an initial 'File:'
+			if imageName.startswith("File:"):
+				imageName = imageName[5:]
+			# Remove an initial 'Image:'
+			if imageName.startswith("Image:"):
+				imageName = imageName[6:]
+			# Check for extension
+			match = imageNameRegex.match(imageName)
+			if match != None:
+				imageName = match.group(0)
+				imageName = urllib.parse.unquote(imageName)
+				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+				imageName = imageName.replace("_", " ")
+				return imageName
+			# Skip lines like: | image = &lt;imagemap&gt;
+			return None
+	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	return None
+with open(dumpFile, mode='rb') as file:
+	iterNum = 0
+	for (pageOffset, endOffset) in offsetToEnd.items():
+		iterNum += 1
+		if iterNum % 100 == 0:
+			print(f"At iteration {iterNum}", file=sys.stderr)
+		#
+		pageIds = offsetToPageids[pageOffset]
+		# Jump to chunk
+		file.seek(pageOffset)
+		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+		# Look in chunk for pages
+		lines = data.splitlines()
+		lineIdx = 0
+		while lineIdx < len(lines):
+			# Look for <page>
+			if lines[lineIdx].lstrip() != "<page>":
+				lineIdx += 1
+				continue
+			# Check page id
+			lineIdx += 3
+			idLine = lines[lineIdx].lstrip()
+			match = idLineRegex.fullmatch(idLine)
+			if match == None or int(match.group(1)) not in pageIds:
+				lineIdx += 1
+				continue
+			pageId = int(match.group(1))
+			lineIdx += 1
+			# Look for <text> in <page>
+			foundText = False
+			while lineIdx < len(lines):
+				if not lines[lineIdx].lstrip().startswith("<text "):
+					lineIdx += 1
+					continue
+				foundText = True
+				# Get text content
+				content = []
+				line = lines[lineIdx]
+				content.append(line[line.find(">") + 1:])
+				lineIdx += 1
+				foundTextEnd = False
+				while lineIdx < len(lines):
+					line = lines[lineIdx]
+					if not line.endswith("</text>"):
+						content.append(line)
+						lineIdx += 1
+						continue
+					foundTextEnd = True
+					content.append(line[:line.rfind("</text>")])
+					# Look for image-filename
+					imageName = getImageName(content)
+					imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+					break
+				if not foundTextEnd:
+					print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
+				break
+			if not foundText:
+				print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
+# Close dbs
+indexDbCon.close()
+imgDbCon.commit()
+imgDbCon.close()
diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py
deleted file mode 100755
index f8bb2ee..0000000
--- a/backend/data/enwiki/getEnwikiImgData.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import bz2, html, urllib.parse
-import sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
-usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-def getInputPageIds():
-	pageIds = set()
-	dbCon = sqlite3.connect("../data.db")
-	dbCur = dbCon.cursor()
-	for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
-		pageIds.add(pageId)
-	dbCon.close()
-	return pageIds
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
-indexDb = "dumpIndex.db"
-imgDb = "enwikiImgs.db" # Output db
-idLineRegex = re.compile(r"<id>(.*)</id>")
-imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
-bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
-imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
-cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
-
-# Open dbs
-indexDbCon = sqlite3.connect(indexDb)
-indexDbCur = indexDbCon.cursor()
-imgDbCon = sqlite3.connect(imgDb)
-imgDbCur = imgDbCon.cursor()
-# Create image-db table
-pidsDone = set()
-if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
-	imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
-	imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
-else:
-	for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
-		pidsDone.add(pid)
-	print(f"Will skip {len(pidsDone)} already-processed page-ids")
-# Get input pageIds
-print("Getting input page-ids", file=sys.stderr)
-pageIds = getInputPageIds()
-for pid in pidsDone:
-	pageIds.remove(pid)
-print(f"Found {len(pageIds)} page-ids to process")
-# Get page-id dump-file offsets
-print("Getting dump-file offsets", file=sys.stderr)
-offsetToPageids = {}
-offsetToEnd = {}
-iterNum = 0
-for pageId in pageIds:
-	iterNum += 1
-	if iterNum % 1e4 == 0:
-		print(f"At iteration {iterNum}", file=sys.stderr)
-	#
-	query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
-	row = indexDbCur.execute(query, (pageId,)).fetchone()
-	if row == None:
-		print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
-		continue
-	(chunkOffset, endOffset) = row
-	offsetToEnd[chunkOffset] = endOffset
-	if chunkOffset not in offsetToPageids:
-		offsetToPageids[chunkOffset] = []
-	offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
-# Look through dump file, jumping to chunks containing relevant pages
-print("Reading through dump file", file=sys.stderr)
-def getImageName(content):
-	""" Given an array of text-content lines, returns an image-filename, or None """
-	for line in content:
-		match = imageLineRegex.match(line)
-		if match != None:
-			imageName = match.group(1).strip()
-			if imageName == "":
-				return None
-			imageName = html.unescape(imageName)
-			# Account for {{...
-			if imageName.startswith("{"):
-				match = cssImgCropRegex.match(imageName)
-				if match == None:
-					return None
-				imageName = match.group(1)
-			# Account for [[File:...|...]]
-			if imageName.startswith("["):
-				match = bracketImageRegex.match(imageName)
-				if match == None:
-					return None
-				imageName = match.group(1)
-			# Account for <!--
-			if imageName.find("<!--") != -1:
-				return None
-			# Remove an initial 'File:'
-			if imageName.startswith("File:"):
-				imageName = imageName[5:]
-			# Remove an initial 'Image:'
-			if imageName.startswith("Image:"):
-				imageName = imageName[6:]
-			# Check for extension
-			match = imageNameRegex.match(imageName)
-			if match != None:
-				imageName = match.group(0)
-				imageName = urllib.parse.unquote(imageName)
-				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
-				imageName = imageName.replace("_", " ")
-				return imageName
-			# Skip lines like: | image = &lt;imagemap&gt;
-			return None
-	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
-	return None
-with open(dumpFile, mode='rb') as file:
-	iterNum = 0
-	for (pageOffset, endOffset) in offsetToEnd.items():
-		iterNum += 1
-		if iterNum % 100 == 0:
-			print(f"At iteration {iterNum}", file=sys.stderr)
-		#
-		pageIds = offsetToPageids[pageOffset]
-		# Jump to chunk
-		file.seek(pageOffset)
-		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
-		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
-		# Look in chunk for pages
-		lines = data.splitlines()
-		lineIdx = 0
-		while lineIdx < len(lines):
-			# Look for <page>
-			if lines[lineIdx].lstrip() != "<page>":
-				lineIdx += 1
-				continue
-			# Check page id
-			lineIdx += 3
-			idLine = lines[lineIdx].lstrip()
-			match = idLineRegex.fullmatch(idLine)
-			if match == None or int(match.group(1)) not in pageIds:
-				lineIdx += 1
-				continue
-			pageId = int(match.group(1))
-			lineIdx += 1
-			# Look for <text> in <page>
-			foundText = False
-			while lineIdx < len(lines):
-				if not lines[lineIdx].lstrip().startswith("<text "):
-					lineIdx += 1
-					continue
-				foundText = True
-				# Get text content
-				content = []
-				line = lines[lineIdx]
-				content.append(line[line.find(">") + 1:])
-				lineIdx += 1
-				foundTextEnd = False
-				while lineIdx < len(lines):
-					line = lines[lineIdx]
-					if not line.endswith("</text>"):
-						content.append(line)
-						lineIdx += 1
-						continue
-					foundTextEnd = True
-					content.append(line[:line.rfind("</text>")])
-					# Look for image-filename
-					imageName = getImageName(content)
-					imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
-					break
-				if not foundTextEnd:
-					print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
-				break
-			if not foundText:
-				print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
-# Close dbs
-indexDbCon.close()
-imgDbCon.commit()
-imgDbCon.close()
-- 
cgit v1.2.3