1 files changed, 193 insertions, 0 deletions
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
new file mode 100755
index 0000000..d4696f0
--- /dev/null
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -0,0 +1,193 @@
+#!/usr/bin/python3
+
+"""
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
+
+import re
+import os, bz2, html, urllib.parse
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+INDEX_DB = 'dumpIndex.db'
+IMG_DB = 'img_data.db' # The database to create
+DB_FILE = os.path.join('..', 'data.db')
+#
+ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
+IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
+BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
+IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
+CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+
+def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
+	print('Opening databases')
+	indexDbCon = sqlite3.connect(indexDb)
+	indexDbCur = indexDbCon.cursor()
+	imgDbCon = sqlite3.connect(imgDb)
+	imgDbCur = imgDbCon.cursor()
+	print('Checking tables')
+	if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
+		# Create tables if not present
+		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+		imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
+	else:
+		# Check for already-processed page IDs
+		numSkipped = 0
+		for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
+			if pid in pageIds:
+				pageIds.remove(pid)
+				numSkipped += 1
+			else:
+				print(f'Found already-processed page ID {pid} which was not in input set')
+		print(f'Will skip {numSkipped} already-processed page IDs')
+	#
+	print('Getting dump-file offsets')
+	offsetToPageids: dict[int, list[int]] = {}
+	offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
+	iterNum = 0
+	for pageId in pageIds:
+		iterNum += 1
+		if iterNum % 1e4 == 0:
+			print(f'At iteration {iterNum}')
+		#
+		query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
+		row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
+		if row is None:
+			print(f'WARNING: Page ID {pageId} not found')
+			continue
+		chunkOffset, endOffset = row
+		offsetToEnd[chunkOffset] = endOffset
+		if chunkOffset not in offsetToPageids:
+			offsetToPageids[chunkOffset] = []
+		offsetToPageids[chunkOffset].append(pageId)
+	print(f'Found {len(offsetToEnd)} chunks to check')
+	#
+	print('Iterating through chunks in dump file')
+	with open(dumpFile, mode='rb') as file:
+		iterNum = 0
+		for pageOffset, endOffset in offsetToEnd.items():
+			iterNum += 1
+			if iterNum % 100 == 0:
+				print(f'At iteration {iterNum}')
+			#
+			chunkPageIds = offsetToPageids[pageOffset]
+			# Jump to chunk
+			file.seek(pageOffset)
+			compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+			data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+			# Look in chunk for pages
+			lines = data.splitlines()
+			lineIdx = 0
+			while lineIdx < len(lines):
+				# Look for <page>
+				if lines[lineIdx].lstrip() != '<page>':
+					lineIdx += 1
+					continue
+				# Check page id
+				lineIdx += 3
+				idLine = lines[lineIdx].lstrip()
+				match = ID_LINE_REGEX.fullmatch(idLine)
+				if match is None or int(match.group(1)) not in chunkPageIds:
+					lineIdx += 1
+					continue
+				pageId = int(match.group(1))
+				lineIdx += 1
+				# Look for <text> in <page>
+				foundText = False
+				while lineIdx < len(lines):
+					if not lines[lineIdx].lstrip().startswith('<text '):
+						lineIdx += 1
+						continue
+					foundText = True
+					# Get text content
+					content: list[str] = []
+					line = lines[lineIdx]
+					content.append(line[line.find('>') + 1:])
+					lineIdx += 1
+					foundTextEnd = False
+					while lineIdx < len(lines):
+						line = lines[lineIdx]
+						if not line.endswith('</text>'):
+							content.append(line)
+							lineIdx += 1
+							continue
+						foundTextEnd = True
+						content.append(line[:line.rfind('</text>')])
+						# Look for image-filename
+						imageName = getImageName(content)
+						imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName))
+						break
+					if not foundTextEnd:
+						print(f'WARNING: Did not find </text> for page id {pageId}')
+					break
+				if not foundText:
+					print(f'WARNING: Did not find <text> for page id {pageId}')
+	#
+	print('Closing databases')
+	indexDbCon.close()
+	imgDbCon.commit()
+	imgDbCon.close()
+def getImageName(content: list[str]) -> str | None:
+	""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
+	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	for line in content:
+		match = IMG_LINE_REGEX.match(line)
+		if match is not None:
+			imageName = match.group(1).strip()
+			if imageName == '':
+				return None
+			imageName = html.unescape(imageName)
+			# Account for {{...
+			if imageName.startswith('{'):
+				match = CSS_IMG_CROP_REGEX.match(imageName)
+				if match is None:
+					return None
+				imageName = match.group(1)
+			# Account for [[File:...|...]]
+			if imageName.startswith('['):
+				match = BRACKET_IMG_REGEX.match(imageName)
+				if match is None:
+					return None
+				imageName = match.group(1)
+			# Account for <!--
+			if imageName.find('<!--') != -1:
+				return None
+			# Remove an initial 'File:'
+			if imageName.startswith('File:'):
+				imageName = imageName[5:]
+			# Remove an initial 'Image:'
+			if imageName.startswith('Image:'):
+				imageName = imageName[6:]
+			# Check for extension
+			match = IMG_NAME_REGEX.match(imageName)
+			if match is not None:
+				imageName = match.group(0)
+				imageName = urllib.parse.unquote(imageName)
+				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+				imageName = imageName.replace('_', ' ')
+				return imageName
+			# Exclude lines like: | image = &lt;imagemap&gt;
+			return None
+	return None
+
+def getInputPageIdsFromDb(dbFile: str) -> set[int]:
+	print('Getting input page-ids')
+	pageIds: set[int] = set()
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
+		pageIds.add(pageId)
+	dbCon.close()
+	print(f'Found {len(pageIds)}')
+	return pageIds
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.parse_args()
+	#
+	pageIds = getInputPageIdsFromDb(DB_FILE)
+	genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)