aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/enwiki/gen_img_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tol_data/enwiki/gen_img_data.py')
-rwxr-xr-xbackend/tol_data/enwiki/gen_img_data.py193
1 files changed, 193 insertions, 0 deletions
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
new file mode 100755
index 0000000..d4696f0
--- /dev/null
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -0,0 +1,193 @@
+#!/usr/bin/python3
+
+"""
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
+
+import re
+import os, bz2, html, urllib.parse
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+INDEX_DB = 'dumpIndex.db'
+IMG_DB = 'img_data.db' # The database to create
+DB_FILE = os.path.join('..', 'data.db')
+#
+ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
+IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
+BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
+IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
+CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+
+def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
+ print('Opening databases')
+ indexDbCon = sqlite3.connect(indexDb)
+ indexDbCur = indexDbCon.cursor()
+ imgDbCon = sqlite3.connect(imgDb)
+ imgDbCur = imgDbCon.cursor()
+ print('Checking tables')
+ if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
+ # Create tables if not present
+ imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+ imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
+ else:
+ # Check for already-processed page IDs
+ numSkipped = 0
+ for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
+ if pid in pageIds:
+ pageIds.remove(pid)
+ numSkipped += 1
+ else:
+ print(f'Found already-processed page ID {pid} which was not in input set')
+ print(f'Will skip {numSkipped} already-processed page IDs')
+ #
+ print('Getting dump-file offsets')
+ offsetToPageids: dict[int, list[int]] = {}
+ offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
+ iterNum = 0
+ for pageId in pageIds:
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
+ row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
+ if row is None:
+ print(f'WARNING: Page ID {pageId} not found')
+ continue
+ chunkOffset, endOffset = row
+ offsetToEnd[chunkOffset] = endOffset
+ if chunkOffset not in offsetToPageids:
+ offsetToPageids[chunkOffset] = []
+ offsetToPageids[chunkOffset].append(pageId)
+ print(f'Found {len(offsetToEnd)} chunks to check')
+ #
+ print('Iterating through chunks in dump file')
+ with open(dumpFile, mode='rb') as file:
+ iterNum = 0
+ for pageOffset, endOffset in offsetToEnd.items():
+ iterNum += 1
+ if iterNum % 100 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ chunkPageIds = offsetToPageids[pageOffset]
+ # Jump to chunk
+ file.seek(pageOffset)
+ compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+ data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+ # Look in chunk for pages
+ lines = data.splitlines()
+ lineIdx = 0
+ while lineIdx < len(lines):
+ # Look for <page>
+ if lines[lineIdx].lstrip() != '<page>':
+ lineIdx += 1
+ continue
+ # Check page id
+ lineIdx += 3
+ idLine = lines[lineIdx].lstrip()
+ match = ID_LINE_REGEX.fullmatch(idLine)
+ if match is None or int(match.group(1)) not in chunkPageIds:
+ lineIdx += 1
+ continue
+ pageId = int(match.group(1))
+ lineIdx += 1
+ # Look for <text> in <page>
+ foundText = False
+ while lineIdx < len(lines):
+ if not lines[lineIdx].lstrip().startswith('<text '):
+ lineIdx += 1
+ continue
+ foundText = True
+ # Get text content
+ content: list[str] = []
+ line = lines[lineIdx]
+ content.append(line[line.find('>') + 1:])
+ lineIdx += 1
+ foundTextEnd = False
+ while lineIdx < len(lines):
+ line = lines[lineIdx]
+ if not line.endswith('</text>'):
+ content.append(line)
+ lineIdx += 1
+ continue
+ foundTextEnd = True
+ content.append(line[:line.rfind('</text>')])
+ # Look for image-filename
+ imageName = getImageName(content)
+ imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName))
+ break
+ if not foundTextEnd:
+ print(f'WARNING: Did not find </text> for page id {pageId}')
+ break
+ if not foundText:
+ print(f'WARNING: Did not find <text> for page id {pageId}')
+ #
+ print('Closing databases')
+ indexDbCon.close()
+ imgDbCon.commit()
+ imgDbCon.close()
+def getImageName(content: list[str]) -> str | None:
+ """ Given an array of text-content lines, tries to return an infoxbox image name, or None """
+ # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+ for line in content:
+ match = IMG_LINE_REGEX.match(line)
+ if match is not None:
+ imageName = match.group(1).strip()
+ if imageName == '':
+ return None
+ imageName = html.unescape(imageName)
+ # Account for {{...
+ if imageName.startswith('{'):
+ match = CSS_IMG_CROP_REGEX.match(imageName)
+ if match is None:
+ return None
+ imageName = match.group(1)
+ # Account for [[File:...|...]]
+ if imageName.startswith('['):
+ match = BRACKET_IMG_REGEX.match(imageName)
+ if match is None:
+ return None
+ imageName = match.group(1)
+ # Account for <!--
+ if imageName.find('<!--') != -1:
+ return None
+ # Remove an initial 'File:'
+ if imageName.startswith('File:'):
+ imageName = imageName[5:]
+ # Remove an initial 'Image:'
+ if imageName.startswith('Image:'):
+ imageName = imageName[6:]
+ # Check for extension
+ match = IMG_NAME_REGEX.match(imageName)
+ if match is not None:
+ imageName = match.group(0)
+ imageName = urllib.parse.unquote(imageName)
+ imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+ imageName = imageName.replace('_', ' ')
+ return imageName
+ # Exclude lines like: | image = &lt;imagemap&gt;
+ return None
+ return None
+
+def getInputPageIdsFromDb(dbFile: str) -> set[int]:
+ print('Getting input page-ids')
+ pageIds: set[int] = set()
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
+ pageIds.add(pageId)
+ dbCon.close()
+ print(f'Found {len(pageIds)}')
+ return pageIds
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ pageIds = getInputPageIdsFromDb(DB_FILE)
+ genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)