#!/usr/bin/python3 import re import bz2, html, urllib.parse import sqlite3 import argparse parser = argparse.ArgumentParser(description=""" For some set of page IDs, looks up their content in the wiki dump, and tries to parse infobox image names, storing them into a database. The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() def getInputPageIds(): pageIds: set[int] = set() dbCon = sqlite3.connect('../data.db') dbCur = dbCon.cursor() for (pageId,) in dbCur.execute('SELECT id from wiki_ids'): pageIds.add(pageId) dbCon.close() return pageIds dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2' indexDb = 'dumpIndex.db' imgDb = 'imgData.db' # The database to create idLineRegex = re.compile(r'(.*)') imageLineRegex = re.compile(r'.*\| *image *= *([^|]*)') bracketImageRegex = re.compile(r'\[\[(File:[^|]*).*]]') imageNameRegex = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) cssImgCropRegex = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) print('Getting input page-ids') pageIds = getInputPageIds() print(f'Found {len(pageIds)}') print('Opening databases') indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() print('Checking tables') if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') else: # Check for already-processed page IDs numSkipped = 0 for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'): if pid in pageIds: pageIds.remove(pid) numSkipped += 1 else: print(f'WARNING: Found already-processed page ID {pid} which was not in input set') print(f'Will skip {numSkipped} already-processed page IDs') print('Getting dump-file offsets') offsetToPageids: dict[int, list[int]] = {} offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets iterNum = 0 for pageId in pageIds: iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') # query = 'SELECT offset, next_offset FROM offsets WHERE id = ?' row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone() if row is None: print(f'WARNING: Page ID {pageId} not found') continue chunkOffset, endOffset = row offsetToEnd[chunkOffset] = endOffset if chunkOffset not in offsetToPageids: offsetToPageids[chunkOffset] = [] offsetToPageids[chunkOffset].append(pageId) print(f'Found {len(offsetToEnd)} chunks to check') print('Iterating through chunks in dump file') def getImageName(content: list[str]) -> str | None: """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ # Doesn't try and find images in outside-infobox [[File:...]] and sections for line in content: match = imageLineRegex.match(line) if match is not None: imageName = match.group(1).strip() if imageName == '': return None imageName = html.unescape(imageName) # Account for {{... if imageName.startswith('{'): match = cssImgCropRegex.match(imageName) if match is None: return None imageName = match.group(1) # Account for [[File:...|...]] if imageName.startswith('['): match = bracketImageRegex.match(imageName) if match is None: return None imageName = match.group(1) # Account for