From e78c4df403e5f98afa08f7a0841ff233d5f6d05b Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 22 Jun 2022 01:42:41 +1000 Subject: Update backend READMEs, rename some files for consistency --- backend/data/enwiki/genImgData.py | 178 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100755 backend/data/enwiki/genImgData.py (limited to 'backend/data/enwiki/genImgData.py') diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py new file mode 100755 index 0000000..9bd28f4 --- /dev/null +++ b/backend/data/enwiki/genImgData.py @@ -0,0 +1,178 @@ +#!/usr/bin/python3 + +import sys, re +import bz2, html, urllib.parse +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n" +usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +def getInputPageIds(): + pageIds = set() + dbCon = sqlite3.connect("../data.db") + dbCur = dbCon.cursor() + for (pageId,) in dbCur.execute("SELECT id from wiki_ids"): + pageIds.add(pageId) + dbCon.close() + return pageIds +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" +indexDb = "dumpIndex.db" +imgDb = "imgData.db" # Output db +idLineRegex = re.compile(r"(.*)") +imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") +bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") +imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE) +cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) + +# Open dbs +indexDbCon = sqlite3.connect(indexDb) +indexDbCur = indexDbCon.cursor() +imgDbCon = sqlite3.connect(imgDb) +imgDbCur = imgDbCon.cursor() +# Create image-db table +pidsDone = set() +if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None: + imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL + imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") +else: + for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"): + pidsDone.add(pid) + print(f"Will skip {len(pidsDone)} already-processed page-ids") +# Get input pageIds +print("Getting input page-ids", file=sys.stderr) +pageIds = getInputPageIds() +for pid in pidsDone: + pageIds.remove(pid) +print(f"Found {len(pageIds)} page-ids to process") +# Get page-id dump-file offsets +print("Getting dump-file offsets", file=sys.stderr) +offsetToPageids = {} +offsetToEnd = {} +iterNum = 0 +for pageId in pageIds: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}", file=sys.stderr) + # + query = "SELECT offset, next_offset FROM offsets WHERE id = ?" + row = indexDbCur.execute(query, (pageId,)).fetchone() + if row == None: + print(f"WARNING: Page id {pageId} not found", file=sys.stderr) + continue + (chunkOffset, endOffset) = row + offsetToEnd[chunkOffset] = endOffset + if chunkOffset not in offsetToPageids: + offsetToPageids[chunkOffset] = [] + offsetToPageids[chunkOffset].append(pageId) +print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr) +# Look through dump file, jumping to chunks containing relevant pages +print("Reading through dump file", file=sys.stderr) +def getImageName(content): + """ Given an array of text-content lines, returns an image-filename, or None """ + for line in content: + match = imageLineRegex.match(line) + if match != None: + imageName = match.group(1).strip() + if imageName == "": + return None + imageName = html.unescape(imageName) + # Account for {{... + if imageName.startswith("{"): + match = cssImgCropRegex.match(imageName) + if match == None: + return None + imageName = match.group(1) + # Account for [[File:...|...]] + if imageName.startswith("["): + match = bracketImageRegex.match(imageName) + if match == None: + return None + imageName = match.group(1) + # Account for