From abb936f5d76f7fe5cec1e8948d287da86643d504 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 22 Jun 2022 23:16:42 +1000 Subject: Refactor backend scripts --- backend/data/enwiki/README.md | 2 +- backend/data/enwiki/downloadEnwikiImgs.py | 88 -------------------------- backend/data/enwiki/downloadImgLicenseInfo.py | 60 ++++++++++-------- backend/data/enwiki/downloadImgs.py | 91 +++++++++++++++++++++++++++ backend/data/enwiki/genDescData.py | 43 +++++++------ backend/data/enwiki/genDumpIndexDb.py | 26 ++++---- backend/data/enwiki/genImgData.py | 72 ++++++++++++--------- backend/data/enwiki/lookupPage.py | 22 ++++--- 8 files changed, 219 insertions(+), 185 deletions(-) delete mode 100755 backend/data/enwiki/downloadEnwikiImgs.py create mode 100755 backend/data/enwiki/downloadImgs.py (limited to 'backend/data/enwiki') diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index 1c16a2e..90d16c7 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -42,7 +42,7 @@ This directory holds files obtained from/using [English Wikipedia](https://en.wi `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids. - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT`
Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. -- downloadEnwikiImgs.py
+- downloadImgs.py
Used to download image files into imgs/. # Other Files diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py deleted file mode 100755 index 2929a0d..0000000 --- a/backend/data/enwiki/downloadEnwikiImgs.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import sqlite3 -import urllib.parse, requests -import time, signal - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Downloads images from URLs specified in an sqlite db,\n" -usageInfo += "into a specified directory.'\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" -usageInfo += "The program can be re-run to continue downloading, and looks\n" -usageInfo += "in the output directory do decide what to skip.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -imgDb = "imgData.db" # About 130k image names -outDir = "imgs" -licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) - -# Create output directory if not present -if not os.path.exists(outDir): - os.mkdir(outDir) -# Get existing image names -print("Gettings already-downloaded images") -fileList = os.listdir(outDir) -pageIdsDone = set() -for filename in fileList: - (basename, extension) = os.path.splitext(filename) - pageIdsDone.add(int(basename)) -print(f"Found {len(pageIdsDone)} already-downloaded images") -# Set SIGINT handler -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) -# Open db -dbCon = sqlite3.connect(imgDb) -dbCur = dbCon.cursor() -# Start downloads -print("Starting downloads") -iterNum = 0 -query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \ - " imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name" -for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query): - if pageId in pageIdsDone: - continue - if interrupted: - print(f"Exiting loop") - break - # Check for problematic attributes - if license == None or licenseRegex.fullmatch(license) == None: - continue - if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None: - continue - if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None: - continue - if restrictions != None and restrictions != "": - continue - # Download image - iterNum += 1 - print(f"Iteration {iterNum}: Downloading for page-id {pageId}") - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) - sys.exit(1) - outFile = f"{outDir}/{pageId}{extension}" - headers = { - "user-agent": "terryt.dev (terry06890@gmail.com)", - "accept-encoding": "gzip", - } - try: - response = requests.get(url, headers=headers) - with open(outFile, 'wb') as file: - file.write(response.content) - time.sleep(1) - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec" - # It's unclear how to properly check for cache misses, so just do about <=1 per sec - except Exception as e: - print(f"Error while downloading to {outFile}: {e}", file=sys.stderr) -# Close db -dbCon.close() diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py index 097304b..399922e 100755 --- a/backend/data/enwiki/downloadImgLicenseInfo.py +++ b/backend/data/enwiki/downloadImgLicenseInfo.py @@ -5,41 +5,48 @@ import sqlite3, urllib.parse, html import requests import time, signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n" -usageInfo += "licensing information for them, adding the info to a sqlite db.\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" -usageInfo += "The program can be re-run to continue downloading, and looks\n" -usageInfo += "at names added to the db to decide what to skip.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads image names from a database, and uses enwiki's online API to obtain +licensing information for them, adding the info to the database. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +at already-processed names to decide what to skip. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDb = "imgData.db" # About 130k image names +imgDb = "imgData.db" apiUrl = "https://en.wikipedia.org/w/api.php" +userAgent = "terryt.dev (terry06890@gmail.com)" batchSz = 50 # Max 50 tagRegex = re.compile(r"<[^<]+>") whitespaceRegex = re.compile(r"\s+") -# Open db +print("Opening database") dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() dbCur2 = dbCon.cursor() -# Create table if it doesn't exist +print("Checking for table") if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None: dbCur.execute("CREATE TABLE imgs(" \ "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)") -# Get image names + print("Reading image names") imgNames = set() for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"): imgNames.add(imgName) -print(f"Found {len(imgNames)} images") +print(f"Found {len(imgNames)}") + +print("Checking for already-processed images") oldSz = len(imgNames) for (imgName,) in dbCur.execute("SELECT name FROM imgs"): imgNames.discard(imgName) -print(f"Skipping {oldSz - len(imgNames)} already-done images") +print(f"Found {oldSz - len(imgNames)}") + # Set SIGINT handler interrupted = False oldHandler = None @@ -48,7 +55,8 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) -# Iterate through image names, making API requests + +print("Iterating through image names") imgNames = list(imgNames) iterNum = 0 for i in range(0, len(imgNames), batchSz): @@ -63,7 +71,7 @@ for i in range(0, len(imgNames), batchSz): imgBatch = ["File:" + x for x in imgBatch] # Make request headers = { - "user-agent": "terryt.dev (terry06890@gmail.com)", + "user-agent": userAgent, "accept-encoding": "gzip", } params = { @@ -80,16 +88,16 @@ for i in range(0, len(imgNames), batchSz): response = requests.get(apiUrl, params=params, headers=headers) responseObj = response.json() except Exception as e: - print(f"Error while downloading info: {e}", file=sys.stderr) - print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + print(f"ERROR: Exception while downloading info: {e}") + print(f"\tImage batch: " + "|".join(imgBatch)) continue # Parse response-object if "query" not in responseObj or "pages" not in responseObj["query"]: - print("WARNING: Response object for doesn't have page data", file=sys.stderr) - print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + print("WARNING: Response object for doesn't have page data") + print("\tImage batch: " + "|".join(imgBatch)) if "error" in responseObj: errorCode = responseObj["error"]["code"] - print(f"\tError code: {errorCode}", file=sys.stderr) + print(f"\tError code: {errorCode}") if errorCode == "maxlag": time.sleep(5) continue @@ -111,10 +119,10 @@ for i in range(0, len(imgNames), batchSz): title = normalisedToInput[title] title = title[5:] # Remove 'File:' if title not in imgNames: - print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr) + print(f"WARNING: Got title \"{title}\" not in image-name list") continue if "imageinfo" not in page: - print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr) + print(f"WARNING: No imageinfo section for page \"{title}\"") continue metadata = page["imageinfo"][0]["extmetadata"] url = page["imageinfo"][0]["url"] @@ -122,7 +130,7 @@ for i in range(0, len(imgNames), batchSz): artist = metadata['Artist']['value'] if 'Artist' in metadata else None credit = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None - # Remove newlines + # Remove markup if artist != None: artist = tagRegex.sub(" ", artist) artist = whitespaceRegex.sub(" ", artist) @@ -134,7 +142,9 @@ for i in range(0, len(imgNames), batchSz): credit = html.unescape(credit) credit = urllib.parse.unquote(credit) # Add to db - dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url)) -# Close db + dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", + (title, license, artist, credit, restrictions, url)) + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/enwiki/downloadImgs.py b/backend/data/enwiki/downloadImgs.py new file mode 100755 index 0000000..8fb605f --- /dev/null +++ b/backend/data/enwiki/downloadImgs.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 + +import sys, re, os +import sqlite3 +import urllib.parse, requests +import time, signal + +usageInfo = f""" +Usage: {sys.argv[0]} + +Downloads images from URLs in an image database, into an output directory, +with names of the form 'pageId1.ext1'. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +in the output directory do decide what to skip. +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +imgDb = "imgData.db" # About 130k image names +outDir = "imgs" +licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) +# In testing, this downloaded about 100k images, over several days + +if not os.path.exists(outDir): + os.mkdir(outDir) +print("Checking for already-downloaded images") +fileList = os.listdir(outDir) +pageIdsDone = set() +for filename in fileList: + (basename, extension) = os.path.splitext(filename) + pageIdsDone.add(int(basename)) +print(f"Found {len(pageIdsDone)}") + +# Set SIGINT handler +interrupted = False +oldHandler = None +def onSigint(sig, frame): + global interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) +oldHandler = signal.signal(signal.SIGINT, onSigint) + +print("Opening database") +dbCon = sqlite3.connect(imgDb) +dbCur = dbCon.cursor() +print("Starting downloads") +iterNum = 0 +query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \ + " imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name" +for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query): + if pageId in pageIdsDone: + continue + if interrupted: + print(f"Exiting loop") + break + # Check for problematic attributes + if license == None or licenseRegex.fullmatch(license) == None: + continue + if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None: + continue + if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None: + continue + if restrictions != None and restrictions != "": + continue + # Download image + iterNum += 1 + print(f"Iteration {iterNum}: Downloading for page-id {pageId}") + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f"WARNING: No filename extension found in URL {url}") + sys.exit(1) + outFile = f"{outDir}/{pageId}{extension}" + headers = { + "user-agent": "terryt.dev (terry06890@gmail.com)", + "accept-encoding": "gzip", + } + try: + response = requests.get(url, headers=headers) + with open(outFile, 'wb') as file: + file.write(response.content) + time.sleep(1) + # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec" + # It's unclear how to properly check for cache misses, so this just aims for 1 per sec + except Exception as e: + print(f"Error while downloading to {outFile}: {e}") +print("Closing database") +dbCon.close() diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py index 032dbed..b0ca272 100755 --- a/backend/data/enwiki/genDescData.py +++ b/backend/data/enwiki/genDescData.py @@ -5,31 +5,36 @@ import bz2 import html, mwxml, mwparserfromhell import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n" -usageInfo += "and short-description info to an sqlite db.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads through the wiki dump, and attempts to +parse short-descriptions, and add them to a database. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages enwikiDb = "descData.db" +# In testing, this script took over 10 hours to run, and generated about 5GB -# Some regexps and functions for parsing wikitext descLineRegex = re.compile("^ *[A-Z'\"]") embeddedHtmlRegex = re.compile(r"<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$") # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") -parensGrpRegex = re.compile(r" \([^()]*\)") -leftoverBraceRegex = re.compile(r"(?:{\||{{).*") def convertTemplateReplace(match): if match.group(2) == None: return f"{match.group(1)} {match.group(4)}" else: return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}" +parensGroupRegex = re.compile(r" \([^()]*\)") +leftoverBraceRegex = re.compile(r"(?:{\||{{).*") + def parseDesc(text): - # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank - # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, + # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, + # and then accumulate lines until a blank one. + # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, lines = [] openBraceCount = 0 @@ -74,18 +79,15 @@ def removeMarkup(content): content = embeddedHtmlRegex.sub("", content) content = convertTemplateRegex.sub(convertTemplateReplace, content) content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup - content = parensGrpRegex.sub("", content) + content = parensGroupRegex.sub("", content) content = leftoverBraceRegex.sub("", content) return content -# Other helper functions def convertTitle(title): return html.unescape(title).replace("_", " ") -# Check for existing db +print("Creating database") if os.path.exists(enwikiDb): - print(f"ERROR: Existing {enwikiDb}", file=sys.stderr) - sys.exit(1) -# Create db + raise Exception(f"ERROR: Existing {enwikiDb}") dbCon = sqlite3.connect(enwikiDb) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") @@ -93,8 +95,8 @@ dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") -# Read through dump file -print("Reading dump file") + +print("Iterating through dump file") with bz2.open(dumpFile, mode='rt') as file: dump = mwxml.Dump.from_file(file) pageNum = 0 @@ -102,13 +104,15 @@ with bz2.open(dumpFile, mode='rt') as file: pageNum += 1 if pageNum % 1e4 == 0: print(f"At page {pageNum}") + if pageNum > 3e4: + break # Parse page if page.namespace == 0: try: dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) except sqlite3.IntegrityError as e: # Accounts for certain pages that have the same title - print(f"Failed to add page with title \"{page.title}\": {e}") + print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr) continue if page.redirect != None: dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) @@ -117,6 +121,7 @@ with bz2.open(dumpFile, mode='rt') as file: desc = parseDesc(revision.text) if desc != None: dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py index ee3e813..3955885 100755 --- a/backend/data/enwiki/genDumpIndexDb.py +++ b/backend/data/enwiki/genDumpIndexDb.py @@ -4,25 +4,26 @@ import sys, os, re import bz2 import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a Wikimedia enwiki dump index file,\n" -usageInfo += "and stores it's offset and title data to an sqlite db.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Adds data from the wiki dump index-file into a database. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines +indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines indexDb = "dumpIndex.db" -# Check for existing db if os.path.exists(indexDb): - print(f"ERROR: Existing {indexDb}", file=sys.stderr) - sys.exit(1) -# Create db + raise Exception(f"ERROR: Existing {indexDb}") +print("Creating database") dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)") -# Reading index file + +print("Iterating through index file") lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") lastOffset = 0 lineNum = 0 @@ -42,7 +43,7 @@ with bz2.open(indexFile, mode='rt') as file: dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset)) except sqlite3.IntegrityError as e: # Accounts for certain entries in the file that have the same title - print(f"Failed on title \"{t}\": {e}") + print(f"Failed on title \"{t}\": {e}", file=sys.stderr) entriesToAdd = [] lastOffset = offset entriesToAdd.append([title, pageId]) @@ -50,7 +51,8 @@ for (title, pageId) in entriesToAdd: try: dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1)) except sqlite3.IntegrityError as e: - print(f"Failed on title \"{t}\": {e}") -# Close db + print(f"Failed on title \"{t}\": {e}", file=sys.stderr) + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py index 9bd28f4..dedfe14 100755 --- a/backend/data/enwiki/genImgData.py +++ b/backend/data/enwiki/genImgData.py @@ -4,9 +4,15 @@ import sys, re import bz2, html, urllib.parse import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n" -usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +For some set of page IDs, looks up their content in the wiki dump, +and tries to parse infobox image names, storing them into a database. + +The program can be re-run with an updated set of page IDs, and +will skip already-processed page IDs. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -21,58 +27,64 @@ def getInputPageIds(): return pageIds dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" indexDb = "dumpIndex.db" -imgDb = "imgData.db" # Output db +imgDb = "imgData.db" # The database to create idLineRegex = re.compile(r"(.*)") imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE) cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) +# In testing, got about 360k image names -# Open dbs +print("Getting input page-ids") +pageIds = getInputPageIds() +print(f"Found {len(pageIds)}") + +print("Opening databases") indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() -# Create image-db table -pidsDone = set() +print("Checking tables") if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None: + # Create tables if not present imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") else: + # Check for already-processed page IDs + numSkipped = 0 for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"): - pidsDone.add(pid) - print(f"Will skip {len(pidsDone)} already-processed page-ids") -# Get input pageIds -print("Getting input page-ids", file=sys.stderr) -pageIds = getInputPageIds() -for pid in pidsDone: - pageIds.remove(pid) -print(f"Found {len(pageIds)} page-ids to process") -# Get page-id dump-file offsets -print("Getting dump-file offsets", file=sys.stderr) + if pid in pageIds: + pageIds.remove(pid) + numSkipped += 1 + else: + print(f"WARNING: Found already-processed page ID {pid} which was not in input set") + print(f"Will skip {numSkipped} already-processed page IDs") + +print("Getting dump-file offsets") offsetToPageids = {} -offsetToEnd = {} +offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets iterNum = 0 for pageId in pageIds: iterNum += 1 if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}", file=sys.stderr) + print(f"At iteration {iterNum}") # query = "SELECT offset, next_offset FROM offsets WHERE id = ?" row = indexDbCur.execute(query, (pageId,)).fetchone() if row == None: - print(f"WARNING: Page id {pageId} not found", file=sys.stderr) + print(f"WARNING: Page ID {pageId} not found") continue (chunkOffset, endOffset) = row offsetToEnd[chunkOffset] = endOffset if chunkOffset not in offsetToPageids: offsetToPageids[chunkOffset] = [] offsetToPageids[chunkOffset].append(pageId) -print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr) -# Look through dump file, jumping to chunks containing relevant pages -print("Reading through dump file", file=sys.stderr) +print(f"Found {len(offsetToEnd)} chunks to check") + +print("Iterating through chunks in dump file") def getImageName(content): - """ Given an array of text-content lines, returns an image-filename, or None """ + " Given an array of text-content lines, tries to return an infoxbox image name, or None " + # Doesn't try and find images in outside-infobox [[File:...]] and sections for line in content: match = imageLineRegex.match(line) if match != None: @@ -109,16 +121,15 @@ def getImageName(content): imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases) imageName = imageName.replace("_", " ") return imageName - # Skip lines like: | image = <imagemap> + # Exclude lines like: | image = <imagemap> return None - # Doesn't try and find images in outside-infobox [[File:...]] and sections return None with open(dumpFile, mode='rb') as file: iterNum = 0 for (pageOffset, endOffset) in offsetToEnd.items(): iterNum += 1 if iterNum % 100 == 0: - print(f"At iteration {iterNum}", file=sys.stderr) + print(f"At iteration {iterNum}") # pageIds = offsetToPageids[pageOffset] # Jump to chunk @@ -168,11 +179,12 @@ with open(dumpFile, mode='rb') as file: imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName)) break if not foundTextEnd: - print(f"Did not find for page id {pageId}", file=sys.stderr) + print(f"WARNING: Did not find for page id {pageId}") break if not foundText: - print(f"Did not find for page id {pageId}", file=sys.stderr) -# Close dbs + print(f"WARNING: Did not find for page id {pageId}") + +print("Closing databases") indexDbCon.close() imgDbCon.commit() imgDbCon.close() diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py index 76f2f95..1a90851 100755 --- a/backend/data/enwiki/lookupPage.py +++ b/backend/data/enwiki/lookupPage.py @@ -4,9 +4,12 @@ import sys, re import bz2 import sqlite3 -usageInfo = f"usage: {sys.argv[0]} title1\n" -usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n" -usageInfo += "using a dump index db, and prints the corresponding .\n" +usageInfo = f""" +Usage: {sys.argv[0]} title1 + +Looks up a page with title title1 in the wiki dump, using +the dump-index db, and prints the corresponding . +""" if len(sys.argv) != 2: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -15,20 +18,19 @@ dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" indexDb = "dumpIndex.db" pageTitle = sys.argv[1].replace("_", " ") -# Searching index file -print("Lookup offset in index db") +print("Looking up offset in index db") dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?" row = dbCur.execute(query, (pageTitle,)).fetchone() if row == None: print("Title not found") - sys.exit(1) -(_, pageOffset, endOffset) = row + sys.exit(0) +_, pageOffset, endOffset = row dbCon.close() print(f"Found chunk at offset {pageOffset}") -# Read dump file -print("Reading dump file") + +print("Reading from wiki dump") content = [] with open(dumpFile, mode='rb') as file: # Get uncompressed chunk @@ -61,6 +63,6 @@ with open(dumpFile, mode='rb') as file: if line.lstrip() == "": break lineIdx += 1 -# Print content + print("Content: ") print("\n".join(content)) -- cgit v1.2.3