From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tol_data/enwiki/README.md | 63 +++++++ backend/tol_data/enwiki/__init__.py | 0 .../tol_data/enwiki/download_img_license_info.py | 154 ++++++++++++++++ backend/tol_data/enwiki/download_imgs.py | 99 +++++++++++ backend/tol_data/enwiki/gen_desc_data.py | 126 ++++++++++++++ backend/tol_data/enwiki/gen_dump_index_db.py | 60 +++++++ backend/tol_data/enwiki/gen_img_data.py | 193 +++++++++++++++++++++ backend/tol_data/enwiki/gen_pageview_data.py | 68 ++++++++ backend/tol_data/enwiki/lookup_page.py | 71 ++++++++ 9 files changed, 834 insertions(+) create mode 100644 backend/tol_data/enwiki/README.md create mode 100644 backend/tol_data/enwiki/__init__.py create mode 100755 backend/tol_data/enwiki/download_img_license_info.py create mode 100755 backend/tol_data/enwiki/download_imgs.py create mode 100755 backend/tol_data/enwiki/gen_desc_data.py create mode 100755 backend/tol_data/enwiki/gen_dump_index_db.py create mode 100755 backend/tol_data/enwiki/gen_img_data.py create mode 100755 backend/tol_data/enwiki/gen_pageview_data.py create mode 100755 backend/tol_data/enwiki/lookup_page.py (limited to 'backend/tol_data/enwiki') diff --git a/backend/tol_data/enwiki/README.md b/backend/tol_data/enwiki/README.md new file mode 100644 index 0000000..ba1de33 --- /dev/null +++ b/backend/tol_data/enwiki/README.md @@ -0,0 +1,63 @@ +This directory holds files obtained/derived from [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page). + +# Downloaded Files +- `enwiki-20220501-pages-articles-multistream.xml.bz2`
+ Contains text content and metadata for pages in enwiki. + Obtained via (site suggests downloading from a mirror). + Some file content and format information was available from + . +- `enwiki-20220501-pages-articles-multistream-index.txt.bz2`
+ Obtained like above. Holds lines of the form offset1:pageId1:title1, + providing, for each page, an offset into the dump file of a chunk of + 100 pages that includes it. + +# Dump-Index Files +- `gen_dump_index_db.py`
+ Creates a database version of the enwiki-dump index file. +- `dumpIndex.db`
+ Generated by `gen_dump_index_db.py`.
+ Tables:
+ - `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT` + +# Description Database Files +- `gen_desc_data.py`
+ Reads through pages in the dump file, and adds short-description info to a database. +- `desc_data.db`
+ Generated by `gen_desc_data.py`.
+ Tables:
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE` + - `redirects`: `id INT PRIMARY KEY, target TEXT` + - `descs`: `id INT PRIMARY KEY, desc TEXT` + +# Image Database Files +- `gen_img_data.py`
+ Used to find infobox image names for page IDs, storing them into a database. +- `downloadImgLicenseInfo.py`
+ Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database. +- `img_data.db`
+ Used to hold metadata about infobox images for a set of pageIDs. + Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`.
+ Tables:
+ - `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT`
+ `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids. + - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT`
+ Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. +- `downloadImgs.py`
+ Used to download image files into imgs/. + +# Page View Files +- `pageviews/pageviews-*-user.bz2` + Each holds wikimedia article page view data for some month. + Obtained via . + Some format info was available from . +- `gen_pageview_data.py`
+ Reads pageview/*, and creates a database holding average monthly pageview counts. +- `pageview_data.db`
+ Generated using `gen_pageview_data.py`.
+ Tables:
+ - `views`: `title TEXT PRIMARY KEY, id INT, views INT` + +# Other Files +- `lookup_page.py`
+ Running `lookup_page.py title1` looks in the dump for a page with a given title, + and prints the contents to stdout. Uses dumpIndex.db. diff --git a/backend/tol_data/enwiki/__init__.py b/backend/tol_data/enwiki/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py new file mode 100755 index 0000000..0a809ac --- /dev/null +++ b/backend/tol_data/enwiki/download_img_license_info.py @@ -0,0 +1,154 @@ +#!/usr/bin/python3 + +""" +Reads image names from a database, and uses enwiki's online API to obtain +licensing information for them, adding the info to the database. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +at already-processed names to decide what to skip. +""" + +import re +import sqlite3, urllib.parse, html +import requests +import time, signal + +IMG_DB = 'img_data.db' +# +API_URL = 'https://en.wikipedia.org/w/api.php' +USER_AGENT = 'terryt.dev (terry06890@gmail.com)' +BATCH_SZ = 50 # Max 50 +TAG_REGEX = re.compile(r'<[^<]+>') +WHITESPACE_REGEX = re.compile(r'\s+') + +def downloadInfo(imgDb: str) -> None: + print('Opening database') + dbCon = sqlite3.connect(imgDb) + dbCur = dbCon.cursor() + print('Checking for table') + if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: + dbCur.execute('CREATE TABLE imgs (' \ + 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') + # + print('Reading image names') + imgNames: set[str] = set() + for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): + imgNames.add(imgName) + print(f'Found {len(imgNames)}') + # + print('Checking for already-processed images') + oldSz = len(imgNames) + for (imgName,) in dbCur.execute('SELECT name FROM imgs'): + imgNames.discard(imgName) + print(f'Found {oldSz - len(imgNames)}') + # + # Set SIGINT handler + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # + print('Iterating through image names') + imgNameList = list(imgNames) + iterNum = 0 + for i in range(0, len(imgNameList), BATCH_SZ): + iterNum += 1 + if iterNum % 1 == 0: + print(f'At iteration {iterNum} (after {(iterNum - 1) * BATCH_SZ} images)') + if interrupted: + print(f'Exiting loop at iteration {iterNum}') + break + # Get batch + imgBatch = imgNameList[i:i+BATCH_SZ] + imgBatch = ['File:' + x for x in imgBatch] + # Make request + headers = { + 'user-agent': USER_AGENT, + 'accept-encoding': 'gzip', + } + params = { + 'action': 'query', + 'format': 'json', + 'prop': 'imageinfo', + 'iiprop': 'extmetadata|url', + 'maxlag': '5', + 'titles': '|'.join(imgBatch), + 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions', + } + responseObj = None + try: + response = requests.get(API_URL, params=params, headers=headers) + responseObj = response.json() + except Exception as e: + print(f'ERROR: Exception while downloading info: {e}') + print('\tImage batch: ' + '|'.join(imgBatch)) + continue + # Parse response-object + if 'query' not in responseObj or 'pages' not in responseObj['query']: + print('WARNING: Response object for doesn\'t have page data') + print('\tImage batch: ' + '|'.join(imgBatch)) + if 'error' in responseObj: + errorCode = responseObj['error']['code'] + print(f'\tError code: {errorCode}') + if errorCode == 'maxlag': + time.sleep(5) + continue + pages = responseObj['query']['pages'] + normalisedToInput: dict[str, str] = {} + if 'normalized' in responseObj['query']: + for entry in responseObj['query']['normalized']: + normalisedToInput[entry['to']] = entry['from'] + for page in pages.values(): + # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data + # LicenseShortName: short human-readable license name, apparently more reliable than 'License', + # Artist: author name (might contain complex html, multiple authors, etc) + # Credit: 'source' + # For image-map-like images, can be quite large/complex html, creditng each sub-image + # May be text2, where the text2 might be non-indicative + # Restrictions: specifies non-copyright legal restrictions + title: str = page['title'] + if title in normalisedToInput: + title = normalisedToInput[title] + title = title[5:] # Remove 'File:' + if title not in imgNames: + print(f'WARNING: Got title "{title}" not in image-name list') + continue + if 'imageinfo' not in page: + print(f'WARNING: No imageinfo section for page "{title}"') + continue + metadata = page['imageinfo'][0]['extmetadata'] + url: str = page['imageinfo'][0]['url'] + license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None + artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None + credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None + restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup + if artist is not None: + artist = TAG_REGEX.sub(' ', artist).strip() + artist = WHITESPACE_REGEX.sub(' ', artist) + artist = html.unescape(artist) + artist = urllib.parse.unquote(artist) + if credit is not None: + credit = TAG_REGEX.sub(' ', credit).strip() + credit = WHITESPACE_REGEX.sub(' ', credit) + credit = html.unescape(credit) + credit = urllib.parse.unquote(credit) + # Add to db + print((title, license, artist, credit, restrictions, url)) + dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', + (title, license, artist, credit, restrictions, url)) + # + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + downloadInfo(IMG_DB) diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py new file mode 100755 index 0000000..ba874e1 --- /dev/null +++ b/backend/tol_data/enwiki/download_imgs.py @@ -0,0 +1,99 @@ +#!/usr/bin/python3 + +""" +Downloads images from URLs in an image database, into an output directory, +with names of the form 'pageId1.ext1'. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +in the output directory do decide what to skip. +""" + +# In testing, this downloaded about 100k images, over several days + +import re, os +import sqlite3 +import urllib.parse, requests +import time, signal + +IMG_DB = 'img_data.db' # About 130k image names +OUT_DIR = 'imgs' +# +LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) +USER_AGENT = 'terryt.dev (terry06890@gmail.com)' +TIMEOUT = 1 + # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' + # It's unclear how to properly check for cache misses, so we just aim for 1 per sec + +def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: + if not os.path.exists(outDir): + os.mkdir(outDir) + print('Checking for already-downloaded images') + fileList = os.listdir(outDir) + pageIdsDone: set[int] = set() + for filename in fileList: + pageIdsDone.add(int(os.path.splitext(filename)[0])) + print(f'Found {len(pageIdsDone)}') + # + # Set SIGINT handler + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # + print('Opening database') + dbCon = sqlite3.connect(imgDb) + dbCur = dbCon.cursor() + print('Starting downloads') + iterNum = 0 + query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \ + ' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name' + for pageId, license, artist, credit, restrictions, url in dbCur.execute(query): + if pageId in pageIdsDone: + continue + if interrupted: + print('Exiting loop') + break + # Check for problematic attributes + if license is None or LICENSE_REGEX.fullmatch(license) is None: + continue + if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None: + continue + if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None: + continue + if restrictions is not None and restrictions != '': + continue + # Download image + iterNum += 1 + print(f'Iteration {iterNum}: Downloading for page-id {pageId}') + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f'WARNING: No filename extension found in URL {url}') + continue + outFile = os.path.join(outDir, f'{pageId}{extension}') + print(outFile) + headers = { + 'user-agent': USER_AGENT, + 'accept-encoding': 'gzip', + } + try: + response = requests.get(url, headers=headers) + with open(outFile, 'wb') as file: + file.write(response.content) + time.sleep(timeout) + except Exception as e: + print(f'Error while downloading to {outFile}: {e}') + return + print('Closing database') + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py new file mode 100755 index 0000000..0dca16b --- /dev/null +++ b/backend/tol_data/enwiki/gen_desc_data.py @@ -0,0 +1,126 @@ +#!/usr/bin/python3 + +""" +Reads through the wiki dump, and attempts to parse short-descriptions, +and add them to a database +""" + +# In testing, this script took over 10 hours to run, and generated about 5GB + +import sys, os, re +import bz2 +import html, mwxml, mwparserfromhell +import sqlite3 + +DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages +DB_FILE = 'desc_data.db' + +DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') +EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$') + # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag +CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +def convertTemplateReplace(match): + """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ + if match.group(2) is None: + return f'{match.group(1)} {match.group(4)}' + else: + return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +def genData(dumpFile: str, dbFile: str) -> None: + print('Creating database') + if os.path.exists(dbFile): + raise Exception(f'ERROR: Existing {dbFile}') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)') + dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)') + dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') + dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') + dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') + # + print('Iterating through dump file') + with bz2.open(dumpFile, mode='rt') as file: + for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): + if pageNum % 1e4 == 0: + print(f'At page {pageNum}') + # Parse page + if page.namespace == 0: + try: + dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) + except sqlite3.IntegrityError as e: + # Accounts for certain pages that have the same title + print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr) + continue + if page.redirect is not None: + dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect))) + else: + revision = next(page) + desc = parseDesc(revision.text) + if desc is not None: + dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) + # + print('Closing database') + dbCon.commit() + dbCon.close() +def parseDesc(text: str) -> str | None: + # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, + # and then accumulate lines until a blank one. + # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, + # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + lines: list[str] = [] + openBraceCount = 0 + openBracketCount = 0 + inComment = False + skip = False + for line in text.splitlines(): + line = line.strip() + if not lines: + if line: + if openBraceCount > 0 or line[0] == '{': + openBraceCount += line.count('{') + openBraceCount -= line.count('}') + skip = True + if openBracketCount > 0 or line[0] == '[': + openBracketCount += line.count('[') + openBracketCount -= line.count(']') + skip = True + if inComment or line.find('') != -1: + if inComment: + inComment = False + skip = True + else: + inComment = True + skip = True + if skip: + skip = False + continue + if line[-1] == ':': # Seems to help avoid disambiguation pages + return None + if DESC_LINE_REGEX.match(line) is not None: + lines.append(line) + else: + if not line: + return removeMarkup(' '.join(lines)) + lines.append(line) + if lines: + return removeMarkup(' '.join(lines)) + return None +def removeMarkup(content: str) -> str: + content = EMBEDDED_HTML_REGEX.sub('', content) + content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) + content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup + content = PARENS_GROUP_REGEX.sub('', content) + content = LEFTOVER_BRACE_REGEX.sub('', content) + return content +def convertTitle(title: str) -> str: + return html.unescape(title).replace('_', ' ') + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(DUMP_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py new file mode 100755 index 0000000..5f21c9b --- /dev/null +++ b/backend/tol_data/enwiki/gen_dump_index_db.py @@ -0,0 +1,60 @@ +#!/usr/bin/python3 + +""" +Adds data from the wiki dump index-file into a database +""" +import sys, os, re +import bz2 +import sqlite3 + +INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines +DB_FILE = 'dumpIndex.db' + +def genData(indexFile: str, dbFile: str) -> None: + """ Reads the index file and creates the db """ + if os.path.exists(dbFile): + raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') + lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') + lastOffset = 0 + lineNum = 0 + entriesToAdd: list[tuple[str, str]] = [] + with bz2.open(indexFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + # + match = lineRegex.fullmatch(line.rstrip()) + assert match is not None + offsetStr, pageId, title = match.group(1,2,3) + offset = int(offsetStr) + if offset > lastOffset: + for t, p in entriesToAdd: + try: + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset)) + except sqlite3.IntegrityError as e: + # Accounts for certain entries in the file that have the same title + print(f'Failed on title "{t}": {e}', file=sys.stderr) + entriesToAdd = [] + lastOffset = offset + entriesToAdd.append((title, pageId)) + for title, pageId in entriesToAdd: + try: + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) + except sqlite3.IntegrityError as e: + print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(INDEX_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py new file mode 100755 index 0000000..d4696f0 --- /dev/null +++ b/backend/tol_data/enwiki/gen_img_data.py @@ -0,0 +1,193 @@ +#!/usr/bin/python3 + +""" +For some set of page IDs, looks up their content in the wiki dump, +and tries to parse infobox image names, storing them into a database. + +The program can be re-run with an updated set of page IDs, and +will skip already-processed page IDs. +""" + +import re +import os, bz2, html, urllib.parse +import sqlite3 + +DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' +INDEX_DB = 'dumpIndex.db' +IMG_DB = 'img_data.db' # The database to create +DB_FILE = os.path.join('..', 'data.db') +# +ID_LINE_REGEX = re.compile(r'(.*)') +IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') +BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') +IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) +CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) + +def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: + print('Opening databases') + indexDbCon = sqlite3.connect(indexDb) + indexDbCur = indexDbCon.cursor() + imgDbCon = sqlite3.connect(imgDb) + imgDbCur = imgDbCon.cursor() + print('Checking tables') + if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: + # Create tables if not present + imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL + imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') + else: + # Check for already-processed page IDs + numSkipped = 0 + for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'): + if pid in pageIds: + pageIds.remove(pid) + numSkipped += 1 + else: + print(f'Found already-processed page ID {pid} which was not in input set') + print(f'Will skip {numSkipped} already-processed page IDs') + # + print('Getting dump-file offsets') + offsetToPageids: dict[int, list[int]] = {} + offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets + iterNum = 0 + for pageId in pageIds: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + query = 'SELECT offset, next_offset FROM offsets WHERE id = ?' + row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone() + if row is None: + print(f'WARNING: Page ID {pageId} not found') + continue + chunkOffset, endOffset = row + offsetToEnd[chunkOffset] = endOffset + if chunkOffset not in offsetToPageids: + offsetToPageids[chunkOffset] = [] + offsetToPageids[chunkOffset].append(pageId) + print(f'Found {len(offsetToEnd)} chunks to check') + # + print('Iterating through chunks in dump file') + with open(dumpFile, mode='rb') as file: + iterNum = 0 + for pageOffset, endOffset in offsetToEnd.items(): + iterNum += 1 + if iterNum % 100 == 0: + print(f'At iteration {iterNum}') + # + chunkPageIds = offsetToPageids[pageOffset] + # Jump to chunk + file.seek(pageOffset) + compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) + data = bz2.BZ2Decompressor().decompress(compressedData).decode() + # Look in chunk for pages + lines = data.splitlines() + lineIdx = 0 + while lineIdx < len(lines): + # Look for + if lines[lineIdx].lstrip() != '': + lineIdx += 1 + continue + # Check page id + lineIdx += 3 + idLine = lines[lineIdx].lstrip() + match = ID_LINE_REGEX.fullmatch(idLine) + if match is None or int(match.group(1)) not in chunkPageIds: + lineIdx += 1 + continue + pageId = int(match.group(1)) + lineIdx += 1 + # Look for in + foundText = False + while lineIdx < len(lines): + if not lines[lineIdx].lstrip().startswith('') + 1:]) + lineIdx += 1 + foundTextEnd = False + while lineIdx < len(lines): + line = lines[lineIdx] + if not line.endswith(''): + content.append(line) + lineIdx += 1 + continue + foundTextEnd = True + content.append(line[:line.rfind('')]) + # Look for image-filename + imageName = getImageName(content) + imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName)) + break + if not foundTextEnd: + print(f'WARNING: Did not find for page id {pageId}') + break + if not foundText: + print(f'WARNING: Did not find for page id {pageId}') + # + print('Closing databases') + indexDbCon.close() + imgDbCon.commit() + imgDbCon.close() +def getImageName(content: list[str]) -> str | None: + """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ + # Doesn't try and find images in outside-infobox [[File:...]] and sections + for line in content: + match = IMG_LINE_REGEX.match(line) + if match is not None: + imageName = match.group(1).strip() + if imageName == '': + return None + imageName = html.unescape(imageName) + # Account for {{... + if imageName.startswith('{'): + match = CSS_IMG_CROP_REGEX.match(imageName) + if match is None: + return None + imageName = match.group(1) + # Account for [[File:...|...]] + if imageName.startswith('['): + match = BRACKET_IMG_REGEX.match(imageName) + if match is None: + return None + imageName = match.group(1) + # Account for