From 8781fdb2b8c530a6c1531ae9e82221eb062e34fb Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 29 Jan 2023 11:30:47 +1100 Subject: Adjust backend coding style Add line spacing, section comments, and import consistency --- .../tol_data/enwiki/download_img_license_info.py | 30 ++++++++++----- backend/tol_data/enwiki/download_imgs.py | 24 ++++++++---- backend/tol_data/enwiki/gen_desc_data.py | 45 +++++++++++++++------- backend/tol_data/enwiki/gen_dump_index_db.py | 16 +++++--- backend/tol_data/enwiki/gen_img_data.py | 36 +++++++++++------ backend/tol_data/enwiki/gen_pageview_data.py | 28 +++++++++----- backend/tol_data/enwiki/lookup_page.py | 9 +++-- 7 files changed, 127 insertions(+), 61 deletions(-) (limited to 'backend/tol_data/enwiki') diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py index 17e15b4..6efc7a4 100755 --- a/backend/tol_data/enwiki/download_img_license_info.py +++ b/backend/tol_data/enwiki/download_img_license_info.py @@ -9,13 +9,19 @@ The program can be re-run to continue downloading, and looks at already-processed names to decide what to skip. """ +import argparse import re -import sqlite3, urllib.parse, html +import sqlite3 + import requests -import time, signal +import urllib.parse +import html + +import time +import signal IMG_DB = 'img_data.db' -# + API_URL = 'https://en.wikipedia.org/w/api.php' USER_AGENT = 'terryt.dev (terry06890@gmail.com)' BATCH_SZ = 50 # Max 50 @@ -30,19 +36,19 @@ def downloadInfo(imgDb: str) -> None: if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: dbCur.execute('CREATE TABLE imgs (' \ 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') - # + print('Reading image names') imgNames: set[str] = set() for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): imgNames.add(imgName) print(f'Found {len(imgNames)}') - # + print('Checking for already-processed images') oldSz = len(imgNames) for (imgName,) in dbCur.execute('SELECT name FROM imgs'): imgNames.discard(imgName) print(f'Found {oldSz - len(imgNames)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -51,7 +57,7 @@ def downloadInfo(imgDb: str) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Iterating through image names') imgNameList = list(imgNames) iterNum = 0 @@ -62,9 +68,11 @@ def downloadInfo(imgDb: str) -> None: if interrupted: print(f'Exiting loop at iteration {iterNum}') break + # Get batch imgBatch = imgNameList[i:i+BATCH_SZ] imgBatch = ['File:' + x for x in imgBatch] + # Make request headers = { 'user-agent': USER_AGENT, @@ -87,6 +95,7 @@ def downloadInfo(imgDb: str) -> None: print(f'ERROR: Exception while downloading info: {e}') print('\tImage batch: ' + '|'.join(imgBatch)) continue + # Parse response-object if 'query' not in responseObj or 'pages' not in responseObj['query']: print('WARNING: Response object doesn\'t have page data') @@ -126,6 +135,7 @@ def downloadInfo(imgDb: str) -> None: artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup if artist is not None: artist = TAG_REGEX.sub(' ', artist).strip() @@ -137,17 +147,17 @@ def downloadInfo(imgDb: str) -> None: credit = WHITESPACE_REGEX.sub(' ', credit) credit = html.unescape(credit) credit = urllib.parse.unquote(credit) + # Add to db dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', (title, license, artist, credit, restrictions, url)) - # + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadInfo(IMG_DB) diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py index c6a1c21..164289d 100755 --- a/backend/tol_data/enwiki/download_imgs.py +++ b/backend/tol_data/enwiki/download_imgs.py @@ -11,14 +11,20 @@ in the output directory do decide what to skip. # In testing, this downloaded about 100k images, over several days -import re, os +import argparse +import re +import os import sqlite3 -import urllib.parse, requests -import time, signal + +import requests +import urllib.parse + +import time +import signal IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' -# + LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 @@ -34,7 +40,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: for filename in fileList: pageIdsDone.add(int(os.path.splitext(filename)[0])) print(f'Found {len(pageIdsDone)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -43,7 +49,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() @@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if interrupted: print('Exiting loop') break + # Check for problematic attributes if license is None or LICENSE_REGEX.fullmatch(license) is None: continue @@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: continue if restrictions is not None and restrictions != '': continue + # Download image iterNum += 1 print(f'Iteration {iterNum}: Downloading for page-id {pageId}') @@ -87,12 +95,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: except Exception as e: print(f'Error while downloading to {outFile}: {e}') return + print('Closing database') dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py index b3fde52..44e4d6f 100755 --- a/backend/tol_data/enwiki/gen_desc_data.py +++ b/backend/tol_data/enwiki/gen_desc_data.py @@ -7,10 +7,16 @@ and adds them to a database # In testing, this script took over 10 hours to run, and generated about 5GB -import sys, os, re +import argparse +import sys +import os +import re import bz2 -import html, mwxml, mwparserfromhell import sqlite3 +import html + +import mwxml +import mwparserfromhell DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' @@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + def convertTemplateReplace(match): """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ if match.group(2) is None: return f'{match.group(1)} {match.group(4)}' else: return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' -PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') -LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +# ========== For data generation ========== def genData(dumpFile: str, dbFile: str) -> None: print('Creating database') @@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None: dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') - # + print('Iterating through dump file') with bz2.open(dumpFile, mode='rt') as file: for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): if pageNum % 1e4 == 0: print(f'At page {pageNum}') - # Parse page + if page.namespace == 0: try: dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) @@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None: desc = parseDesc(revision.text) if desc is not None: dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) - # + print('Closing database') dbCon.commit() dbCon.close() + def parseDesc(text: str) -> str | None: - # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, - # and then accumulate lines until a blank one. - # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ + Looks for a description in wikitext content. + + Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs, + and then accumulates lines until a blank one. + + Some cases not accounted for include: + disambiguation pages, abstracts with sentences split-across-lines, + nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ lines: list[str] = [] openBraceCount = 0 openBracketCount = 0 @@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None: if lines: return removeMarkup(' '.join(lines)) return None + def removeMarkup(content: str) -> str: content = EMBEDDED_HTML_REGEX.sub('', content) content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) @@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str: content = PARENS_GROUP_REGEX.sub('', content) content = LEFTOVER_BRACE_REGEX.sub('', content) return content + def convertTitle(title: str) -> str: return html.unescape(title).replace('_', ' ') +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(DUMP_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py index 5778680..12a8a10 100755 --- a/backend/tol_data/enwiki/gen_dump_index_db.py +++ b/backend/tol_data/enwiki/gen_dump_index_db.py @@ -1,9 +1,13 @@ #!/usr/bin/python3 """ -Adds data from the wiki dump index-file into a database +Converts data from the wiki-dump index-file into a database """ -import sys, os, re + +import argparse +import sys +import os +import re import bz2 import sqlite3 @@ -14,10 +18,12 @@ def genData(indexFile: str, dbFile: str) -> None: """ Reads the index file and creates the db """ if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') lastOffset = 0 @@ -28,7 +34,7 @@ def genData(indexFile: str, dbFile: str) -> None: lineNum += 1 if lineNum % 1e5 == 0: print(f'At line {lineNum}') - # + match = lineRegex.fullmatch(line.rstrip()) assert match is not None offsetStr, pageId, title = match.group(1,2,3) @@ -48,13 +54,13 @@ def genData(indexFile: str, dbFile: str) -> None: dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) except sqlite3.IntegrityError as e: print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(INDEX_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py index 040f223..2c243f3 100755 --- a/backend/tol_data/enwiki/gen_img_data.py +++ b/backend/tol_data/enwiki/gen_img_data.py @@ -8,31 +8,39 @@ The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """ +import argparse import re -import os, bz2, html, urllib.parse +import os +import bz2 +import html +import urllib.parse import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -# + ID_LINE_REGEX = re.compile(r'(.*)') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) +# ========== For data generation ========== + def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: print('Opening databases') indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() + print('Checking tables') if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present - imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL + imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') + # 'img_name' values are set to NULL to indicate page IDs where no image was found imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') else: # Check for already-processed page IDs @@ -44,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: else: print(f'Found already-processed page ID {pid} which was not in input set') print(f'Will skip {numSkipped} already-processed page IDs') - # + print('Getting dump-file offsets') offsetToPageids: dict[int, list[int]] = {} offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets @@ -53,7 +61,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') - # + query = 'SELECT offset, next_offset FROM offsets WHERE id = ?' row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone() if row is None: @@ -65,7 +73,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: offsetToPageids[chunkOffset] = [] offsetToPageids[chunkOffset].append(pageId) print(f'Found {len(offsetToEnd)} chunks to check') - # + print('Iterating through chunks in dump file') with open(dumpFile, mode='rb') as file: iterNum = 0 @@ -73,7 +81,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 100 == 0: print(f'At iteration {iterNum}') - # + chunkPageIds = offsetToPageids[pageOffset] # Jump to chunk file.seek(pageOffset) @@ -126,14 +134,15 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: break if not foundText: print(f'WARNING: Did not find for page id {pageId}') - # + print('Closing databases') indexDbCon.close() imgDbCon.commit() imgDbCon.close() + def getImageName(content: list[str]) -> str | None: """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ - # Doesn't try and find images in outside-infobox [[File:...]] and sections + # Note: Doesn't try and find images in outside-infobox [[File:...]] and sections for line in content: match = IMG_LINE_REGEX.match(line) if match is not None: @@ -174,6 +183,8 @@ def getImageName(content: list[str]) -> str | None: return None return None +# ========== For getting input page IDs ========== + def getInputPageIdsFromDb(dbFile: str) -> set[int]: print('Getting input page-ids') pageIds: set[int] = set() @@ -182,12 +193,15 @@ def getInputPageIdsFromDb(dbFile: str) -> set[int]: for (pageId,) in dbCur.execute('SELECT id from wiki_ids'): pageIds.add(pageId) dbCon.close() + print(f'Found {len(pageIds)}') return pageIds + +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + pageIds = getInputPageIdsFromDb(DB_FILE) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py index 8aee1cc..95b4a60 100755 --- a/backend/tol_data/enwiki/gen_pageview_data.py +++ b/backend/tol_data/enwiki/gen_pageview_data.py @@ -3,27 +3,34 @@ """ Reads through wikimedia files containing pageview counts, computes average counts, and adds them to a database + +Each pageview file has lines that seem to hold these space-separated fields: + wiki code (eg: en.wikipedia), article title, page ID (may be: null), + platform (eg: mobile-web), monthly view count, + hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) """ # Took about 15min per file (each had about 180e6 lines) -import sys, os, glob, math, re +import argparse +import sys +import os +import glob +import math +import re from collections import defaultdict -import bz2, sqlite3 +import bz2 +import sqlite3 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') DUMP_INDEX_DB = 'dump_index.db' DB_FILE = 'pageview_data.db' def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: - # Each pageview file has lines that seem to hold these space-separated fields: - # wiki code (eg: en.wikipedia), article title, page ID (may be: null), - # platform (eg: mobile-web), monthly view count, - # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) if os.path.exists(dbFile): print('ERROR: Database already exists') sys.exit(1) - # + namespaceRegex = re.compile(r'[a-zA-Z]+:') titleToViews: dict[str, int] = defaultdict(int) linePrefix = b'en.wikipedia ' @@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: print(f'At line {lineNum}') if not line.startswith(linePrefix): continue + # Get second and second-last fields line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields title = line[:line.find(b' ')].decode('utf-8') viewCount = int(line[line.rfind(b' ')+1:]) if namespaceRegex.match(title) is not None: continue + # Update map title = title.replace('_', ' ') titleToViews[title] += viewCount print(f'Found {len(titleToViews)} titles') - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: idbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE) diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py index f744818..c4d0932 100755 --- a/backend/tol_data/enwiki/lookup_page.py +++ b/backend/tol_data/enwiki/lookup_page.py @@ -5,6 +5,7 @@ Looks up a page with title title1 in the wiki dump, using the dump-index db, and prints the corresponding . """ +import argparse import sys import bz2 import sqlite3 @@ -24,7 +25,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None: _, pageOffset, endOffset = row dbCon.close() print(f'Found chunk at offset {pageOffset}') - # + print('Reading from wiki dump') content: list[str] = [] with open(dumpFile, mode='rb') as file: @@ -32,6 +33,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None: file.seek(pageOffset) compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) data = bz2.BZ2Decompressor().decompress(compressedData).decode() + # Look in chunk for page lines = data.splitlines() lineIdx = 0 @@ -58,14 +60,13 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None: if line.lstrip() == '': break lineIdx += 1 - # + print('Content: ') print('\n'.join(content)) if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('title', help='The title to look up') args = parser.parse_args() - # + lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' ')) -- cgit v1.2.3