diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-21 12:21:03 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-21 12:32:01 +1100 |
| commit | 0a9b2c2e5eca8a04e37fbdd423379882863237c2 (patch) | |
| tree | 1812bdb6bb13e4f76fdd7ef04075b291f775c213 /backend/hist_data/enwiki/gen_desc_data.py | |
| parent | 8321e2f92dbc073b8f1de87895d6620a2021b22e (diff) | |
Adjust backend coding style
Increase line spacing, add section comments, etc
Diffstat (limited to 'backend/hist_data/enwiki/gen_desc_data.py')
| -rwxr-xr-x | backend/hist_data/enwiki/gen_desc_data.py | 51 |
1 files changed, 37 insertions, 14 deletions
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index bb2b845..194afe8 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -5,30 +5,40 @@ Reads through the wiki dump, attempts to parse short-descriptions, and adds them to a database """ -# In testing, this script took over 10 hours to run, and generated about 5GB +# Note: In testing, this script took over 10 hours to run, and generated about 5GB import argparse -import sys, os, re -import bz2, html, mwxml, mwparserfromhell +import sys +import os +import re import sqlite3 +import bz2 +import html + +import mwxml +import mwparserfromhell DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' -# Regexps + DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + def convertTemplateReplace(match): """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ if match.group(2) is None: return f'{match.group(1)} {match.group(4)}' else: return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' -PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') -LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +# ========== For data generation ========== def genData(dumpFile: str, dbFile: str) -> None: + """ Reads dump, parses descriptions, and writes to db """ print('Creating database') if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') @@ -39,13 +49,13 @@ def genData(dumpFile: str, dbFile: str) -> None: dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') - # + print('Iterating through dump file') with bz2.open(dumpFile, mode='rt') as file: for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): if pageNum % 1e4 == 0: print(f'At page {pageNum}') - # Parse page + if page.namespace == 0: try: dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) @@ -60,15 +70,22 @@ def genData(dumpFile: str, dbFile: str) -> None: desc = parseDesc(revision.text) if desc is not None: dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) - # + print('Closing database') dbCon.commit() dbCon.close() + def parseDesc(text: str) -> str | None: - # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, - # and then accumulate lines until a blank one. - # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ + Looks for a description in wikitext content. + + Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs, + and then accumulates lines until a blank one. + + Some cases not accounted for include: + disambiguation pages, abstracts with sentences split-across-lines, + nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ lines: list[str] = [] openBraceCount = 0 openBracketCount = 0 @@ -108,18 +125,24 @@ def parseDesc(text: str) -> str | None: if lines: return removeMarkup(' '.join(lines)) return None + def removeMarkup(content: str) -> str: + """ Tries to remove markup from wikitext content """ content = EMBEDDED_HTML_REGEX.sub('', content) content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup content = PARENS_GROUP_REGEX.sub('', content) content = LEFTOVER_BRACE_REGEX.sub('', content) return content + def convertTitle(title: str) -> str: + """ Replaces underscores in wiki item title """ return html.unescape(title).replace('_', ' ') +# ========== Main block ========== + if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(DUMP_FILE, DB_FILE) |
