diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
| commit | 8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch) | |
| tree | ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data/enwiki/gen_desc_data.py | |
| parent | f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff) | |
Adjust backend coding style
Add line spacing, section comments, and import consistency
Diffstat (limited to 'backend/tol_data/enwiki/gen_desc_data.py')
| -rwxr-xr-x | backend/tol_data/enwiki/gen_desc_data.py | 45 |
1 files changed, 32 insertions, 13 deletions
diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py index b3fde52..44e4d6f 100755 --- a/backend/tol_data/enwiki/gen_desc_data.py +++ b/backend/tol_data/enwiki/gen_desc_data.py @@ -7,10 +7,16 @@ and adds them to a database # In testing, this script took over 10 hours to run, and generated about 5GB -import sys, os, re +import argparse +import sys +import os +import re import bz2 -import html, mwxml, mwparserfromhell import sqlite3 +import html + +import mwxml +import mwparserfromhell DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' @@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + def convertTemplateReplace(match): """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ if match.group(2) is None: return f'{match.group(1)} {match.group(4)}' else: return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' -PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') -LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +# ========== For data generation ========== def genData(dumpFile: str, dbFile: str) -> None: print('Creating database') @@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None: dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') - # + print('Iterating through dump file') with bz2.open(dumpFile, mode='rt') as file: for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): if pageNum % 1e4 == 0: print(f'At page {pageNum}') - # Parse page + if page.namespace == 0: try: dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) @@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None: desc = parseDesc(revision.text) if desc is not None: dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) - # + print('Closing database') dbCon.commit() dbCon.close() + def parseDesc(text: str) -> str | None: - # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, - # and then accumulate lines until a blank one. - # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ + Looks for a description in wikitext content. + + Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs, + and then accumulates lines until a blank one. + + Some cases not accounted for include: + disambiguation pages, abstracts with sentences split-across-lines, + nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ lines: list[str] = [] openBraceCount = 0 openBracketCount = 0 @@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None: if lines: return removeMarkup(' '.join(lines)) return None + def removeMarkup(content: str) -> str: content = EMBEDDED_HTML_REGEX.sub('', content) content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) @@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str: content = PARENS_GROUP_REGEX.sub('', content) content = LEFTOVER_BRACE_REGEX.sub('', content) return content + def convertTitle(title: str) -> str: return html.unescape(title).replace('_', ' ') +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(DUMP_FILE, DB_FILE) |
