From daccbbd9c73a5292ea9d6746560d7009e5aa666d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 7 Sep 2022 11:37:37 +1000 Subject: Add python type annotations Also use consistent quote symbols Also use 'is None' instead of '== None' Also use 'if list1' instead of 'if len(list1) > 0' --- backend/tolData/enwiki/genDescData.py | 100 +++++++++++++++++----------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'backend/tolData/enwiki/genDescData.py') diff --git a/backend/tolData/enwiki/genDescData.py b/backend/tolData/enwiki/genDescData.py index 0085d70..1698f5c 100755 --- a/backend/tolData/enwiki/genDescData.py +++ b/backend/tolData/enwiki/genDescData.py @@ -12,46 +12,46 @@ and add them to a database """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages -enwikiDb = "descData.db" +dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages +enwikiDb = 'descData.db' # In testing, this script took over 10 hours to run, and generated about 5GB -descLineRegex = re.compile("^ *[A-Z'\"]") -embeddedHtmlRegex = re.compile(r"<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$") +descLineRegex = re.compile('^ *[A-Z\'"]') +embeddedHtmlRegex = re.compile(r'<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag -convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") +convertTemplateRegex = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') def convertTemplateReplace(match): - if match.group(2) == None: - return f"{match.group(1)} {match.group(4)}" + if match.group(2) is None: + return f'{match.group(1)} {match.group(4)}' else: - return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}" -parensGroupRegex = re.compile(r" \([^()]*\)") -leftoverBraceRegex = re.compile(r"(?:{\||{{).*") + return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' +parensGroupRegex = re.compile(r' \([^()]*\)') +leftoverBraceRegex = re.compile(r'(?:{\||{{).*') -def parseDesc(text): +def parseDesc(text: str) -> str | None: # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, # and then accumulate lines until a blank one. # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, - lines = [] + lines: list[str] = [] openBraceCount = 0 openBracketCount = 0 inComment = False skip = False for line in text.splitlines(): line = line.strip() - if len(lines) == 0: - if len(line) > 0: - if openBraceCount > 0 or line[0] == "{": - openBraceCount += line.count("{") - openBraceCount -= line.count("}") + if not lines: + if line: + if openBraceCount > 0 or line[0] == '{': + openBraceCount += line.count('{') + openBraceCount -= line.count('}') skip = True - if openBracketCount > 0 or line[0] == "[": - openBracketCount += line.count("[") - openBracketCount -= line.count("]") + if openBracketCount > 0 or line[0] == '[': + openBracketCount += line.count('[') + openBracketCount -= line.count(']') skip = True - if inComment or line.find("") != -1: + if inComment or line.find('') != -1: if inComment: inComment = False skip = True @@ -61,64 +61,64 @@ def parseDesc(text): if skip: skip = False continue - if line[-1] == ":": # Seems to help avoid disambiguation pages + if line[-1] == ':': # Seems to help avoid disambiguation pages return None - if descLineRegex.match(line) != None: + if descLineRegex.match(line) is not None: lines.append(line) else: - if len(line) == 0: - return removeMarkup(" ".join(lines)) + if not line: + return removeMarkup(' '.join(lines)) lines.append(line) - if len(lines) > 0: - return removeMarkup(" ".join(lines)) + if lines: + return removeMarkup(' '.join(lines)) return None -def removeMarkup(content): - content = embeddedHtmlRegex.sub("", content) +def removeMarkup(content: str) -> str: + content = embeddedHtmlRegex.sub('', content) content = convertTemplateRegex.sub(convertTemplateReplace, content) content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup - content = parensGroupRegex.sub("", content) - content = leftoverBraceRegex.sub("", content) + content = parensGroupRegex.sub('', content) + content = leftoverBraceRegex.sub('', content) return content -def convertTitle(title): - return html.unescape(title).replace("_", " ") +def convertTitle(title: str) -> str: + return html.unescape(title).replace('_', ' ') -print("Creating database") +print('Creating database') if os.path.exists(enwikiDb): - raise Exception(f"ERROR: Existing {enwikiDb}") + raise Exception(f'ERROR: Existing {enwikiDb}') dbCon = sqlite3.connect(enwikiDb) dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") -dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") -dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") -dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") -dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") +dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)') +dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)') +dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') +dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') +dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') -print("Iterating through dump file") +print('Iterating through dump file') with bz2.open(dumpFile, mode='rt') as file: dump = mwxml.Dump.from_file(file) pageNum = 0 for page in dump: pageNum += 1 if pageNum % 1e4 == 0: - print(f"At page {pageNum}") + print(f'At page {pageNum}') if pageNum > 3e4: break # Parse page if page.namespace == 0: try: - dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) + dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) except sqlite3.IntegrityError as e: # Accounts for certain pages that have the same title - print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr) + print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr) continue - if page.redirect != None: - dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) + if page.redirect is not None: + dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect))) else: revision = next(page) desc = parseDesc(revision.text) - if desc != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) + if desc is not None: + dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) -print("Closing database") +print('Closing database') dbCon.commit() dbCon.close() -- cgit v1.2.3