diff options
Diffstat (limited to 'backend/data/enwiki/genDescData.py')
| -rwxr-xr-x | backend/data/enwiki/genDescData.py | 68 |
1 files changed, 0 insertions, 68 deletions
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py deleted file mode 100755 index 3602138..0000000 --- a/backend/data/enwiki/genDescData.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/python3 - -import re -import sys, os.path, glob -import mwxml, mwparserfromhell -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n" -usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml") -wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1))) -enwikiDb = "enwikiData.db" - -# Some regexps and functions for parsing wikitext -descLineRegex = "^ *[A-Z'\"]" -embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$" - # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag -convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}" -def convertTemplateReplace(match): - if match.group(2) == None: - return "{} {}".format(match.group(1), match.group(4)) - else: - return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4)) -parenGrpRegex = r" \([^()]*\)" -def parseDesc(text): - prevLine = None - for line in text.splitlines(): - if prevLine != None: - if line.strip() == "" or re.match(descLineRegex, line) != None: - return prevLine - else: - prevLine = None - if re.match(descLineRegex, line) != None: - line = re.sub(embeddedHtmlRegex, "", line) - line = re.sub(convertTemplateRegex, convertTemplateReplace, line) - line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup - prevLine = re.sub(parenGrpRegex, "", line) - if prevLine != None: - return prevLine - return None - -# Open db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") -# Parse data -iterationNum = 0 -for fileName in wikiDumpFiles: - print("Processing file {}".format(fileName)) - dump = mwxml.Dump.from_file(open(fileName)) - for page in dump: - iterationNum += 1 - if iterationNum % 10000 == 0: - print("At iteration {}".format(iterationNum)) - # Parse page - if page.namespace == 0 and page.redirect == None: - revision = next(page) - desc = parseDesc(revision.text) - if desc != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) -# Close db -dbCon.commit() -dbCon.close() |
