diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-04 01:17:06 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-05-04 01:17:06 +1000 |
| commit | 90a5e15bb824b84e5bb60337d6a57a1394090dc6 (patch) | |
| tree | 661ea356c8d83b74d16f19d3555b0a1d3eb6eb56 /backend/data/enwiki/genDescData.py | |
| parent | ec29e5731136c74a1991e2f93b5e233747f2a230 (diff) | |
Add scripts for obtaining/sending/displaying wikipedia descriptions
Add backend/data/enwiki/ directory containing scripts and instructive
READMEs. Adjust some other scripts to generate 'eol_ids' sqlite table
separate from 'names'. Make server respond to /data/desc requests,
and have client TileInfo component display response data.
Also adjust .gitignore entries to be root-relative.
Diffstat (limited to 'backend/data/enwiki/genDescData.py')
| -rwxr-xr-x | backend/data/enwiki/genDescData.py | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py new file mode 100755 index 0000000..3602138 --- /dev/null +++ b/backend/data/enwiki/genDescData.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 + +import re +import sys, os.path, glob +import mwxml, mwparserfromhell +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n" +usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml") +wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1))) +enwikiDb = "enwikiData.db" + +# Some regexps and functions for parsing wikitext +descLineRegex = "^ *[A-Z'\"]" +embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$" + # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag +convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}" +def convertTemplateReplace(match): + if match.group(2) == None: + return "{} {}".format(match.group(1), match.group(4)) + else: + return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4)) +parenGrpRegex = r" \([^()]*\)" +def parseDesc(text): + prevLine = None + for line in text.splitlines(): + if prevLine != None: + if line.strip() == "" or re.match(descLineRegex, line) != None: + return prevLine + else: + prevLine = None + if re.match(descLineRegex, line) != None: + line = re.sub(embeddedHtmlRegex, "", line) + line = re.sub(convertTemplateRegex, convertTemplateReplace, line) + line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup + prevLine = re.sub(parenGrpRegex, "", line) + if prevLine != None: + return prevLine + return None + +# Open db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") +# Parse data +iterationNum = 0 +for fileName in wikiDumpFiles: + print("Processing file {}".format(fileName)) + dump = mwxml.Dump.from_file(open(fileName)) + for page in dump: + iterationNum += 1 + if iterationNum % 10000 == 0: + print("At iteration {}".format(iterationNum)) + # Parse page + if page.namespace == 0 and page.redirect == None: + revision = next(page) + desc = parseDesc(revision.text) + if desc != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) +# Close db +dbCon.commit() +dbCon.close() |
