aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki/genDescData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/enwiki/genDescData.py')
-rwxr-xr-xbackend/data/enwiki/genDescData.py68
1 files changed, 0 insertions, 68 deletions
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
deleted file mode 100755
index 3602138..0000000
--- a/backend/data/enwiki/genDescData.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/python3
-
-import re
-import sys, os.path, glob
-import mwxml, mwparserfromhell
-import sqlite3
-
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n"
-usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n"
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml")
-wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1)))
-enwikiDb = "enwikiData.db"
-
-# Some regexps and functions for parsing wikitext
-descLineRegex = "^ *[A-Z'\"]"
-embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$"
- # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
-convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}"
-def convertTemplateReplace(match):
- if match.group(2) == None:
- return "{} {}".format(match.group(1), match.group(4))
- else:
- return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
-parenGrpRegex = r" \([^()]*\)"
-def parseDesc(text):
- prevLine = None
- for line in text.splitlines():
- if prevLine != None:
- if line.strip() == "" or re.match(descLineRegex, line) != None:
- return prevLine
- else:
- prevLine = None
- if re.match(descLineRegex, line) != None:
- line = re.sub(embeddedHtmlRegex, "", line)
- line = re.sub(convertTemplateRegex, convertTemplateReplace, line)
- line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup
- prevLine = re.sub(parenGrpRegex, "", line)
- if prevLine != None:
- return prevLine
- return None
-
-# Open db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Parse data
-iterationNum = 0
-for fileName in wikiDumpFiles:
- print("Processing file {}".format(fileName))
- dump = mwxml.Dump.from_file(open(fileName))
- for page in dump:
- iterationNum += 1
- if iterationNum % 10000 == 0:
- print("At iteration {}".format(iterationNum))
- # Parse page
- if page.namespace == 0 and page.redirect == None:
- revision = next(page)
- desc = parseDesc(revision.text)
- if desc != None:
- dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
-dbCon.commit()
-dbCon.close()