1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
#!/usr/bin/python3
import re
import sys, os.path, glob
import mwxml, mwparserfromhell
import sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n"
usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml")
wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1)))
enwikiDb = "enwikiData.db"
# Some regexps and functions for parsing wikitext
descLineRegex = "^ *[A-Z'\"]"
embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$"
# Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}"
def convertTemplateReplace(match):
if match.group(2) == None:
return "{} {}".format(match.group(1), match.group(4))
else:
return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
parenGrpRegex = r" \([^()]*\)"
def parseDesc(text):
prevLine = None
for line in text.splitlines():
if prevLine != None:
if line.strip() == "" or re.match(descLineRegex, line) != None:
return prevLine
else:
prevLine = None
if re.match(descLineRegex, line) != None:
line = re.sub(embeddedHtmlRegex, "", line)
line = re.sub(convertTemplateRegex, convertTemplateReplace, line)
line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup
prevLine = re.sub(parenGrpRegex, "", line)
if prevLine != None:
return prevLine
return None
# Open db
dbCon = sqlite3.connect(enwikiDb)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
# Parse data
iterationNum = 0
for fileName in wikiDumpFiles:
print("Processing file {}".format(fileName))
dump = mwxml.Dump.from_file(open(fileName))
for page in dump:
iterationNum += 1
if iterationNum % 10000 == 0:
print("At iteration {}".format(iterationNum))
# Parse page
if page.namespace == 0 and page.redirect == None:
revision = next(page)
desc = parseDesc(revision.text)
if desc != None:
dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
# Close db
dbCon.commit()
dbCon.close()
|