1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
#!/usr/bin/python3
import sys, re
import sqlite3
usageInfo = f"""
Usage: {sys.argv[0]}
Reads from a database containing data from Wikipdia, along with
node and wiki-id information from the database, and use wikipedia
page-redirect information to add additional alt-name data.
"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
enwikiDb = "enwiki/descData.db"
dbFile = "data.db"
altNameRegex = re.compile(r"[a-zA-Z]+")
# Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',
print("Opening databases")
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
print("Getting nodes with wiki IDs")
nodeToWikiId = {}
for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"):
nodeToWikiId[nodeName] = wikiId
print(f"Found {len(nodeToWikiId)}")
print("Iterating through nodes, finding names that redirect to them")
nodeToAltNames = {}
numAltNames = 0
iterNum = 0
for (nodeName, wikiId) in nodeToWikiId.items():
iterNum += 1
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
nodeToAltNames[nodeName] = set()
query = "SELECT p1.title FROM pages p1" \
" INNER JOIN redirects r1 ON p1.id = r1.id" \
" INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?"
for (name,) in enwikiCur.execute(query, (wikiId,)):
if altNameRegex.fullmatch(name) != None and name.lower() != nodeName:
nodeToAltNames[nodeName].add(name.lower())
numAltNames += 1
print(f"Found {numAltNames} alt-names")
print("Excluding existing alt-names from the set")
query = "SELECT alt_name FROM names WHERE alt_name IN ({})"
iterNum = 0
for (nodeName, altNames) in nodeToAltNames.items():
iterNum += 1
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
existingNames = set()
for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)):
existingNames.add(name)
numAltNames -= len(existingNames)
altNames.difference_update(existingNames)
print(f"Left with {numAltNames} alt-names")
print("Adding alt-names to database")
for (nodeName, altNames) in nodeToAltNames.items():
for altName in altNames:
dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0))
print("Closing databases")
dbCon.commit()
dbCon.close()
enwikiCon.close()
|