1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
#!/usr/bin/python3
import sys, re
import sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
usageInfo += "Reads Wikimedia enwiki redirect data from enwiki/, and node and wiki-id\n"
usageInfo += "data from a sqlite database, and adds supplmenentary alt-name data.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
enwikiDb = "enwiki/enwikiData.db"
dbFile = "data.db"
altNameRegex = re.compile(r"[a-zA-Z]+")
# Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',
# Open dbs
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
# Get nodes with wiki-ids
print("Getting nodes with wiki IDs")
nodeToWikiId = {}
for row in dbCur.execute("SELECT name, wiki_id from descs"):
nodeToWikiId[row[0]] = row[1]
print(f"Found {len(nodeToWikiId)} nodes")
# Find wiki-ids that redirect to each node
print("Finding redirecter names")
nodeToAltNames = {}
numAltNames = 0
iterNum = 0
for (nodeName, wikiId) in nodeToWikiId.items():
iterNum += 1
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
nodeToAltNames[nodeName] = set()
query = "SELECT p1.title FROM pages p1" \
" INNER JOIN redirects r1 ON p1.id = r1.id" \
" INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?"
for (name,) in enwikiCur.execute(query, (wikiId,)):
if altNameRegex.fullmatch(name) != None and name.lower() != nodeName:
nodeToAltNames[nodeName].add(name.lower())
numAltNames += 1
print(f"Found {numAltNames} alt-names")
# Remove existing alt-names
print("Removing existing alt-names")
query = "SELECT alt_name FROM names WHERE alt_name IN ({})"
iterNum = 0
for (nodeName, altNames) in nodeToAltNames.items():
iterNum += 1
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
existingNames = set()
for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)):
existingNames.add(name)
numAltNames -= len(existingNames)
altNames.difference_update(existingNames)
print(f"Left with {numAltNames} alt-names")
# Add alt-names
print("Adding alt-names")
for (nodeName, altNames) in nodeToAltNames.items():
for altName in altNames:
dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (nodeName, altName, 0))
# Close dbs
dbCon.commit()
dbCon.close()
enwikiCon.close()
|