aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genEnwikiNameData.py
blob: 7ad61d1bba4b8feef0552ac51fb13fbccb7570ee (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python3

import sys, re
import sqlite3

usageInfo = f"""
Usage: {sys.argv[0]}

Reads from a database containing data from Wikipdia, along with
node and wiki-id information from the database, and use wikipedia
page-redirect information to add additional alt-name data.
"""
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

enwikiDb = "enwiki/descData.db"
dbFile = "data.db"
altNameRegex = re.compile(r"[a-zA-Z]+")
	# Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',

print("Opening databases")
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()

print("Getting nodes with wiki IDs")
nodeToWikiId = {}
for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"):
	nodeToWikiId[nodeName] = wikiId
print(f"Found {len(nodeToWikiId)}")

print("Iterating through nodes, finding names that redirect to them")
nodeToAltNames = {}
numAltNames = 0
iterNum = 0
for (nodeName, wikiId) in nodeToWikiId.items():
	iterNum += 1
	if iterNum % 1e4 == 0:
		print(f"At iteration {iterNum}")
	#
	nodeToAltNames[nodeName] = set()
	query = "SELECT p1.title FROM pages p1" \
		" INNER JOIN redirects r1 ON p1.id = r1.id" \
		" INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?"
	for (name,) in enwikiCur.execute(query, (wikiId,)):
		if altNameRegex.fullmatch(name) != None and name.lower() != nodeName:
			nodeToAltNames[nodeName].add(name.lower())
			numAltNames += 1
print(f"Found {numAltNames} alt-names")

print("Excluding existing alt-names from the set")
query = "SELECT alt_name FROM names WHERE alt_name IN ({})"
iterNum = 0
for (nodeName, altNames) in nodeToAltNames.items():
	iterNum += 1
	if iterNum % 1e4 == 0:
		print(f"At iteration {iterNum}")
	#
	existingNames = set()
	for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)):
		existingNames.add(name)
	numAltNames -= len(existingNames)
	altNames.difference_update(existingNames)
print(f"Left with {numAltNames} alt-names")

print("Adding alt-names to database")
for (nodeName, altNames) in nodeToAltNames.items():
	for altName in altNames:
		dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0))

print("Closing databases")
dbCon.commit()
dbCon.close()
enwikiCon.close()