1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
#!/usr/bin/python3
import sys, re, os
import html, csv, sqlite3
import argparse
parser = argparse.ArgumentParser(description='''
Maps nodes to vernacular names, using data from EOL, enwiki, and a
picked-names file, and stores results in the database.
''', formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
eolNamesFile = 'eol/vernacularNames.csv'
enwikiDb = 'enwiki/descData.db'
pickedNamesFile = 'pickedNames.txt'
dbFile = 'data.db'
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
print('Getting node mappings')
nodeToTips = {}
eolIdToNode = {} # Maps eol ID to node name (if there are multiple, choose one with most tips)
wikiIdToNode = {}
for name, tips in dbCur.execute('SELECT name, tips from nodes'):
nodeToTips[name] = tips
for name, eolId in dbCur.execute('SELECT name, id from eol_ids'):
if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]:
eolIdToNode[eolId] = name
for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]:
wikiIdToNode[wikiId] = name
print('Creating table')
dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))')
dbCur.execute('CREATE INDEX names_idx ON names(name)')
dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)')
dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)')
print('Getting names from EOL')
# The CSV file has a header line, then lines with these fields:
# page_id, canonical_form (canonical name, not always unique to page ID),
# vernacular_string (vernacular name), language_code,
# resource_name, is_preferred_by_resource, is_preferred_by_eol
namesToSkip = {'unknown', 'unknown species', 'unidentified species'}
with open(eolNamesFile, newline='') as file:
for lineNum, fields in enumerate(csv.reader(file), 1):
if lineNum % 1e5 == 0:
print(f'At line {lineNum}') # Reached about 2.8e6
# Skip header line
if lineNum == 1:
continue
# Parse line
eolId = int(fields[0])
name = html.unescape(fields[2]).lower()
lang = fields[3]
isPreferred = 1 if fields[6] == 'preferred' else 0
# Add to db
if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \
and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words
cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')' # The 'OR IGNORE' accounts for duplicate lines
dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred))
print('Getting names from Wikipedia')
altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)',
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
iterNum = 0
for wikiId, nodeName in wikiIdToNode.items():
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}') # Reached about 3.6e5
#
query = 'SELECT p1.title FROM pages p1' \
' INNER JOIN redirects r1 ON p1.id = r1.id' \
' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?'
for (name,) in enwikiCur.execute(query, (wikiId,)):
name = name.lower()
if altNameRegex.fullmatch(name) != None and name != nodeName and name not in nodeToTips:
dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0))
print('Getting picked names')
# File format:
# nodename1|altName1|isPreferred1 -> Add an alt-name
# nodename1|altName1| -> Remove an alt-name
# nodename1|nodeName1| -> Remove any preferred-alt status
altNamesToSkip = {} # Maps node names to alt-names to exclude
if os.path.exists(pickedNamesFile):
with open(pickedNamesFile) as file:
for line in file:
nodeName, altName, isPreferred = line.lower().rstrip().split('|')
if nodeName not in nodeToTips:
print(f"Skipping \"{nodeName}\", as no such node exists")
continue
if isPreferred:
isPreferred = 1 if isPreferred == '1' else 0
if isPreferred == 1:
# Remove any existing preferred-alt status
cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1'
dbCur.execute(cmd, (nodeName, altName))
# Remove any existing record
dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName))
# Add record
dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'picked')", (nodeName, altName, isPreferred))
elif nodeName != altName: # Remove any matching record
dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName))
else: # Remove any preferred-alt status
cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1'
dbCur.execute(cmd, (nodeName, altName))
print('Closing database')
dbCon.commit()
dbCon.close()
|