1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
|
#!/usr/bin/python3
import sys, re, os
import html, csv, sqlite3
usageInfo = f"""
Usage: {sys.argv[0]}
Reads files describing name data from the 'Encyclopedia of Life' site,
tries to associate names with nodes in the database, and adds tables
to represent associated names.
Reads a vernacularNames.csv file:
Starts with a header line containing:
page_id, canonical_form, vernacular_string, language_code,
resource_name, is_preferred_by_resource, is_preferred_by_eol
The canonical_form and vernacular_string fields contain names
associated with the page ID. Names are not always unique to
particular page IDs.
"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries
dbFile = "data.db"
namesToSkip = {"unknown", "unknown species", "unidentified species"}
pickedIdsFile = "pickedEolIds.txt"
altsToSkipFile = "pickedEolAltsToSkip.txt"
print("Reading in vernacular-names data")
nameToPids = {} # 'pid' means 'Page ID'
canonicalNameToPids = {}
pidToNames = {}
pidToPreferred = {} # Maps pids to 'preferred' names
def updateMaps(name, pid, canonical, preferredAlt):
global namesToSkip, nameToPids, canonicalNameToPids, pidToNames, pidToPreferred
if name in namesToSkip:
return
if name not in nameToPids:
nameToPids[name] = {pid}
else:
nameToPids[name].add(pid)
if canonical:
if name not in canonicalNameToPids:
canonicalNameToPids[name] = {pid}
else:
canonicalNameToPids[name].add(pid)
if pid not in pidToNames:
pidToNames[pid] = {name}
else:
pidToNames[pid].add(name)
if preferredAlt:
pidToPreferred[pid] = name
with open(vnamesFile, newline="") as csvfile:
reader = csv.reader(csvfile)
lineNum = 0
for row in reader:
lineNum += 1
if lineNum % 1e5 == 0:
print(f"At line {lineNum}")
# Skip header line
if lineNum == 1:
continue
# Parse line
pid = int(row[0])
name1 = re.sub(r"<[^>]+>", "", row[1].lower()) # Remove tags
name2 = html.unescape(row[2]).lower()
lang = row[3]
preferred = row[6] == "preferred"
# Add to maps
updateMaps(name1, pid, True, False)
if lang == "eng" and name2 != "":
updateMaps(name2, pid, False, preferred)
print("Checking for manually-picked pids")
nameToPickedPid = {}
if os.path.exists(pickedIdsFile):
with open(pickedIdsFile) as file:
for line in file:
(name, _, eolId) = line.rstrip().partition("|")
nameToPickedPid[name] = None if eolId == "" else int(eolId)
print(f"Found {len(nameToPickedPid)}")
print("Checking for alt-names to skip")
nameToAltsToSkip = {}
numToSkip = 0
if os.path.exists(altsToSkipFile):
with open(altsToSkipFile) as file:
for line in file:
(name, _, altName) = line.rstrip().partition("|")
if name not in nameToAltsToSkip:
nameToAltsToSkip[name] = [altName]
else:
nameToAltsToSkip[name].append(altName)
numToSkip += 1
print(f"Found {numToSkip} alt-names to skip")
print("Creating database tables")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))")
dbCur.execute("CREATE INDEX names_idx ON names(name)")
dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
print("Associating nodes with names")
usedPids = set()
unresolvedNodeNames = set()
dbCur2 = dbCon.cursor()
def addToDb(nodeName, pidToUse):
" Adds page-ID-associated name data to a node in the database "
global dbCur, pidToPreferred
dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
# Get alt-names
altNames = set()
for n in pidToNames[pidToUse]:
# Avoid alt-names with >3 words
if len(n.split(" ")) > 3:
continue
# Avoid alt-names that already name a node in the database
if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None:
continue
# Check for picked alt-name-to-skip
if nodeName in nameToAltsToSkip and n in nameToAltsToSkip[nodeName]:
print(f"Excluding alt-name {n} for node {nodeName}")
continue
#
altNames.add(n)
# Add alt-names to db
preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
for n in altNames:
isPreferred = 1 if (n == preferredName) else 0
dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred))
print("Adding picked IDs")
for (name, pid) in nameToPickedPid.items():
if pid != None:
addToDb(name, pid)
usedPids.add(pid)
print("Associating nodes with canonical names")
iterNum = 0
for (nodeName,) in dbCur2.execute("SELECT name FROM nodes"):
iterNum += 1
if iterNum % 1e5 == 0:
print(f"At iteration {iterNum}")
if nodeName in nameToPickedPid:
continue
# Check for matching canonical name
if nodeName in canonicalNameToPids:
pidToUse = None
# Pick an associated page ID
for pid in canonicalNameToPids[nodeName]:
hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred
hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred
if hasLowerPrio:
continue
if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio):
pidToUse = pid
if pidToUse != None:
addToDb(nodeName, pidToUse)
usedPids.add(pidToUse)
elif nodeName in nameToPids:
unresolvedNodeNames.add(nodeName)
print("Associating leftover nodes with other names")
iterNum = 0
for nodeName in unresolvedNodeNames:
iterNum += 1
if iterNum % 100 == 0:
print(f"At iteration {iterNum}")
# Check for matching name
pidToUse = None
for pid in nameToPids[nodeName]:
# Pick an associated page ID
if pid not in usedPids and (pidToUse == None or pid < pidToUse):
pidToUse = pid
if pidToUse != None:
addToDb(nodeName, pidToUse)
usedPids.add(pidToUse)
print("Closing database")
dbCon.commit()
dbCon.close()
|