aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEolNameData.py
blob: aa3905ef2e3ad6361e8392daffb3560f45f5d7ca (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/python3

import sys, re, os
import html, csv, sqlite3

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Reads vernacular-names CSV data (from the Encyclopedia of Life site),\n"
usageInfo += "makes associations with node data in a sqlite database, and writes\n"
usageInfo += "name data to that database.\n"
usageInfo += "\n"
usageInfo += "Expects a CSV header describing lines with format:\n"
usageInfo += "    page_id, canonical_form, vernacular_string, language_code,\n"
usageInfo += "    resource_name, is_preferred_by_resource, is_preferred_by_eol\n"
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

vnamesFile = "eol/vernacularNames.csv"
dbFile = "data.db"
NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"}
pickedIdsFile = "genEolNameDataPickedIds.txt"
badAltsFile = "genEolNameDataBadAlts.txt"

# Read in vernacular-names data
	# Note: Canonical-names may have multiple pids
	# Note: A canonical-name's associated pids might all have other associated names
print("Reading in vernacular-names data")
nameToPids = {}
canonicalNameToPids = {}
pidToNames = {}
pidToPreferred = {}
def updateMaps(name, pid, canonical, preferredAlt):
	if name in NAMES_TO_SKIP:
		return
	if name not in nameToPids:
		nameToPids[name] = {pid}
	else:
		nameToPids[name].add(pid)
	if canonical:
		if name not in canonicalNameToPids:
			canonicalNameToPids[name] = {pid}
		else:
			canonicalNameToPids[name].add(pid)
	if pid not in pidToNames:
		pidToNames[pid] = {name}
	else:
		pidToNames[pid].add(name)
	if preferredAlt:
		pidToPreferred[pid] = name
with open(vnamesFile, newline="") as csvfile:
	reader = csv.reader(csvfile)
	lineNum = 0
	for row in reader:
		lineNum += 1
		if lineNum == 1:
			continue
		# Parse line
		pid = int(row[0])
		name1 = re.sub(r"<[^>]+>", "", row[1].lower()) # Remove tags
		name2 = html.unescape(row[2]).lower()
		lang = row[3]
		preferred = row[6] == "preferred"
		# Add to maps
		updateMaps(name1, pid, True, False)
		if lang == "eng" and name2 != "":
			updateMaps(name2, pid, False, preferred)
# Check for manually-picked pids
print("Checking for manually-picked pids")
nameToPickedPid = {}
if os.path.exists(pickedIdsFile):
	with open(pickedIdsFile) as file:
		for line in file:
			(name, _, eolId) = line.rstrip().partition("|")
			nameToPickedPid[name] = None if eolId == "" else int(eolId)
print(f"Found {len(nameToPickedPid)}")
# Read in node-alt_names to avoid
print("Checking for bad-alt-names")
nameToBadAlts = {}
if os.path.exists(badAltsFile):
	with open(badAltsFile) as file:
		for line in file:
			(name, _, altName) = line.rstrip().partition("|")
			if name not in nameToBadAlts:
				nameToBadAlts[name] = [altName]
			else:
				nameToBadAlts[name].append(altName)
print(f"Found bad-alts for {len(nameToBadAlts)} nodes")
# Open db connection
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
# Create tables
dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))")
dbCur.execute("CREATE INDEX names_idx ON names(name)")
dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
# Iterate through 'nodes' table, resolving to canonical-names
usedPids = set()
unresolvedNodeNames = set()
dbCur2 = dbCon.cursor()
def addToDb(nodeName, pidToUse):
	altNames = set()
	preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
	dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
	for n in pidToNames[pidToUse]:
		if len(n.split(" ")) > 3:
			continue
		if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None:
			continue
		if nodeName in nameToBadAlts and n in nameToBadAlts[nodeName]:
			print(f"Excluding bad-alt {n} for node {nodeName}")
			continue
		altNames.add(n)
	for n in altNames:
		isPreferred = 1 if (n == preferredName) else 0
		dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred))
for name in nameToPickedPid: # Add manually-picked pids
	pickedPid = nameToPickedPid[name]
	usedPids.add(pickedPid)
	if pickedPid != None:
		addToDb(name, pickedPid)
iterationNum = 0
for (name,) in dbCur2.execute("SELECT name FROM nodes"):
	iterationNum += 1
	if iterationNum % 10000 == 0:
		print(f"Loop 1 iteration {iterationNum}")
	if name in nameToPickedPid:
		continue
	# If name matches a canonical-name, add alt-name entries to 'names' table
	if name in canonicalNameToPids:
		pidToUse = None
		for pid in canonicalNameToPids[name]:
			hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred
			hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred
			if hasLowerPrio:
				continue
			if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio):
				pidToUse = pid
		if pidToUse != None:
			usedPids.add(pidToUse)
			addToDb(name, pidToUse)
	elif name in nameToPids:
		unresolvedNodeNames.add(name)
# Iterate through unresolved nodes, resolving to vernacular-names
iterationNum = 0
for name in unresolvedNodeNames:
	iterationNum += 1
	if iterationNum % 100 == 0:
		print(f"Loop 2 iteration {iterationNum}")
	# Add alt-name entries to 'names' table for first corresponding pid
	pidToUse = None
	for pid in nameToPids[name]:
		if pid not in usedPids and (pidToUse == None or pid < pidToUse):
			pidToUse = pid
	if pidToUse != None:
		usedPids.add(pidToUse)
		addToDb(name, pidToUse)
# Close db
dbCon.commit()
dbCon.close()