aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genMappingData.py
blob: d562d7e2966e857a0512d773844d5593d46a62bb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/python3

import sys, re, os
from collections import defaultdict
import gzip, bz2, csv, sqlite3

import argparse
parser = argparse.ArgumentParser(description='''
Maps otol IDs to EOL and enwiki titles, using IDs from various
other sources (like NCBI).

Reads otol taxonomy data to get source IDs for otol IDs,
then looks up those IDs in an EOL provider_ids file,
and in a wikidata dump, and stores results in the database.

Based on code from https://github.com/OneZoom/OZtree, located in
OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
''', formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()

taxonomyFile = 'otol/taxonomy.tsv'
eolIdsFile = 'eol/provider_ids.csv.gz'
wikidataDb = 'wikidata/taxonSrcs.db'
enwikiDumpIndexDb = 'enwiki/dumpIndex.db'
pickedMappings = {
	'eol': ['pickedEolIds.txt'],
	'enwiki': ['pickedWikiIds.txt', 'pickedWikiIdsRough.txt']
}
dbFile = 'data.db'

print('Reading taxonomy file')
# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence):
	# uid (otol-id, eg: 93302), parent_uid, name, rank, 
	# sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags
OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority
nodeToSrcIds = defaultdict(dict) # Maps otol ID to {src1: id1, src2: id2, ...}
usedSrcIds = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used)
with open(taxonomyFile) as file: # Had about 4.5e6 lines
	lineNum = 0
	for line in file:
		lineNum += 1
		if lineNum % 1e5 == 0:
			print(f'At line {lineNum}')
		# Skip header line
		if lineNum == 1:
			continue
		# Parse line
		fields = line.split('\t|\t')
		try:
			otolId = int(fields[0])
		except ValueError:
			print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
			continue
		srcInfo = fields[4]
		# Add source IDs
		for srcPair in srcInfo.split(','):
			src, srcId = srcPair.split(':', 1)
			if srcId.isdecimal() and src in OTOL_SRCS and src not in nodeToSrcIds[otolId]:
				srcId = int(srcId)
				nodeToSrcIds[otolId][src] = srcId
				usedSrcIds.add((src, srcId))
print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6

print('Reading EOL provider_ids file')
# The CSV file has a header line, then lines that hold these fields:
	# node_id, resource_pk (ID from external source), resource_id (int denoting external-source),
	# page_id (eol ID), preferred_canonical_for_page
EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps ints to external-source names
srcToEolId = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...}
with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines
	for lineNum, row in enumerate(csv.reader(file), 1):
		if lineNum % 1e6 == 0:
			print(f'At line {lineNum}')
		# Skip header line
		if lineNum == 1:
			continue
		# Parse line
		eolId = int(row[3])
		srcVal = int(row[2])
		srcId = row[1]
		if srcId.isdecimal() and srcVal in EOL_SRCS:
			srcId = int(srcId)
			src = EOL_SRCS[srcVal]
			if (src, srcId) not in usedSrcIds:
				continue
			if srcId in srcToEolId[src]:
				print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}')
				continue
			srcToEolId[src][srcId] = eolId
print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
	# Was about 3.5e6 (4.2e6 without usedSrcIds)

print('Resolving candidate EOL IDs')
# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
nodeToEolId = {} # Maps otol ID to eol ID
for otolId, srcInfo in nodeToSrcIds.items():
	eolIdToCount = defaultdict(int)
	for src, srcId in srcInfo.items():
		if src in srcToEolId and srcId in srcToEolId[src]:
			eolId = srcToEolId[src][srcId]
			eolIdToCount[eolId] += 1
	if len(eolIdToCount) == 1:
		nodeToEolId[otolId] = list(eolIdToCount)[0]
	elif len(eolIdToCount) > 1:
		# For multiple candidates, prefer those with most sources, and break ties by picking the lowest
		maxCount = max(eolIdToCount.values())
		eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
		nodeToEolId[otolId] = min(eolIds)
print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6

print('Reading from Wikidata db')
srcToWikiTitle = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...}
wikiTitles = set()
titleToIucnStatus = {}
dbCon = sqlite3.connect(wikidataDb)
dbCur = dbCon.cursor()
for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'):
	if (src, srcId) not in usedSrcIds and src != 'eol': # Keep EOL IDs for later use 
		continue
	srcToWikiTitle[src][srcId] = title
	wikiTitles.add(title)
for title, status in dbCur.execute('SELECT title, status from title_iucn'):
	if title in wikiTitles:
		titleToIucnStatus[title] = status
print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries')
	# Was about 1.1e6 (1.2e6 without usedSrcIds)
print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
dbCon.close()

print('Resolving candidate Wikidata items')
# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
nodeToWikiTitle = {}
for otolId, srcInfo in nodeToSrcIds.items():
	titleToSrcs = defaultdict(list) # Maps candidate titles to {src1: srcId1, ...}
	for src, srcId in srcInfo.items():
		if src in srcToWikiTitle and srcId in srcToWikiTitle[src]:
			title = srcToWikiTitle[src][srcId]
			titleToSrcs[title].append(src)
	# Choose title to use
	if len(titleToSrcs) == 1:
		nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
	elif len(titleToSrcs) > 1: # Test example: otol ID 621052
		# Get titles with most sources
		maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()])
		titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt}
		if len(titleToSrcs) == 1:
			nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
		else: # Test example: otol ID 4235272
			# Get a title with a source with highest priority
			srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]}
			for src in OTOL_SRCS:
				if src in srcToTitle:
					nodeToWikiTitle[otolId] = srcToTitle[src]
					break
print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5

print('Adding extra EOL mappings from Wikidata')
eolIdToNode = {eolId: node for node, eolId in nodeToEolId.items()}
wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
addedEntries = {}
for eolId, title in srcToWikiTitle['eol'].items():
	if title in wikiTitleToNode:
		otolId = wikiTitleToNode[title]
		if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID
			nodeToEolId[otolId] = eolId
			addedEntries[otolId] = eolId
print(f'- Added {len(addedEntries):,} entries') # Was about 3e3

print('Reading picked mappings')
for src in pickedMappings:
	for filename in pickedMappings[src]:
		if not os.path.exists(filename):
			continue
		with open(filename) as file:
			for line in file:
				otolId, mappedVal = line.rstrip().split('|')
				otolId = int(otolId)
				if src == 'eol':
					if mappedVal:
						nodeToEolId[otolId] = int(mappedVal)
					else:
						if otolId in nodeToEolId:
							del nodeToEolId[otolId]
				else: # src == 'enwiki'
					if mappedVal:
						nodeToWikiTitle[otolId] = mappedVal
					else:
						if otolId in nodeToWikiTitle:
							del nodeToWikiTitle[otolId]

print(f'Getting enwiki page IDs')
titleToPageId = {}
numNotFound = 0
dbCon = sqlite3.connect(enwikiDumpIndexDb)
dbCur = dbCon.cursor()
for title in nodeToWikiTitle.values():
	row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
	if row != None:
		titleToPageId[title] = row[0]
	else:
		numNotFound += 1
dbCon.close()
print(f'Unable to find IDs for {numNotFound} titles') # Was 2913

print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
# Get otol id-to-name map
otolIdToName = {}
for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
	if nodeId.startswith('ott'):
		otolIdToName[int(nodeId[3:])] = nodeName
# Add eol mappings
dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
for otolId, eolId in nodeToEolId.items():
	if otolId in otolIdToName:
		dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
# Add enwiki mappings
dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)')
for otolId, title in nodeToWikiTitle.items():
	if otolId in otolIdToName and title in titleToPageId:
		dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
		if title in titleToIucnStatus:
			dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
dbCon.commit()
dbCon.close()