aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/gen_mapping_data.py
blob: 1ab577bf8793e51c207adb108c3a7e71f4eaed91 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/python3

"""
Maps otol IDs to EOL and enwiki titles, using IDs from various
other sources (like NCBI).

Reads otol taxonomy data to get source IDs for otol IDs,
then looks up those IDs in an EOL provider_ids file,
and in a wikidata dump, and stores results in the database.

Based on code from https://github.com/OneZoom/OZtree, located in
OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
"""

import argparse
import os
from collections import defaultdict
import gzip
import csv
import sqlite3

TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv')
EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz')
WIKIDATA_DB = os.path.join('wikidata', 'taxon_srcs.db')
ENWIKI_DUMP_INDEX_DB = os.path.join('enwiki', 'dump_index.db')
PICKED_MAPPINGS = {
	'eol': ['picked_eol_ids.txt'],
	'enwiki': ['picked_wiki_ids.txt', 'picked_wiki_ids_rough.txt']
}
DB_FILE = 'data.db'

OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority
EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps external-source int-identifiers to names

def genData(
		taxonomyFile: str,
		eolIdsFile: str,
		wikidataDb: str,
		pickedMappings: dict[str, list[str]],
		enwikiDumpIndexDb: str,
		dbFile: str) -> None:
	""" Reads the files and enwiki db and creates the db """
	nodeToSrcIds: dict[int, dict[str, int]] = {} # Maps otol ID to {src1: id1, src2: id2, ...}
	usedSrcIds: set[tuple[str, int]] = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used)
	nodeToEolId: dict[int, int] = {} # Maps otol ID to eol ID
	nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title
	titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string
	titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID

	# Get mappings from data input
	readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds)
	readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId)
	readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId)
	readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle)
	getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId)

	print('Writing to db')
	dbCon = sqlite3.connect(dbFile)
	dbCur = dbCon.cursor()

	# Get otol id-to-name map
	otolIdToName: dict[int, str] = {}
	for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
		if nodeId.startswith('ott'):
			otolIdToName[int(nodeId[3:])] = nodeName

	# Add eol mappings
	dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
	dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
	for otolId, eolId in nodeToEolId.items():
		if otolId in otolIdToName:
			dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))

	# Add enwiki mappings
	dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
	dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
	dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)')
	for otolId, title in nodeToWikiTitle.items():
		if otolId in otolIdToName and title in titleToPageId:
			dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
			if title in titleToIucnStatus:
				dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))

	dbCon.commit()
	dbCon.close()

def readTaxonomyFile(
		taxonomyFile: str,
		nodeToSrcIds: dict[int, dict[str, int]],
		usedSrcIds: set[tuple[str, int]]) -> None:
	""" Reads taxonomy file, and maps OTOL node IDs to external-source IDs """
	# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence):
		# uid (otol-id, eg: 93302), parent_uid, name, rank, 
		# sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags
	print('Reading taxonomy file')
	with open(taxonomyFile) as file: # Had about 4.5e6 lines
		for lineNum, line in enumerate(file, 1):
			if lineNum % 1e5 == 0:
				print(f'At line {lineNum}')

			# Skip header line
			if lineNum == 1:
				continue

			# Parse line
			fields = line.split('\t|\t')
			try:
				otolId = int(fields[0])
			except ValueError:
				print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
				continue
			srcsField = fields[4]

			# Add source IDs
			for srcPair in srcsField.split(','):
				src, srcIdStr = srcPair.split(':', 1)
				if srcIdStr.isdecimal() and src in OTOL_SRCS:
					if otolId not in nodeToSrcIds:
						nodeToSrcIds[otolId] = {}
					elif src in nodeToSrcIds[otolId]:
						continue
					srcId = int(srcIdStr)
					nodeToSrcIds[otolId][src] = srcId
					usedSrcIds.add((src, srcId))
	print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6

def readEolIdsFile(
		eolIdsFile: str,
		nodeToSrcIds: dict[int, dict[str, int]],
		usedSrcIds: set[tuple[str, int]],
		nodeToEolId: dict[int, int]) -> None:
	""" Reads EOL provider IDs file, and maps EOL IDs to external-source IDs """
	# The file is a CSV with a header line, then lines that hold these fields:
		# node_id, resource_pk (ID from external source), resource_id (int denoting external-source),
		# page_id (eol ID), preferred_canonical_for_page
	print('Reading EOL provider IDs file')
	srcToEolId: dict[str, dict[int, int]] = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...}
	with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines
		for lineNum, row in enumerate(csv.reader(file), 1):
			if lineNum % 1e6 == 0:
				print(f'At line {lineNum}')

			# Skip header line
			if lineNum == 1:
				continue

			# Parse line
			eolId = int(row[3])
			srcInt = int(row[2])
			srcIdStr = row[1]
			if srcIdStr.isdecimal() and srcInt in EOL_SRCS:
				srcId = int(srcIdStr)
				src = EOL_SRCS[srcInt]
				if (src, srcId) not in usedSrcIds:
					continue
				if srcId in srcToEolId[src]:
					print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}')
					continue
				srcToEolId[src][srcId] = eolId
	print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
		# Was about 3.5e6 (4.2e6 without usedSrcIds)

	print('Resolving candidate EOL IDs')
	# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
	for otolId, srcInfo in nodeToSrcIds.items():
		eolIdToCount: dict[int, int] = defaultdict(int)
		for src, srcId in srcInfo.items():
			if src in srcToEolId and srcId in srcToEolId[src]:
				eolId = srcToEolId[src][srcId]
				eolIdToCount[eolId] += 1
		if len(eolIdToCount) == 1:
			nodeToEolId[otolId] = list(eolIdToCount)[0]
		elif len(eolIdToCount) > 1:
			# For multiple candidates, prefer those with most sources, and break ties by picking the lowest
			maxCount = max(eolIdToCount.values())
			eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
			nodeToEolId[otolId] = min(eolIds)
	print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6

def readWikidataDb(
		wikidataDb: str,
		nodeToSrcIds: dict[int, dict[str, int]],
		usedSrcIds: set[tuple[str, int]],
		nodeToWikiTitle: dict[int, str],
		titleToIucnStatus: dict[str, str],
		nodeToEolId: dict[int, int]) -> None:
	""" Reads db holding ID and IUCN mappings from wikidata, and maps otol IDs to Wikipedia titles and EOL IDs """
	print('Reading from Wikidata db')
	srcToWikiTitle: dict[str, dict[int, str]] = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...}
	wikiTitles = set()
	dbCon = sqlite3.connect(wikidataDb)
	dbCur = dbCon.cursor()
	for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'):
		if (src, srcId) in usedSrcIds or src == 'eol': # Keep EOL IDs for later use 
			srcToWikiTitle[src][srcId] = title
			wikiTitles.add(title)
	for title, status in dbCur.execute('SELECT title, status from title_iucn'):
		if title in wikiTitles:
			titleToIucnStatus[title] = status
	print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries')
		# Was about 1.1e6 (1.2e6 without usedSrcIds)
	print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
	dbCon.close()

	print('Resolving candidate Wikidata items')
	# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
	for otolId, srcInfo in nodeToSrcIds.items():
		titleToSrcs: dict[str, list[str]] = defaultdict(list) # Maps candidate titles to list of sources
		for src, srcId in srcInfo.items():
			if src in srcToWikiTitle and srcId in srcToWikiTitle[src]:
				title = srcToWikiTitle[src][srcId]
				titleToSrcs[title].append(src)
		# Choose title to use
		if len(titleToSrcs) == 1:
			nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
		elif len(titleToSrcs) > 1: # Test example: otol ID 621052
			# Get titles with most sources
			maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()])
			titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt}
			if len(titleToSrcs) == 1:
				nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
			else:
				# Get a title with a source with highest priority
				srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]}
				for src in OTOL_SRCS:
					if src in srcToTitle:
						nodeToWikiTitle[otolId] = srcToTitle[src]
						break
	print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5

	print('Adding extra EOL mappings from Wikidata')
	wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
	addedEntries: dict[int, int] = {}
	for eolId, title in srcToWikiTitle['eol'].items():
		if title in wikiTitleToNode:
			otolId = wikiTitleToNode[title]
			if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID
				nodeToEolId[otolId] = eolId
				addedEntries[otolId] = eolId
	print(f'- Added {len(addedEntries):,} entries') # Was about 3e3

def readPickedMappings(
		pickedMappings: dict[str, list[str]],
		nodeToEolId: dict[int, int],
		nodeToWikiTitle: dict[int, str]) -> None:
	""" Read mappings from OTOL IDs to EOL IDs and Wikipedia titles """
	print('Reading picked mappings')
	for src in pickedMappings:
		for filename in pickedMappings[src]:
			if not os.path.exists(filename):
				continue
			with open(filename) as file:
				for line in file:
					otolIdStr, mappedVal = line.rstrip().split('|')
					otolId = int(otolIdStr)
					if src == 'eol':
						if mappedVal:
							nodeToEolId[otolId] = int(mappedVal)
						else:
							if otolId in nodeToEolId:
								del nodeToEolId[otolId]
					else: # src == 'enwiki'
						if mappedVal:
							nodeToWikiTitle[otolId] = mappedVal
						else:
							if otolId in nodeToWikiTitle:
								del nodeToWikiTitle[otolId]

def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None:
	""" Read a db for mappings from enwiki titles to page IDs """
	print('Getting enwiki page IDs')
	numNotFound = 0
	dbCon = sqlite3.connect(enwikiDumpIndexDb)
	dbCur = dbCon.cursor()
	for title in nodeToWikiTitle.values():
		record = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
		if record != None:
			titleToPageId[title] = record[0]
		else:
			numNotFound += 1
	dbCon.close()
	print(f'Unable to find IDs for {numNotFound} titles') # Was 2913

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	args = parser.parse_args()

	genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE)