1 files changed, 229 insertions, 0 deletions
diff --git a/backend/tolData/genMappingData.py b/backend/tolData/genMappingData.py
new file mode 100755
index 0000000..d562d7e
--- /dev/null
+++ b/backend/tolData/genMappingData.py
@@ -0,0 +1,229 @@
+#!/usr/bin/python3
+
+import sys, re, os
+from collections import defaultdict
+import gzip, bz2, csv, sqlite3
+
+import argparse
+parser = argparse.ArgumentParser(description='''
+Maps otol IDs to EOL and enwiki titles, using IDs from various
+other sources (like NCBI).
+
+Reads otol taxonomy data to get source IDs for otol IDs,
+then looks up those IDs in an EOL provider_ids file,
+and in a wikidata dump, and stores results in the database.
+
+Based on code from https://github.com/OneZoom/OZtree, located in
+OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
+''', formatter_class=argparse.RawDescriptionHelpFormatter)
+args = parser.parse_args()
+
+taxonomyFile = 'otol/taxonomy.tsv'
+eolIdsFile = 'eol/provider_ids.csv.gz'
+wikidataDb = 'wikidata/taxonSrcs.db'
+enwikiDumpIndexDb = 'enwiki/dumpIndex.db'
+pickedMappings = {
+	'eol': ['pickedEolIds.txt'],
+	'enwiki': ['pickedWikiIds.txt', 'pickedWikiIdsRough.txt']
+}
+dbFile = 'data.db'
+
+print('Reading taxonomy file')
+# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence):
+	# uid (otol-id, eg: 93302), parent_uid, name, rank, 
+	# sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags
+OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority
+nodeToSrcIds = defaultdict(dict) # Maps otol ID to {src1: id1, src2: id2, ...}
+usedSrcIds = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used)
+with open(taxonomyFile) as file: # Had about 4.5e6 lines
+	lineNum = 0
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f'At line {lineNum}')
+		# Skip header line
+		if lineNum == 1:
+			continue
+		# Parse line
+		fields = line.split('\t|\t')
+		try:
+			otolId = int(fields[0])
+		except ValueError:
+			print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
+			continue
+		srcInfo = fields[4]
+		# Add source IDs
+		for srcPair in srcInfo.split(','):
+			src, srcId = srcPair.split(':', 1)
+			if srcId.isdecimal() and src in OTOL_SRCS and src not in nodeToSrcIds[otolId]:
+				srcId = int(srcId)
+				nodeToSrcIds[otolId][src] = srcId
+				usedSrcIds.add((src, srcId))
+print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
+
+print('Reading EOL provider_ids file')
+# The CSV file has a header line, then lines that hold these fields:
+	# node_id, resource_pk (ID from external source), resource_id (int denoting external-source),
+	# page_id (eol ID), preferred_canonical_for_page
+EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps ints to external-source names
+srcToEolId = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...}
+with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines
+	for lineNum, row in enumerate(csv.reader(file), 1):
+		if lineNum % 1e6 == 0:
+			print(f'At line {lineNum}')
+		# Skip header line
+		if lineNum == 1:
+			continue
+		# Parse line
+		eolId = int(row[3])
+		srcVal = int(row[2])
+		srcId = row[1]
+		if srcId.isdecimal() and srcVal in EOL_SRCS:
+			srcId = int(srcId)
+			src = EOL_SRCS[srcVal]
+			if (src, srcId) not in usedSrcIds:
+				continue
+			if srcId in srcToEolId[src]:
+				print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}')
+				continue
+			srcToEolId[src][srcId] = eolId
+print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
+	# Was about 3.5e6 (4.2e6 without usedSrcIds)
+
+print('Resolving candidate EOL IDs')
+# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
+nodeToEolId = {} # Maps otol ID to eol ID
+for otolId, srcInfo in nodeToSrcIds.items():
+	eolIdToCount = defaultdict(int)
+	for src, srcId in srcInfo.items():
+		if src in srcToEolId and srcId in srcToEolId[src]:
+			eolId = srcToEolId[src][srcId]
+			eolIdToCount[eolId] += 1
+	if len(eolIdToCount) == 1:
+		nodeToEolId[otolId] = list(eolIdToCount)[0]
+	elif len(eolIdToCount) > 1:
+		# For multiple candidates, prefer those with most sources, and break ties by picking the lowest
+		maxCount = max(eolIdToCount.values())
+		eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
+		nodeToEolId[otolId] = min(eolIds)
+print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
+
+print('Reading from Wikidata db')
+srcToWikiTitle = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...}
+wikiTitles = set()
+titleToIucnStatus = {}
+dbCon = sqlite3.connect(wikidataDb)
+dbCur = dbCon.cursor()
+for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'):
+	if (src, srcId) not in usedSrcIds and src != 'eol': # Keep EOL IDs for later use 
+		continue
+	srcToWikiTitle[src][srcId] = title
+	wikiTitles.add(title)
+for title, status in dbCur.execute('SELECT title, status from title_iucn'):
+	if title in wikiTitles:
+		titleToIucnStatus[title] = status
+print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries')
+	# Was about 1.1e6 (1.2e6 without usedSrcIds)
+print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
+dbCon.close()
+
+print('Resolving candidate Wikidata items')
+# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
+nodeToWikiTitle = {}
+for otolId, srcInfo in nodeToSrcIds.items():
+	titleToSrcs = defaultdict(list) # Maps candidate titles to {src1: srcId1, ...}
+	for src, srcId in srcInfo.items():
+		if src in srcToWikiTitle and srcId in srcToWikiTitle[src]:
+			title = srcToWikiTitle[src][srcId]
+			titleToSrcs[title].append(src)
+	# Choose title to use
+	if len(titleToSrcs) == 1:
+		nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
+	elif len(titleToSrcs) > 1: # Test example: otol ID 621052
+		# Get titles with most sources
+		maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()])
+		titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt}
+		if len(titleToSrcs) == 1:
+			nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
+		else: # Test example: otol ID 4235272
+			# Get a title with a source with highest priority
+			srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]}
+			for src in OTOL_SRCS:
+				if src in srcToTitle:
+					nodeToWikiTitle[otolId] = srcToTitle[src]
+					break
+print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
+
+print('Adding extra EOL mappings from Wikidata')
+eolIdToNode = {eolId: node for node, eolId in nodeToEolId.items()}
+wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
+addedEntries = {}
+for eolId, title in srcToWikiTitle['eol'].items():
+	if title in wikiTitleToNode:
+		otolId = wikiTitleToNode[title]
+		if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID
+			nodeToEolId[otolId] = eolId
+			addedEntries[otolId] = eolId
+print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
+
+print('Reading picked mappings')
+for src in pickedMappings:
+	for filename in pickedMappings[src]:
+		if not os.path.exists(filename):
+			continue
+		with open(filename) as file:
+			for line in file:
+				otolId, mappedVal = line.rstrip().split('|')
+				otolId = int(otolId)
+				if src == 'eol':
+					if mappedVal:
+						nodeToEolId[otolId] = int(mappedVal)
+					else:
+						if otolId in nodeToEolId:
+							del nodeToEolId[otolId]
+				else: # src == 'enwiki'
+					if mappedVal:
+						nodeToWikiTitle[otolId] = mappedVal
+					else:
+						if otolId in nodeToWikiTitle:
+							del nodeToWikiTitle[otolId]
+
+print(f'Getting enwiki page IDs')
+titleToPageId = {}
+numNotFound = 0
+dbCon = sqlite3.connect(enwikiDumpIndexDb)
+dbCur = dbCon.cursor()
+for title in nodeToWikiTitle.values():
+	row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+	if row != None:
+		titleToPageId[title] = row[0]
+	else:
+		numNotFound += 1
+dbCon.close()
+print(f'Unable to find IDs for {numNotFound} titles') # Was 2913
+
+print('Writing to db')
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Get otol id-to-name map
+otolIdToName = {}
+for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
+	if nodeId.startswith('ott'):
+		otolIdToName[int(nodeId[3:])] = nodeName
+# Add eol mappings
+dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
+dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
+for otolId, eolId in nodeToEolId.items():
+	if otolId in otolIdToName:
+		dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
+# Add enwiki mappings
+dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
+dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
+dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)')
+for otolId, title in nodeToWikiTitle.items():
+	if otolId in otolIdToName and title in titleToPageId:
+		dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
+		if title in titleToIucnStatus:
+			dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
+dbCon.commit()
+dbCon.close()