#!/usr/bin/python3

"""
Maps otol IDs to EOL and enwiki titles, using IDs from various
other sources (like NCBI).

Reads otol taxonomy data to get source IDs for otol IDs,
then looks up those IDs in an EOL provider_ids file,
and in a wikidata dump, and stores results in the database.

Based on code from https://github.com/OneZoom/OZtree, located in
OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
"""

import os
from collections import defaultdict
import gzip, csv, sqlite3

TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv')
EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz')
WIKIDATA_DB = os.path.join('wikidata', 'taxon_srcs.db')
ENWIKI_DUMP_INDEX_DB = os.path.join('enwiki', 'dumpIndex.db')
PICKED_MAPPINGS = {
	'eol': ['picked_eol_ids.txt'],
	'enwiki': ['picked_wiki_ids.txt', 'picked_wiki_ids_rough.txt']
}
DB_FILE = 'data.db'

OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority
EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps external-source int-identifiers to names

def genData(
		taxonomyFile: str,
		eolIdsFile: str,
		wikidataDb: str,
		pickedMappings: dict[str, list[str]],
		enwikiDumpIndexDb: str,
		dbFile: str) -> None:
	""" Reads the files and enwiki db and creates the db """
	nodeToSrcIds: dict[int, dict[str, int]] = {} # Maps otol ID to {src1: id1, src2: id2, ...}
	usedSrcIds: set[tuple[str, int]] = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used)
	nodeToEolId: dict[int, int] = {} # Maps otol ID to eol ID
	nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title
	titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string
	titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID
	# Get mappings from data input
	readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds)
	readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId)
	readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId)
	readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle)
	getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId)
	#
	print('Writing to db')
	dbCon = sqlite3.connect(dbFile)
	dbCur = dbCon.cursor()
	# Get otol id-to-name map
	otolIdToName: dict[int, str] = {}
	for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
		if nodeId.startswith('ott'):
			otolIdToName[int(nodeId[3:])] = nodeName
	# Add eol mappings
	dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
	dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
	for otolId, eolId in nodeToEolId.items():
		if otolId in otolIdToName:
			dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
	# Add enwiki mappings
	dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
	dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
	dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)')
	for otolId, title in nodeToWikiTitle.items():
		if otolId in otolIdToName and title in titleToPageId:
			dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
			if title in titleToIucnStatus:
				dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
	dbCon.commit()
	dbCon.close()
def readTaxonomyFile(
		taxonomyFile: str,
		nodeToSrcIds: dict[int, dict[str, int]],
		usedSrcIds: set[tuple[str, int]]) -> None:
	""" Reads taxonomy file, and maps OTOL node IDs to external-source IDs """
	# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence):
		# uid (otol-id, eg: 93302), parent_uid, name, rank, 
		# sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags
	print('Reading taxonomy file')
	with open(taxonomyFile) as file: # Had about 4.5e6 lines
		for lineNum, line in enumerate(file, 1):
			if lineNum % 1e5 == 0:
				print(f'At line {lineNum}')
			# Skip header line
			if lineNum == 1:
				continue
			# Parse line
			fields = line.split('\t|\t')
			try:
				otolId = int(fields[0])
			except ValueError:
				print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
				continue
			srcsField = fields[4]
			# Add source IDs
			for srcPair in srcsField.split(','):
				src, srcIdStr = srcPair.split(':', 1)
				if srcIdStr.isdecimal() and src in OTOL_SRCS:
					if otolId not in nodeToSrcIds:
						nodeToSrcIds[otolId] = {}
					elif src in nodeToSrcIds[otolId]:
						continue
					srcId = int(srcIdStr)
					nodeToSrcIds[otolId][src] = srcId
					usedSrcIds.add((src, srcId))
	print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
def readEolIdsFile(
		eolIdsFile: str,
		nodeToSrcIds: dict[int, dict[str, int]],
		usedSrcIds: set[tuple[str, int]],
		nodeToEolId: dict[int, int]) -> None:
	""" Reads EOL provider IDs file, and maps EOL IDs to external-source IDs """
	# The file is a CSV with a header line, then lines that hold these fields:
		# node_id, resource_pk (ID from external source), resource_id (int denoting external-source),
		# page_id (eol ID), preferred_canonical_for_page
	print('Reading EOL provider IDs file')
	srcToEolId: dict[str, dict[int, int]] = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...}
	with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines
		for lineNum, row in enumerate(csv.reader(file), 1):
			if lineNum % 1e6 == 0:
				print(f'At line {lineNum}')
			# Skip header line
			if lineNum == 1:
				continue
			# Parse line
			eolId = int(row[3])
			srcInt = int(row[2])
			srcIdStr = row[1]
			if srcIdStr.isdecimal() and srcInt in EOL_SRCS:
				srcId = int(srcIdStr)
				src = EOL_SRCS[srcInt]
				if (src, srcId) not in usedSrcIds:
					continue
				if srcId in srcToEolId[src]:
					print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}')
					continue
				srcToEolId[src][srcId] = eolId
	print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
		# Was about 3.5e6 (4.2e6 without usedSrcIds)
	#
	print('Resolving candidate EOL IDs')
	# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
	for otolId, srcInfo in nodeToSrcIds.items():
		eolIdToCount: dict[int, int] = defaultdict(int)
		for src, srcId in srcInfo.items():
			if src in srcToEolId and srcId in srcToEolId[src]:
				eolId = srcToEolId[src][srcId]
				eolIdToCount[eolId] += 1
		if len(eolIdToCount) == 1:
			nodeToEolId[otolId] = list(eolIdToCount)[0]
		elif len(eolIdToCount) > 1:
			# For multiple candidates, prefer those with most sources, and break ties by picking the lowest
			maxCount = max(eolIdToCount.values())
			eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
			nodeToEolId[otolId] = min(eolIds)
	print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
def readWikidataDb(
		wikidataDb: str,
		nodeToSrcIds: dict[int, dict[str, int]],
		usedSrcIds: set[tuple[str, int]],
		nodeToWikiTitle: dict[int, str],
		titleToIucnStatus: dict[str, str],
		nodeToEolId: dict[int, int]) -> None:
	""" Reads db holding ID and IUCN mappings from wikidata, and maps otol IDs to Wikipedia titles and EOL IDs """
	print('Reading from Wikidata db')
	srcToWikiTitle: dict[str, dict[int, str]] = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...}
	wikiTitles = set()
	dbCon = sqlite3.connect(wikidataDb)
	dbCur = dbCon.cursor()
	for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'):
		if (src, srcId) in usedSrcIds or src == 'eol': # Keep EOL IDs for later use 
			srcToWikiTitle[src][srcId] = title
			wikiTitles.add(title)
	for title, status in dbCur.execute('SELECT title, status from title_iucn'):
		if title in wikiTitles:
			titleToIucnStatus[title] = status
	print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries')
		# Was about 1.1e6 (1.2e6 without usedSrcIds)
	print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
	dbCon.close()
	#
	print('Resolving candidate Wikidata items')
	# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
	for otolId, srcInfo in nodeToSrcIds.items():
		titleToSrcs: dict[str, list[str]] = defaultdict(list) # Maps candidate titles to list of sources
		for src, srcId in srcInfo.items():
			if src in srcToWikiTitle and srcId in srcToWikiTitle[src]:
				title = srcToWikiTitle[src][srcId]
				titleToSrcs[title].append(src)
		# Choose title to use
		if len(titleToSrcs) == 1:
			nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
		elif len(titleToSrcs) > 1: # Test example: otol ID 621052
			# Get titles with most sources
			maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()])
			titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt}
			if len(titleToSrcs) == 1:
				nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
			else:
				# Get a title with a source with highest priority
				srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]}
				for src in OTOL_SRCS:
					if src in srcToTitle:
						nodeToWikiTitle[otolId] = srcToTitle[src]
						break
	print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
	#
	print('Adding extra EOL mappings from Wikidata')
	wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
	addedEntries: dict[int, int] = {}
	for eolId, title in srcToWikiTitle['eol'].items():
		if title in wikiTitleToNode:
			otolId = wikiTitleToNode[title]
			if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID
				nodeToEolId[otolId] = eolId
				addedEntries[otolId] = eolId
	print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
def readPickedMappings(
		pickedMappings: dict[str, list[str]],
		nodeToEolId: dict[int, int],
		nodeToWikiTitle: dict[int, str]) -> None:
	""" Read mappings from OTOL IDs to EOL IDs and Wikipedia titles """
	print('Reading picked mappings')
	for src in pickedMappings:
		for filename in pickedMappings[src]:
			if not os.path.exists(filename):
				continue
			with open(filename) as file:
				for line in file:
					otolIdStr, mappedVal = line.rstrip().split('|')
					otolId = int(otolIdStr)
					if src == 'eol':
						if mappedVal:
							nodeToEolId[otolId] = int(mappedVal)
						else:
							if otolId in nodeToEolId:
								del nodeToEolId[otolId]
					else: # src == 'enwiki'
						if mappedVal:
							nodeToWikiTitle[otolId] = mappedVal
						else:
							if otolId in nodeToWikiTitle:
								del nodeToWikiTitle[otolId]
def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None:
	""" Read a db for mappings from enwiki titles to page IDs """
	print('Getting enwiki page IDs')
	numNotFound = 0
	dbCon = sqlite3.connect(enwikiDumpIndexDb)
	dbCur = dbCon.cursor()
	for title in nodeToWikiTitle.values():
		record = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
		if record != None:
			titleToPageId[title] = record[0]
		else:
			numNotFound += 1
	dbCon.close()
	print(f'Unable to find IDs for {numNotFound} titles') # Was 2913

if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	args = parser.parse_args()
	#
	genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE)