#!/usr/bin/python3

import re, os
import json, sqlite3

import argparse
parser = argparse.ArgumentParser(description="""
Reads files describing a tree-of-life from an 'Open Tree of Life' release,
and stores tree info in a database.

Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format:
    The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6
        The root node is named n6, and has children n1, n2, and n5.
    Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', 
        'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'.
        The node with ID 'ott770315' will get the name 'homo sapiens'.
        A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]).
    It is possible for multiple nodes to have the same name.
        In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc.
Reads an annotations.json file, which is assumed to have this format:
    Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node,
    such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that
    support/conflict with the node's placement.
Reads from a picked-names file, if present, which specifies name and node ID pairs.
    These help resolve cases where multiple nodes share the same name.
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()

class Node:
	' Represents a tree-of-life node '
	def __init__(self, name, childIds, parentId, tips, pSupport):
		self.name = name
		self.childIds = childIds
		self.parentId = parentId
		self.tips = tips
		self.pSupport = pSupport

treeFile = 'otol/labelled_supertree_ottnames.tre' # Had about 2.5e9 nodes
annFile = 'otol/annotations.json'
dbFile = 'data.db'
nodeMap: dict[str, Node] = {} # Maps node IDs to node objects
nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs)
dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs
pickedNamesFile = 'pickedOtolNames.txt'

print('Parsing tree file')
# Read file
data: str
with open(treeFile) as file:
	data = file.read()
dataIdx = 0
# Parse content
iterNum = 0
def parseNewick() -> str:
	""" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """
	global data, dataIdx, iterNum
	iterNum += 1
	if iterNum % 1e5 == 0:
		print(f'At iteration {iterNum}')
	# Check for EOF
	if dataIdx == len(data):
		raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
	# Check for node
	if data[dataIdx] == '(': # parse inner node
		dataIdx += 1
		childIds: list[str] = []
		while True:
			# Read child
			childId = parseNewick()
			childIds.append(childId)
			if (dataIdx == len(data)):
				raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
			# Check for next child
			if (data[dataIdx] == ','):
				dataIdx += 1
				continue
			else:
				# Get node name and id
				dataIdx += 1 # Consume an expected ')'
				name, id = parseNewickName()
				updateNameMaps(name, id)
				# Get child num-tips total
				tips = 0
				for childId in childIds:
					tips += nodeMap[childId].tips
				# Add node to nodeMap
				nodeMap[id] = Node(name, childIds, None, tips, False)
				# Update childrens' parent reference
				for childId in childIds:
					nodeMap[childId].parentId = id
				return id
	else: # Parse node name
		name, id = parseNewickName()
		updateNameMaps(name, id)
		nodeMap[id] = Node(name, [], None, 1, False)
		return id
def parseNewickName() -> tuple[str, str]:
	""" Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair """
	global data, dataIdx
	name: str
	end = dataIdx
	# Get name
	if (end < len(data) and data[end] == "'"): # Check for quoted name
		end += 1
		inQuote = True
		while end < len(data):
			if (data[end] == "'"):
				if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote
					end += 2
					continue
				else:
					end += 1
					inQuote = False
					break
			end += 1
		if inQuote:
			raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
		name = data[dataIdx:end]
		dataIdx = end
	else:
		while end < len(data) and not re.match(r'[(),]', data[end]):
			end += 1
		if (end == dataIdx):
			raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
		name = data[dataIdx:end].rstrip()
		if end == len(data): # Ignore trailing input semicolon
			name = name[:-1]
		dataIdx = end
	# Convert to (name, id)
	name = name.lower()
	if name.startswith('mrca'):
		return (name, name)
	elif name[0] == "'":
		match = re.fullmatch(r"'([^\\\']+) (ott\d+)'", name)
		if match is None:
			raise Exception(f'ERROR: invalid name \'{name}\'')
		name = match.group(1).replace("''", "'")
		return (name, match.group(2))
	else:
		match = re.fullmatch(r"([^\\\']+)_(ott\d+)", name)
		if match is None:
			raise Exception(f'ERROR: invalid name \'{name}\'')
		return (match.group(1).replace('_', ' '), match.group(2))
def updateNameMaps(name, id):
	global nameToFirstId, dupNameToIds
	if name not in nameToFirstId:
		nameToFirstId[name] = id
	else:
		if name not in dupNameToIds:
			dupNameToIds[name] = [nameToFirstId[name], id]
		else:
			dupNameToIds[name].append(id)
rootId: str = parseNewick()

print('Resolving duplicate names')
# Read picked-names file
nameToPickedId: dict[str, str] = {}
if os.path.exists(pickedNamesFile):
	with open(pickedNamesFile) as file:
		for line in file:
			name, _, otolId = line.rstrip().partition('|')
			nameToPickedId[name] = otolId
# Resolve duplicates
for dupName, ids in dupNameToIds.items():
	# Check for picked id
	if dupName in nameToPickedId:
		idToUse = nameToPickedId[dupName]
	else:
		# Get conflicting node with most tips
		tipNums = [nodeMap[id].tips for id in ids]
		maxIdx = tipNums.index(max(tipNums))
		idToUse = ids[maxIdx]
	# Adjust name of other conflicting nodes
	counter = 2
	for id in ids:
		if id != idToUse:
			nodeMap[id].name += f' [{counter}]'
			counter += 1

print('Changing mrca* names')
def convertMrcaName(id: str):
	node = nodeMap[id]
	name = node.name
	childIds = node.childIds
	if len(childIds) < 2:
		print(f'WARNING: MRCA node \'{name}\' has less than 2 children')
		return
	# Get 2 children with most tips
	childTips = [nodeMap[id].tips for id in childIds]
	maxIdx1 = childTips.index(max(childTips))
	childTips[maxIdx1] = 0
	maxIdx2 = childTips.index(max(childTips))
	childId1 = childIds[maxIdx1]
	childId2 = childIds[maxIdx2]
	childName1 = nodeMap[childId1].name
	childName2 = nodeMap[childId2].name
	# Check for mrca* child names
	if childName1.startswith('mrca'):
		childName1 = convertMrcaName(childId1)
	if childName2.startswith('mrca'):
		childName2 = convertMrcaName(childId2)
	# Check for composite names
	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1)
	if match is not None:
		childName1 = match.group(1)
	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2)
	if match is not None:
		childName2 = match.group(1)
	# Create composite name
	node.name = f'[{childName1} + {childName2}]'
	return childName1
for id, node in nodeMap.items():
	if node.name.startswith('mrca'):
		convertMrcaName(id)

print('Parsing annotations file')
# Read file
with open(annFile) as file:
	data = file.read()
obj = json.loads(data)
nodeAnnsMap = obj['nodes']
# Find relevant annotations
for id, node in nodeMap.items():
	# Set has-support value using annotations
	if id in nodeAnnsMap:
		nodeAnns = nodeAnnsMap[id]
		supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0
		conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0
		node.pSupport = supportQty > 0 and conflictQty == 0

print('Creating nodes and edges tables')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)')
dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)')
dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))')
dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)')
for otolId, node in nodeMap.items():
	dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips))
	for childId in node.childIds:
		childNode = nodeMap[childId]
		dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)',
			(node.name, childNode.name, 1 if childNode.pSupport else 0))
print('Closing database')
dbCon.commit()
dbCon.close()