#!/usr/bin/python3

import sys, re, json, sqlite3
import os.path

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release), \n"
usageInfo += "and creates an sqlite database otol.db, which holds entries of the form (name text, data text).\n"
usageInfo += "Each row holds a tree-of-life node name, and a JSON string with the form \n"
usageInfo += "{\"children\": [name1, ...], \"parent\": name1, \"tips\": int1, \"pSupport\": bool1}, holding \n"
usageInfo += "child names, a parent name or null, descendant 'tips', and a phylogeny-support indicator\n"
usageInfo += "\n"
usageInfo += "This script was adapted to handle Open Tree of Life version 13.4.\n"
usageInfo += "Link: https://tree.opentreeoflife.org/about/synthesis-release/v13.4\n"
usageInfo += "\n"
usageInfo += "labelled_supertree_ottnames.tre format:\n"
usageInfo += "    Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n"
usageInfo += "    where root node is named n6, and has children n1, n2, and n5.\n"
usageInfo += "    Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n"
usageInfo += "    Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n"
usageInfo += "annotations.json format:\n"
usageInfo += "    JSON object holding information about the tree-of-life release.\n"
usageInfo += "    The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
usageInfo += "    such as phylogenetic trees that support/conflict with it's placement.\n"

if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

treeFile = "otol/labelled_supertree_ottnames.tre"
annFile = "otol/annotations.json"
dbFile = "otol.db"
nodeMap = {} # Maps node names to node objects
idToName = {} # Maps node IDs to names

# Check for existing db
if os.path.exists(dbFile):
	print("ERROR: Existing {} file".format(dbFile), file=sys.stderr)
	sys.exit(1)

# Parse treeFile
data = None
with open(treeFile) as file:
	data = file.read()
dataIdx = 0
def parseNewick():
	"""Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None"""
	global dataIdx
	# Check for EOF
	if dataIdx == len(data):
		print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr)
		return None
	# Check for inner-node start
	if data[dataIdx] == "(":
		dataIdx += 1
		childNames = []
		while True:
			# Read child
			childName = parseNewick()
			if childName == None:
				return None
			childNames.append(childName)
			if (dataIdx == len(data)):
				print("ERROR: Unexpected EOF", file=sys.stderr)
				return None
			# Check for next child
			if (data[dataIdx] == ","):
				dataIdx += 1
				continue
			else:
				# Get node name
				dataIdx += 1 # Consume an expected ')'
				[name, id] = parseNewickName()
				idToName[id] = name
				# Get child num-tips total
				tips = 0
				for childName in childNames:
					tips += nodeMap[childName]["tips"]
				# Add node to nodeMap
				if name in nodeMap: # Turns out the names might not actually be unique
					count = 2
					name2 = name + " [" + str(count) + "]"
					while name2 in nodeMap:
						count += 1
						name2 = name + " [" + str(count) + "]"
					name = name2
				nodeMap[name] = {
					"n": name, "id": id, "children": childNames, "parent": None, "tips": tips, "pSupport": False
				}
				# Update childrens' parent reference
				for childName in childNames:
					nodeMap[childName]["parent"] = name
				return name
	else:
		[name, id] = parseNewickName()
		idToName[id] = name
		nodeMap[name] = {"n": name, "id": id, "children": [], "parent": None, "tips": 1, "pSupport": False}
		return name
def parseNewickName():
	"""Helper that parses an input node name, and returns a [name,id] pair"""
	global data, dataIdx
	name = None
	end = dataIdx
	# Get name
	if (end < len(data) and data[end] == "'"): # Check for quoted name
		end += 1
		inQuote = True
		while end < len(data):
			if (data[end] == "'"):
				if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote
					end += 2
					continue
				else:
					end += 1
					inQuote = False
					break
			end += 1
		if inQuote:
			raise Exception("ERROR: Unexpected EOF")
		name = data[dataIdx:end]
		dataIdx = end
	else:
		while end < len(data) and not re.match(r"[(),]", data[end]):
			end += 1
		if (end == dataIdx):
			raise Exception("ERROR: Unexpected EOF")
		name = data[dataIdx:end].rstrip()
		if end == len(data): # Ignore trailing input semicolon
			name = name[:-1]
		dataIdx = end
	# Convert to [name, id]
	name = name.lower()
	if name.startswith("mrca"):
		return [name, name]
	elif name[0] == "'":
		match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name)
		if match == None:
			raise Exception("ERROR: invalid name \"{}\"".format(name))
		name = match.group(1).replace("''", "'")
		return [name, match.group(2)]
	else:
		match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name)
		if match == None:
			raise Exception("ERROR: invalid name \"{}\"".format(name))
		return [match.group(1).replace("_", " "), match.group(2)]
rootName = parseNewick()

# Parse annFile
data = None
with open(annFile) as file:
	data = file.read()
obj = json.loads(data)
nodeAnnsMap = obj['nodes']

# Change mrca* names
def applyMrcaNameConvert(name, namesToSwap):
	"""
	Given an mrca* name, makes namesToSwap map it to an expanded version with the form [childName1 + childName2].
	May recurse on child nodes with mrca* names.
	Also returns the name of the highest-tips child (used when recursing).
	"""
	node = nodeMap[name]
	childNames = node["children"]
	if len(childNames) < 2:
		print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr)
		return name
	# Get 2 children with most tips
	childTips = []
	for n in childNames:
		childTips.append(nodeMap[n]["tips"])
	maxTips = max(childTips)
	maxIdx = childTips.index(maxTips)
	childTips[maxIdx] = 0
	maxTips2 = max(childTips)
	maxIdx2 = childTips.index(maxTips2)
	#
	childName1 = node["children"][maxIdx]
	childName2 = node["children"][maxIdx2]
	if childName1.startswith("mrca"):
		childName1 = applyMrcaNameConvert(childName1, namesToSwap)
	if childName2.startswith("mrca"):
		childName2 = applyMrcaNameConvert(childName2, namesToSwap)
	# Create composite name
	namesToSwap[name] = "[{} + {}]".format(childName1, childName2)
	return childName1
namesToSwap = {} # Maps mrca* names to replacement names
for node in nodeMap.values():
	name = node["n"]
	if (name.startswith("mrca") and name not in namesToSwap):
		applyMrcaNameConvert(name, namesToSwap)
for [oldName, newName] in namesToSwap.items():
	nodeMap[newName] = nodeMap[oldName]
	del nodeMap[oldName]
for node in nodeMap.values():
	parentName = node["parent"]
	if (parentName in namesToSwap):
		node["parent"] = namesToSwap[parentName]
	childNames = node["children"]
	for i in range(len(childNames)):
		childName = childNames[i]
		if (childName in namesToSwap):
			childNames[i] = namesToSwap[childName]

# Add annotations data, and delete certain fields
for node in nodeMap.values():
	# Set has-support value using annotations
	id = node["id"]
	if id in nodeAnnsMap:
		nodeAnns = nodeAnnsMap[id]
		supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0
		conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0
		node["pSupport"] = supportQty > 0 and conflictQty == 0
	# Root node gets support
	if node["parent"] == None:
		node["pSupport"] = True
	# Delete some no-longer-needed fields
	del node["n"]
	del node["id"]

# Create db
con = sqlite3.connect(dbFile)
cur = con.cursor()
cur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, data TEXT)")
for name in nodeMap.keys():
	cur.execute("INSERT INTO nodes VALUES (?, ?)", (name, json.dumps(nodeMap[name])))
con.commit()
con.close()