From 04e9444746d3ba8ddcc96d0fd16f1c02adce1389 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Tue, 26 Apr 2022 13:53:46 +1000
Subject: Have tol data in sqlite db, and add server script that accesses it

Adapt otol-data-converting script to generate otol.db, add server.py
script that provides access to that db, and adapt the app to query the
server for tol data when needed.
---
 data_otol/namedTreeToJSON.py | 181 -------------------------------------------
 1 file changed, 181 deletions(-)
 delete mode 100755 data_otol/namedTreeToJSON.py

(limited to 'data_otol/namedTreeToJSON.py')

diff --git a/data_otol/namedTreeToJSON.py b/data_otol/namedTreeToJSON.py
deleted file mode 100755
index 30b8033..0000000
--- a/data_otol/namedTreeToJSON.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, json
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release), \n"
-usageInfo += "and prints a JSON object, which maps node names to objects of the form \n"
-usageInfo += "{\"children\": [name1, ...], \"parent\": name1, \"tips\": int1, \"pSupport\": bool1}, which holds \n"
-usageInfo += "child names, a parent name or null, descendant 'tips', and a phylogeny-support indicator\n"
-usageInfo += "\n"
-usageInfo += "This script was adapted to handle Open Tree of Life version 13.4.\n"
-usageInfo += "Link: https://tree.opentreeoflife.org/about/synthesis-release/v13.4\n"
-usageInfo += "\n"
-usageInfo += "labelled_supertree_ottnames.tre format:\n"
-usageInfo += "    Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n"
-usageInfo += "    where root node is named n6, and has children n1, n2, and n5.\n"
-usageInfo += "    Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n"
-usageInfo += "    Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n"
-usageInfo += "annotations.json format:\n"
-usageInfo += "    JSON object holding information about the tree-of-life release.\n"
-usageInfo += "    The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
-usageInfo += "    such as phylogenetic trees that support/conflict with it's placement.\n"
-
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-nodeMap = {} # The JSON object to output
-idToName = {} # Maps node IDs to names
-
-# Parse labelled_supertree_ottnames.tre
-data = None
-with open("labelled_supertree_ottnames.tre") as file:
-	data = file.read()
-dataIdx = 0
-def parseNewick():
-	"""Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None"""
-	global dataIdx
-	# Check for EOF
-	if dataIdx == len(data):
-		print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr)
-		return None
-	# Check for inner-node start
-	if data[dataIdx] == "(":
-		dataIdx += 1
-		childNames = []
-		while True:
-			# Read child
-			childName = parseNewick()
-			if childName == None:
-				return None
-			childNames.append(childName)
-			if (dataIdx == len(data)):
-				print("ERROR: Unexpected EOF", file=sys.stderr)
-				return None
-			# Check for next child
-			if (data[dataIdx] == ","):
-				dataIdx += 1
-				continue
-			else:
-				# Get node name
-				dataIdx += 1 # Consume an expected ')'
-				[name, id] = parseNewickName()
-				idToName[id] = name
-				# Get child num-tips total
-				tips = 0
-				for childName in childNames:
-					tips += nodeMap[childName]["tips"]
-				# Add node to nodeMap
-				nodeMap[name] = {
-					"n": name, "id": id, "children": childNames, "parent": None, "tips": tips, "pSupport": False
-				}
-				# Update childrens' parent reference
-				for childName in childNames:
-					nodeMap[childName]["parent"] = name
-				return name
-	else:
-		[name, id] = parseNewickName()
-		idToName[id] = name
-		nodeMap[name] = {"n": name, "id": id, "children": [], "parent": None, "tips": 1, "pSupport": False}
-		return name
-def parseNewickName():
-	"""Helper that parses an input node name, and returns a [name,id] pair"""
-	global data, dataIdx
-	name = None
-	end = dataIdx
-	# Get name
-	if (end < len(data) and data[end] == "'"): # Check for quoted name
-		end += 1
-		inQuote = True
-		while end < len(data):
-			if (data[end] == "'"):
-				if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote
-					end += 2
-					continue
-				else:
-					end += 1
-					inQuote = False
-					break
-			end += 1
-		if inQuote:
-			raise Exception("ERROR: Unexpected EOF")
-		name = data[dataIdx:end]
-		dataIdx = end
-	else:
-		while end < len(data) and not re.match(r"[(),]", data[end]):
-			end += 1
-		if (end == dataIdx):
-			raise Exception("ERROR: Unexpected EOF")
-		name = data[dataIdx:end].rstrip()
-		if end == len(data): # Ignore trailing input semicolon
-			name = name[:-1]
-		dataIdx = end
-	# Convert to [name, id]
-	if name.startswith("mrca"):
-		return [name, name]
-	elif name[0] == "'":
-		match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name)
-		if match == None:
-			raise Exception("ERROR: invalid name \"{}\"".format(name))
-		name = match.group(1).replace("''", "'")
-		return [name, match.group(2)]
-	else:
-		match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name)
-		if match == None:
-			raise Exception("ERROR: invalid name \"{}\"".format(name))
-		return [match.group(1).replace("_", " "), match.group(2)]
-rootName = parseNewick()
-
-# Parse annotations.json
-data = None
-with open("annotations.json") as file:
-	data = file.read()
-obj = json.loads(data)
-nodeAnnsMap = obj['nodes']
-
-# Do some more postprocessing on each node
-def convertMrcaName(name):
-	"""Given an mrca* name, returns an expanded version with the form [name1 + name2]"""
-	match = re.fullmatch(r"mrca(ott\d+)(ott\d+)", name)
-	if match == None:
-		print("ERROR: Invalid name \"{}\"".format(name), file=sys.stderr)
-	else:
-		subName1 = match.group(1)
-		subName2 = match.group(2)
-		if subName1 not in idToName:
-			print("ERROR: MRCA name \"{}\" sub-name \"{}\" not found".format(subName1), file=sys.stderr)
-		elif subName2 not in idToName:
-			print("ERROR: MRCA name \"{}\" sub-name \"{}\" not found".format(subName2), file=sys.stderr)
-		else:
-			return "[{} + {}]".format(idToName[subName1], idToName[subName2])
-namesToSwap = [] # Will hold [oldName, newName] pairs, for renaming nodes in nodeMap
-for node in nodeMap.values():
-	# Set has-support value using annotations
-	id = node["id"]
-	if id in nodeAnnsMap:
-		nodeAnns = nodeAnnsMap[id]
-		supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0
-		conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0
-		node["pSupport"] = supportQty > 0 and conflictQty == 0
-	# Change mrca* names
-	name = node["n"]
-	if (name.startswith("mrca")):
-		namesToSwap.append([name, convertMrcaName(name)])
-	parentName = node["parent"]
-	if (parentName != None and parentName.startswith("mrca")):
-		node["parent"] = convertMrcaName(parentName)
-	childNames = node["children"]
-	for i in range(len(childNames)):
-		if (childNames[i].startswith("mrca")):
-			childNames[i] = convertMrcaName(childNames[i])
-	# Delete some no-longer-needed fields
-	del node["n"]
-	del node["id"]
-# Finish mrca* renamings
-for [oldName, newName] in namesToSwap:
-	nodeMap[newName] = nodeMap[oldName]
-	del nodeMap[oldName]
-
-# Output JSON
-print(json.dumps(nodeMap))
-- 
cgit v1.2.3