#!/usr/bin/python3 import sys, os.path, re import json, sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release),\n" usageInfo += "and creates a sqlite database, which holds entries of the form (name text, data text).\n" usageInfo += "Each row holds a tree-of-life node's name, JSON-encoded child name array, a parent name or '',\n" usageInfo += "number of descendant 'tips', and a 1 or 0 indicating phylogenetic-support.\n" usageInfo += "\n" usageInfo += "Expected labelled_supertree_ottnames.tre format:\n" usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n" usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n" usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n" usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n" usageInfo += "Expected annotations.json format:\n" usageInfo += " JSON object holding information about the tree-of-life release.\n" usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n" usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) treeFile = "otol/labelled_supertree_ottnames.tre" annFile = "otol/annotations.json" dbFile = "data.db" nodeMap = {} # Maps node IDs to node objects idToName = {} # Maps node IDs to names nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs # Parse treeFile print("Parsing tree file") data = None with open(treeFile) as file: data = file.read() dataIdx = 0 def parseNewick(): """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None""" global dataIdx # Check for EOF if dataIdx == len(data): print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr) return None # Check for node if data[dataIdx] == "(": # parse inner node dataIdx += 1 childIds = [] while True: # Read child childId = parseNewick() if childId == None: return None childIds.append(childId) if (dataIdx == len(data)): print("ERROR: Unexpected EOF", file=sys.stderr) return None # Check for next child if (data[dataIdx] == ","): dataIdx += 1 continue else: # Get node name and id dataIdx += 1 # Consume an expected ')' [name, id] = parseNewickName() updateNameMaps(name, id) # Get child num-tips total tips = 0 for childId in childIds: tips += nodeMap[childId]["tips"] # Add node to nodeMap nodeMap[id] = {"name": name, "children": childIds, "parent": None, "tips": tips, "pSupport": False} # Update childrens' parent reference for childId in childIds: nodeMap[childId]["parent"] = id return id else: # Parse node name [name, id] = parseNewickName() updateNameMaps(name, id) nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False} return id def updateNameMaps(name, id): idToName[id] = name if name not in nameToFirstId: nameToFirstId[name] = id else: if name not in dupNameToIds: dupNameToIds[name] = [nameToFirstId[name], id] else: dupNameToIds[name].append(id) def parseNewickName(): """Helper that parses an input node name, and returns a [name,id] pair""" global data, dataIdx name = None end = dataIdx # Get name if (end < len(data) and data[end] == "'"): # Check for quoted name end += 1 inQuote = True while end < len(data): if (data[end] == "'"): if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote end += 2 continue else: end += 1 inQuote = False break end += 1 if inQuote: raise Exception("ERROR: Unexpected EOF") name = data[dataIdx:end] dataIdx = end else: while end < len(data) and not re.match(r"[(),]", data[end]): end += 1 if (end == dataIdx): raise Exception("ERROR: Unexpected EOF") name = data[dataIdx:end].rstrip() if end == len(data): # Ignore trailing input semicolon name = name[:-1] dataIdx = end # Convert to [name, id] name = name.lower() if name.startswith("mrca"): return [name, name] elif name[0] == "'": match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) if match == None: raise Exception("ERROR: invalid name \"{}\"".format(name)) name = match.group(1).replace("''", "'") return [name, match.group(2)] else: match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) if match == None: raise Exception("ERROR: invalid name \"{}\"".format(name)) return [match.group(1).replace("_", " "), match.group(2)] rootName = parseNewick() # Resolve duplicate names print("Resolving duplicates") for [dupName, ids] in dupNameToIds.items(): # Get conflicting node with most tips tipNums = [nodeMap[id]["tips"] for id in ids] maxIdx = tipNums.index(max(tipNums)) maxId = ids[maxIdx] # Adjust name of other conflicting nodes counter = 2 for id in ids: if id != maxId: nodeMap[id]["name"] += " [" + str(counter)+ "]" counter += 1 # Change mrca* names print("Changing mrca* names") def convertMrcaName(id): node = nodeMap[id] name = node["name"] childIds = node["children"] if len(childIds) < 2: print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr) return # Get 2 children with most tips childTips = [nodeMap[id]["tips"] for id in childIds] maxIdx = childTips.index(max(childTips)) childTips[maxIdx] = 0 maxIdx2 = childTips.index(max(childTips)) childId1 = childIds[maxIdx] childId2 = childIds[maxIdx2] childName1 = nodeMap[childId1]["name"] childName2 = nodeMap[childId2]["name"] # Check for mrca* child names if childName1.startswith("mrca"): childName1 = convertMrcaName(childId1) if childName2.startswith("mrca"): childName2 = convertMrcaName(childId2) # Check for composite names match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1) if match != None: childName1 = match.group(1) match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2) if match != None: childName2 = match.group(1) # Create composite name node["name"] = "[{} + {}]".format(childName1, childName2) return childName1 for [id, node] in nodeMap.items(): if node["name"].startswith("mrca"): convertMrcaName(id) # Parse annFile print("Parsing annotations file") data = None with open(annFile) as file: data = file.read() obj = json.loads(data) nodeAnnsMap = obj['nodes'] # Add annotations data print("Adding annotation data") for [id, node] in nodeMap.items(): # Set has-support value using annotations if id in nodeAnnsMap: nodeAnns = nodeAnnsMap[id] supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0 conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0 node["pSupport"] = supportQty > 0 and conflictQty == 0 # Root node gets support if node["parent"] == None: node["pSupport"] = True # Create db print("Creating nodes and edges tables") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, tips INT)") dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)") dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))") dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)") for node in nodeMap.values(): dbCur.execute("INSERT INTO nodes VALUES (?, ?)", (node["name"], node["tips"])) childIds = node["children"] for childId in childIds: childNode = nodeMap[childId] dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)", (node["name"], childNode["name"], 1 if childNode["pSupport"] else 0)) dbCon.commit() dbCon.close()