diff options
Diffstat (limited to 'backend/data/otolToSqlite.py')
| -rwxr-xr-x | backend/data/otolToSqlite.py | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/backend/data/otolToSqlite.py b/backend/data/otolToSqlite.py new file mode 100755 index 0000000..187e224 --- /dev/null +++ b/backend/data/otolToSqlite.py @@ -0,0 +1,228 @@ +#!/usr/bin/python3 + +import sys, re, json, sqlite3 +import os.path + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release), \n" +usageInfo += "and creates an sqlite database otol.db, which holds entries of the form (name text, data text).\n" +usageInfo += "Each row holds a tree-of-life node name, and a JSON string with the form \n" +usageInfo += "{\"children\": [name1, ...], \"parent\": name1, \"tips\": int1, \"pSupport\": bool1}, holding \n" +usageInfo += "child names, a parent name or null, descendant 'tips', and a phylogeny-support indicator\n" +usageInfo += "\n" +usageInfo += "This script was adapted to handle Open Tree of Life version 13.4.\n" +usageInfo += "Link: https://tree.opentreeoflife.org/about/synthesis-release/v13.4\n" +usageInfo += "\n" +usageInfo += "labelled_supertree_ottnames.tre format:\n" +usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n" +usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n" +usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n" +usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n" +usageInfo += "annotations.json format:\n" +usageInfo += " JSON object holding information about the tree-of-life release.\n" +usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n" +usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n" + +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +treeFile = "otol/labelled_supertree_ottnames.tre" +annFile = "otol/annotations.json" +dbFile = "otol.db" +nodeMap = {} # Maps node names to node objects +idToName = {} # Maps node IDs to names + +# Check for existing db +if os.path.exists(dbFile): + print("ERROR: Existing {} file".format(dbFile), file=sys.stderr) + sys.exit(1) + +# Parse treeFile +data = None +with open(treeFile) as file: + data = file.read() +dataIdx = 0 +def parseNewick(): + """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None""" + global dataIdx + # Check for EOF + if dataIdx == len(data): + print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr) + return None + # Check for inner-node start + if data[dataIdx] == "(": + dataIdx += 1 + childNames = [] + while True: + # Read child + childName = parseNewick() + if childName == None: + return None + childNames.append(childName) + if (dataIdx == len(data)): + print("ERROR: Unexpected EOF", file=sys.stderr) + return None + # Check for next child + if (data[dataIdx] == ","): + dataIdx += 1 + continue + else: + # Get node name + dataIdx += 1 # Consume an expected ')' + [name, id] = parseNewickName() + idToName[id] = name + # Get child num-tips total + tips = 0 + for childName in childNames: + tips += nodeMap[childName]["tips"] + # Add node to nodeMap + if name in nodeMap: # Turns out the names might not actually be unique + count = 2 + name2 = name + " [" + str(count) + "]" + while name2 in nodeMap: + count += 1 + name2 = name + " [" + str(count) + "]" + name = name2 + nodeMap[name] = { + "n": name, "id": id, "children": childNames, "parent": None, "tips": tips, "pSupport": False + } + # Update childrens' parent reference + for childName in childNames: + nodeMap[childName]["parent"] = name + return name + else: + [name, id] = parseNewickName() + idToName[id] = name + nodeMap[name] = {"n": name, "id": id, "children": [], "parent": None, "tips": 1, "pSupport": False} + return name +def parseNewickName(): + """Helper that parses an input node name, and returns a [name,id] pair""" + global data, dataIdx + name = None + end = dataIdx + # Get name + if (end < len(data) and data[end] == "'"): # Check for quoted name + end += 1 + inQuote = True + while end < len(data): + if (data[end] == "'"): + if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote + end += 2 + continue + else: + end += 1 + inQuote = False + break + end += 1 + if inQuote: + raise Exception("ERROR: Unexpected EOF") + name = data[dataIdx:end] + dataIdx = end + else: + while end < len(data) and not re.match(r"[(),]", data[end]): + end += 1 + if (end == dataIdx): + raise Exception("ERROR: Unexpected EOF") + name = data[dataIdx:end].rstrip() + if end == len(data): # Ignore trailing input semicolon + name = name[:-1] + dataIdx = end + # Convert to [name, id] + name = name.lower() + if name.startswith("mrca"): + return [name, name] + elif name[0] == "'": + match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) + if match == None: + raise Exception("ERROR: invalid name \"{}\"".format(name)) + name = match.group(1).replace("''", "'") + return [name, match.group(2)] + else: + match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) + if match == None: + raise Exception("ERROR: invalid name \"{}\"".format(name)) + return [match.group(1).replace("_", " "), match.group(2)] +rootName = parseNewick() + +# Parse annFile +data = None +with open(annFile) as file: + data = file.read() +obj = json.loads(data) +nodeAnnsMap = obj['nodes'] + +# Change mrca* names +def applyMrcaNameConvert(name, namesToSwap): + """ + Given an mrca* name, makes namesToSwap map it to an expanded version with the form [childName1 + childName2]. + May recurse on child nodes with mrca* names. + Also returns the name of the highest-tips child (used when recursing). + """ + node = nodeMap[name] + childNames = node["children"] + if len(childNames) < 2: + print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr) + return name + # Get 2 children with most tips + childTips = [] + for n in childNames: + childTips.append(nodeMap[n]["tips"]) + maxTips = max(childTips) + maxIdx = childTips.index(maxTips) + childTips[maxIdx] = 0 + maxTips2 = max(childTips) + maxIdx2 = childTips.index(maxTips2) + # + childName1 = node["children"][maxIdx] + childName2 = node["children"][maxIdx2] + if childName1.startswith("mrca"): + childName1 = applyMrcaNameConvert(childName1, namesToSwap) + if childName2.startswith("mrca"): + childName2 = applyMrcaNameConvert(childName2, namesToSwap) + # Create composite name + namesToSwap[name] = "[{} + {}]".format(childName1, childName2) + return childName1 +namesToSwap = {} # Maps mrca* names to replacement names +for node in nodeMap.values(): + name = node["n"] + if (name.startswith("mrca") and name not in namesToSwap): + applyMrcaNameConvert(name, namesToSwap) +for [oldName, newName] in namesToSwap.items(): + nodeMap[newName] = nodeMap[oldName] + del nodeMap[oldName] +for node in nodeMap.values(): + parentName = node["parent"] + if (parentName in namesToSwap): + node["parent"] = namesToSwap[parentName] + childNames = node["children"] + for i in range(len(childNames)): + childName = childNames[i] + if (childName in namesToSwap): + childNames[i] = namesToSwap[childName] + +# Add annotations data, and delete certain fields +for node in nodeMap.values(): + # Set has-support value using annotations + id = node["id"] + if id in nodeAnnsMap: + nodeAnns = nodeAnnsMap[id] + supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0 + conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0 + node["pSupport"] = supportQty > 0 and conflictQty == 0 + # Root node gets support + if node["parent"] == None: + node["pSupport"] = True + # Delete some no-longer-needed fields + del node["n"] + del node["id"] + +# Create db +con = sqlite3.connect(dbFile) +cur = con.cursor() +cur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, data TEXT)") +for name in nodeMap.keys(): + cur.execute("INSERT INTO nodes VALUES (?, ?)", (name, json.dumps(nodeMap[name]))) +cur.execute("CREATE UNIQUE INDEX nodes_idx on nodes(name)") +con.commit() +con.close() |
