diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-22 23:16:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-22 23:16:42 +1000 |
| commit | abb936f5d76f7fe5cec1e8948d287da86643d504 (patch) | |
| tree | f07b9eaadf5ae91363fdbac9d81b74e1fb0a436f /backend/data/genOtolData.py | |
| parent | e78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff) | |
Refactor backend scriptsextended-db
Diffstat (limited to 'backend/data/genOtolData.py')
| -rwxr-xr-x | backend/data/genOtolData.py | 181 |
1 files changed, 99 insertions, 82 deletions
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py index 87b35c3..36b6197 100755 --- a/backend/data/genOtolData.py +++ b/backend/data/genOtolData.py @@ -3,29 +3,33 @@ import sys, re, os import json, sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release),\n" -usageInfo += "and creates a sqlite database, which holds entries of the form (name text, data text).\n" -usageInfo += "Each row holds a tree-of-life node's name, JSON-encoded child name array, a parent name or '',\n" -usageInfo += "number of descendant 'tips', and a 1 or 0 indicating phylogenetic-support.\n" -usageInfo += "\n" -usageInfo += "Expected labelled_supertree_ottnames.tre format:\n" -usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n" -usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n" -usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n" -usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n" -usageInfo += "Expected annotations.json format:\n" -usageInfo += " JSON object holding information about the tree-of-life release.\n" -usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n" -usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n" -usageInfo += "\n" -usageInfo += "Some node trimming is done on the extracted tree, for performance and relevance reasons.\n" -usageInfo += "The app can get quite laggy when some nodes in the chain have over 10k children.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads files describing a tree-of-life from an 'Open Tree of Life' release, +and stores tree information in a database. + +Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: + The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 + The root node is named n6, and has children n1, n2, and n5. + Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', + 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. + The node with ID 'ott770315' will get the name 'homo sapiens'. + A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). + It is possible for multiple nodes to have the same name. + In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. +Reads an annotations.json file, which is assumed to have this format: + Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, + such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that + support/conflict with the node's placement. +Reads from a picked-names file, if present, which specifies name and node ID pairs. + These help resolve cases where multiple nodes share the same name. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -treeFile = "otol/labelled_supertree_ottnames.tre" +treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes annFile = "otol/annotations.json" dbFile = "data.db" nodeMap = {} # Maps node IDs to node objects @@ -33,19 +37,32 @@ nameToFirstId = {} # Maps node names to first found ID (names might have multipl dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs pickedNamesFile = "pickedOtolNames.txt" -# Parse treeFile +class Node: + " Represents a tree-of-life node " + def __init__(self, name, childIds, parentId, tips, pSupport): + self.name = name + self.childIds = childIds + self.parentId = parentId + self.tips = tips + self.pSupport = pSupport + print("Parsing tree file") +# Read file data = None with open(treeFile) as file: data = file.read() dataIdx = 0 +# Parse content +iterNum = 0 def parseNewick(): - """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None""" - global dataIdx + " Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID " + global data, dataIdx, iterNum + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") # Check for EOF if dataIdx == len(data): - print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr) - return None + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") # Check for node if data[dataIdx] == "(": # parse inner node dataIdx += 1 @@ -53,12 +70,9 @@ def parseNewick(): while True: # Read child childId = parseNewick() - if childId == None: - return None childIds.append(childId) if (dataIdx == len(data)): - print("ERROR: Unexpected EOF", file=sys.stderr) - return None + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") # Check for next child if (data[dataIdx] == ","): dataIdx += 1 @@ -66,33 +80,25 @@ def parseNewick(): else: # Get node name and id dataIdx += 1 # Consume an expected ')' - [name, id] = parseNewickName() + name, id = parseNewickName() updateNameMaps(name, id) # Get child num-tips total tips = 0 for childId in childIds: - tips += nodeMap[childId]["tips"] + tips += nodeMap[childId].tips # Add node to nodeMap - nodeMap[id] = {"name": name, "children": childIds, "parent": None, "tips": tips, "pSupport": False} + nodeMap[id] = Node(name, childIds, None, tips, False) # Update childrens' parent reference for childId in childIds: - nodeMap[childId]["parent"] = id + nodeMap[childId].parentId = id return id else: # Parse node name - [name, id] = parseNewickName() + name, id = parseNewickName() updateNameMaps(name, id) - nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False} + nodeMap[id] = Node(name, [], None, 1, False) return id -def updateNameMaps(name, id): - if name not in nameToFirstId: - nameToFirstId[name] = id - else: - if name not in dupNameToIds: - dupNameToIds[name] = [nameToFirstId[name], id] - else: - dupNameToIds[name].append(id) def parseNewickName(): - """Helper that parses an input node name, and returns a [name,id] pair""" + " Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair " global data, dataIdx name = None end = dataIdx @@ -102,7 +108,7 @@ def parseNewickName(): inQuote = True while end < len(data): if (data[end] == "'"): - if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote + if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote end += 2 continue else: @@ -111,75 +117,86 @@ def parseNewickName(): break end += 1 if inQuote: - raise Exception("ERROR: Unexpected EOF") + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") name = data[dataIdx:end] dataIdx = end else: while end < len(data) and not re.match(r"[(),]", data[end]): end += 1 if (end == dataIdx): - raise Exception("ERROR: Unexpected EOF") + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") name = data[dataIdx:end].rstrip() if end == len(data): # Ignore trailing input semicolon name = name[:-1] dataIdx = end - # Convert to [name, id] + # Convert to (name, id) name = name.lower() if name.startswith("mrca"): - return [name, name] + return (name, name) elif name[0] == "'": match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) if match == None: raise Exception(f"ERROR: invalid name \"{name}\"") name = match.group(1).replace("''", "'") - return [name, match.group(2)] + return (name, match.group(2)) else: match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) if match == None: raise Exception(f"ERROR: invalid name \"{name}\"") - return [match.group(1).replace("_", " "), match.group(2)] + return (match.group(1).replace("_", " "), match.group(2)) +def updateNameMaps(name, id): + global nameToFirstId, dupNameToIds + if name not in nameToFirstId: + nameToFirstId[name] = id + else: + if name not in dupNameToIds: + dupNameToIds[name] = [nameToFirstId[name], id] + else: + dupNameToIds[name].append(id) rootId = parseNewick() -# Resolve duplicate names -print("Resolving duplicates") + +print("Resolving duplicate names") +# Read picked-names file nameToPickedId = {} if os.path.exists(pickedNamesFile): with open(pickedNamesFile) as file: for line in file: (name, _, otolId) = line.rstrip().partition("|") nameToPickedId[name] = otolId -for [dupName, ids] in dupNameToIds.items(): +# Resolve duplicates +for (dupName, ids) in dupNameToIds.items(): # Check for picked id if dupName in nameToPickedId: idToUse = nameToPickedId[dupName] else: # Get conflicting node with most tips - tipNums = [nodeMap[id]["tips"] for id in ids] + tipNums = [nodeMap[id].tips for id in ids] maxIdx = tipNums.index(max(tipNums)) idToUse = ids[maxIdx] # Adjust name of other conflicting nodes counter = 2 for id in ids: if id != idToUse: - nodeMap[id]["name"] += " [" + str(counter)+ "]" + nodeMap[id].name += f" [{counter}]" counter += 1 -# Change mrca* names + print("Changing mrca* names") def convertMrcaName(id): node = nodeMap[id] - name = node["name"] - childIds = node["children"] + name = node.name + childIds = node.childIds if len(childIds) < 2: - print(f"WARNING: MRCA node \"{name}\" has less than 2 children", file=sys.stderr) + print(f"WARNING: MRCA node \"{name}\" has less than 2 children") return # Get 2 children with most tips - childTips = [nodeMap[id]["tips"] for id in childIds] - maxIdx = childTips.index(max(childTips)) - childTips[maxIdx] = 0 + childTips = [nodeMap[id].tips for id in childIds] + maxIdx1 = childTips.index(max(childTips)) + childTips[maxIdx1] = 0 maxIdx2 = childTips.index(max(childTips)) - childId1 = childIds[maxIdx] + childId1 = childIds[maxIdx1] childId2 = childIds[maxIdx2] - childName1 = nodeMap[childId1]["name"] - childName2 = nodeMap[childId2]["name"] + childName1 = nodeMap[childId1].name + childName2 = nodeMap[childId2].name # Check for mrca* child names if childName1.startswith("mrca"): childName1 = convertMrcaName(childId1) @@ -193,44 +210,44 @@ def convertMrcaName(id): if match != None: childName2 = match.group(1) # Create composite name - node["name"] = f"[{childName1} + {childName2}]" + node.name = f"[{childName1} + {childName2}]" return childName1 -for [id, node] in nodeMap.items(): - if node["name"].startswith("mrca"): +for (id, node) in nodeMap.items(): + if node.name.startswith("mrca"): convertMrcaName(id) -# Parse annFile + print("Parsing annotations file") +# Read file data = None with open(annFile) as file: data = file.read() obj = json.loads(data) -nodeAnnsMap = obj['nodes'] -# Add annotations data -print("Adding annotation data") -for [id, node] in nodeMap.items(): +nodeAnnsMap = obj["nodes"] +# Find relevant annotations +for (id, node) in nodeMap.items(): # Set has-support value using annotations if id in nodeAnnsMap: nodeAnns = nodeAnnsMap[id] supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0 conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0 - node["pSupport"] = supportQty > 0 and conflictQty == 0 + node.pSupport = supportQty > 0 and conflictQty == 0 # Root node gets support - if node["parent"] == None: - node["pSupport"] = True -# Create db + if node.parentId == None: + node.pSupport = True + print("Creating nodes and edges tables") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)") -dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))") +dbCur.execute("CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)") for (otolId, node) in nodeMap.items(): - dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node["name"], otolId, node["tips"])) - childIds = node["children"] - for childId in childIds: + dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node.name, otolId, node.tips)) + for childId in node.childIds: childNode = nodeMap[childId] dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)", - (node["name"], childNode["name"], 1 if childNode["pSupport"] else 0)) + (node.name, childNode.name, 1 if childNode.pSupport else 0)) +print("Closing database") dbCon.commit() dbCon.close() |
