aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genOtolData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-22 23:16:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-22 23:16:42 +1000
commitabb936f5d76f7fe5cec1e8948d287da86643d504 (patch)
treef07b9eaadf5ae91363fdbac9d81b74e1fb0a436f /backend/data/genOtolData.py
parente78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff)
Refactor backend scriptsextended-db
Diffstat (limited to 'backend/data/genOtolData.py')
-rwxr-xr-xbackend/data/genOtolData.py181
1 files changed, 99 insertions, 82 deletions
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index 87b35c3..36b6197 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -3,29 +3,33 @@
import sys, re, os
import json, sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release),\n"
-usageInfo += "and creates a sqlite database, which holds entries of the form (name text, data text).\n"
-usageInfo += "Each row holds a tree-of-life node's name, JSON-encoded child name array, a parent name or '',\n"
-usageInfo += "number of descendant 'tips', and a 1 or 0 indicating phylogenetic-support.\n"
-usageInfo += "\n"
-usageInfo += "Expected labelled_supertree_ottnames.tre format:\n"
-usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n"
-usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n"
-usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n"
-usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n"
-usageInfo += "Expected annotations.json format:\n"
-usageInfo += " JSON object holding information about the tree-of-life release.\n"
-usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
-usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n"
-usageInfo += "\n"
-usageInfo += "Some node trimming is done on the extracted tree, for performance and relevance reasons.\n"
-usageInfo += "The app can get quite laggy when some nodes in the chain have over 10k children.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads files describing a tree-of-life from an 'Open Tree of Life' release,
+and stores tree information in a database.
+
+Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format:
+ The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6
+ The root node is named n6, and has children n1, n2, and n5.
+ Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753',
+ 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'.
+ The node with ID 'ott770315' will get the name 'homo sapiens'.
+ A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]).
+ It is possible for multiple nodes to have the same name.
+ In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc.
+Reads an annotations.json file, which is assumed to have this format:
+ Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node,
+ such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that
+ support/conflict with the node's placement.
+Reads from a picked-names file, if present, which specifies name and node ID pairs.
+ These help resolve cases where multiple nodes share the same name.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-treeFile = "otol/labelled_supertree_ottnames.tre"
+treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes
annFile = "otol/annotations.json"
dbFile = "data.db"
nodeMap = {} # Maps node IDs to node objects
@@ -33,19 +37,32 @@ nameToFirstId = {} # Maps node names to first found ID (names might have multipl
dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs
pickedNamesFile = "pickedOtolNames.txt"
-# Parse treeFile
+class Node:
+ " Represents a tree-of-life node "
+ def __init__(self, name, childIds, parentId, tips, pSupport):
+ self.name = name
+ self.childIds = childIds
+ self.parentId = parentId
+ self.tips = tips
+ self.pSupport = pSupport
+
print("Parsing tree file")
+# Read file
data = None
with open(treeFile) as file:
data = file.read()
dataIdx = 0
+# Parse content
+iterNum = 0
def parseNewick():
- """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None"""
- global dataIdx
+ " Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID "
+ global data, dataIdx, iterNum
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"At iteration {iterNum}")
# Check for EOF
if dataIdx == len(data):
- print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr)
- return None
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
# Check for node
if data[dataIdx] == "(": # parse inner node
dataIdx += 1
@@ -53,12 +70,9 @@ def parseNewick():
while True:
# Read child
childId = parseNewick()
- if childId == None:
- return None
childIds.append(childId)
if (dataIdx == len(data)):
- print("ERROR: Unexpected EOF", file=sys.stderr)
- return None
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
# Check for next child
if (data[dataIdx] == ","):
dataIdx += 1
@@ -66,33 +80,25 @@ def parseNewick():
else:
# Get node name and id
dataIdx += 1 # Consume an expected ')'
- [name, id] = parseNewickName()
+ name, id = parseNewickName()
updateNameMaps(name, id)
# Get child num-tips total
tips = 0
for childId in childIds:
- tips += nodeMap[childId]["tips"]
+ tips += nodeMap[childId].tips
# Add node to nodeMap
- nodeMap[id] = {"name": name, "children": childIds, "parent": None, "tips": tips, "pSupport": False}
+ nodeMap[id] = Node(name, childIds, None, tips, False)
# Update childrens' parent reference
for childId in childIds:
- nodeMap[childId]["parent"] = id
+ nodeMap[childId].parentId = id
return id
else: # Parse node name
- [name, id] = parseNewickName()
+ name, id = parseNewickName()
updateNameMaps(name, id)
- nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False}
+ nodeMap[id] = Node(name, [], None, 1, False)
return id
-def updateNameMaps(name, id):
- if name not in nameToFirstId:
- nameToFirstId[name] = id
- else:
- if name not in dupNameToIds:
- dupNameToIds[name] = [nameToFirstId[name], id]
- else:
- dupNameToIds[name].append(id)
def parseNewickName():
- """Helper that parses an input node name, and returns a [name,id] pair"""
+ " Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair "
global data, dataIdx
name = None
end = dataIdx
@@ -102,7 +108,7 @@ def parseNewickName():
inQuote = True
while end < len(data):
if (data[end] == "'"):
- if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote
+ if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote
end += 2
continue
else:
@@ -111,75 +117,86 @@ def parseNewickName():
break
end += 1
if inQuote:
- raise Exception("ERROR: Unexpected EOF")
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
name = data[dataIdx:end]
dataIdx = end
else:
while end < len(data) and not re.match(r"[(),]", data[end]):
end += 1
if (end == dataIdx):
- raise Exception("ERROR: Unexpected EOF")
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
name = data[dataIdx:end].rstrip()
if end == len(data): # Ignore trailing input semicolon
name = name[:-1]
dataIdx = end
- # Convert to [name, id]
+ # Convert to (name, id)
name = name.lower()
if name.startswith("mrca"):
- return [name, name]
+ return (name, name)
elif name[0] == "'":
match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name)
if match == None:
raise Exception(f"ERROR: invalid name \"{name}\"")
name = match.group(1).replace("''", "'")
- return [name, match.group(2)]
+ return (name, match.group(2))
else:
match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name)
if match == None:
raise Exception(f"ERROR: invalid name \"{name}\"")
- return [match.group(1).replace("_", " "), match.group(2)]
+ return (match.group(1).replace("_", " "), match.group(2))
+def updateNameMaps(name, id):
+ global nameToFirstId, dupNameToIds
+ if name not in nameToFirstId:
+ nameToFirstId[name] = id
+ else:
+ if name not in dupNameToIds:
+ dupNameToIds[name] = [nameToFirstId[name], id]
+ else:
+ dupNameToIds[name].append(id)
rootId = parseNewick()
-# Resolve duplicate names
-print("Resolving duplicates")
+
+print("Resolving duplicate names")
+# Read picked-names file
nameToPickedId = {}
if os.path.exists(pickedNamesFile):
with open(pickedNamesFile) as file:
for line in file:
(name, _, otolId) = line.rstrip().partition("|")
nameToPickedId[name] = otolId
-for [dupName, ids] in dupNameToIds.items():
+# Resolve duplicates
+for (dupName, ids) in dupNameToIds.items():
# Check for picked id
if dupName in nameToPickedId:
idToUse = nameToPickedId[dupName]
else:
# Get conflicting node with most tips
- tipNums = [nodeMap[id]["tips"] for id in ids]
+ tipNums = [nodeMap[id].tips for id in ids]
maxIdx = tipNums.index(max(tipNums))
idToUse = ids[maxIdx]
# Adjust name of other conflicting nodes
counter = 2
for id in ids:
if id != idToUse:
- nodeMap[id]["name"] += " [" + str(counter)+ "]"
+ nodeMap[id].name += f" [{counter}]"
counter += 1
-# Change mrca* names
+
print("Changing mrca* names")
def convertMrcaName(id):
node = nodeMap[id]
- name = node["name"]
- childIds = node["children"]
+ name = node.name
+ childIds = node.childIds
if len(childIds) < 2:
- print(f"WARNING: MRCA node \"{name}\" has less than 2 children", file=sys.stderr)
+ print(f"WARNING: MRCA node \"{name}\" has less than 2 children")
return
# Get 2 children with most tips
- childTips = [nodeMap[id]["tips"] for id in childIds]
- maxIdx = childTips.index(max(childTips))
- childTips[maxIdx] = 0
+ childTips = [nodeMap[id].tips for id in childIds]
+ maxIdx1 = childTips.index(max(childTips))
+ childTips[maxIdx1] = 0
maxIdx2 = childTips.index(max(childTips))
- childId1 = childIds[maxIdx]
+ childId1 = childIds[maxIdx1]
childId2 = childIds[maxIdx2]
- childName1 = nodeMap[childId1]["name"]
- childName2 = nodeMap[childId2]["name"]
+ childName1 = nodeMap[childId1].name
+ childName2 = nodeMap[childId2].name
# Check for mrca* child names
if childName1.startswith("mrca"):
childName1 = convertMrcaName(childId1)
@@ -193,44 +210,44 @@ def convertMrcaName(id):
if match != None:
childName2 = match.group(1)
# Create composite name
- node["name"] = f"[{childName1} + {childName2}]"
+ node.name = f"[{childName1} + {childName2}]"
return childName1
-for [id, node] in nodeMap.items():
- if node["name"].startswith("mrca"):
+for (id, node) in nodeMap.items():
+ if node.name.startswith("mrca"):
convertMrcaName(id)
-# Parse annFile
+
print("Parsing annotations file")
+# Read file
data = None
with open(annFile) as file:
data = file.read()
obj = json.loads(data)
-nodeAnnsMap = obj['nodes']
-# Add annotations data
-print("Adding annotation data")
-for [id, node] in nodeMap.items():
+nodeAnnsMap = obj["nodes"]
+# Find relevant annotations
+for (id, node) in nodeMap.items():
# Set has-support value using annotations
if id in nodeAnnsMap:
nodeAnns = nodeAnnsMap[id]
supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0
conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0
- node["pSupport"] = supportQty > 0 and conflictQty == 0
+ node.pSupport = supportQty > 0 and conflictQty == 0
# Root node gets support
- if node["parent"] == None:
- node["pSupport"] = True
-# Create db
+ if node.parentId == None:
+ node.pSupport = True
+
print("Creating nodes and edges tables")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)")
dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))")
+dbCur.execute("CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))")
dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)")
for (otolId, node) in nodeMap.items():
- dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node["name"], otolId, node["tips"]))
- childIds = node["children"]
- for childId in childIds:
+ dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node.name, otolId, node.tips))
+ for childId in node.childIds:
childNode = nodeMap[childId]
dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)",
- (node["name"], childNode["name"], 1 if childNode["pSupport"] else 0))
+ (node.name, childNode.name, 1 if childNode.pSupport else 0))
+print("Closing database")
dbCon.commit()
dbCon.close()