Refactor backend scriptsextended-db

author: Terry Truong <terry06890@gmail.com> 2022-06-22 23:16:42 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-06-22 23:16:42 +1000
commit: abb936f5d76f7fe5cec1e8948d287da86643d504 (patch)
tree: f07b9eaadf5ae91363fdbac9d81b74e1fb0a436f /backend/data/genOtolData.py
parent: e78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff)
1 files changed, 99 insertions, 82 deletions
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index 87b35c3..36b6197 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -3,29 +3,33 @@
 import sys, re, os
 import json, sqlite3
 
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release),\n"
-usageInfo += "and creates a sqlite database, which holds entries of the form (name text, data text).\n"
-usageInfo += "Each row holds a tree-of-life node's name, JSON-encoded child name array, a parent name or '',\n"
-usageInfo += "number of descendant 'tips', and a 1 or 0 indicating phylogenetic-support.\n"
-usageInfo += "\n"
-usageInfo += "Expected labelled_supertree_ottnames.tre format:\n"
-usageInfo += "    Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n"
-usageInfo += "    where root node is named n6, and has children n1, n2, and n5.\n"
-usageInfo += "    Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n"
-usageInfo += "    Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n"
-usageInfo += "Expected annotations.json format:\n"
-usageInfo += "    JSON object holding information about the tree-of-life release.\n"
-usageInfo += "    The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
-usageInfo += "    such as phylogenetic trees that support/conflict with it's placement.\n"
-usageInfo += "\n"
-usageInfo += "Some node trimming is done on the extracted tree, for performance and relevance reasons.\n"
-usageInfo += "The app can get quite laggy when some nodes in the chain have over 10k children.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads files describing a tree-of-life from an 'Open Tree of Life' release,
+and stores tree information in a database.
+
+Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format:
+    The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6
+		The root node is named n6, and has children n1, n2, and n5.
+    Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', 
+		'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'.
+		The node with ID 'ott770315' will get the name 'homo sapiens'.
+		A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]).
+	It is possible for multiple nodes to have the same name.
+		In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc.
+Reads an annotations.json file, which is assumed to have this format:
+    Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node,
+    such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that
+	support/conflict with the node's placement.
+Reads from a picked-names file, if present, which specifies name and node ID pairs.
+	These help resolve cases where multiple nodes share the same name.
+"""
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-treeFile = "otol/labelled_supertree_ottnames.tre"
+treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes
 annFile = "otol/annotations.json"
 dbFile = "data.db"
 nodeMap = {} # Maps node IDs to node objects
@@ -33,19 +37,32 @@ nameToFirstId = {} # Maps node names to first found ID (names might have multipl
 dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs
 pickedNamesFile = "pickedOtolNames.txt"
 
-# Parse treeFile
+class Node:
+	" Represents a tree-of-life node "
+	def __init__(self, name, childIds, parentId, tips, pSupport):
+		self.name = name
+		self.childIds = childIds
+		self.parentId = parentId
+		self.tips = tips
+		self.pSupport = pSupport
+
 print("Parsing tree file")
+# Read file
 data = None
 with open(treeFile) as file:
 	data = file.read()
 dataIdx = 0
+# Parse content
+iterNum = 0
 def parseNewick():
-	"""Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None"""
-	global dataIdx
+	" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID "
+	global data, dataIdx, iterNum
+	iterNum += 1
+	if iterNum % 1e5 == 0:
+		print(f"At iteration {iterNum}")
 	# Check for EOF
 	if dataIdx == len(data):
-		print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr)
-		return None
+		raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
 	# Check for node
 	if data[dataIdx] == "(": # parse inner node
 		dataIdx += 1
@@ -53,12 +70,9 @@ def parseNewick():
 		while True:
 			# Read child
 			childId = parseNewick()
-			if childId == None:
-				return None
 			childIds.append(childId)
 			if (dataIdx == len(data)):
-				print("ERROR: Unexpected EOF", file=sys.stderr)
-				return None
+				raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
 			# Check for next child
 			if (data[dataIdx] == ","):
 				dataIdx += 1
@@ -66,33 +80,25 @@ def parseNewick():
 			else:
 				# Get node name and id
 				dataIdx += 1 # Consume an expected ')'
-				[name, id] = parseNewickName()
+				name, id = parseNewickName()
 				updateNameMaps(name, id)
 				# Get child num-tips total
 				tips = 0
 				for childId in childIds:
-					tips += nodeMap[childId]["tips"]
+					tips += nodeMap[childId].tips
 				# Add node to nodeMap
-				nodeMap[id] = {"name": name, "children": childIds, "parent": None, "tips": tips, "pSupport": False}
+				nodeMap[id] = Node(name, childIds, None, tips, False)
 				# Update childrens' parent reference
 				for childId in childIds:
-					nodeMap[childId]["parent"] = id
+					nodeMap[childId].parentId = id
 				return id
 	else: # Parse node name
-		[name, id] = parseNewickName()
+		name, id = parseNewickName()
 		updateNameMaps(name, id)
-		nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False}
+		nodeMap[id] = Node(name, [], None, 1, False)
 		return id
-def updateNameMaps(name, id):
-	if name not in nameToFirstId:
-		nameToFirstId[name] = id
-	else:
-		if name not in dupNameToIds:
-			dupNameToIds[name] = [nameToFirstId[name], id]
-		else:
-			dupNameToIds[name].append(id)
 def parseNewickName():
-	"""Helper that parses an input node name, and returns a [name,id] pair"""
+	" Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair "
 	global data, dataIdx
 	name = None
 	end = dataIdx
@@ -102,7 +108,7 @@ def parseNewickName():
 		inQuote = True
 		while end < len(data):
 			if (data[end] == "'"):
-				if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote
+				if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote
 					end += 2
 					continue
 				else:
@@ -111,75 +117,86 @@ def parseNewickName():
 					break
 			end += 1
 		if inQuote:
-			raise Exception("ERROR: Unexpected EOF")
+			raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
 		name = data[dataIdx:end]
 		dataIdx = end
 	else:
 		while end < len(data) and not re.match(r"[(),]", data[end]):
 			end += 1
 		if (end == dataIdx):
-			raise Exception("ERROR: Unexpected EOF")
+			raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
 		name = data[dataIdx:end].rstrip()
 		if end == len(data): # Ignore trailing input semicolon
 			name = name[:-1]
 		dataIdx = end
-	# Convert to [name, id]
+	# Convert to (name, id)
 	name = name.lower()
 	if name.startswith("mrca"):
-		return [name, name]
+		return (name, name)
 	elif name[0] == "'":
 		match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name)
 		if match == None:
 			raise Exception(f"ERROR: invalid name \"{name}\"")
 		name = match.group(1).replace("''", "'")
-		return [name, match.group(2)]
+		return (name, match.group(2))
 	else:
 		match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name)
 		if match == None:
 			raise Exception(f"ERROR: invalid name \"{name}\"")
-		return [match.group(1).replace("_", " "), match.group(2)]
+		return (match.group(1).replace("_", " "), match.group(2))
+def updateNameMaps(name, id):
+	global nameToFirstId, dupNameToIds
+	if name not in nameToFirstId:
+		nameToFirstId[name] = id
+	else:
+		if name not in dupNameToIds:
+			dupNameToIds[name] = [nameToFirstId[name], id]
+		else:
+			dupNameToIds[name].append(id)
 rootId = parseNewick()
-# Resolve duplicate names
-print("Resolving duplicates")
+
+print("Resolving duplicate names")
+# Read picked-names file
 nameToPickedId = {}
 if os.path.exists(pickedNamesFile):
 	with open(pickedNamesFile) as file:
 		for line in file:
 			(name, _, otolId) = line.rstrip().partition("|")
 			nameToPickedId[name] = otolId
-for [dupName, ids] in dupNameToIds.items():
+# Resolve duplicates
+for (dupName, ids) in dupNameToIds.items():
 	# Check for picked id
 	if dupName in nameToPickedId:
 		idToUse = nameToPickedId[dupName]
 	else:
 		# Get conflicting node with most tips
-		tipNums = [nodeMap[id]["tips"] for id in ids]
+		tipNums = [nodeMap[id].tips for id in ids]
 		maxIdx = tipNums.index(max(tipNums))
 		idToUse = ids[maxIdx]
 	# Adjust name of other conflicting nodes
 	counter = 2
 	for id in ids:
 		if id != idToUse:
-			nodeMap[id]["name"] += " [" + str(counter)+ "]"
+			nodeMap[id].name += f" [{counter}]"
 			counter += 1
-# Change mrca* names
+
 print("Changing mrca* names")
 def convertMrcaName(id):
 	node = nodeMap[id]
-	name = node["name"]
-	childIds = node["children"]
+	name = node.name
+	childIds = node.childIds
 	if len(childIds) < 2:
-		print(f"WARNING: MRCA node \"{name}\" has less than 2 children", file=sys.stderr)
+		print(f"WARNING: MRCA node \"{name}\" has less than 2 children")
 		return
 	# Get 2 children with most tips
-	childTips = [nodeMap[id]["tips"] for id in childIds]
-	maxIdx = childTips.index(max(childTips))
-	childTips[maxIdx] = 0
+	childTips = [nodeMap[id].tips for id in childIds]
+	maxIdx1 = childTips.index(max(childTips))
+	childTips[maxIdx1] = 0
 	maxIdx2 = childTips.index(max(childTips))
-	childId1 = childIds[maxIdx]
+	childId1 = childIds[maxIdx1]
 	childId2 = childIds[maxIdx2]
-	childName1 = nodeMap[childId1]["name"]
-	childName2 = nodeMap[childId2]["name"]
+	childName1 = nodeMap[childId1].name
+	childName2 = nodeMap[childId2].name
 	# Check for mrca* child names
 	if childName1.startswith("mrca"):
 		childName1 = convertMrcaName(childId1)
@@ -193,44 +210,44 @@ def convertMrcaName(id):
 	if match != None:
 		childName2 = match.group(1)
 	# Create composite name
-	node["name"] = f"[{childName1} + {childName2}]"
+	node.name = f"[{childName1} + {childName2}]"
 	return childName1
-for [id, node] in nodeMap.items():
-	if node["name"].startswith("mrca"):
+for (id, node) in nodeMap.items():
+	if node.name.startswith("mrca"):
 		convertMrcaName(id)
-# Parse annFile
+
 print("Parsing annotations file")
+# Read file
 data = None
 with open(annFile) as file:
 	data = file.read()
 obj = json.loads(data)
-nodeAnnsMap = obj['nodes']
-# Add annotations data
-print("Adding annotation data")
-for [id, node] in nodeMap.items():
+nodeAnnsMap = obj["nodes"]
+# Find relevant annotations
+for (id, node) in nodeMap.items():
 	# Set has-support value using annotations
 	if id in nodeAnnsMap:
 		nodeAnns = nodeAnnsMap[id]
 		supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0
 		conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0
-		node["pSupport"] = supportQty > 0 and conflictQty == 0
+		node.pSupport = supportQty > 0 and conflictQty == 0
 	# Root node gets support
-	if node["parent"] == None:
-		node["pSupport"] = True
-# Create db
+	if node.parentId == None:
+		node.pSupport = True
+
 print("Creating nodes and edges tables")
 dbCon = sqlite3.connect(dbFile)
 dbCur = dbCon.cursor()
 dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)")
 dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))")
+dbCur.execute("CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))")
 dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)")
 for (otolId, node) in nodeMap.items():
-	dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node["name"], otolId, node["tips"]))
-	childIds = node["children"]
-	for childId in childIds:
+	dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node.name, otolId, node.tips))
+	for childId in node.childIds:
 		childNode = nodeMap[childId]
 		dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)",
-			(node["name"], childNode["name"], 1 if childNode["pSupport"] else 0))
+			(node.name, childNode.name, 1 if childNode.pSupport else 0))
+print("Closing database")
 dbCon.commit()
 dbCon.close()
author	Terry Truong <terry06890@gmail.com>	2022-06-22 23:16:42 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-06-22 23:16:42 +1000
commit	abb936f5d76f7fe5cec1e8948d287da86643d504 (patch)
tree	f07b9eaadf5ae91363fdbac9d81b74e1fb0a436f /backend/data/genOtolData.py
parent	e78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff)