aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/data/eol/README.md0
-rw-r--r--backend/data/otol/README.md0
-rwxr-xr-xbackend/data/otolToSqlite.py228
-rwxr-xr-xbackend/server.py142
4 files changed, 370 insertions, 0 deletions
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/data/eol/README.md
diff --git a/backend/data/otol/README.md b/backend/data/otol/README.md
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/data/otol/README.md
diff --git a/backend/data/otolToSqlite.py b/backend/data/otolToSqlite.py
new file mode 100755
index 0000000..187e224
--- /dev/null
+++ b/backend/data/otolToSqlite.py
@@ -0,0 +1,228 @@
+#!/usr/bin/python3
+
+import sys, re, json, sqlite3
+import os.path
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release), \n"
+usageInfo += "and creates an sqlite database otol.db, which holds entries of the form (name text, data text).\n"
+usageInfo += "Each row holds a tree-of-life node name, and a JSON string with the form \n"
+usageInfo += "{\"children\": [name1, ...], \"parent\": name1, \"tips\": int1, \"pSupport\": bool1}, holding \n"
+usageInfo += "child names, a parent name or null, descendant 'tips', and a phylogeny-support indicator\n"
+usageInfo += "\n"
+usageInfo += "This script was adapted to handle Open Tree of Life version 13.4.\n"
+usageInfo += "Link: https://tree.opentreeoflife.org/about/synthesis-release/v13.4\n"
+usageInfo += "\n"
+usageInfo += "labelled_supertree_ottnames.tre format:\n"
+usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n"
+usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n"
+usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n"
+usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n"
+usageInfo += "annotations.json format:\n"
+usageInfo += " JSON object holding information about the tree-of-life release.\n"
+usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
+usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n"
+
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+treeFile = "otol/labelled_supertree_ottnames.tre"
+annFile = "otol/annotations.json"
+dbFile = "otol.db"
+nodeMap = {} # Maps node names to node objects
+idToName = {} # Maps node IDs to names
+
+# Check for existing db
+if os.path.exists(dbFile):
+ print("ERROR: Existing {} file".format(dbFile), file=sys.stderr)
+ sys.exit(1)
+
+# Parse treeFile
+data = None
+with open(treeFile) as file:
+ data = file.read()
+dataIdx = 0
+def parseNewick():
+ """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None"""
+ global dataIdx
+ # Check for EOF
+ if dataIdx == len(data):
+ print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr)
+ return None
+ # Check for inner-node start
+ if data[dataIdx] == "(":
+ dataIdx += 1
+ childNames = []
+ while True:
+ # Read child
+ childName = parseNewick()
+ if childName == None:
+ return None
+ childNames.append(childName)
+ if (dataIdx == len(data)):
+ print("ERROR: Unexpected EOF", file=sys.stderr)
+ return None
+ # Check for next child
+ if (data[dataIdx] == ","):
+ dataIdx += 1
+ continue
+ else:
+ # Get node name
+ dataIdx += 1 # Consume an expected ')'
+ [name, id] = parseNewickName()
+ idToName[id] = name
+ # Get child num-tips total
+ tips = 0
+ for childName in childNames:
+ tips += nodeMap[childName]["tips"]
+ # Add node to nodeMap
+ if name in nodeMap: # Turns out the names might not actually be unique
+ count = 2
+ name2 = name + " [" + str(count) + "]"
+ while name2 in nodeMap:
+ count += 1
+ name2 = name + " [" + str(count) + "]"
+ name = name2
+ nodeMap[name] = {
+ "n": name, "id": id, "children": childNames, "parent": None, "tips": tips, "pSupport": False
+ }
+ # Update childrens' parent reference
+ for childName in childNames:
+ nodeMap[childName]["parent"] = name
+ return name
+ else:
+ [name, id] = parseNewickName()
+ idToName[id] = name
+ nodeMap[name] = {"n": name, "id": id, "children": [], "parent": None, "tips": 1, "pSupport": False}
+ return name
+def parseNewickName():
+ """Helper that parses an input node name, and returns a [name,id] pair"""
+ global data, dataIdx
+ name = None
+ end = dataIdx
+ # Get name
+ if (end < len(data) and data[end] == "'"): # Check for quoted name
+ end += 1
+ inQuote = True
+ while end < len(data):
+ if (data[end] == "'"):
+ if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote
+ end += 2
+ continue
+ else:
+ end += 1
+ inQuote = False
+ break
+ end += 1
+ if inQuote:
+ raise Exception("ERROR: Unexpected EOF")
+ name = data[dataIdx:end]
+ dataIdx = end
+ else:
+ while end < len(data) and not re.match(r"[(),]", data[end]):
+ end += 1
+ if (end == dataIdx):
+ raise Exception("ERROR: Unexpected EOF")
+ name = data[dataIdx:end].rstrip()
+ if end == len(data): # Ignore trailing input semicolon
+ name = name[:-1]
+ dataIdx = end
+ # Convert to [name, id]
+ name = name.lower()
+ if name.startswith("mrca"):
+ return [name, name]
+ elif name[0] == "'":
+ match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name)
+ if match == None:
+ raise Exception("ERROR: invalid name \"{}\"".format(name))
+ name = match.group(1).replace("''", "'")
+ return [name, match.group(2)]
+ else:
+ match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name)
+ if match == None:
+ raise Exception("ERROR: invalid name \"{}\"".format(name))
+ return [match.group(1).replace("_", " "), match.group(2)]
+rootName = parseNewick()
+
+# Parse annFile
+data = None
+with open(annFile) as file:
+ data = file.read()
+obj = json.loads(data)
+nodeAnnsMap = obj['nodes']
+
+# Change mrca* names
+def applyMrcaNameConvert(name, namesToSwap):
+ """
+ Given an mrca* name, makes namesToSwap map it to an expanded version with the form [childName1 + childName2].
+ May recurse on child nodes with mrca* names.
+ Also returns the name of the highest-tips child (used when recursing).
+ """
+ node = nodeMap[name]
+ childNames = node["children"]
+ if len(childNames) < 2:
+ print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr)
+ return name
+ # Get 2 children with most tips
+ childTips = []
+ for n in childNames:
+ childTips.append(nodeMap[n]["tips"])
+ maxTips = max(childTips)
+ maxIdx = childTips.index(maxTips)
+ childTips[maxIdx] = 0
+ maxTips2 = max(childTips)
+ maxIdx2 = childTips.index(maxTips2)
+ #
+ childName1 = node["children"][maxIdx]
+ childName2 = node["children"][maxIdx2]
+ if childName1.startswith("mrca"):
+ childName1 = applyMrcaNameConvert(childName1, namesToSwap)
+ if childName2.startswith("mrca"):
+ childName2 = applyMrcaNameConvert(childName2, namesToSwap)
+ # Create composite name
+ namesToSwap[name] = "[{} + {}]".format(childName1, childName2)
+ return childName1
+namesToSwap = {} # Maps mrca* names to replacement names
+for node in nodeMap.values():
+ name = node["n"]
+ if (name.startswith("mrca") and name not in namesToSwap):
+ applyMrcaNameConvert(name, namesToSwap)
+for [oldName, newName] in namesToSwap.items():
+ nodeMap[newName] = nodeMap[oldName]
+ del nodeMap[oldName]
+for node in nodeMap.values():
+ parentName = node["parent"]
+ if (parentName in namesToSwap):
+ node["parent"] = namesToSwap[parentName]
+ childNames = node["children"]
+ for i in range(len(childNames)):
+ childName = childNames[i]
+ if (childName in namesToSwap):
+ childNames[i] = namesToSwap[childName]
+
+# Add annotations data, and delete certain fields
+for node in nodeMap.values():
+ # Set has-support value using annotations
+ id = node["id"]
+ if id in nodeAnnsMap:
+ nodeAnns = nodeAnnsMap[id]
+ supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0
+ conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0
+ node["pSupport"] = supportQty > 0 and conflictQty == 0
+ # Root node gets support
+ if node["parent"] == None:
+ node["pSupport"] = True
+ # Delete some no-longer-needed fields
+ del node["n"]
+ del node["id"]
+
+# Create db
+con = sqlite3.connect(dbFile)
+cur = con.cursor()
+cur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, data TEXT)")
+for name in nodeMap.keys():
+ cur.execute("INSERT INTO nodes VALUES (?, ?)", (name, json.dumps(nodeMap[name])))
+cur.execute("CREATE UNIQUE INDEX nodes_idx on nodes(name)")
+con.commit()
+con.close()
diff --git a/backend/server.py b/backend/server.py
new file mode 100755
index 0000000..2ff7e74
--- /dev/null
+++ b/backend/server.py
@@ -0,0 +1,142 @@
+#!/usr/bin/python3
+
+import sys, re, sqlite3, json
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import urllib.parse
+
+hostname = "localhost"
+port = 8000
+dbFile = "data/otol.db"
+tolnodeReqDepth = 2
+ # For a /tolnode/name1 request, respond with name1's node, and descendent nodes in a subtree to some depth
+ # A depth of 0 means only respond with one node
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Starts a server that listens for GET requests to " + hostname + ":" + str(port) + "/tolnode/name1,\n"
+usageInfo += "and responds with JSON representing an object mapping names to node objects.\n"
+usageInfo += "Normally, the response includes node name1, and child nodes up to depth " + str(tolnodeReqDepth) + ".\n"
+usageInfo += "\n"
+usageInfo += "A query section ?type=type1 can be used to affect the result.\n"
+usageInfo += "If type1 is 'children', the operation acts on node name1's children, and the results combined.\n"
+usageInfo += "If type1 is 'chain', the response includes nodes from name1 up to the root, and their direct children.\n"
+
+dbCon = sqlite3.connect(dbFile)
+def lookupName(name):
+ cur = dbCon.cursor()
+ cur.execute("SELECT name, data FROM nodes WHERE name = ?", (name,))
+ row = cur.fetchone()
+ return row[1] if row != None else None
+#def lookupNameLike(name):
+# cur = dbCon.cursor()
+# found = False
+# cur.execute("SELECT name, data FROM nodes WHERE name like ?", ("%{}%".format(name),))
+# rows = cur.fetchall()
+# if len(rows) == 0:
+# return None
+# else:
+# jsonStr = "{"
+# for i in range(len(rows)):
+# jsonStr += json.dumps(rows[i][0]) + ":" + rows[i][1]
+# if i < len(njList) - 1:
+# jsonStr += ","
+# jsonStr += "}"
+# return jsonFromNameJsonList(rows)
+
+class DbServer(BaseHTTPRequestHandler):
+ def do_GET(self):
+ # Parse URL
+ urlParts = urllib.parse.urlparse(self.path)
+ path = urllib.parse.unquote(urlParts.path)
+ queryDict = urllib.parse.parse_qs(urlParts.query)
+ # Check first element of path
+ match = re.match(r"/([^/]+)/(.+)", path)
+ if match != None:
+ reqType = match.group(1)
+ if reqType == "tolnode":
+ name = match.group(2)
+ # Check query string
+ if "type" not in queryDict:
+ nodeJson = lookupName(name)
+ if nodeJson != None:
+ results = []
+ getResultsUntilDepth(name, nodeJson, tolnodeReqDepth, results)
+ self.respondJson(nodeResultsToJSON(results))
+ return
+ elif queryDict["type"][0] == "children":
+ nodeJson = lookupName(name)
+ if nodeJson != None:
+ obj = json.loads(nodeJson)
+ results = []
+ for childName in obj["children"]:
+ nodeJson = lookupName(childName)
+ if nodeJson != None:
+ getResultsUntilDepth(childName, nodeJson, tolnodeReqDepth, results)
+ self.respondJson(nodeResultsToJSON(results))
+ return
+ elif queryDict["type"][0] == "chain":
+ results = []
+ ranOnce = False
+ while True:
+ jsonResult = lookupName(name)
+ if jsonResult == None:
+ if ranOnce:
+ print("ERROR: Parent-chain node {} not found".format(name), file=sys.stderr)
+ break
+ results.append([name, jsonResult])
+ obj = json.loads(jsonResult)
+ # Add children
+ if not ranOnce:
+ ranOnce = True
+ else:
+ internalFail = False
+ for childName in obj["children"]:
+ jsonResult = lookupName(childName)
+ if jsonResult == None:
+ print("ERROR: Parent-chain-child node {} not found".format(name), file=sys.stderr)
+ internalFail = True
+ break
+ results.append([childName, jsonResult])
+ if internalFail:
+ break
+ # Check if root
+ if obj["parent"] == None:
+ self.respondJson(nodeResultsToJSON(results))
+ return
+ else:
+ name = obj["parent"]
+ self.send_response(404)
+ self.end_headers()
+ self.end_headers()
+ def respondJson(self, jsonStr):
+ self.send_response(200)
+ self.send_header("Content-type", "application/json")
+ self.end_headers()
+ self.wfile.write(jsonStr.encode("utf-8"))
+def getResultsUntilDepth(name, nodeJson, depth, results):
+ """Given a node [name, nodeJson] pair, adds child node pairs to 'results', up until 'depth'"""
+ results.append([name, nodeJson])
+ if depth > 0:
+ obj = json.loads(nodeJson)
+ for childName in obj["children"]:
+ childJson = lookupName(childName)
+ if childJson != None:
+ getResultsUntilDepth(childName, childJson, depth-1, results)
+def nodeResultsToJSON(results):
+ """Given a list of [name, nodeJson] pairs, returns a representative JSON string"""
+ jsonStr = "{"
+ for i in range(len(results)):
+ jsonStr += json.dumps(results[i][0]) + ":" + results[i][1]
+ if i < len(results) - 1:
+ jsonStr += ","
+ jsonStr += "}"
+ return jsonStr
+
+server = HTTPServer((hostname, port), DbServer)
+print("Server started at http://{}:{}".format(hostname, port))
+try:
+ server.serve_forever()
+except KeyboardInterrupt:
+ pass
+server.server_close()
+dbCon.close()
+print("Server stopped")