diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
| commit | 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch) | |
| tree | 3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/tolData/genOtolData.py | |
| parent | a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff) | |
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or
symlinked from, public/. This needed to be changed before each
build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/tolData/genOtolData.py')
| -rwxr-xr-x | backend/tolData/genOtolData.py | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/backend/tolData/genOtolData.py b/backend/tolData/genOtolData.py new file mode 100755 index 0000000..b5e0055 --- /dev/null +++ b/backend/tolData/genOtolData.py @@ -0,0 +1,250 @@ +#!/usr/bin/python3 + +import sys, re, os +import json, sqlite3 + +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads files describing a tree-of-life from an 'Open Tree of Life' release, +and stores tree information in a database. + +Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: + The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 + The root node is named n6, and has children n1, n2, and n5. + Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', + 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. + The node with ID 'ott770315' will get the name 'homo sapiens'. + A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). + It is possible for multiple nodes to have the same name. + In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. +Reads an annotations.json file, which is assumed to have this format: + Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, + such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that + support/conflict with the node's placement. +Reads from a picked-names file, if present, which specifies name and node ID pairs. + These help resolve cases where multiple nodes share the same name. +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes +annFile = "otol/annotations.json" +dbFile = "data.db" +nodeMap = {} # Maps node IDs to node objects +nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs) +dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs +pickedNamesFile = "pickedOtolNames.txt" + +class Node: + " Represents a tree-of-life node " + def __init__(self, name, childIds, parentId, tips, pSupport): + self.name = name + self.childIds = childIds + self.parentId = parentId + self.tips = tips + self.pSupport = pSupport + +print("Parsing tree file") +# Read file +data = None +with open(treeFile) as file: + data = file.read() +dataIdx = 0 +# Parse content +iterNum = 0 +def parseNewick(): + " Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID " + global data, dataIdx, iterNum + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") + # Check for EOF + if dataIdx == len(data): + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") + # Check for node + if data[dataIdx] == "(": # parse inner node + dataIdx += 1 + childIds = [] + while True: + # Read child + childId = parseNewick() + childIds.append(childId) + if (dataIdx == len(data)): + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") + # Check for next child + if (data[dataIdx] == ","): + dataIdx += 1 + continue + else: + # Get node name and id + dataIdx += 1 # Consume an expected ')' + name, id = parseNewickName() + updateNameMaps(name, id) + # Get child num-tips total + tips = 0 + for childId in childIds: + tips += nodeMap[childId].tips + # Add node to nodeMap + nodeMap[id] = Node(name, childIds, None, tips, False) + # Update childrens' parent reference + for childId in childIds: + nodeMap[childId].parentId = id + return id + else: # Parse node name + name, id = parseNewickName() + updateNameMaps(name, id) + nodeMap[id] = Node(name, [], None, 1, False) + return id +def parseNewickName(): + " Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair " + global data, dataIdx + name = None + end = dataIdx + # Get name + if (end < len(data) and data[end] == "'"): # Check for quoted name + end += 1 + inQuote = True + while end < len(data): + if (data[end] == "'"): + if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote + end += 2 + continue + else: + end += 1 + inQuote = False + break + end += 1 + if inQuote: + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") + name = data[dataIdx:end] + dataIdx = end + else: + while end < len(data) and not re.match(r"[(),]", data[end]): + end += 1 + if (end == dataIdx): + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") + name = data[dataIdx:end].rstrip() + if end == len(data): # Ignore trailing input semicolon + name = name[:-1] + dataIdx = end + # Convert to (name, id) + name = name.lower() + if name.startswith("mrca"): + return (name, name) + elif name[0] == "'": + match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) + if match == None: + raise Exception(f"ERROR: invalid name \"{name}\"") + name = match.group(1).replace("''", "'") + return (name, match.group(2)) + else: + match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) + if match == None: + raise Exception(f"ERROR: invalid name \"{name}\"") + return (match.group(1).replace("_", " "), match.group(2)) +def updateNameMaps(name, id): + global nameToFirstId, dupNameToIds + if name not in nameToFirstId: + nameToFirstId[name] = id + else: + if name not in dupNameToIds: + dupNameToIds[name] = [nameToFirstId[name], id] + else: + dupNameToIds[name].append(id) +rootId = parseNewick() + +print("Resolving duplicate names") +# Read picked-names file +nameToPickedId = {} +if os.path.exists(pickedNamesFile): + with open(pickedNamesFile) as file: + for line in file: + (name, _, otolId) = line.rstrip().partition("|") + nameToPickedId[name] = otolId +# Resolve duplicates +for (dupName, ids) in dupNameToIds.items(): + # Check for picked id + if dupName in nameToPickedId: + idToUse = nameToPickedId[dupName] + else: + # Get conflicting node with most tips + tipNums = [nodeMap[id].tips for id in ids] + maxIdx = tipNums.index(max(tipNums)) + idToUse = ids[maxIdx] + # Adjust name of other conflicting nodes + counter = 2 + for id in ids: + if id != idToUse: + nodeMap[id].name += f" [{counter}]" + counter += 1 + +print("Changing mrca* names") +def convertMrcaName(id): + node = nodeMap[id] + name = node.name + childIds = node.childIds + if len(childIds) < 2: + print(f"WARNING: MRCA node \"{name}\" has less than 2 children") + return + # Get 2 children with most tips + childTips = [nodeMap[id].tips for id in childIds] + maxIdx1 = childTips.index(max(childTips)) + childTips[maxIdx1] = 0 + maxIdx2 = childTips.index(max(childTips)) + childId1 = childIds[maxIdx1] + childId2 = childIds[maxIdx2] + childName1 = nodeMap[childId1].name + childName2 = nodeMap[childId2].name + # Check for mrca* child names + if childName1.startswith("mrca"): + childName1 = convertMrcaName(childId1) + if childName2.startswith("mrca"): + childName2 = convertMrcaName(childId2) + # Check for composite names + match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1) + if match != None: + childName1 = match.group(1) + match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2) + if match != None: + childName2 = match.group(1) + # Create composite name + node.name = f"[{childName1} + {childName2}]" + return childName1 +for (id, node) in nodeMap.items(): + if node.name.startswith("mrca"): + convertMrcaName(id) + +print("Parsing annotations file") +# Read file +data = None +with open(annFile) as file: + data = file.read() +obj = json.loads(data) +nodeAnnsMap = obj["nodes"] +# Find relevant annotations +for (id, node) in nodeMap.items(): + # Set has-support value using annotations + if id in nodeAnnsMap: + nodeAnns = nodeAnnsMap[id] + supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0 + conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0 + node.pSupport = supportQty > 0 and conflictQty == 0 + +print("Creating nodes and edges tables") +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") +dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)") +dbCur.execute("CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") +dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)") +for (otolId, node) in nodeMap.items(): + dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node.name, otolId, node.tips)) + for childId in node.childIds: + childNode = nodeMap[childId] + dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)", + (node.name, childNode.name, 1 if childNode.pSupport else 0)) +print("Closing database") +dbCon.commit() +dbCon.close() |
