From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tol_data/gen_otol_data.py | 267 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100755 backend/tol_data/gen_otol_data.py (limited to 'backend/tol_data/gen_otol_data.py') diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py new file mode 100755 index 0000000..eba8779 --- /dev/null +++ b/backend/tol_data/gen_otol_data.py @@ -0,0 +1,267 @@ +#!/usr/bin/python3 + +""" +Reads files describing a tree-of-life from an 'Open Tree of Life' release, +and stores tree info in a database. + +Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: + The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 + The root node is named n6, and has children n1, n2, and n5. + Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', + 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. + The node with ID 'ott770315' will get the name 'homo sapiens'. + A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). + It is possible for multiple nodes to have the same name. + In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. +Reads an annotations.json file, which is assumed to have this format: + Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, + such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that + support/conflict with the node's placement. +Reads from a picked-names file, if present, which specifies name and node ID pairs. + These help resolve cases where multiple nodes share the same name. +""" + +import re, os +import json, sqlite3 + +TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes +ANN_FILE = os.path.join('otol', 'annotations.json') +DB_FILE = 'data.db' +PICKED_NAMES_FILE = 'picked_otol_names.txt' + +class Node: + """ Represents a tree-of-life node """ + def __init__(self, name, childIds, parentId, tips, pSupport): + self.name = name + self.childIds = childIds + self.parentId = parentId + self.tips = tips + self.pSupport = pSupport +class BasicStream: + """ Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """ + def __init__(self, data, idx=0): + self.data = data + self.idx = idx + def hasNext(self) -> bool: + return self.idx < len(self.data) + def next(self) -> str: + if self.hasNext(): + char = self.data[self.idx] + self.idx += 1 + return char; + else: + return ''; + def peek(self) -> str: + if self.hasNext(): + return self.data[self.idx] + else: + return ''; + def skipWhitespace(self) -> None: + while self.hasNext() and self.data[self.idx].isspace(): + self.idx += 1 + def progress(self) -> float: + return (self.idx / len(self.data)) + +def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None: + """ Reads the files and stores the tree info """ + nodeMap: dict[str, Node] = {} # Maps node IDs to node objects + nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) + dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs + # + print('Parsing tree file') + treeStream: BasicStream + with open(treeFile) as file: + treeStream = BasicStream(file.read()) + # Parse content + parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds) + print('Resolving duplicate names') + # Read picked-names file + nameToPickedId: dict[str, str] = {} + if os.path.exists(pickedNamesFile): + with open(pickedNamesFile) as file: + for line in file: + name, _, otolId = line.strip().partition('|') + nameToPickedId[name] = otolId + # Resolve duplicates + for dupName, ids in dupNameToIds.items(): + # Check for picked id + if dupName in nameToPickedId: + idToUse = nameToPickedId[dupName] + else: + # Get conflicting node with most tips + tipNums = [nodeMap[id].tips for id in ids] + maxIdx = tipNums.index(max(tipNums)) + idToUse = ids[maxIdx] + # Adjust name of other conflicting nodes + counter = 2 + for id in ids: + if id != idToUse: + nodeMap[id].name += f' [{counter}]' + counter += 1 + print('Changing mrca* names') + for id, node in nodeMap.items(): + if node.name.startswith('mrca'): + convertMrcaName(id, nodeMap) + print('Parsing annotations file') + # Read file + with open(annFile) as file: + data = file.read() + obj = json.loads(data) + nodeAnnsMap = obj['nodes'] + # Find relevant annotations + for id, node in nodeMap.items(): + # Set has-support value using annotations + if id in nodeAnnsMap: + nodeAnns = nodeAnnsMap[id] + supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 + conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 + node.pSupport = supportQty > 0 and conflictQty == 0 + print('Creating nodes and edges tables') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') + dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)') + dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') + dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)') + for otolId, node in nodeMap.items(): + dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips)) + for childId in node.childIds: + childNode = nodeMap[childId] + dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', + (node.name, childNode.name, 1 if childNode.pSupport else 0)) + print('Closing database') + dbCon.commit() + dbCon.close() +def parseNewick( + stream: BasicStream, + nodeMap: dict[str, Node], + nameToFirstId: dict[str, str], + dupNameToIds: dict[str, list[str]]) -> str: + """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ + if stream.idx % 1e5 == 0: + print(f'Progress: {stream.progress() * 100:.2f}%') + # Find node + stream.skipWhitespace() + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == '(': # Start of inner node + stream.next() + childIds: list[str] = [] + while True: + # Read child + childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds) + childIds.append(childId) + # Check for next child or end of node + stream.skipWhitespace() + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == ',': # Expect another child + stream.next() + continue + else: # End of child list + # Get node name and id + stream.next() # Consume an expected ')' + stream.skipWhitespace() + name, id = parseNewickName(stream) + updateNameMaps(name, id, nameToFirstId, dupNameToIds) + # Get child num-tips total + tips = 0 + for childId in childIds: + tips += nodeMap[childId].tips + # Add node to nodeMap + nodeMap[id] = Node(name, childIds, None, tips, False) + # Update childrens' parent reference + for childId in childIds: + nodeMap[childId].parentId = id + return id + else: # Parse node name + name, id = parseNewickName(stream) + updateNameMaps(name, id, nameToFirstId, dupNameToIds) + nodeMap[id] = Node(name, [], None, 1, False) + return id +def parseNewickName(stream: BasicStream) -> tuple[str, str]: + """ Parses a node name from 'stream', and returns a (name, id) pair """ + name: str + nameChars = [] + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == "'": # Quoted name + nameChars.append(stream.next()) + while True: + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == "'": + nameChars.append(stream.next()) + if stream.peek() == "'": # '' is escaped-quote + nameChars.append(stream.next()) + continue + break + nameChars.append(stream.next()) + else: + while stream.hasNext() and not re.match(r'[(),;]', stream.peek()): + nameChars.append(stream.next()) + if stream.peek() == ';': # Ignore trailing input semicolon + stream.next() + # Convert to (name, id) + name = ''.join(nameChars).rstrip().lower() + if name.startswith('mrca'): + return (name, name) + elif name[0] == "'": + match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) + if match is None: + raise Exception(f'ERROR: invalid name \'{name}\'') + name = match.group(1).replace("''", "'") + return (name, match.group(2)) + else: + match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) + if match is None: + raise Exception(f'ERROR: invalid name \'{name}\'') + return (match.group(1).replace('_', ' '), match.group(2)) +def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None: + """ Update maps upon a newly parsed name """ + if name not in nameToFirstId: + nameToFirstId[name] = id + else: + if name not in dupNameToIds: + dupNameToIds[name] = [nameToFirstId[name], id] + else: + dupNameToIds[name].append(id) +def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: + """ Update a node in a tree to be named after 2 descendants. + Returns the name of one such descendant, for use during recursion. """ + node = nodeMap[id] + name = node.name + childIds = node.childIds + if len(childIds) < 2: + raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children') + # Get 2 children with most tips + childTips = [nodeMap[id].tips for id in childIds] + maxIdx1 = childTips.index(max(childTips)) + childTips[maxIdx1] = 0 + maxIdx2 = childTips.index(max(childTips)) + childId1 = childIds[maxIdx1] + childId2 = childIds[maxIdx2] + childName1 = nodeMap[childId1].name + childName2 = nodeMap[childId2].name + # Check for mrca* child names + if childName1.startswith('mrca'): + childName1 = convertMrcaName(childId1, nodeMap) + if childName2.startswith('mrca'): + childName2 = convertMrcaName(childId2, nodeMap) + # Check for composite names + match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) + if match is not None: + childName1 = match.group(1) + match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) + if match is not None: + childName2 = match.group(1) + # Create composite name + node.name = f'[{childName1} + {childName2}]' + return childName1 + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE) -- cgit v1.2.3