#!/usr/bin/python3 """ Reads files describing a tree-of-life from an 'Open Tree of Life' release, and stores tree info in a database. Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 The root node is named n6, and has children n1, n2, and n5. Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. The node with ID 'ott770315' will get the name 'homo sapiens'. A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). It is possible for multiple nodes to have the same name. In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. Reads an annotations.json file, which is assumed to have this format: Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that support/conflict with the node's placement. Reads from a picked-names file, if present, which specifies name and node ID pairs. These help resolve cases where multiple nodes share the same name. """ import re, os import json, sqlite3 TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes ANN_FILE = os.path.join('otol', 'annotations.json') DB_FILE = 'data.db' PICKED_NAMES_FILE = 'picked_otol_names.txt' class Node: """ Represents a tree-of-life node """ def __init__(self, name, childIds, parentId, tips, pSupport): self.name = name self.childIds = childIds self.parentId = parentId self.tips = tips self.pSupport = pSupport class BasicStream: """ Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """ def __init__(self, data, idx=0): self.data = data self.idx = idx def hasNext(self) -> bool: return self.idx < len(self.data) def next(self) -> str: if self.hasNext(): char = self.data[self.idx] self.idx += 1 return char; else: return ''; def peek(self) -> str: if self.hasNext(): return self.data[self.idx] else: return ''; def skipWhitespace(self) -> None: while self.hasNext() and self.data[self.idx].isspace(): self.idx += 1 def progress(self) -> float: return (self.idx / len(self.data)) def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None: """ Reads the files and stores the tree info """ nodeMap: dict[str, Node] = {} # Maps node IDs to node objects nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs # print('Parsing tree file') treeStream: BasicStream with open(treeFile) as file: treeStream = BasicStream(file.read()) # Parse content parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds) print('Resolving duplicate names') # Read picked-names file nameToPickedId: dict[str, str] = {} if os.path.exists(pickedNamesFile): with open(pickedNamesFile) as file: for line in file: name, _, otolId = line.strip().partition('|') nameToPickedId[name] = otolId # Resolve duplicates for dupName, ids in dupNameToIds.items(): # Check for picked id if dupName in nameToPickedId: idToUse = nameToPickedId[dupName] else: # Get conflicting node with most tips tipNums = [nodeMap[id].tips for id in ids] maxIdx = tipNums.index(max(tipNums)) idToUse = ids[maxIdx] # Adjust name of other conflicting nodes counter = 2 for id in ids: if id != idToUse: nodeMap[id].name += f' [{counter}]' counter += 1 print('Changing mrca* names') for id, node in nodeMap.items(): if node.name.startswith('mrca'): convertMrcaName(id, nodeMap) print('Parsing annotations file') # Read file with open(annFile) as file: data = file.read() obj = json.loads(data) nodeAnnsMap = obj['nodes'] # Find relevant annotations for id, node in nodeMap.items(): # Set has-support value using annotations if id in nodeAnnsMap: nodeAnns = nodeAnnsMap[id] supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 node.pSupport = supportQty > 0 and conflictQty == 0 print('Creating nodes and edges tables') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)') dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)') for otolId, node in nodeMap.items(): dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips)) for childId in node.childIds: childNode = nodeMap[childId] dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', (node.name, childNode.name, 1 if childNode.pSupport else 0)) print('Closing database') dbCon.commit() dbCon.close() def parseNewick( stream: BasicStream, nodeMap: dict[str, Node], nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> str: """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ if stream.idx % 1e5 == 0: print(f'Progress: {stream.progress() * 100:.2f}%') # Find node stream.skipWhitespace() if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == '(': # Start of inner node stream.next() childIds: list[str] = [] while True: # Read child childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds) childIds.append(childId) # Check for next child or end of node stream.skipWhitespace() if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == ',': # Expect another child stream.next() continue else: # End of child list # Get node name and id stream.next() # Consume an expected ')' stream.skipWhitespace() name, id = parseNewickName(stream) updateNameMaps(name, id, nameToFirstId, dupNameToIds) # Get child num-tips total tips = 0 for childId in childIds: tips += nodeMap[childId].tips # Add node to nodeMap nodeMap[id] = Node(name, childIds, None, tips, False) # Update childrens' parent reference for childId in childIds: nodeMap[childId].parentId = id return id else: # Parse node name name, id = parseNewickName(stream) updateNameMaps(name, id, nameToFirstId, dupNameToIds) nodeMap[id] = Node(name, [], None, 1, False) return id def parseNewickName(stream: BasicStream) -> tuple[str, str]: """ Parses a node name from 'stream', and returns a (name, id) pair """ name: str nameChars = [] if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == "'": # Quoted name nameChars.append(stream.next()) while True: if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == "'": nameChars.append(stream.next()) if stream.peek() == "'": # '' is escaped-quote nameChars.append(stream.next()) continue break nameChars.append(stream.next()) else: while stream.hasNext() and not re.match(r'[(),;]', stream.peek()): nameChars.append(stream.next()) if stream.peek() == ';': # Ignore trailing input semicolon stream.next() # Convert to (name, id) name = ''.join(nameChars).rstrip().lower() if name.startswith('mrca'): return (name, name) elif name[0] == "'": match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') name = match.group(1).replace("''", "'") return (name, match.group(2)) else: match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') return (match.group(1).replace('_', ' '), match.group(2)) def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None: """ Update maps upon a newly parsed name """ if name not in nameToFirstId: nameToFirstId[name] = id else: if name not in dupNameToIds: dupNameToIds[name] = [nameToFirstId[name], id] else: dupNameToIds[name].append(id) def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: """ Update a node in a tree to be named after 2 descendants. Returns the name of one such descendant, for use during recursion. """ node = nodeMap[id] name = node.name childIds = node.childIds if len(childIds) < 2: raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children') # Get 2 children with most tips childTips = [nodeMap[id].tips for id in childIds] maxIdx1 = childTips.index(max(childTips)) childTips[maxIdx1] = 0 maxIdx2 = childTips.index(max(childTips)) childId1 = childIds[maxIdx1] childId2 = childIds[maxIdx2] childName1 = nodeMap[childId1].name childName2 = nodeMap[childId2].name # Check for mrca* child names if childName1.startswith('mrca'): childName1 = convertMrcaName(childId1, nodeMap) if childName2.startswith('mrca'): childName2 = convertMrcaName(childId2, nodeMap) # Check for composite names match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) if match is not None: childName1 = match.group(1) match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) if match is not None: childName2 = match.group(1) # Create composite name node.name = f'[{childName1} + {childName2}]' return childName1 if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE)