#!/usr/bin/python3 """ Reads files describing a tree-of-life from an 'Open Tree of Life' release, and stores tree info in a database. Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 The root node is named n6, and has children n1, n2, and n5. Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. The node with ID 'ott770315' will get the name 'homo sapiens'. A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). It is possible for multiple nodes to have the same name. In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. Reads an annotations.json file, which is assumed to have this format: Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that support/conflict with the node's placement. Reads from a picked-names file, if present, which specifies name and node ID pairs. These help resolve cases where multiple nodes share the same name. """ import argparse import re import os import json import sqlite3 TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes ANN_FILE = os.path.join('otol', 'annotations.json') DB_FILE = 'data.db' PICKED_NAMES_FILE = 'picked_otol_names.txt' # ========== Classes ========== class Node: """ Represents a tree-of-life node """ def __init__(self, name, childIds, parentId, tips, pSupport): self.name = name self.childIds = childIds self.parentId = parentId self.tips = tips self.pSupport = pSupport class BasicStream: """ Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """ def __init__(self, data, idx=0): self.data = data self.idx = idx def hasNext(self) -> bool: return self.idx < len(self.data) def next(self) -> str: if self.hasNext(): char = self.data[self.idx] self.idx += 1 return char; else: return ''; def peek(self) -> str: if self.hasNext(): return self.data[self.idx] else: return ''; def skipWhitespace(self) -> None: while self.hasNext() and self.data[self.idx].isspace(): self.idx += 1 def progress(self) -> float: return (self.idx / len(self.data)) # ========== For data generation ========== def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None: """ Reads the files and stores the tree info """ nodeMap: dict[str, Node] = {} # Maps node IDs to node objects nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs print('Parsing tree file') treeStream: BasicStream with open(treeFile) as file: treeStream = BasicStream(file.read()) # Parse content parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds) print('Resolving duplicate names') # Read picked-names file nameToPickedId: dict[str, str] = {} if os.path.exists(pickedNamesFile): with open(pickedNamesFile) as file: for line in file: name, _, otolId = line.strip().partition('|') nameToPickedId[name] = otolId # Resolve duplicates for dupName, ids in dupNameToIds.items(): # Check for picked id if dupName in nameToPickedId: idToUse = nameToPickedId[dupName] else: # Get conflicting node with most tips tipNums = [nodeMap[id].tips for id in ids] maxIdx = tipNums.index(max(tipNums)) idToUse = ids[maxIdx] # Adjust name of other conflicting nodes counter = 2 for id in ids: if id != idToUse: nodeMap[id].name += f' [{counter}]' counter += 1 print('Changing mrca* names') for id, node in nodeMap.items(): if node.name.startswith('mrca'): convertMrcaName(id, nodeMap) print('Parsing annotations file') # Read file with open(annFile) as file: data = file.read() obj = json.loads(data) nodeAnnsMap = obj['nodes'] # Find relevant annotations for id, node in nodeMap.items(): # Set has-support value using annotations if id in nodeAnnsMap: nodeAnns = nodeAnnsMap[id] supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 node.pSupport = supportQty > 0 and conflictQty == 0 print('Creating nodes and edges tables') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)') dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)') for otolId, node in nodeMap.items(): dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips)) for childId in node.childIds: childNode = nodeMap[childId] dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', (node.name, childNode.name, 1 if childNode.pSupport else 0)) print('Closing database') dbCon.commit() dbCon.close() def parseNewick( stream: BasicStream, nodeMap: dict[str, Node], nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> str: """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ if stream.idx % 1e5 == 0: print(f'Progress: {stream.progress() * 100:.2f}%') # Find node stream.skipWhitespace() if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == '(': # Start of inner node stream.next() childIds: list[str] = [] while True: # Read child childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds) childIds.append(childId) # Check for next child or end of node stream.skipWhitespace() if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == ',': # Expect another child stream.next() continue else: # End of child list # Get node name and id stream.next() # Consume an expected ')' stream.skipWhitespace() name, id = parseNewickName(stream) updateNameMaps(name, id, nameToFirstId, dupNameToIds) # Get child num-tips total tips = 0 for childId in childIds: tips += nodeMap[childId].tips # Add node to nodeMap nodeMap[id] = Node(name, childIds, None, tips, False) # Update childrens' parent reference for childId in childIds: nodeMap[childId].parentId = id return id else: # Parse node name name, id = parseNewickName(stream) updateNameMaps(name, id, nameToFirstId, dupNameToIds) nodeMap[id] = Node(name, [], None, 1, False) return id def parseNewickName(stream: BasicStream) -> tuple[str, str]: """ Parses a node name from 'stream', and returns a (name, id) pair """ name: str nameChars = [] if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == "'": # Quoted name nameChars.append(stream.next()) while True: if stream.peek() == '': raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') elif stream.peek() == "'": nameChars.append(stream.next()) if stream.peek() == "'": # '' is escaped-quote nameChars.append(stream.next()) continue break nameChars.append(stream.next()) else: while stream.hasNext() and not re.match(r'[(),;]', stream.peek()): nameChars.append(stream.next()) if stream.peek() == ';': # Ignore trailing input semicolon stream.next() # Convert to (name, id) name = ''.join(nameChars).rstrip().lower() if name.startswith('mrca'): return (name, name) elif name[0] == "'": match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') name = match.group(1).replace("''", "'") return (name, match.group(2)) else: match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') return (match.group(1).replace('_', ' '), match.group(2)) def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None: """ Update maps upon a newly parsed name """ if name not in nameToFirstId: nameToFirstId[name] = id else: if name not in dupNameToIds: dupNameToIds[name] = [nameToFirstId[name], id] else: dupNameToIds[name].append(id) def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: """ Update a node in a tree to be named after 2 descendants. Returns the name of one such descendant, for use during recursion. """ node = nodeMap[id] name = node.name childIds = node.childIds if len(childIds) < 2: raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children') # Get 2 children with most tips childTips = [nodeMap[id].tips for id in childIds] maxIdx1 = childTips.index(max(childTips)) childTips[maxIdx1] = 0 maxIdx2 = childTips.index(max(childTips)) childId1 = childIds[maxIdx1] childId2 = childIds[maxIdx2] childName1 = nodeMap[childId1].name childName2 = nodeMap[childId2].name # Check for mrca* child names if childName1.startswith('mrca'): childName1 = convertMrcaName(childId1, nodeMap) if childName2.startswith('mrca'): childName2 = convertMrcaName(childId2, nodeMap) # Check for composite names match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) if match is not None: childName1 = match.group(1) match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) if match is not None: childName2 = match.group(1) # Create composite name node.name = f'[{childName1} + {childName2}]' return childName1 # ========== Main block ========== if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE)