#!/usr/bin/python3 import re, os import json, sqlite3 import argparse parser = argparse.ArgumentParser(description=""" Reads files describing a tree-of-life from an 'Open Tree of Life' release, and stores tree info in a database. Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 The root node is named n6, and has children n1, n2, and n5. Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. The node with ID 'ott770315' will get the name 'homo sapiens'. A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). It is possible for multiple nodes to have the same name. In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. Reads an annotations.json file, which is assumed to have this format: Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that support/conflict with the node's placement. Reads from a picked-names file, if present, which specifies name and node ID pairs. These help resolve cases where multiple nodes share the same name. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() class Node: ' Represents a tree-of-life node ' def __init__(self, name, childIds, parentId, tips, pSupport): self.name = name self.childIds = childIds self.parentId = parentId self.tips = tips self.pSupport = pSupport treeFile = 'otol/labelled_supertree_ottnames.tre' # Had about 2.5e9 nodes annFile = 'otol/annotations.json' dbFile = 'data.db' nodeMap: dict[str, Node] = {} # Maps node IDs to node objects nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs pickedNamesFile = 'pickedOtolNames.txt' print('Parsing tree file') # Read file data: str with open(treeFile) as file: data = file.read() dataIdx = 0 # Parse content iterNum = 0 def parseNewick() -> str: """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ global data, dataIdx, iterNum iterNum += 1 if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') # Check for EOF if dataIdx == len(data): raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') # Check for node if data[dataIdx] == '(': # parse inner node dataIdx += 1 childIds: list[str] = [] while True: # Read child childId = parseNewick() childIds.append(childId) if (dataIdx == len(data)): raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') # Check for next child if (data[dataIdx] == ','): dataIdx += 1 continue else: # Get node name and id dataIdx += 1 # Consume an expected ')' name, id = parseNewickName() updateNameMaps(name, id) # Get child num-tips total tips = 0 for childId in childIds: tips += nodeMap[childId].tips # Add node to nodeMap nodeMap[id] = Node(name, childIds, None, tips, False) # Update childrens' parent reference for childId in childIds: nodeMap[childId].parentId = id return id else: # Parse node name name, id = parseNewickName() updateNameMaps(name, id) nodeMap[id] = Node(name, [], None, 1, False) return id def parseNewickName() -> tuple[str, str]: """ Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair """ global data, dataIdx name: str end = dataIdx # Get name if (end < len(data) and data[end] == "'"): # Check for quoted name end += 1 inQuote = True while end < len(data): if (data[end] == "'"): if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote end += 2 continue else: end += 1 inQuote = False break end += 1 if inQuote: raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') name = data[dataIdx:end] dataIdx = end else: while end < len(data) and not re.match(r'[(),]', data[end]): end += 1 if (end == dataIdx): raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') name = data[dataIdx:end].rstrip() if end == len(data): # Ignore trailing input semicolon name = name[:-1] dataIdx = end # Convert to (name, id) name = name.lower() if name.startswith('mrca'): return (name, name) elif name[0] == "'": match = re.fullmatch(r"'([^\\\']+) (ott\d+)'", name) if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') name = match.group(1).replace("''", "'") return (name, match.group(2)) else: match = re.fullmatch(r"([^\\\']+)_(ott\d+)", name) if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') return (match.group(1).replace('_', ' '), match.group(2)) def updateNameMaps(name, id): global nameToFirstId, dupNameToIds if name not in nameToFirstId: nameToFirstId[name] = id else: if name not in dupNameToIds: dupNameToIds[name] = [nameToFirstId[name], id] else: dupNameToIds[name].append(id) rootId: str = parseNewick() print('Resolving duplicate names') # Read picked-names file nameToPickedId: dict[str, str] = {} if os.path.exists(pickedNamesFile): with open(pickedNamesFile) as file: for line in file: name, _, otolId = line.rstrip().partition('|') nameToPickedId[name] = otolId # Resolve duplicates for dupName, ids in dupNameToIds.items(): # Check for picked id if dupName in nameToPickedId: idToUse = nameToPickedId[dupName] else: # Get conflicting node with most tips tipNums = [nodeMap[id].tips for id in ids] maxIdx = tipNums.index(max(tipNums)) idToUse = ids[maxIdx] # Adjust name of other conflicting nodes counter = 2 for id in ids: if id != idToUse: nodeMap[id].name += f' [{counter}]' counter += 1 print('Changing mrca* names') def convertMrcaName(id: str): node = nodeMap[id] name = node.name childIds = node.childIds if len(childIds) < 2: print(f'WARNING: MRCA node \'{name}\' has less than 2 children') return # Get 2 children with most tips childTips = [nodeMap[id].tips for id in childIds] maxIdx1 = childTips.index(max(childTips)) childTips[maxIdx1] = 0 maxIdx2 = childTips.index(max(childTips)) childId1 = childIds[maxIdx1] childId2 = childIds[maxIdx2] childName1 = nodeMap[childId1].name childName2 = nodeMap[childId2].name # Check for mrca* child names if childName1.startswith('mrca'): childName1 = convertMrcaName(childId1) if childName2.startswith('mrca'): childName2 = convertMrcaName(childId2) # Check for composite names match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) if match is not None: childName1 = match.group(1) match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) if match is not None: childName2 = match.group(1) # Create composite name node.name = f'[{childName1} + {childName2}]' return childName1 for id, node in nodeMap.items(): if node.name.startswith('mrca'): convertMrcaName(id) print('Parsing annotations file') # Read file with open(annFile) as file: data = file.read() obj = json.loads(data) nodeAnnsMap = obj['nodes'] # Find relevant annotations for id, node in nodeMap.items(): # Set has-support value using annotations if id in nodeAnnsMap: nodeAnns = nodeAnnsMap[id] supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 node.pSupport = supportQty > 0 and conflictQty == 0 print('Creating nodes and edges tables') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)') dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)') for otolId, node in nodeMap.items(): dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips)) for childId in node.childIds: childNode = nodeMap[childId] dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', (node.name, childNode.name, 1 if childNode.pSupport else 0)) print('Closing database') dbCon.commit() dbCon.close()