diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
| commit | 8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch) | |
| tree | ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data/gen_otol_data.py | |
| parent | f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff) | |
Adjust backend coding style
Add line spacing, section comments, and import consistency
Diffstat (limited to 'backend/tol_data/gen_otol_data.py')
| -rwxr-xr-x | backend/tol_data/gen_otol_data.py | 45 |
1 files changed, 40 insertions, 5 deletions
diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py index eba8779..a67ea4b 100755 --- a/backend/tol_data/gen_otol_data.py +++ b/backend/tol_data/gen_otol_data.py @@ -21,14 +21,19 @@ Reads from a picked-names file, if present, which specifies name and node ID pai These help resolve cases where multiple nodes share the same name. """ -import re, os -import json, sqlite3 +import argparse +import re +import os +import json +import sqlite3 TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes ANN_FILE = os.path.join('otol', 'annotations.json') DB_FILE = 'data.db' PICKED_NAMES_FILE = 'picked_otol_names.txt' +# ========== Classes ========== + class Node: """ Represents a tree-of-life node """ def __init__(self, name, childIds, parentId, tips, pSupport): @@ -37,13 +42,16 @@ class Node: self.parentId = parentId self.tips = tips self.pSupport = pSupport + class BasicStream: """ Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """ def __init__(self, data, idx=0): self.data = data self.idx = idx + def hasNext(self) -> bool: return self.idx < len(self.data) + def next(self) -> str: if self.hasNext(): char = self.data[self.idx] @@ -51,30 +59,37 @@ class BasicStream: return char; else: return ''; + def peek(self) -> str: if self.hasNext(): return self.data[self.idx] else: return ''; + def skipWhitespace(self) -> None: while self.hasNext() and self.data[self.idx].isspace(): self.idx += 1 + def progress(self) -> float: return (self.idx / len(self.data)) +# ========== For data generation ========== + def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None: """ Reads the files and stores the tree info """ nodeMap: dict[str, Node] = {} # Maps node IDs to node objects nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs - # + print('Parsing tree file') treeStream: BasicStream with open(treeFile) as file: treeStream = BasicStream(file.read()) + # Parse content parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds) print('Resolving duplicate names') + # Read picked-names file nameToPickedId: dict[str, str] = {} if os.path.exists(pickedNamesFile): @@ -82,6 +97,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N for line in file: name, _, otolId = line.strip().partition('|') nameToPickedId[name] = otolId + # Resolve duplicates for dupName, ids in dupNameToIds.items(): # Check for picked id @@ -98,10 +114,12 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N if id != idToUse: nodeMap[id].name += f' [{counter}]' counter += 1 + print('Changing mrca* names') for id, node in nodeMap.items(): if node.name.startswith('mrca'): convertMrcaName(id, nodeMap) + print('Parsing annotations file') # Read file with open(annFile) as file: @@ -116,6 +134,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 node.pSupport = supportQty > 0 and conflictQty == 0 + print('Creating nodes and edges tables') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -129,9 +148,11 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N childNode = nodeMap[childId] dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', (node.name, childNode.name, 1 if childNode.pSupport else 0)) + print('Closing database') dbCon.commit() dbCon.close() + def parseNewick( stream: BasicStream, nodeMap: dict[str, Node], @@ -140,6 +161,7 @@ def parseNewick( """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ if stream.idx % 1e5 == 0: print(f'Progress: {stream.progress() * 100:.2f}%') + # Find node stream.skipWhitespace() if stream.peek() == '': @@ -151,6 +173,7 @@ def parseNewick( # Read child childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds) childIds.append(childId) + # Check for next child or end of node stream.skipWhitespace() if stream.peek() == '': @@ -164,12 +187,15 @@ def parseNewick( stream.skipWhitespace() name, id = parseNewickName(stream) updateNameMaps(name, id, nameToFirstId, dupNameToIds) + # Get child num-tips total tips = 0 for childId in childIds: tips += nodeMap[childId].tips + # Add node to nodeMap nodeMap[id] = Node(name, childIds, None, tips, False) + # Update childrens' parent reference for childId in childIds: nodeMap[childId].parentId = id @@ -179,6 +205,7 @@ def parseNewick( updateNameMaps(name, id, nameToFirstId, dupNameToIds) nodeMap[id] = Node(name, [], None, 1, False) return id + def parseNewickName(stream: BasicStream) -> tuple[str, str]: """ Parses a node name from 'stream', and returns a (name, id) pair """ name: str @@ -202,6 +229,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]: nameChars.append(stream.next()) if stream.peek() == ';': # Ignore trailing input semicolon stream.next() + # Convert to (name, id) name = ''.join(nameChars).rstrip().lower() if name.startswith('mrca'): @@ -217,6 +245,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]: if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') return (match.group(1).replace('_', ' '), match.group(2)) + def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None: """ Update maps upon a newly parsed name """ if name not in nameToFirstId: @@ -226,6 +255,7 @@ def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToI dupNameToIds[name] = [nameToFirstId[name], id] else: dupNameToIds[name].append(id) + def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: """ Update a node in a tree to be named after 2 descendants. Returns the name of one such descendant, for use during recursion. """ @@ -234,6 +264,7 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: childIds = node.childIds if len(childIds) < 2: raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children') + # Get 2 children with most tips childTips = [nodeMap[id].tips for id in childIds] maxIdx1 = childTips.index(max(childTips)) @@ -243,11 +274,13 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: childId2 = childIds[maxIdx2] childName1 = nodeMap[childId1].name childName2 = nodeMap[childId2].name + # Check for mrca* child names if childName1.startswith('mrca'): childName1 = convertMrcaName(childId1, nodeMap) if childName2.startswith('mrca'): childName2 = convertMrcaName(childId2, nodeMap) + # Check for composite names match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) if match is not None: @@ -255,13 +288,15 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) if match is not None: childName2 = match.group(1) + # Create composite name node.name = f'[{childName1} + {childName2}]' return childName1 +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE) |
