aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/gen_otol_data.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-01-29 11:30:47 +1100
committerTerry Truong <terry06890@gmail.com>2023-01-29 11:30:47 +1100
commit8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch)
treeffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data/gen_otol_data.py
parentf5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff)
Adjust backend coding style
Add line spacing, section comments, and import consistency
Diffstat (limited to 'backend/tol_data/gen_otol_data.py')
-rwxr-xr-xbackend/tol_data/gen_otol_data.py45
1 files changed, 40 insertions, 5 deletions
diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py
index eba8779..a67ea4b 100755
--- a/backend/tol_data/gen_otol_data.py
+++ b/backend/tol_data/gen_otol_data.py
@@ -21,14 +21,19 @@ Reads from a picked-names file, if present, which specifies name and node ID pai
These help resolve cases where multiple nodes share the same name.
"""
-import re, os
-import json, sqlite3
+import argparse
+import re
+import os
+import json
+import sqlite3
TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes
ANN_FILE = os.path.join('otol', 'annotations.json')
DB_FILE = 'data.db'
PICKED_NAMES_FILE = 'picked_otol_names.txt'
+# ========== Classes ==========
+
class Node:
""" Represents a tree-of-life node """
def __init__(self, name, childIds, parentId, tips, pSupport):
@@ -37,13 +42,16 @@ class Node:
self.parentId = parentId
self.tips = tips
self.pSupport = pSupport
+
class BasicStream:
""" Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """
def __init__(self, data, idx=0):
self.data = data
self.idx = idx
+
def hasNext(self) -> bool:
return self.idx < len(self.data)
+
def next(self) -> str:
if self.hasNext():
char = self.data[self.idx]
@@ -51,30 +59,37 @@ class BasicStream:
return char;
else:
return '';
+
def peek(self) -> str:
if self.hasNext():
return self.data[self.idx]
else:
return '';
+
def skipWhitespace(self) -> None:
while self.hasNext() and self.data[self.idx].isspace():
self.idx += 1
+
def progress(self) -> float:
return (self.idx / len(self.data))
+# ========== For data generation ==========
+
def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None:
""" Reads the files and stores the tree info """
nodeMap: dict[str, Node] = {} # Maps node IDs to node objects
nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs)
dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs
- #
+
print('Parsing tree file')
treeStream: BasicStream
with open(treeFile) as file:
treeStream = BasicStream(file.read())
+
# Parse content
parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds)
print('Resolving duplicate names')
+
# Read picked-names file
nameToPickedId: dict[str, str] = {}
if os.path.exists(pickedNamesFile):
@@ -82,6 +97,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
for line in file:
name, _, otolId = line.strip().partition('|')
nameToPickedId[name] = otolId
+
# Resolve duplicates
for dupName, ids in dupNameToIds.items():
# Check for picked id
@@ -98,10 +114,12 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
if id != idToUse:
nodeMap[id].name += f' [{counter}]'
counter += 1
+
print('Changing mrca* names')
for id, node in nodeMap.items():
if node.name.startswith('mrca'):
convertMrcaName(id, nodeMap)
+
print('Parsing annotations file')
# Read file
with open(annFile) as file:
@@ -116,6 +134,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0
conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0
node.pSupport = supportQty > 0 and conflictQty == 0
+
print('Creating nodes and edges tables')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -129,9 +148,11 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
childNode = nodeMap[childId]
dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)',
(node.name, childNode.name, 1 if childNode.pSupport else 0))
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def parseNewick(
stream: BasicStream,
nodeMap: dict[str, Node],
@@ -140,6 +161,7 @@ def parseNewick(
""" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """
if stream.idx % 1e5 == 0:
print(f'Progress: {stream.progress() * 100:.2f}%')
+
# Find node
stream.skipWhitespace()
if stream.peek() == '':
@@ -151,6 +173,7 @@ def parseNewick(
# Read child
childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds)
childIds.append(childId)
+
# Check for next child or end of node
stream.skipWhitespace()
if stream.peek() == '':
@@ -164,12 +187,15 @@ def parseNewick(
stream.skipWhitespace()
name, id = parseNewickName(stream)
updateNameMaps(name, id, nameToFirstId, dupNameToIds)
+
# Get child num-tips total
tips = 0
for childId in childIds:
tips += nodeMap[childId].tips
+
# Add node to nodeMap
nodeMap[id] = Node(name, childIds, None, tips, False)
+
# Update childrens' parent reference
for childId in childIds:
nodeMap[childId].parentId = id
@@ -179,6 +205,7 @@ def parseNewick(
updateNameMaps(name, id, nameToFirstId, dupNameToIds)
nodeMap[id] = Node(name, [], None, 1, False)
return id
+
def parseNewickName(stream: BasicStream) -> tuple[str, str]:
""" Parses a node name from 'stream', and returns a (name, id) pair """
name: str
@@ -202,6 +229,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
nameChars.append(stream.next())
if stream.peek() == ';': # Ignore trailing input semicolon
stream.next()
+
# Convert to (name, id)
name = ''.join(nameChars).rstrip().lower()
if name.startswith('mrca'):
@@ -217,6 +245,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
if match is None:
raise Exception(f'ERROR: invalid name \'{name}\'')
return (match.group(1).replace('_', ' '), match.group(2))
+
def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None:
""" Update maps upon a newly parsed name """
if name not in nameToFirstId:
@@ -226,6 +255,7 @@ def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToI
dupNameToIds[name] = [nameToFirstId[name], id]
else:
dupNameToIds[name].append(id)
+
def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
""" Update a node in a tree to be named after 2 descendants.
Returns the name of one such descendant, for use during recursion. """
@@ -234,6 +264,7 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
childIds = node.childIds
if len(childIds) < 2:
raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children')
+
# Get 2 children with most tips
childTips = [nodeMap[id].tips for id in childIds]
maxIdx1 = childTips.index(max(childTips))
@@ -243,11 +274,13 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
childId2 = childIds[maxIdx2]
childName1 = nodeMap[childId1].name
childName2 = nodeMap[childId2].name
+
# Check for mrca* child names
if childName1.startswith('mrca'):
childName1 = convertMrcaName(childId1, nodeMap)
if childName2.startswith('mrca'):
childName2 = convertMrcaName(childId2, nodeMap)
+
# Check for composite names
match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1)
if match is not None:
@@ -255,13 +288,15 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2)
if match is not None:
childName2 = match.group(1)
+
# Create composite name
node.name = f'[{childName1} + {childName2}]'
return childName1
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE)