Adjust backend coding style

Add line spacing, section comments, and import consistency
author: Terry Truong <terry06890@gmail.com> 2023-01-29 11:30:47 +1100
committer: Terry Truong <terry06890@gmail.com> 2023-01-29 11:30:47 +1100
commit: 8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch)
tree: ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data/gen_otol_data.py
parent: f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff)
1 files changed, 40 insertions, 5 deletions
diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py
index eba8779..a67ea4b 100755
--- a/backend/tol_data/gen_otol_data.py
+++ b/backend/tol_data/gen_otol_data.py
@@ -21,14 +21,19 @@ Reads from a picked-names file, if present, which specifies name and node ID pai
     These help resolve cases where multiple nodes share the same name.
 """
 
-import re, os
-import json, sqlite3
+import argparse
+import re
+import os
+import json
+import sqlite3
 
 TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes
 ANN_FILE = os.path.join('otol', 'annotations.json')
 DB_FILE = 'data.db'
 PICKED_NAMES_FILE = 'picked_otol_names.txt'
 
+# ========== Classes ==========
+
 class Node:
 	""" Represents a tree-of-life node """
 	def __init__(self, name, childIds, parentId, tips, pSupport):
@@ -37,13 +42,16 @@ class Node:
 		self.parentId = parentId
 		self.tips = tips
 		self.pSupport = pSupport
+
 class BasicStream:
 	""" Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """
 	def __init__(self, data, idx=0):
 		self.data = data
 		self.idx = idx
+
 	def hasNext(self) -> bool:
 		return self.idx < len(self.data)
+
 	def next(self) -> str:
 		if self.hasNext():
 			char = self.data[self.idx]
@@ -51,30 +59,37 @@ class BasicStream:
 			return char;
 		else:
 			return '';
+
 	def peek(self) -> str:
 		if self.hasNext():
 			return self.data[self.idx]
 		else:
 			return '';
+
 	def skipWhitespace(self) -> None:
 		while self.hasNext() and self.data[self.idx].isspace():
 			self.idx += 1
+
 	def progress(self) -> float:
 		return (self.idx / len(self.data))
 
+# ========== For data generation ==========
+
 def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None:
 	""" Reads the files and stores the tree info """
 	nodeMap: dict[str, Node] = {} # Maps node IDs to node objects
 	nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs)
 	dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs
-	#
+
 	print('Parsing tree file')
 	treeStream: BasicStream
 	with open(treeFile) as file:
 		treeStream = BasicStream(file.read())
+
 	# Parse content
 	parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds)
 	print('Resolving duplicate names')
+
 	# Read picked-names file
 	nameToPickedId: dict[str, str] = {}
 	if os.path.exists(pickedNamesFile):
@@ -82,6 +97,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			for line in file:
 				name, _, otolId = line.strip().partition('|')
 				nameToPickedId[name] = otolId
+
 	# Resolve duplicates
 	for dupName, ids in dupNameToIds.items():
 		# Check for picked id
@@ -98,10 +114,12 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			if id != idToUse:
 				nodeMap[id].name += f' [{counter}]'
 				counter += 1
+
 	print('Changing mrca* names')
 	for id, node in nodeMap.items():
 		if node.name.startswith('mrca'):
 			convertMrcaName(id, nodeMap)
+
 	print('Parsing annotations file')
 	# Read file
 	with open(annFile) as file:
@@ -116,6 +134,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0
 			conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0
 			node.pSupport = supportQty > 0 and conflictQty == 0
+
 	print('Creating nodes and edges tables')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
@@ -129,9 +148,11 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			childNode = nodeMap[childId]
 			dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)',
 				(node.name, childNode.name, 1 if childNode.pSupport else 0))
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
+
 def parseNewick(
 		stream: BasicStream,
 		nodeMap: dict[str, Node],
@@ -140,6 +161,7 @@ def parseNewick(
 	""" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """
 	if stream.idx % 1e5 == 0:
 		print(f'Progress: {stream.progress() * 100:.2f}%')
+
 	# Find node
 	stream.skipWhitespace()
 	if stream.peek() == '':
@@ -151,6 +173,7 @@ def parseNewick(
 			# Read child
 			childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds)
 			childIds.append(childId)
+
 			# Check for next child or end of node
 			stream.skipWhitespace()
 			if stream.peek() == '':
@@ -164,12 +187,15 @@ def parseNewick(
 				stream.skipWhitespace()
 				name, id = parseNewickName(stream)
 				updateNameMaps(name, id, nameToFirstId, dupNameToIds)
+
 				# Get child num-tips total
 				tips = 0
 				for childId in childIds:
 					tips += nodeMap[childId].tips
+
 				# Add node to nodeMap
 				nodeMap[id] = Node(name, childIds, None, tips, False)
+
 				# Update childrens' parent reference
 				for childId in childIds:
 					nodeMap[childId].parentId = id
@@ -179,6 +205,7 @@ def parseNewick(
 		updateNameMaps(name, id, nameToFirstId, dupNameToIds)
 		nodeMap[id] = Node(name, [], None, 1, False)
 		return id
+
 def parseNewickName(stream: BasicStream) -> tuple[str, str]:
 	""" Parses a node name from 'stream', and returns a (name, id) pair """
 	name: str
@@ -202,6 +229,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
 			nameChars.append(stream.next())
 		if stream.peek() == ';': # Ignore trailing input semicolon
 			stream.next()
+
 	# Convert to (name, id)
 	name = ''.join(nameChars).rstrip().lower()
 	if name.startswith('mrca'):
@@ -217,6 +245,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
 		if match is None:
 			raise Exception(f'ERROR: invalid name \'{name}\'')
 		return (match.group(1).replace('_', ' '), match.group(2))
+
 def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None:
 	""" Update maps upon a newly parsed name """
 	if name not in nameToFirstId:
@@ -226,6 +255,7 @@ def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToI
 			dupNameToIds[name] = [nameToFirstId[name], id]
 		else:
 			dupNameToIds[name].append(id)
+
 def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	""" Update a node in a tree to be named after 2 descendants.
 		Returns the name of one such descendant, for use during recursion. """
@@ -234,6 +264,7 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	childIds = node.childIds
 	if len(childIds) < 2:
 		raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children')
+
 	# Get 2 children with most tips
 	childTips = [nodeMap[id].tips for id in childIds]
 	maxIdx1 = childTips.index(max(childTips))
@@ -243,11 +274,13 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	childId2 = childIds[maxIdx2]
 	childName1 = nodeMap[childId1].name
 	childName2 = nodeMap[childId2].name
+
 	# Check for mrca* child names
 	if childName1.startswith('mrca'):
 		childName1 = convertMrcaName(childId1, nodeMap)
 	if childName2.startswith('mrca'):
 		childName2 = convertMrcaName(childId2, nodeMap)
+
 	# Check for composite names
 	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1)
 	if match is not None:
@@ -255,13 +288,15 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2)
 	if match is not None:
 		childName2 = match.group(1)
+
 	# Create composite name
 	node.name = f'[{childName1} + {childName2}]'
 	return childName1
 
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	# 
+
 	genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE)
author	Terry Truong <terry06890@gmail.com>	2023-01-29 11:30:47 +1100
committer	Terry Truong <terry06890@gmail.com>	2023-01-29 11:30:47 +1100
commit	8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch)
tree	ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data/gen_otol_data.py
parent	f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff)