Add backend unit tests

- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
author: Terry Truong <terry06890@gmail.com> 2022-09-11 14:55:42 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-09-11 15:04:14 +1000
commit: 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree: 2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/genOtolData.py
parent: daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
1 files changed, 0 insertions, 246 deletions
diff --git a/backend/tolData/genOtolData.py b/backend/tolData/genOtolData.py
deleted file mode 100755
index d4d6ee8..0000000
--- a/backend/tolData/genOtolData.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#!/usr/bin/python3
-
-import re, os
-import json, sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads files describing a tree-of-life from an 'Open Tree of Life' release,
-and stores tree info in a database.
-
-Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format:
-    The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6
-        The root node is named n6, and has children n1, n2, and n5.
-    Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', 
-        'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'.
-        The node with ID 'ott770315' will get the name 'homo sapiens'.
-        A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]).
-    It is possible for multiple nodes to have the same name.
-        In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc.
-Reads an annotations.json file, which is assumed to have this format:
-    Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node,
-    such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that
-    support/conflict with the node's placement.
-Reads from a picked-names file, if present, which specifies name and node ID pairs.
-    These help resolve cases where multiple nodes share the same name.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-class Node:
-	' Represents a tree-of-life node '
-	def __init__(self, name, childIds, parentId, tips, pSupport):
-		self.name = name
-		self.childIds = childIds
-		self.parentId = parentId
-		self.tips = tips
-		self.pSupport = pSupport
-
-treeFile = 'otol/labelled_supertree_ottnames.tre' # Had about 2.5e9 nodes
-annFile = 'otol/annotations.json'
-dbFile = 'data.db'
-nodeMap: dict[str, Node] = {} # Maps node IDs to node objects
-nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs)
-dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs
-pickedNamesFile = 'pickedOtolNames.txt'
-
-print('Parsing tree file')
-# Read file
-data: str
-with open(treeFile) as file:
-	data = file.read()
-dataIdx = 0
-# Parse content
-iterNum = 0
-def parseNewick() -> str:
-	""" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """
-	global data, dataIdx, iterNum
-	iterNum += 1
-	if iterNum % 1e5 == 0:
-		print(f'At iteration {iterNum}')
-	# Check for EOF
-	if dataIdx == len(data):
-		raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
-	# Check for node
-	if data[dataIdx] == '(': # parse inner node
-		dataIdx += 1
-		childIds: list[str] = []
-		while True:
-			# Read child
-			childId = parseNewick()
-			childIds.append(childId)
-			if (dataIdx == len(data)):
-				raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
-			# Check for next child
-			if (data[dataIdx] == ','):
-				dataIdx += 1
-				continue
-			else:
-				# Get node name and id
-				dataIdx += 1 # Consume an expected ')'
-				name, id = parseNewickName()
-				updateNameMaps(name, id)
-				# Get child num-tips total
-				tips = 0
-				for childId in childIds:
-					tips += nodeMap[childId].tips
-				# Add node to nodeMap
-				nodeMap[id] = Node(name, childIds, None, tips, False)
-				# Update childrens' parent reference
-				for childId in childIds:
-					nodeMap[childId].parentId = id
-				return id
-	else: # Parse node name
-		name, id = parseNewickName()
-		updateNameMaps(name, id)
-		nodeMap[id] = Node(name, [], None, 1, False)
-		return id
-def parseNewickName() -> tuple[str, str]:
-	""" Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair """
-	global data, dataIdx
-	name: str
-	end = dataIdx
-	# Get name
-	if (end < len(data) and data[end] == "'"): # Check for quoted name
-		end += 1
-		inQuote = True
-		while end < len(data):
-			if (data[end] == "'"):
-				if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote
-					end += 2
-					continue
-				else:
-					end += 1
-					inQuote = False
-					break
-			end += 1
-		if inQuote:
-			raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
-		name = data[dataIdx:end]
-		dataIdx = end
-	else:
-		while end < len(data) and not re.match(r'[(),]', data[end]):
-			end += 1
-		if (end == dataIdx):
-			raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}')
-		name = data[dataIdx:end].rstrip()
-		if end == len(data): # Ignore trailing input semicolon
-			name = name[:-1]
-		dataIdx = end
-	# Convert to (name, id)
-	name = name.lower()
-	if name.startswith('mrca'):
-		return (name, name)
-	elif name[0] == "'":
-		match = re.fullmatch(r"'([^\\\']+) (ott\d+)'", name)
-		if match is None:
-			raise Exception(f'ERROR: invalid name \'{name}\'')
-		name = match.group(1).replace("''", "'")
-		return (name, match.group(2))
-	else:
-		match = re.fullmatch(r"([^\\\']+)_(ott\d+)", name)
-		if match is None:
-			raise Exception(f'ERROR: invalid name \'{name}\'')
-		return (match.group(1).replace('_', ' '), match.group(2))
-def updateNameMaps(name, id):
-	global nameToFirstId, dupNameToIds
-	if name not in nameToFirstId:
-		nameToFirstId[name] = id
-	else:
-		if name not in dupNameToIds:
-			dupNameToIds[name] = [nameToFirstId[name], id]
-		else:
-			dupNameToIds[name].append(id)
-rootId: str = parseNewick()
-
-print('Resolving duplicate names')
-# Read picked-names file
-nameToPickedId: dict[str, str] = {}
-if os.path.exists(pickedNamesFile):
-	with open(pickedNamesFile) as file:
-		for line in file:
-			name, _, otolId = line.rstrip().partition('|')
-			nameToPickedId[name] = otolId
-# Resolve duplicates
-for dupName, ids in dupNameToIds.items():
-	# Check for picked id
-	if dupName in nameToPickedId:
-		idToUse = nameToPickedId[dupName]
-	else:
-		# Get conflicting node with most tips
-		tipNums = [nodeMap[id].tips for id in ids]
-		maxIdx = tipNums.index(max(tipNums))
-		idToUse = ids[maxIdx]
-	# Adjust name of other conflicting nodes
-	counter = 2
-	for id in ids:
-		if id != idToUse:
-			nodeMap[id].name += f' [{counter}]'
-			counter += 1
-
-print('Changing mrca* names')
-def convertMrcaName(id: str):
-	node = nodeMap[id]
-	name = node.name
-	childIds = node.childIds
-	if len(childIds) < 2:
-		print(f'WARNING: MRCA node \'{name}\' has less than 2 children')
-		return
-	# Get 2 children with most tips
-	childTips = [nodeMap[id].tips for id in childIds]
-	maxIdx1 = childTips.index(max(childTips))
-	childTips[maxIdx1] = 0
-	maxIdx2 = childTips.index(max(childTips))
-	childId1 = childIds[maxIdx1]
-	childId2 = childIds[maxIdx2]
-	childName1 = nodeMap[childId1].name
-	childName2 = nodeMap[childId2].name
-	# Check for mrca* child names
-	if childName1.startswith('mrca'):
-		childName1 = convertMrcaName(childId1)
-	if childName2.startswith('mrca'):
-		childName2 = convertMrcaName(childId2)
-	# Check for composite names
-	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1)
-	if match is not None:
-		childName1 = match.group(1)
-	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2)
-	if match is not None:
-		childName2 = match.group(1)
-	# Create composite name
-	node.name = f'[{childName1} + {childName2}]'
-	return childName1
-for id, node in nodeMap.items():
-	if node.name.startswith('mrca'):
-		convertMrcaName(id)
-
-print('Parsing annotations file')
-# Read file
-with open(annFile) as file:
-	data = file.read()
-obj = json.loads(data)
-nodeAnnsMap = obj['nodes']
-# Find relevant annotations
-for id, node in nodeMap.items():
-	# Set has-support value using annotations
-	if id in nodeAnnsMap:
-		nodeAnns = nodeAnnsMap[id]
-		supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0
-		conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0
-		node.pSupport = supportQty > 0 and conflictQty == 0
-
-print('Creating nodes and edges tables')
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)')
-dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)')
-dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))')
-dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)')
-for otolId, node in nodeMap.items():
-	dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips))
-	for childId in node.childIds:
-		childNode = nodeMap[childId]
-		dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)',
-			(node.name, childNode.name, 1 if childNode.pSupport else 0))
-print('Closing database')
-dbCon.commit()
-dbCon.close()
author	Terry Truong <terry06890@gmail.com>	2022-09-11 14:55:42 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-09-11 15:04:14 +1000
commit	5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree	2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/genOtolData.py
parent	daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)