diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-09-11 14:55:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-09-11 15:04:14 +1000 |
| commit | 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch) | |
| tree | 2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/genOtolData.py | |
| parent | daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff) | |
Add backend unit tests
- Add unit testing code in backend/tests/
- Change to snake-case for script/file/directory names
- Use os.path.join() instead of '/'
- Refactor script code into function defs and a main-guard
- Make global vars all-caps
Some fixes:
- For getting descriptions, some wiki redirects weren't properly resolved
- Linked images were sub-optimally propagated
- Generation of reduced trees assumed a wiki-id association implied a description
- Tilo.py had potential null dereferences by not always using a reduced node set
- EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tolData/genOtolData.py')
| -rwxr-xr-x | backend/tolData/genOtolData.py | 246 |
1 files changed, 0 insertions, 246 deletions
diff --git a/backend/tolData/genOtolData.py b/backend/tolData/genOtolData.py deleted file mode 100755 index d4d6ee8..0000000 --- a/backend/tolData/genOtolData.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/python3 - -import re, os -import json, sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads files describing a tree-of-life from an 'Open Tree of Life' release, -and stores tree info in a database. - -Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: - The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 - The root node is named n6, and has children n1, n2, and n5. - Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', - 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. - The node with ID 'ott770315' will get the name 'homo sapiens'. - A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). - It is possible for multiple nodes to have the same name. - In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. -Reads an annotations.json file, which is assumed to have this format: - Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, - such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that - support/conflict with the node's placement. -Reads from a picked-names file, if present, which specifies name and node ID pairs. - These help resolve cases where multiple nodes share the same name. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -class Node: - ' Represents a tree-of-life node ' - def __init__(self, name, childIds, parentId, tips, pSupport): - self.name = name - self.childIds = childIds - self.parentId = parentId - self.tips = tips - self.pSupport = pSupport - -treeFile = 'otol/labelled_supertree_ottnames.tre' # Had about 2.5e9 nodes -annFile = 'otol/annotations.json' -dbFile = 'data.db' -nodeMap: dict[str, Node] = {} # Maps node IDs to node objects -nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) -dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs -pickedNamesFile = 'pickedOtolNames.txt' - -print('Parsing tree file') -# Read file -data: str -with open(treeFile) as file: - data = file.read() -dataIdx = 0 -# Parse content -iterNum = 0 -def parseNewick() -> str: - """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ - global data, dataIdx, iterNum - iterNum += 1 - if iterNum % 1e5 == 0: - print(f'At iteration {iterNum}') - # Check for EOF - if dataIdx == len(data): - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - # Check for node - if data[dataIdx] == '(': # parse inner node - dataIdx += 1 - childIds: list[str] = [] - while True: - # Read child - childId = parseNewick() - childIds.append(childId) - if (dataIdx == len(data)): - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - # Check for next child - if (data[dataIdx] == ','): - dataIdx += 1 - continue - else: - # Get node name and id - dataIdx += 1 # Consume an expected ')' - name, id = parseNewickName() - updateNameMaps(name, id) - # Get child num-tips total - tips = 0 - for childId in childIds: - tips += nodeMap[childId].tips - # Add node to nodeMap - nodeMap[id] = Node(name, childIds, None, tips, False) - # Update childrens' parent reference - for childId in childIds: - nodeMap[childId].parentId = id - return id - else: # Parse node name - name, id = parseNewickName() - updateNameMaps(name, id) - nodeMap[id] = Node(name, [], None, 1, False) - return id -def parseNewickName() -> tuple[str, str]: - """ Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair """ - global data, dataIdx - name: str - end = dataIdx - # Get name - if (end < len(data) and data[end] == "'"): # Check for quoted name - end += 1 - inQuote = True - while end < len(data): - if (data[end] == "'"): - if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote - end += 2 - continue - else: - end += 1 - inQuote = False - break - end += 1 - if inQuote: - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - name = data[dataIdx:end] - dataIdx = end - else: - while end < len(data) and not re.match(r'[(),]', data[end]): - end += 1 - if (end == dataIdx): - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - name = data[dataIdx:end].rstrip() - if end == len(data): # Ignore trailing input semicolon - name = name[:-1] - dataIdx = end - # Convert to (name, id) - name = name.lower() - if name.startswith('mrca'): - return (name, name) - elif name[0] == "'": - match = re.fullmatch(r"'([^\\\']+) (ott\d+)'", name) - if match is None: - raise Exception(f'ERROR: invalid name \'{name}\'') - name = match.group(1).replace("''", "'") - return (name, match.group(2)) - else: - match = re.fullmatch(r"([^\\\']+)_(ott\d+)", name) - if match is None: - raise Exception(f'ERROR: invalid name \'{name}\'') - return (match.group(1).replace('_', ' '), match.group(2)) -def updateNameMaps(name, id): - global nameToFirstId, dupNameToIds - if name not in nameToFirstId: - nameToFirstId[name] = id - else: - if name not in dupNameToIds: - dupNameToIds[name] = [nameToFirstId[name], id] - else: - dupNameToIds[name].append(id) -rootId: str = parseNewick() - -print('Resolving duplicate names') -# Read picked-names file -nameToPickedId: dict[str, str] = {} -if os.path.exists(pickedNamesFile): - with open(pickedNamesFile) as file: - for line in file: - name, _, otolId = line.rstrip().partition('|') - nameToPickedId[name] = otolId -# Resolve duplicates -for dupName, ids in dupNameToIds.items(): - # Check for picked id - if dupName in nameToPickedId: - idToUse = nameToPickedId[dupName] - else: - # Get conflicting node with most tips - tipNums = [nodeMap[id].tips for id in ids] - maxIdx = tipNums.index(max(tipNums)) - idToUse = ids[maxIdx] - # Adjust name of other conflicting nodes - counter = 2 - for id in ids: - if id != idToUse: - nodeMap[id].name += f' [{counter}]' - counter += 1 - -print('Changing mrca* names') -def convertMrcaName(id: str): - node = nodeMap[id] - name = node.name - childIds = node.childIds - if len(childIds) < 2: - print(f'WARNING: MRCA node \'{name}\' has less than 2 children') - return - # Get 2 children with most tips - childTips = [nodeMap[id].tips for id in childIds] - maxIdx1 = childTips.index(max(childTips)) - childTips[maxIdx1] = 0 - maxIdx2 = childTips.index(max(childTips)) - childId1 = childIds[maxIdx1] - childId2 = childIds[maxIdx2] - childName1 = nodeMap[childId1].name - childName2 = nodeMap[childId2].name - # Check for mrca* child names - if childName1.startswith('mrca'): - childName1 = convertMrcaName(childId1) - if childName2.startswith('mrca'): - childName2 = convertMrcaName(childId2) - # Check for composite names - match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) - if match is not None: - childName1 = match.group(1) - match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) - if match is not None: - childName2 = match.group(1) - # Create composite name - node.name = f'[{childName1} + {childName2}]' - return childName1 -for id, node in nodeMap.items(): - if node.name.startswith('mrca'): - convertMrcaName(id) - -print('Parsing annotations file') -# Read file -with open(annFile) as file: - data = file.read() -obj = json.loads(data) -nodeAnnsMap = obj['nodes'] -# Find relevant annotations -for id, node in nodeMap.items(): - # Set has-support value using annotations - if id in nodeAnnsMap: - nodeAnns = nodeAnnsMap[id] - supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 - conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 - node.pSupport = supportQty > 0 and conflictQty == 0 - -print('Creating nodes and edges tables') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') -dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)') -dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') -dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)') -for otolId, node in nodeMap.items(): - dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips)) - for childId in node.childIds: - childNode = nodeMap[childId] - dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', - (node.name, childNode.name, 1 if childNode.pSupport else 0)) -print('Closing database') -dbCon.commit() -dbCon.close() |
