diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
| commit | 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch) | |
| tree | 3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/data/genOtolData.py | |
| parent | a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff) | |
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or
symlinked from, public/. This needed to be changed before each
build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/data/genOtolData.py')
| -rwxr-xr-x | backend/data/genOtolData.py | 250 |
1 files changed, 0 insertions, 250 deletions
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py deleted file mode 100755 index b5e0055..0000000 --- a/backend/data/genOtolData.py +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import json, sqlite3 - -usageInfo = f""" -Usage: {sys.argv[0]} - -Reads files describing a tree-of-life from an 'Open Tree of Life' release, -and stores tree information in a database. - -Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: - The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 - The root node is named n6, and has children n1, n2, and n5. - Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', - 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. - The node with ID 'ott770315' will get the name 'homo sapiens'. - A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). - It is possible for multiple nodes to have the same name. - In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. -Reads an annotations.json file, which is assumed to have this format: - Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, - such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that - support/conflict with the node's placement. -Reads from a picked-names file, if present, which specifies name and node ID pairs. - These help resolve cases where multiple nodes share the same name. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes -annFile = "otol/annotations.json" -dbFile = "data.db" -nodeMap = {} # Maps node IDs to node objects -nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs) -dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs -pickedNamesFile = "pickedOtolNames.txt" - -class Node: - " Represents a tree-of-life node " - def __init__(self, name, childIds, parentId, tips, pSupport): - self.name = name - self.childIds = childIds - self.parentId = parentId - self.tips = tips - self.pSupport = pSupport - -print("Parsing tree file") -# Read file -data = None -with open(treeFile) as file: - data = file.read() -dataIdx = 0 -# Parse content -iterNum = 0 -def parseNewick(): - " Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID " - global data, dataIdx, iterNum - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"At iteration {iterNum}") - # Check for EOF - if dataIdx == len(data): - raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") - # Check for node - if data[dataIdx] == "(": # parse inner node - dataIdx += 1 - childIds = [] - while True: - # Read child - childId = parseNewick() - childIds.append(childId) - if (dataIdx == len(data)): - raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") - # Check for next child - if (data[dataIdx] == ","): - dataIdx += 1 - continue - else: - # Get node name and id - dataIdx += 1 # Consume an expected ')' - name, id = parseNewickName() - updateNameMaps(name, id) - # Get child num-tips total - tips = 0 - for childId in childIds: - tips += nodeMap[childId].tips - # Add node to nodeMap - nodeMap[id] = Node(name, childIds, None, tips, False) - # Update childrens' parent reference - for childId in childIds: - nodeMap[childId].parentId = id - return id - else: # Parse node name - name, id = parseNewickName() - updateNameMaps(name, id) - nodeMap[id] = Node(name, [], None, 1, False) - return id -def parseNewickName(): - " Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair " - global data, dataIdx - name = None - end = dataIdx - # Get name - if (end < len(data) and data[end] == "'"): # Check for quoted name - end += 1 - inQuote = True - while end < len(data): - if (data[end] == "'"): - if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote - end += 2 - continue - else: - end += 1 - inQuote = False - break - end += 1 - if inQuote: - raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") - name = data[dataIdx:end] - dataIdx = end - else: - while end < len(data) and not re.match(r"[(),]", data[end]): - end += 1 - if (end == dataIdx): - raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") - name = data[dataIdx:end].rstrip() - if end == len(data): # Ignore trailing input semicolon - name = name[:-1] - dataIdx = end - # Convert to (name, id) - name = name.lower() - if name.startswith("mrca"): - return (name, name) - elif name[0] == "'": - match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) - if match == None: - raise Exception(f"ERROR: invalid name \"{name}\"") - name = match.group(1).replace("''", "'") - return (name, match.group(2)) - else: - match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) - if match == None: - raise Exception(f"ERROR: invalid name \"{name}\"") - return (match.group(1).replace("_", " "), match.group(2)) -def updateNameMaps(name, id): - global nameToFirstId, dupNameToIds - if name not in nameToFirstId: - nameToFirstId[name] = id - else: - if name not in dupNameToIds: - dupNameToIds[name] = [nameToFirstId[name], id] - else: - dupNameToIds[name].append(id) -rootId = parseNewick() - -print("Resolving duplicate names") -# Read picked-names file -nameToPickedId = {} -if os.path.exists(pickedNamesFile): - with open(pickedNamesFile) as file: - for line in file: - (name, _, otolId) = line.rstrip().partition("|") - nameToPickedId[name] = otolId -# Resolve duplicates -for (dupName, ids) in dupNameToIds.items(): - # Check for picked id - if dupName in nameToPickedId: - idToUse = nameToPickedId[dupName] - else: - # Get conflicting node with most tips - tipNums = [nodeMap[id].tips for id in ids] - maxIdx = tipNums.index(max(tipNums)) - idToUse = ids[maxIdx] - # Adjust name of other conflicting nodes - counter = 2 - for id in ids: - if id != idToUse: - nodeMap[id].name += f" [{counter}]" - counter += 1 - -print("Changing mrca* names") -def convertMrcaName(id): - node = nodeMap[id] - name = node.name - childIds = node.childIds - if len(childIds) < 2: - print(f"WARNING: MRCA node \"{name}\" has less than 2 children") - return - # Get 2 children with most tips - childTips = [nodeMap[id].tips for id in childIds] - maxIdx1 = childTips.index(max(childTips)) - childTips[maxIdx1] = 0 - maxIdx2 = childTips.index(max(childTips)) - childId1 = childIds[maxIdx1] - childId2 = childIds[maxIdx2] - childName1 = nodeMap[childId1].name - childName2 = nodeMap[childId2].name - # Check for mrca* child names - if childName1.startswith("mrca"): - childName1 = convertMrcaName(childId1) - if childName2.startswith("mrca"): - childName2 = convertMrcaName(childId2) - # Check for composite names - match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1) - if match != None: - childName1 = match.group(1) - match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2) - if match != None: - childName2 = match.group(1) - # Create composite name - node.name = f"[{childName1} + {childName2}]" - return childName1 -for (id, node) in nodeMap.items(): - if node.name.startswith("mrca"): - convertMrcaName(id) - -print("Parsing annotations file") -# Read file -data = None -with open(annFile) as file: - data = file.read() -obj = json.loads(data) -nodeAnnsMap = obj["nodes"] -# Find relevant annotations -for (id, node) in nodeMap.items(): - # Set has-support value using annotations - if id in nodeAnnsMap: - nodeAnns = nodeAnnsMap[id] - supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0 - conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0 - node.pSupport = supportQty > 0 and conflictQty == 0 - -print("Creating nodes and edges tables") -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") -dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)") -dbCur.execute("CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") -dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)") -for (otolId, node) in nodeMap.items(): - dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node.name, otolId, node.tips)) - for childId in node.childIds: - childNode = nodeMap[childId] - dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)", - (node.name, childNode.name, 1 if childNode.pSupport else 0)) -print("Closing database") -dbCon.commit() -dbCon.close() |
