diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | backend/data/README.md | 12 | ||||
| -rwxr-xr-x | backend/data/genOtolData.py | 133 | ||||
| -rw-r--r-- | src/App.vue | 2 |
4 files changed, 112 insertions, 36 deletions
@@ -20,3 +20,4 @@ /backend/data/enwiki/.venv/ /backend/data/dbpedia/*.bz2 /backend/data/dbpedia/*.db +/backend/data/namesToKeep.txt diff --git a/backend/data/README.md b/backend/data/README.md index a1bc287..8cfa960 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -4,7 +4,8 @@ File Generation Process 1 Tree Structure Data 1 Obtain data in otol/, as specified in it's README. 2 Run genOtolData.py, which creates data.db, and adds - 'nodes' and 'edges' tables using data in otol/*. + 'nodes' and 'edges' tables using data in otol/*, as well as + namesToKeep.txt, if present. 2 Name Data for Search 1 Obtain data in eol/, as specified in it's README. 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, @@ -57,3 +58,12 @@ Other Files tries to associate tree-of-life node names wth DBpedia node labels. It writes data about them to conflicts.txt, which can be manually edited to resolve them. +- namesToKeep.txt <br> + Contains names to avoid trimming off the tree data generated by + genOtolData.py. Usage is optional, but, without it, a large amount + of possibly-significant nodes are removed, using a short-sighted + heuristic. <br> + One way to generate this list is to generate the files as usual, + then get node names that have an associated image, linked-image, + description, or presence in r_nodes. Then run the genOtolData.py + and genEolNameData.py scripts again. diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py index d1567d3..252e9f2 100755 --- a/backend/data/genOtolData.py +++ b/backend/data/genOtolData.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, os.path, re +import sys, os, re import json, sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" @@ -18,6 +18,9 @@ usageInfo += "Expected annotations.json format:\n" usageInfo += " JSON object holding information about the tree-of-life release.\n" usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n" usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n" +usageInfo += "\n" +usageInfo += "Some node trimming is done on the extracted tree, for performance and relevance reasons.\n" +usageInfo += "The app can get quite laggy when some nodes in the chain have over 10k children.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -26,9 +29,10 @@ treeFile = "otol/labelled_supertree_ottnames.tre" annFile = "otol/annotations.json" dbFile = "data.db" nodeMap = {} # Maps node IDs to node objects -idToName = {} # Maps node IDs to names nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs +softChildLimit = 100 +keptNamesFile = "namesToKeep.txt" # Contains names to keep when doing node trimming # Parse treeFile print("Parsing tree file") @@ -81,7 +85,6 @@ def parseNewick(): nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False} return id def updateNameMaps(name, id): - idToName[id] = name if name not in nameToFirstId: nameToFirstId[name] = id else: @@ -136,7 +139,69 @@ def parseNewickName(): if match == None: raise Exception("ERROR: invalid name \"{}\"".format(name)) return [match.group(1).replace("_", " "), match.group(2)] -rootName = parseNewick() +rootId = parseNewick() +# For nodes with *many* children, remove some of those children +print("Trimming nodes from tree") +namesToKeep = set() +if os.path.exists(keptNamesFile): + with open(keptNamesFile) as file: # Contains names with an image (incl linked), desc, or reduced-tree-presence + for line in file: + namesToKeep.add(line.rstrip()) +else: + print("WARNING: No '{}' file found".format(keptNamesFile)) +def trimChildren(nodeId): + """ Traverse node tree, looking for nodes with too many children """ + nodeObj = nodeMap[nodeId] + tipsRemoved = 0 + if len(nodeObj["children"]) > softChildLimit: + childIds = nodeObj["children"] + # Look for children to delete, excluding 'kept nodes' + idsToKeep, otherIds = [], [] + for id in childIds: + if nodeMap[id]["name"] in namesToKeep: + idsToKeep.append(id) + else: + otherIds.append(id) + if len(idsToKeep) < softChildLimit: + # Order by decreasing number of tips, placing excess children in list + numMoreToKeep = softChildLimit - len(idsToKeep) + otherIds.sort(key = lambda id: nodeMap[id]["tips"], reverse=True) + idsToKeep.extend(otherIds[:numMoreToKeep]) + otherIds = otherIds[numMoreToKeep:] + # Perform deletion + nodeObj["children"] = idsToKeep + for id in otherIds: + tipsRemoved += deleteDownward(id) + # Recurse on children + for childId in nodeObj["children"]: + tipsRemoved += trimChildren(childId) + nodeObj["tips"] -= tipsRemoved + return tipsRemoved +def deleteDownward(nodeId): + """ Deletes a node and it's descendants from the node map, along with associated data """ + nodeObj = nodeMap[nodeId] + name = nodeObj["name"] + # Recurse on children + tipsRemoved = 0 + if len(nodeObj["children"]) == 0: + tipsRemoved = 1 + else: + for childId in nodeObj["children"]: + tipsRemoved += deleteDownward(childId) + # Delete from name maps + if name not in dupNameToIds: + del nameToFirstId[name] + else: + dupNameToIds[name].remove(nodeId) + if nameToFirstId[name] == nodeId: + nameToFirstId[name] = dupNameToIds[name][0] + if len(dupNameToIds[name]) == 1: + del dupNameToIds[name] + # Delete from node map + del nodeMap[nodeId] + # + return tipsRemoved +trimChildren(rootId) # Resolve duplicate names print("Resolving duplicates") for [dupName, ids] in dupNameToIds.items(): @@ -153,36 +218,36 @@ for [dupName, ids] in dupNameToIds.items(): # Change mrca* names print("Changing mrca* names") def convertMrcaName(id): - node = nodeMap[id] - name = node["name"] - childIds = node["children"] - if len(childIds) < 2: - print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr) - return - # Get 2 children with most tips - childTips = [nodeMap[id]["tips"] for id in childIds] - maxIdx = childTips.index(max(childTips)) - childTips[maxIdx] = 0 - maxIdx2 = childTips.index(max(childTips)) - childId1 = childIds[maxIdx] - childId2 = childIds[maxIdx2] - childName1 = nodeMap[childId1]["name"] - childName2 = nodeMap[childId2]["name"] - # Check for mrca* child names - if childName1.startswith("mrca"): - childName1 = convertMrcaName(childId1) - if childName2.startswith("mrca"): - childName2 = convertMrcaName(childId2) - # Check for composite names - match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1) - if match != None: - childName1 = match.group(1) - match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2) - if match != None: - childName2 = match.group(1) - # Create composite name - node["name"] = "[{} + {}]".format(childName1, childName2) - return childName1 + node = nodeMap[id] + name = node["name"] + childIds = node["children"] + if len(childIds) < 2: + print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr) + return + # Get 2 children with most tips + childTips = [nodeMap[id]["tips"] for id in childIds] + maxIdx = childTips.index(max(childTips)) + childTips[maxIdx] = 0 + maxIdx2 = childTips.index(max(childTips)) + childId1 = childIds[maxIdx] + childId2 = childIds[maxIdx2] + childName1 = nodeMap[childId1]["name"] + childName2 = nodeMap[childId2]["name"] + # Check for mrca* child names + if childName1.startswith("mrca"): + childName1 = convertMrcaName(childId1) + if childName2.startswith("mrca"): + childName2 = convertMrcaName(childId2) + # Check for composite names + match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1) + if match != None: + childName1 = match.group(1) + match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2) + if match != None: + childName2 = match.group(1) + # Create composite name + node["name"] = "[{} + {}]".format(childName1, childName2) + return childName1 for [id, node] in nodeMap.items(): if node["name"].startswith("mrca"): convertMrcaName(id) diff --git a/src/App.vue b/src/App.vue index b609204..f545e8f 100644 --- a/src/App.vue +++ b/src/App.vue @@ -65,7 +65,7 @@ const defaultUiOpts = { shadowFocused: '0 0 1px 2px orange', infoIconSz: 18, //px infoIconMargin: 2, //px - tipThresholds: [[1, 'greenyellow'], [100, 'orange'], [1000, 'red']], + tipThresholds: [[1, 'greenyellow'], [30, 'orange'], [100, 'red']], headerColor: '#fafaf9', // For leaf tiles leafTilePadding: 4, //px |
