aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--backend/data/README.md12
-rwxr-xr-xbackend/data/genOtolData.py133
-rw-r--r--src/App.vue2
4 files changed, 112 insertions, 36 deletions
diff --git a/.gitignore b/.gitignore
index 7fa730c..5b7ed15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@
/backend/data/enwiki/.venv/
/backend/data/dbpedia/*.bz2
/backend/data/dbpedia/*.db
+/backend/data/namesToKeep.txt
diff --git a/backend/data/README.md b/backend/data/README.md
index a1bc287..8cfa960 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -4,7 +4,8 @@ File Generation Process
1 Tree Structure Data
1 Obtain data in otol/, as specified in it's README.
2 Run genOtolData.py, which creates data.db, and adds
- 'nodes' and 'edges' tables using data in otol/*.
+ 'nodes' and 'edges' tables using data in otol/*, as well as
+ namesToKeep.txt, if present.
2 Name Data for Search
1 Obtain data in eol/, as specified in it's README.
2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
@@ -57,3 +58,12 @@ Other Files
tries to associate tree-of-life node names wth DBpedia node labels. It
writes data about them to conflicts.txt, which can be manually edited
to resolve them.
+- namesToKeep.txt <br>
+ Contains names to avoid trimming off the tree data generated by
+ genOtolData.py. Usage is optional, but, without it, a large amount
+ of possibly-significant nodes are removed, using a short-sighted
+ heuristic. <br>
+ One way to generate this list is to generate the files as usual,
+ then get node names that have an associated image, linked-image,
+ description, or presence in r_nodes. Then run the genOtolData.py
+ and genEolNameData.py scripts again.
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index d1567d3..252e9f2 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, os.path, re
+import sys, os, re
import json, sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
@@ -18,6 +18,9 @@ usageInfo += "Expected annotations.json format:\n"
usageInfo += " JSON object holding information about the tree-of-life release.\n"
usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n"
+usageInfo += "\n"
+usageInfo += "Some node trimming is done on the extracted tree, for performance and relevance reasons.\n"
+usageInfo += "The app can get quite laggy when some nodes in the chain have over 10k children.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -26,9 +29,10 @@ treeFile = "otol/labelled_supertree_ottnames.tre"
annFile = "otol/annotations.json"
dbFile = "data.db"
nodeMap = {} # Maps node IDs to node objects
-idToName = {} # Maps node IDs to names
nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs)
dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs
+softChildLimit = 100
+keptNamesFile = "namesToKeep.txt" # Contains names to keep when doing node trimming
# Parse treeFile
print("Parsing tree file")
@@ -81,7 +85,6 @@ def parseNewick():
nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False}
return id
def updateNameMaps(name, id):
- idToName[id] = name
if name not in nameToFirstId:
nameToFirstId[name] = id
else:
@@ -136,7 +139,69 @@ def parseNewickName():
if match == None:
raise Exception("ERROR: invalid name \"{}\"".format(name))
return [match.group(1).replace("_", " "), match.group(2)]
-rootName = parseNewick()
+rootId = parseNewick()
+# For nodes with *many* children, remove some of those children
+print("Trimming nodes from tree")
+namesToKeep = set()
+if os.path.exists(keptNamesFile):
+ with open(keptNamesFile) as file: # Contains names with an image (incl linked), desc, or reduced-tree-presence
+ for line in file:
+ namesToKeep.add(line.rstrip())
+else:
+ print("WARNING: No '{}' file found".format(keptNamesFile))
+def trimChildren(nodeId):
+ """ Traverse node tree, looking for nodes with too many children """
+ nodeObj = nodeMap[nodeId]
+ tipsRemoved = 0
+ if len(nodeObj["children"]) > softChildLimit:
+ childIds = nodeObj["children"]
+ # Look for children to delete, excluding 'kept nodes'
+ idsToKeep, otherIds = [], []
+ for id in childIds:
+ if nodeMap[id]["name"] in namesToKeep:
+ idsToKeep.append(id)
+ else:
+ otherIds.append(id)
+ if len(idsToKeep) < softChildLimit:
+ # Order by decreasing number of tips, placing excess children in list
+ numMoreToKeep = softChildLimit - len(idsToKeep)
+ otherIds.sort(key = lambda id: nodeMap[id]["tips"], reverse=True)
+ idsToKeep.extend(otherIds[:numMoreToKeep])
+ otherIds = otherIds[numMoreToKeep:]
+ # Perform deletion
+ nodeObj["children"] = idsToKeep
+ for id in otherIds:
+ tipsRemoved += deleteDownward(id)
+ # Recurse on children
+ for childId in nodeObj["children"]:
+ tipsRemoved += trimChildren(childId)
+ nodeObj["tips"] -= tipsRemoved
+ return tipsRemoved
+def deleteDownward(nodeId):
+ """ Deletes a node and it's descendants from the node map, along with associated data """
+ nodeObj = nodeMap[nodeId]
+ name = nodeObj["name"]
+ # Recurse on children
+ tipsRemoved = 0
+ if len(nodeObj["children"]) == 0:
+ tipsRemoved = 1
+ else:
+ for childId in nodeObj["children"]:
+ tipsRemoved += deleteDownward(childId)
+ # Delete from name maps
+ if name not in dupNameToIds:
+ del nameToFirstId[name]
+ else:
+ dupNameToIds[name].remove(nodeId)
+ if nameToFirstId[name] == nodeId:
+ nameToFirstId[name] = dupNameToIds[name][0]
+ if len(dupNameToIds[name]) == 1:
+ del dupNameToIds[name]
+ # Delete from node map
+ del nodeMap[nodeId]
+ #
+ return tipsRemoved
+trimChildren(rootId)
# Resolve duplicate names
print("Resolving duplicates")
for [dupName, ids] in dupNameToIds.items():
@@ -153,36 +218,36 @@ for [dupName, ids] in dupNameToIds.items():
# Change mrca* names
print("Changing mrca* names")
def convertMrcaName(id):
- node = nodeMap[id]
- name = node["name"]
- childIds = node["children"]
- if len(childIds) < 2:
- print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr)
- return
- # Get 2 children with most tips
- childTips = [nodeMap[id]["tips"] for id in childIds]
- maxIdx = childTips.index(max(childTips))
- childTips[maxIdx] = 0
- maxIdx2 = childTips.index(max(childTips))
- childId1 = childIds[maxIdx]
- childId2 = childIds[maxIdx2]
- childName1 = nodeMap[childId1]["name"]
- childName2 = nodeMap[childId2]["name"]
- # Check for mrca* child names
- if childName1.startswith("mrca"):
- childName1 = convertMrcaName(childId1)
- if childName2.startswith("mrca"):
- childName2 = convertMrcaName(childId2)
- # Check for composite names
- match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1)
- if match != None:
- childName1 = match.group(1)
- match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2)
- if match != None:
- childName2 = match.group(1)
- # Create composite name
- node["name"] = "[{} + {}]".format(childName1, childName2)
- return childName1
+ node = nodeMap[id]
+ name = node["name"]
+ childIds = node["children"]
+ if len(childIds) < 2:
+ print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr)
+ return
+ # Get 2 children with most tips
+ childTips = [nodeMap[id]["tips"] for id in childIds]
+ maxIdx = childTips.index(max(childTips))
+ childTips[maxIdx] = 0
+ maxIdx2 = childTips.index(max(childTips))
+ childId1 = childIds[maxIdx]
+ childId2 = childIds[maxIdx2]
+ childName1 = nodeMap[childId1]["name"]
+ childName2 = nodeMap[childId2]["name"]
+ # Check for mrca* child names
+ if childName1.startswith("mrca"):
+ childName1 = convertMrcaName(childId1)
+ if childName2.startswith("mrca"):
+ childName2 = convertMrcaName(childId2)
+ # Check for composite names
+ match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1)
+ if match != None:
+ childName1 = match.group(1)
+ match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2)
+ if match != None:
+ childName2 = match.group(1)
+ # Create composite name
+ node["name"] = "[{} + {}]".format(childName1, childName2)
+ return childName1
for [id, node] in nodeMap.items():
if node["name"].startswith("mrca"):
convertMrcaName(id)
diff --git a/src/App.vue b/src/App.vue
index b609204..f545e8f 100644
--- a/src/App.vue
+++ b/src/App.vue
@@ -65,7 +65,7 @@ const defaultUiOpts = {
shadowFocused: '0 0 1px 2px orange',
infoIconSz: 18, //px
infoIconMargin: 2, //px
- tipThresholds: [[1, 'greenyellow'], [100, 'orange'], [1000, 'red']],
+ tipThresholds: [[1, 'greenyellow'], [30, 'orange'], [100, 'red']],
headerColor: '#fafaf9',
// For leaf tiles
leafTilePadding: 4, //px