aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--backend/data/README.md5
-rwxr-xr-xbackend/data/genEolNameData.py1
-rwxr-xr-xbackend/data/genOtolData.py81
-rwxr-xr-xbackend/data/trimTree.py130
4 files changed, 136 insertions, 81 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 0bde721..87cf58d 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -55,13 +55,16 @@ File Generation Process
6 Other
- Optionally run genEnwikiNameData.py, which adds more entries to the 'names' table,
using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+ - Optionally run trimTree.py, which tries to remove some 'low-significance' nodes,
+ for the sake of performance and result-relevance. Without this, jumping to certain
+ nodes within the fungi and moths can take over a minute to render.
data.db Tables
==============
- nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT
- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
-- eol\_ids: id INT PRIMARY KEY, name TEXT
- names: name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name)
+- eol\_ids: id INT PRIMARY KEY, name TEXT
- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
- images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)
- node\_imgs: id TEXT PRIMARY KEY, img\_id INT, src TEXT
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index 74b0e3d..79e6f4f 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -77,6 +77,7 @@ dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
# Create tables
dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))")
+dbCur.execute("CREATE INDEX names_idx ON names(name)")
dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index 5a89270..cfb5bed 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, os, re
+import sys, re
import json, sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
@@ -32,8 +32,6 @@ nodeMap = {} # Maps node IDs to node objects
nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs)
dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs
pickedDupsFile = "genOtolDataPickedDups.txt"
-softChildLimit = 100
-keptNamesFile = "genOtolNamesToKeep.txt" # Contains names to keep when doing node trimming
# Parse treeFile
print("Parsing tree file")
@@ -141,83 +139,6 @@ def parseNewickName():
raise Exception(f"ERROR: invalid name \"{name}\"")
return [match.group(1).replace("_", " "), match.group(2)]
rootId = parseNewick()
-# For nodes with *many* children, remove some of those children
-print("Getting nodes for which to avoid trimming")
-namesToKeep = set()
-if os.path.exists(keptNamesFile):
- with open(keptNamesFile) as file: # Contains names with an image, desc, or reduced-tree-presence
- for line in file:
- namesToKeep.add(line.rstrip())
-else:
- print(f"WARNING: No '{keptNamesFile}' file found")
-print(f"Read in {len(namesToKeep)} nodes")
-keptAncestors = set()
-for name in namesToKeep:
- if name in nameToFirstId:
- ids = [nameToFirstId[name]] if name not in dupNameToIds else dupNameToIds[name]
- for id in ids:
- parentId = nodeMap[id]["parent"]
- while parentId != None:
- parentObj = nodeMap[parentId]
- keptAncestors.add(parentObj["name"])
- parentId = parentObj["parent"]
-oldNamesToKeepSz = len(namesToKeep)
-namesToKeep.update(keptAncestors)
-print("Added {} ancestor nodes".format(len(namesToKeep) - oldNamesToKeepSz))
-print("Trimming nodes from tree")
-def trimChildren(nodeId):
- """ Traverse node tree, looking for nodes with too many children """
- nodeObj = nodeMap[nodeId]
- tipsRemoved = 0
- if len(nodeObj["children"]) > softChildLimit:
- childIds = nodeObj["children"]
- # Look for children to delete, excluding 'kept nodes'
- idsToKeep, otherIds = [], []
- for id in childIds:
- if nodeMap[id]["name"] in namesToKeep:
- idsToKeep.append(id)
- else:
- otherIds.append(id)
- if len(idsToKeep) < softChildLimit:
- # Order by decreasing number of tips, placing excess children in list
- numMoreToKeep = softChildLimit - len(idsToKeep)
- otherIds.sort(key = lambda id: nodeMap[id]["tips"], reverse=True)
- idsToKeep.extend(otherIds[:numMoreToKeep])
- otherIds = otherIds[numMoreToKeep:]
- # Perform deletion
- nodeObj["children"] = idsToKeep
- for id in otherIds:
- tipsRemoved += deleteDownward(id)
- # Recurse on children
- for childId in nodeObj["children"]:
- tipsRemoved += trimChildren(childId)
- nodeObj["tips"] -= tipsRemoved
- return tipsRemoved
-def deleteDownward(nodeId):
- """ Deletes a node and it's descendants from the node map, along with associated data """
- nodeObj = nodeMap[nodeId]
- name = nodeObj["name"]
- # Recurse on children
- tipsRemoved = 0
- if len(nodeObj["children"]) == 0:
- tipsRemoved = 1
- else:
- for childId in nodeObj["children"]:
- tipsRemoved += deleteDownward(childId)
- # Delete from name maps
- if name not in dupNameToIds:
- del nameToFirstId[name]
- else:
- dupNameToIds[name].remove(nodeId)
- if nameToFirstId[name] == nodeId:
- nameToFirstId[name] = dupNameToIds[name][0]
- if len(dupNameToIds[name]) == 1:
- del dupNameToIds[name]
- # Delete from node map
- del nodeMap[nodeId]
- #
- return tipsRemoved
-#trimChildren(rootId)
# Resolve duplicate names
print("Resolving duplicates")
nameToPickedId = {}
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py
new file mode 100755
index 0000000..1291c14
--- /dev/null
+++ b/backend/data/trimTree.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python3
+
+import sys
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Removes certain children from a tol-tree in an sqlite db.\n"
+usageInfo += "Looks for nodes with an amount of children above a threshold,\n"
+usageInfo += "and removes the excess, excluding those with 'significant'\n"
+usageInfo += "associations, like those with descriptions and images.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbFile = "data.db"
+softChildLimit = 100
+
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Get nodes that shouldn't be deleted, along with their ancestors
+print("Finding nodes to keep")
+nodesToKeep = set()
+print("\tFinding nodes with descs")
+for (name,) in dbCur.execute("SELECT name FROM descs"):
+ nodesToKeep.add(name)
+print("\tFinding nodes with images")
+for (name,) in dbCur.execute("SELECT name FROM nodes INNER JOIN node_imgs ON nodes.id = node_imgs.id"):
+ nodesToKeep.add(name)
+print("\tFinding nodes in reduced-tree")
+for (name,) in dbCur.execute("SELECT name from r_nodes"):
+ nodesToKeep.add(name)
+print("\tFinding ancestors")
+ancestors = set()
+iterNum = 0
+for name in nodesToKeep:
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f"\tAt iteration {iterNum}")
+ #
+ while True:
+ row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (name,)).fetchone()
+ if row != None:
+ parent = row[0]
+ if parent not in nodesToKeep and parent not in ancestors:
+ ancestors.add(parent)
+ continue
+ break
+nodesToKeep.update(ancestors)
+print(f"Total of {len(nodesToKeep)} nodes to keep")
+# Find root node
+query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.node IS NULL LIMIT 1"
+(rootName,) = dbCur.execute(query).fetchone()
+print(f"Found root node {rootName}")
+# Traverse tree, looking for trimmable nodes
+print("Looking for trimmable nodes")
+nodeToTipsChg = {}
+nodesToDelete = set()
+iterNum = 0
+def findTrimmables(nodeName):
+ global iterNum
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))]
+ tipsRemoved = 0
+ if len(childNames) > softChildLimit:
+ # Look for children to delete, excluding 'kept nodes'
+ childrenToKeep, otherChildren = [], []
+ for n in childNames:
+ if n in nodesToKeep:
+ childrenToKeep.append(n)
+ else:
+ otherChildren.append(n)
+ # Only trim beyond threshold
+ if len(childrenToKeep) < softChildLimit:
+ numMoreToKeep = softChildLimit - len(childrenToKeep)
+ # Prefer keeping nodes with more tips
+ childToTips = {}
+ query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(otherChildren)))
+ for (n, tips) in dbCur.execute(query, otherChildren):
+ childToTips[n] = tips
+ otherChildren.sort(key=lambda n: childToTips[n], reverse=True)
+ childrenToKeep.extend(otherChildren[:numMoreToKeep])
+ otherChildren = otherChildren[numMoreToKeep:]
+ # 'Simulate' deletions
+ childNames = childrenToKeep
+ for n in otherChildren:
+ tipsRemoved += markForDeletion(n)
+ # Recurse on children
+ for n in childNames:
+ tipsRemoved += findTrimmables(n)
+ # Store info for updating num-tips later
+ nodeToTipsChg[nodeName] = tipsRemoved
+ return tipsRemoved
+def markForDeletion(nodeName):
+ nodesToDelete.add(nodeName)
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))]
+ if len(childNames) == 0:
+ return 1
+ else:
+ tipsRemoved = 0
+ for n in childNames:
+ tipsRemoved += markForDeletion(n)
+ return tipsRemoved
+findTrimmables(rootName)
+# Delete trimmable nodes
+print(f"Deleting {len(nodesToDelete)} nodes")
+iterNum = 0
+for nodeName in nodesToDelete:
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,))
+ dbCur.execute("DELETE FROM edges WHERE node = ?", (nodeName,))
+ dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,))
+ dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,))
+ dbCur.execute("DELETE FROM eol_ids WHERE name = ?", (nodeName,))
+print(f"Updating num-tips for {len(nodeToTipsChg)} nodes")
+iterNum = 0
+for (nodeName, tipsChg) in nodeToTipsChg.items():
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName))
+# Close db
+dbCon.commit()
+dbCon.close()