aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-07-08 14:19:49 +1000
committerTerry Truong <terry06890@gmail.com>2022-07-08 14:31:46 +1000
commit834dab545931a3f224ef336530a890a7349b100a (patch)
tree1ed5e2a2059bcabd3f8266fd7d52138cc00f026a /backend/data
parentd84a2dab11aa23d56c3213008424872e1a011279 (diff)
Add ancestors_* tables, for faster 'toroot' lookupancestors-tables
Speedup seemed minor, and for a non-wide range of situations. It also roughly quadrupled the database size.
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md9
-rwxr-xr-xbackend/data/genReducedTrees.py32
2 files changed, 31 insertions, 10 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 13aeb89..b4b0745 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -43,6 +43,8 @@ This directory holds files used to generate data.db, which contains tree-of-life
These are like `nodes`, but describe the nodes for various reduced trees.
- `edges_t`, `edges_i`, `edges_p` <br>
Like `edges` but for reduced trees.
+- `ancestors_t`, `ancestors_i`, `ancestors_p` <br>
+ Maps nodes to their ancestors. Used for faster lookup.
# Generating the Database
@@ -146,7 +148,6 @@ Some of the python scripts require third-party packages:
These correspond to entries in the `names` table. `prefAlt` should be 1 or 0.
A line like `name1|name1|1` causes a node to have no preferred alt-name.
3. Run genReducedTrees.py, which generates multiple reduced versions of the tree,
- adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from
- pickedNodes.txt, which lists names of nodes that must be included (1 per line).
- The original tree isn't used for web-queries, as some nodes would have over
- 10k children, which can take a while to render (took over a minute in testing).
+ adding the `nodes_*`, `edges_*`, and `ancestors_*` tables, using `nodes` and
+ `names`. Reads from pickedNodes.txt, which lists names of nodes that must be
+ included (1 per line). The original tree isn't used for web-queries.
diff --git a/backend/data/genReducedTrees.py b/backend/data/genReducedTrees.py
index a921be4..ad73d3c 100755
--- a/backend/data/genReducedTrees.py
+++ b/backend/data/genReducedTrees.py
@@ -88,8 +88,8 @@ def genPickedNodeTree(dbCur, pickedNames, rootName):
print(f"Result has {len(nodeMap)} nodes")
print("Updating 'tips' values")
updateTips(rootName, nodeMap)
- print("Creating table")
- addTreeTables(nodeMap, dbCur, "p")
+ print("Creating tables")
+ addTreeTables(nodeMap, dbCur, "p", rootName)
def genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName):
print("Getting ancestors")
nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4)
@@ -107,8 +107,8 @@ def genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName):
print(f"Result has {len(nodeMap)} nodes")
print(f"Updating 'tips' values")
updateTips(rootName, nodeMap)
- print("Creating table")
- addTreeTables(nodeMap, dbCur, "i")
+ print("Creating tables")
+ addTreeTables(nodeMap, dbCur, "i", rootName)
def genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName):
print("Getting ancestors")
nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5)
@@ -139,7 +139,7 @@ def genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked,
print(f"Updating 'tips' values")
updateTips(rootName, nodeMap)
print("Creating table")
- addTreeTables(nodeMap, dbCur, "t")
+ addTreeTables(nodeMap, dbCur, "t", rootName)
# Helper functions
def genNodeMap(dbCur, nameSet, itersBeforePrint = 1):
" Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map "
@@ -256,8 +256,9 @@ def updateTips(nodeName, nodeMap):
tips = max(1, tips)
node.tips = tips
return tips
-def addTreeTables(nodeMap, dbCur, suffix):
+def addTreeTables(nodeMap, dbCur, suffix, rootName):
" Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix "
+ print("Creating nodes and edges tables")
nodesTbl = f"nodes_{suffix}"
edgesTbl = f"edges_{suffix}"
dbCur.execute(f"CREATE TABLE {nodesTbl} (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)")
@@ -269,6 +270,25 @@ def addTreeTables(nodeMap, dbCur, suffix):
for childName in node.children:
pSupport = 1 if nodeMap[childName].pSupport else 0
dbCur.execute(f"INSERT INTO {edgesTbl} VALUES (?, ?, ?)", (name, childName, pSupport))
+ print("Creating ancestors table")
+ ancestorsTbl = f"ancestors_{suffix}"
+ dbCur.execute(f"CREATE TABLE {ancestorsTbl} (name TEXT, ancestor TEXT, PRIMARY KEY (name, ancestor))")
+ dbCur.execute(f"CREATE INDEX {ancestorsTbl}_idx ON {ancestorsTbl}(name)")
+ iterNum = 0
+ def addAncestors(nodeName, ancestors = []):
+ nonlocal nodeMap, dbCur, iterNum
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ node = nodeMap[nodeName]
+ for ancestor in ancestors:
+ dbCur.execute(f"INSERT INTO {ancestorsTbl} VALUES (?, ?)", (nodeName, ancestor))
+ ancestors.append(nodeName)
+ for childName in node.children:
+ addAncestors(childName, ancestors)
+ ancestors.pop()
+ addAncestors(rootName)
print(f"Finding root node")
query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1"