diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-07-08 14:19:49 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-07-08 14:31:46 +1000 |
| commit | 834dab545931a3f224ef336530a890a7349b100a (patch) | |
| tree | 1ed5e2a2059bcabd3f8266fd7d52138cc00f026a /backend/data | |
| parent | d84a2dab11aa23d56c3213008424872e1a011279 (diff) | |
Add ancestors_* tables, for faster 'toroot' lookupancestors-tables
Speedup seemed minor, and for a non-wide range of situations.
It also roughly quadrupled the database size.
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 9 | ||||
| -rwxr-xr-x | backend/data/genReducedTrees.py | 32 |
2 files changed, 31 insertions, 10 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 13aeb89..b4b0745 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -43,6 +43,8 @@ This directory holds files used to generate data.db, which contains tree-of-life These are like `nodes`, but describe the nodes for various reduced trees. - `edges_t`, `edges_i`, `edges_p` <br> Like `edges` but for reduced trees. +- `ancestors_t`, `ancestors_i`, `ancestors_p` <br> + Maps nodes to their ancestors. Used for faster lookup. # Generating the Database @@ -146,7 +148,6 @@ Some of the python scripts require third-party packages: These correspond to entries in the `names` table. `prefAlt` should be 1 or 0. A line like `name1|name1|1` causes a node to have no preferred alt-name. 3. Run genReducedTrees.py, which generates multiple reduced versions of the tree, - adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from - pickedNodes.txt, which lists names of nodes that must be included (1 per line). - The original tree isn't used for web-queries, as some nodes would have over - 10k children, which can take a while to render (took over a minute in testing). + adding the `nodes_*`, `edges_*`, and `ancestors_*` tables, using `nodes` and + `names`. Reads from pickedNodes.txt, which lists names of nodes that must be + included (1 per line). The original tree isn't used for web-queries. diff --git a/backend/data/genReducedTrees.py b/backend/data/genReducedTrees.py index a921be4..ad73d3c 100755 --- a/backend/data/genReducedTrees.py +++ b/backend/data/genReducedTrees.py @@ -88,8 +88,8 @@ def genPickedNodeTree(dbCur, pickedNames, rootName): print(f"Result has {len(nodeMap)} nodes") print("Updating 'tips' values") updateTips(rootName, nodeMap) - print("Creating table") - addTreeTables(nodeMap, dbCur, "p") + print("Creating tables") + addTreeTables(nodeMap, dbCur, "p", rootName) def genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName): print("Getting ancestors") nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4) @@ -107,8 +107,8 @@ def genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName): print(f"Result has {len(nodeMap)} nodes") print(f"Updating 'tips' values") updateTips(rootName, nodeMap) - print("Creating table") - addTreeTables(nodeMap, dbCur, "i") + print("Creating tables") + addTreeTables(nodeMap, dbCur, "i", rootName) def genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName): print("Getting ancestors") nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5) @@ -139,7 +139,7 @@ def genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, print(f"Updating 'tips' values") updateTips(rootName, nodeMap) print("Creating table") - addTreeTables(nodeMap, dbCur, "t") + addTreeTables(nodeMap, dbCur, "t", rootName) # Helper functions def genNodeMap(dbCur, nameSet, itersBeforePrint = 1): " Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map " @@ -256,8 +256,9 @@ def updateTips(nodeName, nodeMap): tips = max(1, tips) node.tips = tips return tips -def addTreeTables(nodeMap, dbCur, suffix): +def addTreeTables(nodeMap, dbCur, suffix, rootName): " Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix " + print("Creating nodes and edges tables") nodesTbl = f"nodes_{suffix}" edgesTbl = f"edges_{suffix}" dbCur.execute(f"CREATE TABLE {nodesTbl} (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") @@ -269,6 +270,25 @@ def addTreeTables(nodeMap, dbCur, suffix): for childName in node.children: pSupport = 1 if nodeMap[childName].pSupport else 0 dbCur.execute(f"INSERT INTO {edgesTbl} VALUES (?, ?, ?)", (name, childName, pSupport)) + print("Creating ancestors table") + ancestorsTbl = f"ancestors_{suffix}" + dbCur.execute(f"CREATE TABLE {ancestorsTbl} (name TEXT, ancestor TEXT, PRIMARY KEY (name, ancestor))") + dbCur.execute(f"CREATE INDEX {ancestorsTbl}_idx ON {ancestorsTbl}(name)") + iterNum = 0 + def addAncestors(nodeName, ancestors = []): + nonlocal nodeMap, dbCur, iterNum + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}") + # + node = nodeMap[nodeName] + for ancestor in ancestors: + dbCur.execute(f"INSERT INTO {ancestorsTbl} VALUES (?, ?)", (nodeName, ancestor)) + ancestors.append(nodeName) + for childName in node.children: + addAncestors(childName, ancestors) + ancestors.pop() + addAncestors(rootName) print(f"Finding root node") query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" |
