diff options
Diffstat (limited to 'backend')
| -rwxr-xr-x | backend/cgi-bin/data.py | 59 | ||||
| -rw-r--r-- | backend/data/README.md | 23 | ||||
| -rwxr-xr-x | backend/data/genReducedTreeData.py | 177 | ||||
| -rwxr-xr-x | backend/data/genReducedTrees.py | 324 | ||||
| -rwxr-xr-x | backend/data/trimTree.py | 143 |
5 files changed, 366 insertions, 360 deletions
diff --git a/backend/cgi-bin/data.py b/backend/cgi-bin/data.py index b152c14..3815715 100755 --- a/backend/cgi-bin/data.py +++ b/backend/cgi-bin/data.py @@ -24,8 +24,10 @@ Query parameters: - toroot: Used with type=node, and causes inclusion of nodes upward to the root, and their children. If specified, the value should be 'true'. - limit: Used with type=sugg to specify the max number of suggestions. -- rtree: Specifies whether the reduced tree should be used. - If specified, the value should be 'true'. +- tree: Specifies which tree should be used. + May be 'trimmed', 'images', or 'picked', corresponding to the + weakly-trimmed, images-only, and picked-nodes trees. The default + is 'images'. """ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) @@ -81,12 +83,13 @@ class InfoResponse: self.subNodesInfo = subNodesInfo # [] | [NodeInfo, NodeInfo] # For data lookup -def lookupNodes(names, useReducedTree, dbCur): +def lookupNodes(names, tree, dbCur): " For a set of node names, returns a name-to-TolNode map that describes those nodes " # Get node info nameToNodes = {} - nodesTable = "nodes" if not useReducedTree else "r_nodes" - edgesTable = "edges" if not useReducedTree else "r_edges" + tblSuffix = "t" if tree == "trimmed" else "i" if tree == "images" else "p" + nodesTable = f"nodes_{tblSuffix}" + edgesTable = f"edges_{tblSuffix}" queryParamStr = ",".join(["?"] * len(names)) query = f"SELECT name, id, tips FROM {nodesTable} WHERE name IN ({queryParamStr})" for (nodeName, otolId, tips) in dbCur.execute(query, names): @@ -133,23 +136,20 @@ def lookupNodes(names, useReducedTree, dbCur): nameToNodes[name].commonName = altName # return nameToNodes -def lookupSuggs(searchStr, suggLimit, useReducedTree, dbCur): +def lookupSuggs(searchStr, suggLimit, tree, dbCur): " For a search string, returns a SearchSuggResponse describing search suggestions " + time1 = time.time() results = [] hasMore = False # Get node names and alt-names query1, query2 = (None, None) - if not useReducedTree: - query1 = "SELECT DISTINCT name FROM nodes" \ - " WHERE name LIKE ? AND name NOT LIKE '[%' ORDER BY length(name) LIMIT ?" - query2 = "SELECT DISTINCT alt_name, name FROM names" \ - " WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?" - else: - query1 = "SELECT DISTINCT name FROM r_nodes" \ - " WHERE name LIKE ? AND name NOT LIKE '[%' ORDER BY length(name) LIMIT ?" - query2 = "SELECT DISTINCT alt_name, names.name FROM" \ - " names INNER JOIN r_nodes ON names.name = r_nodes.name" \ - " WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?" + tblSuffix = "t" if tree == "trimmed" else "i" if tree == "images" else "p" + nodesTable = f"nodes_{tblSuffix}" + query1 = f"SELECT DISTINCT name FROM {nodesTable}" \ + f" WHERE name LIKE ? AND name NOT LIKE '[%' ORDER BY length(name) LIMIT ?" + query2 = f"SELECT DISTINCT alt_name, names.name FROM" \ + f" names INNER JOIN {nodesTable} ON names.name = {nodesTable}.name" \ + f" WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?" # Join results, and get shortest suggs = [] for (nodeName,) in dbCur.execute(query1, (searchStr + "%", suggLimit + 1)): @@ -178,11 +178,12 @@ def lookupSuggs(searchStr, suggLimit, useReducedTree, dbCur): if len(suggs) > suggLimit: hasMore = True # + print(time.time() - time1, file=sys.stderr) return SearchSuggResponse(results, hasMore) -def lookupInfo(name, useReducedTree, dbCur): +def lookupInfo(name, tree, dbCur): " For a node name, returns an InfoResponse, or None " # Get node info - nameToNodes = lookupNodes([name], useReducedTree, dbCur) + nameToNodes = lookupNodes([name], tree, dbCur) tolNode = nameToNodes[name] if name in nameToNodes else None if tolNode == None: return None @@ -190,7 +191,7 @@ def lookupInfo(name, useReducedTree, dbCur): match = re.fullmatch(r"\[(.+) \+ (.+)]", name) subNames = [match.group(1), match.group(2)] if match != None else [] if len(subNames) > 0: - nameToSubNodes = lookupNodes(subNames, useReducedTree, dbCur) + nameToSubNodes = lookupNodes(subNames, tree, dbCur) if len(nameToSubNodes) < 2: print(f"ERROR: Unable to find sub-names entries for {name}", file=sys.stderr) return None @@ -247,15 +248,19 @@ def handleReq(dbCur): query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" (name,) = dbCur.execute(query).fetchone() reqType = queryDict["type"][0] if "type" in queryDict else None - useReducedTree = "rtree" in queryDict and queryDict["rtree"][0] == "true" + tree = queryDict["tree"][0] if "tree" in queryDict else "images" + # Check for valid 'tree' + if tree != None and re.fullmatch(r"trimmed|images|picked", tree) == None: + respondJson(None) + return # Get data of requested type if reqType == "node": toroot = "toroot" in queryDict and queryDict["toroot"][0] == "true" if not toroot: - tolNodes = lookupNodes([name], useReducedTree, dbCur) + tolNodes = lookupNodes([name], tree, dbCur) if len(tolNodes) > 0: tolNode = tolNodes[name] - childNodeObjs = lookupNodes(tolNode.children, useReducedTree, dbCur) + childNodeObjs = lookupNodes(tolNode.children, tree, dbCur) childNodeObjs[name] = tolNode respondJson(childNodeObjs) return @@ -264,7 +269,7 @@ def handleReq(dbCur): ranOnce = False while True: # Get node - tolNodes = lookupNodes([name], useReducedTree, dbCur) + tolNodes = lookupNodes([name], tree, dbCur) if len(tolNodes) == 0: if not ranOnce: respondJson(results) @@ -281,7 +286,7 @@ def handleReq(dbCur): for childName in tolNode.children: if childName not in results: childNamesToAdd.append(childName) - childNodeObjs = lookupNodes(childNamesToAdd, useReducedTree, dbCur) + childNodeObjs = lookupNodes(childNamesToAdd, tree, dbCur) results.update(childNodeObjs) # Check if root if tolNode.parent == None: @@ -302,10 +307,10 @@ def handleReq(dbCur): print(f"INFO: Invalid limit {suggLimit}", file=sys.stderr) # Get search suggestions if not invalidLimit: - respondJson(lookupSuggs(name, suggLimit, useReducedTree, dbCur)) + respondJson(lookupSuggs(name, suggLimit, tree, dbCur)) return elif reqType == "info": - infoResponse = lookupInfo(name, useReducedTree, dbCur) + infoResponse = lookupInfo(name, tree, dbCur) if infoResponse != None: respondJson(infoResponse) return diff --git a/backend/data/README.md b/backend/data/README.md index 776ff17..13aeb89 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -38,13 +38,11 @@ This directory holds files used to generate data.db, which contains tree-of-life Associates a node with an image from another node. `otol_ids` can be an otol ID, or two comma-separated otol IDs or empty strings. The latter is used for compound nodes. -## Reduced-tree data -- `r_nodes` <br> - Format: `name TEXT PRIMARY KEY, tips INT` <br> - Like `nodes`, but for a reduced tree. -- `r_edges` <br> - Format: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)` <br> - Like `edges` but for a reduced tree. +## Reduced tree data +- `nodes_t`, `nodes_i`, `nodes_p` <br> + These are like `nodes`, but describe the nodes for various reduced trees. +- `edges_t`, `edges_i`, `edges_p` <br> + Like `edges` but for reduced trees. # Generating the Database @@ -147,9 +145,8 @@ Some of the python scripts require third-party packages: - pickedNames.txt: Has lines of the form `nodeName1|altName1|prefAlt1`. These correspond to entries in the `names` table. `prefAlt` should be 1 or 0. A line like `name1|name1|1` causes a node to have no preferred alt-name. -3. Run genReducedTreeData.py, which generates a second, reduced version of the tree, - adding the `r_nodes` and `r_edges` tables, using `nodes` and `names`. Reads from - pickedReducedNodes.txt, which lists names of nodes that must be included (1 per line). -4. Optionally run trimTree.py, which tries to remove some 'low significance' nodes, - for the sake of performance and content-relevance. Otherwise, some nodes may have - over 10k children, which can take a while to render (took over a minute in testing). +3. Run genReducedTrees.py, which generates multiple reduced versions of the tree, + adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from + pickedNodes.txt, which lists names of nodes that must be included (1 per line). + The original tree isn't used for web-queries, as some nodes would have over + 10k children, which can take a while to render (took over a minute in testing). diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py deleted file mode 100755 index 2e56bba..0000000 --- a/backend/data/genReducedTreeData.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/python3 - -import sys, os.path, re -import json, sqlite3 - -usageInfo = f""" -Usage: {sys.argv[0]} - -Creates a reduced version of the tree in the database. -Reads a subset of the node names from a file, and creates a -minimal tree that contains them, possibly with a few extras. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -dbFile = "data.db" -nodeNamesFile = "pickedReducedNodes.txt" -minimalNames = set() -nodeMap = {} # Maps node names to node objects -PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit -compNameRegex = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes - -class Node: - " Represents a node from the database " - def __init__(self, id, children, parent, tips, pSupport): - self.id = id - self.children = children - self.parent = parent - self.tips = tips - self.pSupport = pSupport - -print("Opening database") -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Getting minimal name set") -iterNum = 0 -with open(nodeNamesFile) as file: - for line in file: - iterNum += 1 - if iterNum % 100 == 0: - print(f"At iteration {iterNum}") - # - name = line.rstrip() - row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone() - if row == None: - row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone() - if row != None: - minimalNames.add(row[0]) -if len(minimalNames) == 0: - print("No names found") - sys.exit(0) -print(f"Result has {len(minimalNames)} names") - -print("Getting ancestor nodes") -rootName = None -iterNum = 0 -for name in minimalNames: - iterNum += 1 - if iterNum % 100 == 0: - print(f"At iteration {iterNum}") - # - prevName = None - while name != None: - if name not in nodeMap: - (id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone() - row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone() - parent = None if row == None or row[0] == "" else row[0] - pSupport = row == None or row[1] == 1 - children = [] if prevName == None else [prevName] - nodeMap[name] = Node(id, children, parent, 0, pSupport) - prevName = name - name = parent - else: - if prevName != None: - nodeMap[name].children.append(prevName) - break - if name == None: - rootName = prevName -print(f"Result has {len(nodeMap)} nodes") - -print("Merging-upward composite nodes") -namesToRemove = set() -for (name, node) in nodeMap.items(): - parent = node.parent - if parent != None and compNameRegex.fullmatch(name) != None: - # Connect children to parent - nodeMap[parent].children.remove(name) - nodeMap[parent].children.extend(node.children) - for n in node.children: - nodeMap[n].parent = parent - nodeMap[n].pSupport &= node.pSupport - # Remember for removal - namesToRemove.add(name) -for name in namesToRemove: - del nodeMap[name] -print(f"Result has {len(nodeMap)} nodes") - -print("Removing 'chain collapsible' nodes") -namesToRemove2 = set() -for (name, node) in nodeMap.items(): - hasOneChild = len(node.children) == 1 - isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1 - if name not in minimalNames and (hasOneChild or isOnlyChild): - parent = node.parent - # Connect parent and children - nodeMap[parent].children.remove(name) - nodeMap[parent].children.extend(node.children) - for n in node.children: - nodeMap[n].parent = parent - nodeMap[n].pSupport &= node.pSupport - # Remember for removal - namesToRemove2.add(name) -for name in namesToRemove2: - del nodeMap[name] - namesToRemove.add(name) -print(f"Result has {len(nodeMap)} nodes") - -print("Adding some additional nearby children") -namesToAdd = [] -iterNum = 0 -for (name, node) in nodeMap.items(): - iterNum += 1 - if iterNum % 100 == 0: - print(f"At iteration {iterNum}") - # - numChildren = len(node.children) - if numChildren < PREF_NUM_CHILDREN: - children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))] - newChildren = [] - for n in children: - if n in nodeMap or n in namesToRemove: - continue - if compNameRegex.fullmatch(name) != None: - continue - if dbCur.execute("SELECT name from node_imgs WHERE name = ?", (n,)).fetchone() == None: - continue - if dbCur.execute("SELECT name from linked_imgs WHERE name = ?", (n,)).fetchone() == None: - continue - newChildren.append(n) - newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)] - node.children.extend(newChildNames) - namesToAdd.extend(newChildNames) -for name in namesToAdd: - parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone() - (id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone() - parent = None if parent == "" else parent - nodeMap[name] = Node(id, [], parent, 0, pSupport == 1) -print(f"Result has {len(nodeMap)} nodes") - -print("Setting 'tips' values") -def setTips(nodeName): - node = nodeMap[nodeName] - if len(node.children) == 0: - node.tips = 1 - return 1 - tips = sum([setTips(childName) for childName in node.children]) - node.tips = tips - return tips -setTips(rootName) - -print("Adding reduced tree to database") -dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") -dbCur.execute("CREATE INDEX r_nodes_idx_nc ON r_nodes(name COLLATE NOCASE)") -dbCur.execute("CREATE TABLE r_edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") -dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)") -for (name, node) in nodeMap.items(): - parentName = "" if node.parent == None else node.parent - dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, node.id, node.tips)) - for childName in node.children: - pSupport = 1 if nodeMap[childName].pSupport else 0 - dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport)) - -print("Closing database") -dbCon.commit() -dbCon.close() diff --git a/backend/data/genReducedTrees.py b/backend/data/genReducedTrees.py new file mode 100755 index 0000000..17a66f2 --- /dev/null +++ b/backend/data/genReducedTrees.py @@ -0,0 +1,324 @@ +#!/usr/bin/python3 + +import sys, os.path, re +import json, sqlite3 + +usageInfo = f""" +Usage: {sys.argv[0]} [tree1] + +Creates reduced versions of the tree in the database: +- A 'picked nodes' tree: + Created from a minimal set of node names read from a file, + possibly with some extra randmly-picked children. +- An 'images only' tree: + Created by removing nodes without an image or presence in the + 'picked' tree. +- A 'weakly trimmed' tree: + Created by removing nodes that lack an image or description, or + presence in the 'picked' tree. And, for nodes with 'many' children, + removing some more, despite any node descriptions. + +If tree1 is specified, as 'picked', 'images', or 'trimmed', only that +tree is generated. +""" +if len(sys.argv) > 2 or len(sys.argv) == 2 and re.fullmatch(r"picked|images|trimmed", sys.argv[1]) == None: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +tree = sys.argv[1] if len(sys.argv) > 1 else None +dbFile = "data.db" +pickedNodesFile = "pickedNodes.txt" +COMP_NAME_REGEX = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes +PREF_NUM_CHILDREN = 3 # When generating the picked-nodes tree, include extra children up to this limit +SOFT_CHILD_LIMIT = 500 # When generating the weakly-trimmed tree, this indicates when a node has 'many' children + +class Node: + def __init__(self, id, children, parent, tips, pSupport): + self.id = id + self.children = children + self.parent = parent + self.tips = tips + self.pSupport = pSupport + +print("Opening database") +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() + +def genPickedNodeTree(dbCur, pickedNames, rootName): + global COMP_NAME_REGEX, PREF_NUM_CHILDREN + nodeMap = {} # Maps node names to Nodes + print("Getting ancestors") + nodeMap = genNodeMap(dbCur, pickedNames, 100) + print(f"Result has {len(nodeMap)} nodes") + print("Removing composite nodes") + removedNames = removeCompositeNodes(nodeMap) + print(f"Result has {len(nodeMap)} nodes") + print("Removing 'chain collapsible' nodes") + temp = removeCollapsibleNodes(nodeMap, pickedNames) + removedNames.update(temp) + print(f"Result has {len(nodeMap)} nodes") + print("Adding some additional nearby children") + namesToAdd = [] + iterNum = 0 + for (name, node) in nodeMap.items(): + iterNum += 1 + if iterNum % 100 == 0: + print(f"At iteration {iterNum}") + # + numChildren = len(node.children) + if numChildren < PREF_NUM_CHILDREN: + children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))] + newChildren = [] + for n in children: + if n in nodeMap or n in removedNames: + continue + if COMP_NAME_REGEX.fullmatch(n) != None: + continue + if dbCur.execute("SELECT name from node_imgs WHERE name = ?", (n,)).fetchone() == None and \ + dbCur.execute("SELECT name from linked_imgs WHERE name = ?", (n,)).fetchone() == None: + continue + newChildren.append(n) + newChildNames = newChildren[:(PREF_NUM_CHILDREN - numChildren)] + node.children.extend(newChildNames) + namesToAdd.extend(newChildNames) + for name in namesToAdd: + parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone() + (id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone() + parent = None if parent == "" else parent + nodeMap[name] = Node(id, [], parent, 0, pSupport == 1) + print(f"Result has {len(nodeMap)} nodes") + print("Setting 'tips' values") + updateTips(rootName, nodeMap) + print("Creating table") + addTreeTables(nodeMap, dbCur, "p") +def genImagesOnlyTree(dbCur, nodesWithImgOrPicked, rootName): + print("Getting ancestors") + nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4) + print(f"Result has {len(nodeMap)} nodes") + print("Removing 'collapsible' nodes") + removeCollapsibleNodes(nodeMap, nodesWithImgOrPicked) + print(f"Result has {len(nodeMap)} nodes") + print(f"Setting 'tips' values") + updateTips(rootName, nodeMap) + print("Creating table") + addTreeTables(nodeMap, dbCur, "i") +def genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName): + global SOFT_CHILD_LIMIT + print("Getting ancestors") + nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5) + print(f"Result has {len(nodeMap)} nodes") + print("Getting nodes to 'strongly keep'") + iterNum = 0 + nodesFromImgOrPicked = set() + for name in nodesWithImgOrPicked: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}") + # + while name != None: + if name not in nodesFromImgOrPicked: + nodesFromImgOrPicked.add(name) + name = nodeMap[name].parent + else: + break + print(f"Node set has {len(nodesFromImgOrPicked)} nodes") + print("Removing 'collapsible' nodes") + removeCollapsibleNodes(nodeMap, nodesWithImgDescOrPicked) + print(f"Result has {len(nodeMap)} nodes") + print(f"Finding nodes with 'many' children") + namesToRemove = set() + iterNum = 0 + def findTrimmables(nodeName): + nonlocal nodeMap, nodesFromImgOrPicked, iterNum + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") + # + node = nodeMap[nodeName] + if len(node.children) > SOFT_CHILD_LIMIT: + numToTrim = len(node.children) - SOFT_CHILD_LIMIT + # Try removing weakly-kept nodes, preferring those with less tips + candidatesToTrim = [n for n in node.children if n not in nodesFromImgOrPicked] + childToTips = {n: nodeMap[n].tips for n in candidatesToTrim} + candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True) + childrenToRemove = set(candidatesToTrim[-numToTrim:]) + node.children = [n for n in node.children if n not in childrenToRemove] + # Mark nodes for deletion + for n in childrenToRemove: + markForRemoval(n) + # Recurse on children + for n in node.children: + findTrimmables(n) + def markForRemoval(nodeName): + nonlocal nodeMap, namesToRemove + namesToRemove.add(nodeName) + for child in nodeMap[nodeName].children: + markForRemoval(child) + findTrimmables(rootName) + print(f"Deleting {len(namesToRemove)} nodes") + for nodeName in namesToRemove: + del nodeMap[nodeName] + print(f"Setting 'tips' values") + updateTips(rootName, nodeMap) + print("Creating table") + addTreeTables(nodeMap, dbCur, "t") +# Helper functions +def genNodeMap(dbCur, nameSet, itersBeforePrint = 1): + " Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map " + nodeMap = {} + iterNum = 0 + for name in nameSet: + iterNum += 1 + if iterNum % itersBeforePrint == 0: + print(f"At iteration {iterNum}") + # + prevName = None + while name != None: + if name not in nodeMap: + # Add node + (id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone() + row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone() + parent = None if row == None or row[0] == "" else row[0] + pSupport = row == None or row[1] == 1 + children = [] if prevName == None else [prevName] + nodeMap[name] = Node(id, children, parent, 0, pSupport) + # Iterate to parent + prevName = name + name = parent + else: + # Just add as child + if prevName != None: + nodeMap[name].children.append(prevName) + break + return nodeMap +def removeCompositeNodes(nodeMap): + " Given a tree, removes composite-name nodes, and returns the removed nodes' names " + global COMP_NAME_REGEX + namesToRemove = set() + for (name, node) in nodeMap.items(): + parent = node.parent + if parent != None and COMP_NAME_REGEX.fullmatch(name) != None: + # Connect children to parent + nodeMap[parent].children.remove(name) + nodeMap[parent].children.extend(node.children) + for n in node.children: + nodeMap[n].parent = parent + nodeMap[n].pSupport &= node.pSupport + # Remember for removal + namesToRemove.add(name) + for name in namesToRemove: + del nodeMap[name] + return namesToRemove +def removeCollapsibleNodes(nodeMap, nodesToKeep = {}): + """ Given a tree, removes single-child parents, then only-childs, + with given exceptions, and returns the set of removed nodes' names """ + namesToRemove = set() + # Remove single-child parents + for (name, node) in nodeMap.items(): + if len(node.children) == 1 and node.parent != None and name not in nodesToKeep: + # Connect parent and children + parent = node.parent + child = node.children[0] + nodeMap[parent].children.remove(name) + nodeMap[parent].children.append(child) + nodeMap[child].parent = parent + nodeMap[child].pSupport &= node.pSupport + # Remember for removal + namesToRemove.add(name) + for name in namesToRemove: + del nodeMap[name] + # Remove only-childs (not redundant because 'nodesToKeep' can cause single-child parents to be kept) + namesToRemove.clear() + for (name, node) in nodeMap.items(): + isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1 + if isOnlyChild and name not in nodesToKeep: + # Connect parent and children + parent = node.parent + nodeMap[parent].children = node.children + for n in node.children: + nodeMap[n].parent = parent + nodeMap[n].pSupport &= node.pSupport + # Remember for removal + namesToRemove.add(name) + for name in namesToRemove: + del nodeMap[name] + # + return namesToRemove +def updateTips(nodeName, nodeMap): + " Updates the 'tips' values for a node and it's descendants, returning the node's new 'tips' value " + node = nodeMap[nodeName] + tips = sum([updateTips(childName, nodeMap) for childName in node.children]) + tips = max(1, tips) + node.tips = tips + return tips +def addTreeTables(nodeMap, dbCur, suffix): + " Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix " + nodesTbl = f"nodes_{suffix}" + edgesTbl = f"edges_{suffix}" + dbCur.execute(f"CREATE TABLE {nodesTbl} (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") + dbCur.execute(f"CREATE INDEX {nodesTbl}_idx_nc ON {nodesTbl}(name COLLATE NOCASE)") + dbCur.execute(f"CREATE TABLE {edgesTbl} (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") + dbCur.execute(f"CREATE INDEX {edgesTbl}_child_idx ON {edgesTbl}(child)") + for (name, node) in nodeMap.items(): + dbCur.execute(f"INSERT INTO {nodesTbl} VALUES (?, ?, ?)", (name, node.id, node.tips)) + for childName in node.children: + pSupport = 1 if nodeMap[childName].pSupport else 0 + dbCur.execute(f"INSERT INTO {edgesTbl} VALUES (?, ?, ?)", (name, childName, pSupport)) + +print(f"Finding root node") +query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" +(rootName,) = dbCur.execute(query).fetchone() +print(f"Found \"{rootName}\"") + +print('=== Getting picked-nodes ===') +pickedNames = set() +pickedTreeExists = False +if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='nodes_p'").fetchone() == None: + print(f"Reading from {pickedNodesFile}") + with open(pickedNodesFile) as file: + for line in file: + name = line.rstrip() + row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone() + if row == None: + row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone() + if row != None: + pickedNames.add(row[0]) + if len(pickedNames) == 0: + raise Exception("ERROR: No picked names found") +else: + pickedTreeExists = True + print("Picked-node tree already exists") + if tree == 'picked': + sys.exit() + for (name,) in dbCur.execute("SELECT name FROM nodes_p"): + pickedNames.add(name) +print(f"Found {len(pickedNames)} names") + +if (tree == 'picked' or tree == None) and not pickedTreeExists: + print("=== Generating picked-nodes tree ===") + genPickedNodeTree(dbCur, pickedNames, rootName) +if tree != 'picked': + print("=== Finding 'non-low significance' nodes ===") + nodesWithImgOrPicked = set() + nodesWithImgDescOrPicked = set() + print("Finding nodes with descs") + for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc + nodesWithImgDescOrPicked.add(name) + print("Finding nodes with images") + for (name,) in dbCur.execute("SELECT name FROM node_imgs"): + nodesWithImgDescOrPicked.add(name) + nodesWithImgOrPicked.add(name) + print("Adding picked nodes") + for name in pickedNames: + nodesWithImgDescOrPicked.add(name) + nodesWithImgOrPicked.add(name) + if tree == 'images' or tree == None: + print("=== Generating images-only tree ===") + genImagesOnlyTree(dbCur, nodesWithImgOrPicked, rootName) + if tree == 'trimmed' or tree == None: + print("=== Generating weakly-trimmed tree ===") + genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName) + +print("Closing database") +dbCon.commit() +dbCon.close() diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py deleted file mode 100755 index fa269d8..0000000 --- a/backend/data/trimTree.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/python3 - -import sys -import sqlite3 - -usageInfo = f""" -Usage: {sys.argv[0]} - -Tries to remove 'low significance' nodes from the database. Currently -removes nodes that don't have an image or description, or a presence in -the reduced tree. Also, for nodes with 'many' children, trims some more, -ignoring the presence of node descriptions. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -dbFile = "data.db" -softChildLimit = 500 # Used to determine when a node has 'many' children - -print("Opening database") -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Finding nodes to keep") -nodesToKeep = set() -nodesToStronglyKeep = set() -print("\tFinding nodes with descs") -for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc - nodesToKeep.add(name) -print("\tFinding nodes with images") -for (name,) in dbCur.execute("SELECT name FROM node_imgs"): - nodesToKeep.add(name) - nodesToStronglyKeep.add(name) -print("\tFinding nodes in reduced-tree") -for (name,) in dbCur.execute("SELECT name from r_nodes"): - nodesToKeep.add(name) - nodesToStronglyKeep.add(name) -print("\tFinding ancestors") -ancestors = set() -iterNum = 0 -for name in nodesToKeep: - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"\tAt iteration {iterNum}") - # - while True: - row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (name,)).fetchone() - if row != None: - parent = row[0] - if parent not in nodesToKeep and parent not in ancestors: - ancestors.add(parent) - if name not in nodesToStronglyKeep: - nodesToStronglyKeep.add(parent) - name = parent - continue - break -nodesToKeep.update(ancestors) -print(f"Result: {len(nodesToKeep)} nodes to keep") - -# Find root node -query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" -(rootName,) = dbCur.execute(query).fetchone() -print(f"Found root node \"{rootName}\"") - -print("Looking for trimmable nodes") -nodeToTipsChg = {} # Used to update 'tips' values after trimming -nodesToDelete = set() -iterNum = 0 -def findTrimmables(nodeName): - global iterNum - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] - childrenToKeep, otherChildren = set(), set() - for n in childNames: - if n in nodesToKeep: - childrenToKeep.add(n) - else: - otherChildren.add(n) - tipsRemoved = 0 - # Check soft limit - if len(childrenToKeep) > softChildLimit: - numToTrim = len(childrenToKeep) - softChildLimit - # Try removing weakly-kept nodes, preferring those with less tips - candidatesToTrim = [n for n in childrenToKeep if n not in nodesToStronglyKeep] - childToTips = {} - query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(candidatesToTrim))) - for (n, tips) in dbCur.execute(query, candidatesToTrim): - childToTips[n] = tips - candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True) - otherChildren.update(candidatesToTrim[-numToTrim:]) - childrenToKeep.difference_update(candidatesToTrim[-numToTrim:]) - # Mark nodes for deletion - for n in otherChildren: - tipsRemoved += markForDeletion(n) - # Recurse on children - for n in childrenToKeep: - tipsRemoved += findTrimmables(n) - # Store info for updating num-tips later - nodeToTipsChg[nodeName] = tipsRemoved - return tipsRemoved -def markForDeletion(nodeName): - nodesToDelete.add(nodeName) - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] - if len(childNames) == 0: - return 1 - else: - tipsRemoved = 0 - for n in childNames: - tipsRemoved += markForDeletion(n) - return tipsRemoved -findTrimmables(rootName) - -print(f"Deleting {len(nodesToDelete)} nodes") -iterNum = 0 -for nodeName in nodesToDelete: - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,)) - dbCur.execute("DELETE FROM edges WHERE parent = ?", (nodeName,)) - dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,)) - dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,)) - # Could also delete from 'eol_ids', 'wiki_ids', and 'descs', but this - # makes it much harder to restore the original data if needed, and - # the memory savings didn't seem significant. - -print(f"Updating num-tips for {len(nodeToTipsChg)} nodes") -iterNum = 0 -for (nodeName, tipsChg) in nodeToTipsChg.items(): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"At iteration {iterNum}") - # - dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName)) - -print("Closing database") -dbCon.commit() -dbCon.close() |
