diff options
Diffstat (limited to 'backend/data/trimTree.py')
| -rwxr-xr-x | backend/data/trimTree.py | 53 |
1 files changed, 31 insertions, 22 deletions
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py index 302ea0d..fa269d8 100755 --- a/backend/data/trimTree.py +++ b/backend/data/trimTree.py @@ -3,21 +3,25 @@ import sys import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Removes certain children from a tol-tree in an sqlite db.\n" -usageInfo += "Looks for nodes with an amount of children above a threshold,\n" -usageInfo += "and removes the excess, excluding those with 'significant'\n" -usageInfo += "associations, like those with descriptions and images.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Tries to remove 'low significance' nodes from the database. Currently +removes nodes that don't have an image or description, or a presence in +the reduced tree. Also, for nodes with 'many' children, trims some more, +ignoring the presence of node descriptions. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) dbFile = "data.db" -softChildLimit = 500 +softChildLimit = 500 # Used to determine when a node has 'many' children +print("Opening database") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Get nodes that shouldn't be deleted, along with their ancestors + print("Finding nodes to keep") nodesToKeep = set() nodesToStronglyKeep = set() @@ -41,25 +45,26 @@ for name in nodesToKeep: print(f"\tAt iteration {iterNum}") # while True: - row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (name,)).fetchone() + row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (name,)).fetchone() if row != None: parent = row[0] if parent not in nodesToKeep and parent not in ancestors: ancestors.add(parent) - if name in nodesToStronglyKeep: + if name not in nodesToStronglyKeep: nodesToStronglyKeep.add(parent) name = parent continue break nodesToKeep.update(ancestors) -print(f"Total of {len(nodesToKeep)} nodes to keep") +print(f"Result: {len(nodesToKeep)} nodes to keep") + # Find root node -query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.node IS NULL LIMIT 1" +query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" (rootName,) = dbCur.execute(query).fetchone() -print(f"Found root node '{rootName}'") -# Traverse tree, looking for trimmable nodes +print(f"Found root node \"{rootName}\"") + print("Looking for trimmable nodes") -nodeToTipsChg = {} +nodeToTipsChg = {} # Used to update 'tips' values after trimming nodesToDelete = set() iterNum = 0 def findTrimmables(nodeName): @@ -68,15 +73,15 @@ def findTrimmables(nodeName): if iterNum % 1e4 == 0: print(f"At iteration {iterNum}") # - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))] + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] childrenToKeep, otherChildren = set(), set() for n in childNames: if n in nodesToKeep: childrenToKeep.add(n) else: otherChildren.add(n) - # Check soft limit tipsRemoved = 0 + # Check soft limit if len(childrenToKeep) > softChildLimit: numToTrim = len(childrenToKeep) - softChildLimit # Try removing weakly-kept nodes, preferring those with less tips @@ -88,7 +93,7 @@ def findTrimmables(nodeName): candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True) otherChildren.update(candidatesToTrim[-numToTrim:]) childrenToKeep.difference_update(candidatesToTrim[-numToTrim:]) - # 'Simulate' deletions + # Mark nodes for deletion for n in otherChildren: tipsRemoved += markForDeletion(n) # Recurse on children @@ -99,7 +104,7 @@ def findTrimmables(nodeName): return tipsRemoved def markForDeletion(nodeName): nodesToDelete.add(nodeName) - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))] + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] if len(childNames) == 0: return 1 else: @@ -108,7 +113,7 @@ def markForDeletion(nodeName): tipsRemoved += markForDeletion(n) return tipsRemoved findTrimmables(rootName) -# Delete trimmable nodes + print(f"Deleting {len(nodesToDelete)} nodes") iterNum = 0 for nodeName in nodesToDelete: @@ -117,10 +122,13 @@ for nodeName in nodesToDelete: print(f"At iteration {iterNum}") # dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,)) - dbCur.execute("DELETE FROM edges WHERE node = ?", (nodeName,)) + dbCur.execute("DELETE FROM edges WHERE parent = ?", (nodeName,)) dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,)) dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,)) - dbCur.execute("DELETE FROM eol_ids WHERE name = ?", (nodeName,)) + # Could also delete from 'eol_ids', 'wiki_ids', and 'descs', but this + # makes it much harder to restore the original data if needed, and + # the memory savings didn't seem significant. + print(f"Updating num-tips for {len(nodeToTipsChg)} nodes") iterNum = 0 for (nodeName, tipsChg) in nodeToTipsChg.items(): @@ -129,6 +137,7 @@ for (nodeName, tipsChg) in nodeToTipsChg.items(): print(f"At iteration {iterNum}") # dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName)) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() |
