aboutsummaryrefslogtreecommitdiff
path: root/backend/data/trimTree.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/trimTree.py')
-rwxr-xr-xbackend/data/trimTree.py53
1 files changed, 31 insertions, 22 deletions
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py
index 302ea0d..fa269d8 100755
--- a/backend/data/trimTree.py
+++ b/backend/data/trimTree.py
@@ -3,21 +3,25 @@
import sys
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Removes certain children from a tol-tree in an sqlite db.\n"
-usageInfo += "Looks for nodes with an amount of children above a threshold,\n"
-usageInfo += "and removes the excess, excluding those with 'significant'\n"
-usageInfo += "associations, like those with descriptions and images.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Tries to remove 'low significance' nodes from the database. Currently
+removes nodes that don't have an image or description, or a presence in
+the reduced tree. Also, for nodes with 'many' children, trims some more,
+ignoring the presence of node descriptions.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
dbFile = "data.db"
-softChildLimit = 500
+softChildLimit = 500 # Used to determine when a node has 'many' children
+print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Get nodes that shouldn't be deleted, along with their ancestors
+
print("Finding nodes to keep")
nodesToKeep = set()
nodesToStronglyKeep = set()
@@ -41,25 +45,26 @@ for name in nodesToKeep:
print(f"\tAt iteration {iterNum}")
#
while True:
- row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (name,)).fetchone()
+ row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (name,)).fetchone()
if row != None:
parent = row[0]
if parent not in nodesToKeep and parent not in ancestors:
ancestors.add(parent)
- if name in nodesToStronglyKeep:
+ if name not in nodesToStronglyKeep:
nodesToStronglyKeep.add(parent)
name = parent
continue
break
nodesToKeep.update(ancestors)
-print(f"Total of {len(nodesToKeep)} nodes to keep")
+print(f"Result: {len(nodesToKeep)} nodes to keep")
+
# Find root node
-query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.node IS NULL LIMIT 1"
+query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1"
(rootName,) = dbCur.execute(query).fetchone()
-print(f"Found root node '{rootName}'")
-# Traverse tree, looking for trimmable nodes
+print(f"Found root node \"{rootName}\"")
+
print("Looking for trimmable nodes")
-nodeToTipsChg = {}
+nodeToTipsChg = {} # Used to update 'tips' values after trimming
nodesToDelete = set()
iterNum = 0
def findTrimmables(nodeName):
@@ -68,15 +73,15 @@ def findTrimmables(nodeName):
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
- childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))]
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))]
childrenToKeep, otherChildren = set(), set()
for n in childNames:
if n in nodesToKeep:
childrenToKeep.add(n)
else:
otherChildren.add(n)
- # Check soft limit
tipsRemoved = 0
+ # Check soft limit
if len(childrenToKeep) > softChildLimit:
numToTrim = len(childrenToKeep) - softChildLimit
# Try removing weakly-kept nodes, preferring those with less tips
@@ -88,7 +93,7 @@ def findTrimmables(nodeName):
candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True)
otherChildren.update(candidatesToTrim[-numToTrim:])
childrenToKeep.difference_update(candidatesToTrim[-numToTrim:])
- # 'Simulate' deletions
+ # Mark nodes for deletion
for n in otherChildren:
tipsRemoved += markForDeletion(n)
# Recurse on children
@@ -99,7 +104,7 @@ def findTrimmables(nodeName):
return tipsRemoved
def markForDeletion(nodeName):
nodesToDelete.add(nodeName)
- childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))]
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))]
if len(childNames) == 0:
return 1
else:
@@ -108,7 +113,7 @@ def markForDeletion(nodeName):
tipsRemoved += markForDeletion(n)
return tipsRemoved
findTrimmables(rootName)
-# Delete trimmable nodes
+
print(f"Deleting {len(nodesToDelete)} nodes")
iterNum = 0
for nodeName in nodesToDelete:
@@ -117,10 +122,13 @@ for nodeName in nodesToDelete:
print(f"At iteration {iterNum}")
#
dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,))
- dbCur.execute("DELETE FROM edges WHERE node = ?", (nodeName,))
+ dbCur.execute("DELETE FROM edges WHERE parent = ?", (nodeName,))
dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,))
dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,))
- dbCur.execute("DELETE FROM eol_ids WHERE name = ?", (nodeName,))
+ # Could also delete from 'eol_ids', 'wiki_ids', and 'descs', but this
+ # makes it much harder to restore the original data if needed, and
+ # the memory savings didn't seem significant.
+
print(f"Updating num-tips for {len(nodeToTipsChg)} nodes")
iterNum = 0
for (nodeName, tipsChg) in nodeToTipsChg.items():
@@ -129,6 +137,7 @@ for (nodeName, tipsChg) in nodeToTipsChg.items():
print(f"At iteration {iterNum}")
#
dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName))
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()