aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rwxr-xr-xbackend/data/trimTree.py53
1 files changed, 28 insertions, 25 deletions
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py
index 294aaf1..302ea0d 100755
--- a/backend/data/trimTree.py
+++ b/backend/data/trimTree.py
@@ -13,22 +13,25 @@ if len(sys.argv) > 1:
sys.exit(1)
dbFile = "data.db"
-softChildLimit = 0
+softChildLimit = 500
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
# Get nodes that shouldn't be deleted, along with their ancestors
print("Finding nodes to keep")
nodesToKeep = set()
+nodesToStronglyKeep = set()
print("\tFinding nodes with descs")
for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc
nodesToKeep.add(name)
print("\tFinding nodes with images")
for (name,) in dbCur.execute("SELECT name FROM node_imgs"):
nodesToKeep.add(name)
+ nodesToStronglyKeep.add(name)
print("\tFinding nodes in reduced-tree")
for (name,) in dbCur.execute("SELECT name from r_nodes"):
nodesToKeep.add(name)
+ nodesToStronglyKeep.add(name)
print("\tFinding ancestors")
ancestors = set()
iterNum = 0
@@ -43,6 +46,8 @@ for name in nodesToKeep:
parent = row[0]
if parent not in nodesToKeep and parent not in ancestors:
ancestors.add(parent)
+ if name in nodesToStronglyKeep:
+ nodesToStronglyKeep.add(parent)
name = parent
continue
break
@@ -64,32 +69,30 @@ def findTrimmables(nodeName):
print(f"At iteration {iterNum}")
#
childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))]
+ childrenToKeep, otherChildren = set(), set()
+ for n in childNames:
+ if n in nodesToKeep:
+ childrenToKeep.add(n)
+ else:
+ otherChildren.add(n)
+ # Check soft limit
tipsRemoved = 0
- if len(childNames) > softChildLimit:
- # Look for children to delete, excluding 'kept nodes'
- childrenToKeep, otherChildren = [], []
- for n in childNames:
- if n in nodesToKeep:
- childrenToKeep.append(n)
- else:
- otherChildren.append(n)
- # Only trim beyond threshold
- if len(childrenToKeep) < softChildLimit:
- numMoreToKeep = softChildLimit - len(childrenToKeep)
- # Prefer keeping nodes with more tips
- childToTips = {}
- query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(otherChildren)))
- for (n, tips) in dbCur.execute(query, otherChildren):
- childToTips[n] = tips
- otherChildren.sort(key=lambda n: childToTips[n], reverse=True)
- childrenToKeep.extend(otherChildren[:numMoreToKeep])
- otherChildren = otherChildren[numMoreToKeep:]
- # 'Simulate' deletions
- childNames = childrenToKeep
- for n in otherChildren:
- tipsRemoved += markForDeletion(n)
+ if len(childrenToKeep) > softChildLimit:
+ numToTrim = len(childrenToKeep) - softChildLimit
+ # Try removing weakly-kept nodes, preferring those with less tips
+ candidatesToTrim = [n for n in childrenToKeep if n not in nodesToStronglyKeep]
+ childToTips = {}
+ query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(candidatesToTrim)))
+ for (n, tips) in dbCur.execute(query, candidatesToTrim):
+ childToTips[n] = tips
+ candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True)
+ otherChildren.update(candidatesToTrim[-numToTrim:])
+ childrenToKeep.difference_update(candidatesToTrim[-numToTrim:])
+ # 'Simulate' deletions
+ for n in otherChildren:
+ tipsRemoved += markForDeletion(n)
# Recurse on children
- for n in childNames:
+ for n in childrenToKeep:
tipsRemoved += findTrimmables(n)
# Store info for updating num-tips later
nodeToTipsChg[nodeName] = tipsRemoved