diff options
Diffstat (limited to 'backend/data')
| -rwxr-xr-x | backend/data/trimTree.py | 53 |
1 files changed, 28 insertions, 25 deletions
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py index 294aaf1..302ea0d 100755 --- a/backend/data/trimTree.py +++ b/backend/data/trimTree.py @@ -13,22 +13,25 @@ if len(sys.argv) > 1: sys.exit(1) dbFile = "data.db" -softChildLimit = 0 +softChildLimit = 500 dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() # Get nodes that shouldn't be deleted, along with their ancestors print("Finding nodes to keep") nodesToKeep = set() +nodesToStronglyKeep = set() print("\tFinding nodes with descs") for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc nodesToKeep.add(name) print("\tFinding nodes with images") for (name,) in dbCur.execute("SELECT name FROM node_imgs"): nodesToKeep.add(name) + nodesToStronglyKeep.add(name) print("\tFinding nodes in reduced-tree") for (name,) in dbCur.execute("SELECT name from r_nodes"): nodesToKeep.add(name) + nodesToStronglyKeep.add(name) print("\tFinding ancestors") ancestors = set() iterNum = 0 @@ -43,6 +46,8 @@ for name in nodesToKeep: parent = row[0] if parent not in nodesToKeep and parent not in ancestors: ancestors.add(parent) + if name in nodesToStronglyKeep: + nodesToStronglyKeep.add(parent) name = parent continue break @@ -64,32 +69,30 @@ def findTrimmables(nodeName): print(f"At iteration {iterNum}") # childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))] + childrenToKeep, otherChildren = set(), set() + for n in childNames: + if n in nodesToKeep: + childrenToKeep.add(n) + else: + otherChildren.add(n) + # Check soft limit tipsRemoved = 0 - if len(childNames) > softChildLimit: - # Look for children to delete, excluding 'kept nodes' - childrenToKeep, otherChildren = [], [] - for n in childNames: - if n in nodesToKeep: - childrenToKeep.append(n) - else: - otherChildren.append(n) - # Only trim beyond threshold - if len(childrenToKeep) < softChildLimit: - numMoreToKeep = softChildLimit - len(childrenToKeep) - # Prefer keeping nodes with more tips - childToTips = {} - query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(otherChildren))) - for (n, tips) in dbCur.execute(query, otherChildren): - childToTips[n] = tips - otherChildren.sort(key=lambda n: childToTips[n], reverse=True) - childrenToKeep.extend(otherChildren[:numMoreToKeep]) - otherChildren = otherChildren[numMoreToKeep:] - # 'Simulate' deletions - childNames = childrenToKeep - for n in otherChildren: - tipsRemoved += markForDeletion(n) + if len(childrenToKeep) > softChildLimit: + numToTrim = len(childrenToKeep) - softChildLimit + # Try removing weakly-kept nodes, preferring those with less tips + candidatesToTrim = [n for n in childrenToKeep if n not in nodesToStronglyKeep] + childToTips = {} + query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(candidatesToTrim))) + for (n, tips) in dbCur.execute(query, candidatesToTrim): + childToTips[n] = tips + candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True) + otherChildren.update(candidatesToTrim[-numToTrim:]) + childrenToKeep.difference_update(candidatesToTrim[-numToTrim:]) + # 'Simulate' deletions + for n in otherChildren: + tipsRemoved += markForDeletion(n) # Recurse on children - for n in childNames: + for n in childrenToKeep: tipsRemoved += findTrimmables(n) # Store info for updating num-tips later nodeToTipsChg[nodeName] = tipsRemoved |
