diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-07-01 19:28:12 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-07-01 19:28:12 +1000 |
| commit | 551fbe163b90cc1f318612c167fbdfe738dd7132 (patch) | |
| tree | 00286538d754fdf686751a3d4c1689d799ecd65e /backend/data/trimTree.py | |
| parent | c2b9a8b7a706cdca58dab7f4a980401e1c20a602 (diff) | |
Generate 3 reduced trees, keeping the original, and serve only those
Generate a 'trimmed' reduced tree instead of changing the original.
Generate an 'images-only' reduced tree, and use it as the default.
Combine 'picked' reduced tree code with that of other reduced trees.
Adapt server API to allow selecting between more than 2 trees.
Add client setting for selecting between 3 trees.
Diffstat (limited to 'backend/data/trimTree.py')
| -rwxr-xr-x | backend/data/trimTree.py | 143 |
1 files changed, 0 insertions, 143 deletions
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py deleted file mode 100755 index fa269d8..0000000 --- a/backend/data/trimTree.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/python3 - -import sys -import sqlite3 - -usageInfo = f""" -Usage: {sys.argv[0]} - -Tries to remove 'low significance' nodes from the database. Currently -removes nodes that don't have an image or description, or a presence in -the reduced tree. Also, for nodes with 'many' children, trims some more, -ignoring the presence of node descriptions. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -dbFile = "data.db" -softChildLimit = 500 # Used to determine when a node has 'many' children - -print("Opening database") -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Finding nodes to keep") -nodesToKeep = set() -nodesToStronglyKeep = set() -print("\tFinding nodes with descs") -for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc - nodesToKeep.add(name) -print("\tFinding nodes with images") -for (name,) in dbCur.execute("SELECT name FROM node_imgs"): - nodesToKeep.add(name) - nodesToStronglyKeep.add(name) -print("\tFinding nodes in reduced-tree") -for (name,) in dbCur.execute("SELECT name from r_nodes"): - nodesToKeep.add(name) - nodesToStronglyKeep.add(name) -print("\tFinding ancestors") -ancestors = set() -iterNum = 0 -for name in nodesToKeep: - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"\tAt iteration {iterNum}") - # - while True: - row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (name,)).fetchone() - if row != None: - parent = row[0] - if parent not in nodesToKeep and parent not in ancestors: - ancestors.add(parent) - if name not in nodesToStronglyKeep: - nodesToStronglyKeep.add(parent) - name = parent - continue - break -nodesToKeep.update(ancestors) -print(f"Result: {len(nodesToKeep)} nodes to keep") - -# Find root node -query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" -(rootName,) = dbCur.execute(query).fetchone() -print(f"Found root node \"{rootName}\"") - -print("Looking for trimmable nodes") -nodeToTipsChg = {} # Used to update 'tips' values after trimming -nodesToDelete = set() -iterNum = 0 -def findTrimmables(nodeName): - global iterNum - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] - childrenToKeep, otherChildren = set(), set() - for n in childNames: - if n in nodesToKeep: - childrenToKeep.add(n) - else: - otherChildren.add(n) - tipsRemoved = 0 - # Check soft limit - if len(childrenToKeep) > softChildLimit: - numToTrim = len(childrenToKeep) - softChildLimit - # Try removing weakly-kept nodes, preferring those with less tips - candidatesToTrim = [n for n in childrenToKeep if n not in nodesToStronglyKeep] - childToTips = {} - query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(candidatesToTrim))) - for (n, tips) in dbCur.execute(query, candidatesToTrim): - childToTips[n] = tips - candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True) - otherChildren.update(candidatesToTrim[-numToTrim:]) - childrenToKeep.difference_update(candidatesToTrim[-numToTrim:]) - # Mark nodes for deletion - for n in otherChildren: - tipsRemoved += markForDeletion(n) - # Recurse on children - for n in childrenToKeep: - tipsRemoved += findTrimmables(n) - # Store info for updating num-tips later - nodeToTipsChg[nodeName] = tipsRemoved - return tipsRemoved -def markForDeletion(nodeName): - nodesToDelete.add(nodeName) - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] - if len(childNames) == 0: - return 1 - else: - tipsRemoved = 0 - for n in childNames: - tipsRemoved += markForDeletion(n) - return tipsRemoved -findTrimmables(rootName) - -print(f"Deleting {len(nodesToDelete)} nodes") -iterNum = 0 -for nodeName in nodesToDelete: - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,)) - dbCur.execute("DELETE FROM edges WHERE parent = ?", (nodeName,)) - dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,)) - dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,)) - # Could also delete from 'eol_ids', 'wiki_ids', and 'descs', but this - # makes it much harder to restore the original data if needed, and - # the memory savings didn't seem significant. - -print(f"Updating num-tips for {len(nodeToTipsChg)} nodes") -iterNum = 0 -for (nodeName, tipsChg) in nodeToTipsChg.items(): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"At iteration {iterNum}") - # - dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName)) - -print("Closing database") -dbCon.commit() -dbCon.close() |
