1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
|
#!/usr/bin/python3
import sys
import sqlite3
usageInfo = f"""
Usage: {sys.argv[0]}
Tries to remove 'low significance' nodes from the database. Currently
removes nodes that don't have an image or description, or a presence in
the reduced tree. Also, for nodes with 'many' children, trims some more,
ignoring the presence of node descriptions.
"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
dbFile = "data.db"
softChildLimit = 500 # Used to determine when a node has 'many' children
print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
print("Finding nodes to keep")
nodesToKeep = set()
nodesToStronglyKeep = set()
ancestorsToTrim = set()
print("\tFinding nodes with descs")
for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc
nodesToKeep.add(name)
print("\tFinding nodes with images")
for (name,) in dbCur.execute("SELECT name FROM node_imgs"):
nodesToKeep.add(name)
nodesToStronglyKeep.add(name)
print("\tFinding nodes in reduced-tree")
for (name,) in dbCur.execute("SELECT name from r_nodes"):
nodesToKeep.add(name)
nodesToStronglyKeep.add(name)
print("\tFinding ancestors")
ancestors = set()
iterNum = 0
for name in nodesToKeep:
iterNum += 1
if iterNum % 1e4 == 0:
print(f"\tAt iteration {iterNum}")
#
while True:
row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (name,)).fetchone()
if row != None:
parent = row[0]
if parent not in nodesToKeep and parent not in ancestors:
ancestors.add(parent)
if name not in nodesToStronglyKeep:
nodesToStronglyKeep.add(parent)
name = parent
continue
break
# For ancestors that would end up with 1 child, mark for trimming
for n in ancestors:
children = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (n,))]
children = [n for n in children if n in nodesToKeep or n in ancestors]
if len(children) == 1:
ancestorsToTrim.add(n)
print(f"Found {len(ancestorsToTrim)} extra ancestors to trim")
#
nodesToKeep.update(ancestors)
print(f"Result: {len(nodesToKeep) - len(ancestorsToTrim)} nodes to keep")
# Find root node
query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1"
(rootName,) = dbCur.execute(query).fetchone()
print(f"Found root node \"{rootName}\"")
print("Looking for trimmable nodes")
nodeToTipsChg = {} # Used to update 'tips' values after trimming
nodesToDelete = set()
iterNum = 0
def findTrimmables(nodeName):
global iterNum
iterNum += 1
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))]
childrenToKeep, otherChildren = set(), set()
for n in childNames:
if n in nodesToKeep:
childrenToKeep.add(n)
else:
otherChildren.add(n)
tipsRemoved = 0
# Check soft limit
if len(childrenToKeep) > softChildLimit:
numToTrim = len(childrenToKeep) - softChildLimit
# Try removing weakly-kept nodes, preferring those with less tips
candidatesToTrim = [n for n in childrenToKeep if n not in nodesToStronglyKeep]
childToTips = {}
query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(candidatesToTrim)))
for (n, tips) in dbCur.execute(query, candidatesToTrim):
childToTips[n] = tips
candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True)
otherChildren.update(candidatesToTrim[-numToTrim:])
childrenToKeep.difference_update(candidatesToTrim[-numToTrim:])
# Mark nodes for deletion
for n in otherChildren:
tipsRemoved += markForDeletion(n)
# Recurse on children
for n in childrenToKeep:
tipsRemoved += findTrimmables(n)
# Store info for updating num-tips later
nodeToTipsChg[nodeName] = tipsRemoved
return tipsRemoved
def markForDeletion(nodeName):
nodesToDelete.add(nodeName)
ancestorsToTrim.discard(nodeName)
childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))]
if len(childNames) == 0:
return 1
else:
tipsRemoved = 0
for n in childNames:
tipsRemoved += markForDeletion(n)
return tipsRemoved
findTrimmables(rootName)
print(f"Deleting {len(nodesToDelete)} nodes")
iterNum = 0
for nodeName in nodesToDelete:
iterNum += 1
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,))
dbCur.execute("DELETE FROM edges WHERE parent = ?", (nodeName,))
dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,))
dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,))
# Could also delete from 'eol_ids', 'wiki_ids', and 'descs', but this
# makes it much harder to restore the original data if needed, and
# the memory savings didn't seem significant.
print(f"Updating num-tips for {len(nodeToTipsChg)} nodes")
iterNum = 0
for (nodeName, tipsChg) in nodeToTipsChg.items():
iterNum += 1
if iterNum % 1e5 == 0:
print(f"At iteration {iterNum}")
#
dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName))
print(f"Deleting {len(ancestorsToTrim)} single-child ancestor nodes")
iterNum = 0
for nodeName in ancestorsToTrim:
iterNum += 1
if iterNum % 1e2 == 0:
print(f"At iteration {iterNum}")
# Get parent and child
row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (nodeName,)).fetchone()
if row == None:
print("ERROR: Root node was marked for deletion")
sys.exit()
parent = row[0]
(child,) = dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,)).fetchone()
# Connect parent and child
dbCur.execute("UPDATE edges SET parent = ? WHERE child = ?", (parent, child))
dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,))
# Delete
dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,))
dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,))
print("Closing database")
dbCon.commit()
dbCon.close()
|