aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genReducedTreeData.py
blob: 2e56bba3936d51bef819d903dfcc09978c240d63 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/python3

import sys, os.path, re
import json, sqlite3

usageInfo = f"""
Usage: {sys.argv[0]}

Creates a reduced version of the tree in the database.
Reads a subset of the node names from a file, and creates a
minimal tree that contains them, possibly with a few extras.
"""
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

dbFile = "data.db"
nodeNamesFile = "pickedReducedNodes.txt"
minimalNames = set()
nodeMap = {} # Maps node names to node objects
PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit
compNameRegex = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes

class Node:
	" Represents a node from the database "
	def __init__(self, id, children, parent, tips, pSupport):
		self.id = id
		self.children = children
		self.parent = parent
		self.tips = tips
		self.pSupport = pSupport

print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()

print("Getting minimal name set")
iterNum = 0
with open(nodeNamesFile) as file:
	for line in file:
		iterNum += 1
		if iterNum % 100 == 0:
			print(f"At iteration {iterNum}")
		#
		name = line.rstrip()
		row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone()
		if row == None:
			row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone()
		if row != None:
			minimalNames.add(row[0])
if len(minimalNames) == 0:
	print("No names found")
	sys.exit(0)
print(f"Result has {len(minimalNames)} names")

print("Getting ancestor nodes")
rootName = None
iterNum = 0
for name in minimalNames:
	iterNum += 1
	if iterNum % 100 == 0:
		print(f"At iteration {iterNum}")
	#
	prevName = None
	while name != None:
		if name not in nodeMap:
			(id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone()
			row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone()
			parent = None if row == None or row[0] == "" else row[0]
			pSupport = row == None or row[1] == 1
			children = [] if prevName == None else [prevName]
			nodeMap[name] = Node(id, children, parent, 0, pSupport)
			prevName = name
			name = parent
		else:
			if prevName != None:
				nodeMap[name].children.append(prevName)
			break
	if name == None:
		rootName = prevName
print(f"Result has {len(nodeMap)} nodes")

print("Merging-upward composite nodes")
namesToRemove = set()
for (name, node) in nodeMap.items():
	parent = node.parent
	if parent != None and compNameRegex.fullmatch(name) != None:
		# Connect children to parent
		nodeMap[parent].children.remove(name)
		nodeMap[parent].children.extend(node.children)
		for n in node.children:
			nodeMap[n].parent = parent
			nodeMap[n].pSupport &= node.pSupport
		# Remember for removal
		namesToRemove.add(name)
for name in namesToRemove:
	del nodeMap[name]
print(f"Result has {len(nodeMap)} nodes")

print("Removing 'chain collapsible' nodes")
namesToRemove2 = set()
for (name, node) in nodeMap.items():
	hasOneChild = len(node.children) == 1
	isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1
	if name not in minimalNames and (hasOneChild or isOnlyChild):
		parent = node.parent
		# Connect parent and children
		nodeMap[parent].children.remove(name)
		nodeMap[parent].children.extend(node.children)
		for n in node.children:
			nodeMap[n].parent = parent
			nodeMap[n].pSupport &= node.pSupport
		# Remember for removal
		namesToRemove2.add(name)
for name in namesToRemove2:
	del nodeMap[name]
	namesToRemove.add(name)
print(f"Result has {len(nodeMap)} nodes")

print("Adding some additional nearby children")
namesToAdd = []
iterNum = 0
for (name, node) in nodeMap.items():
	iterNum += 1
	if iterNum % 100 == 0:
		print(f"At iteration {iterNum}")
	#
	numChildren = len(node.children)
	if numChildren < PREF_NUM_CHILDREN:
		children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))]
		newChildren = []
		for n in children:
			if n in nodeMap or n in namesToRemove:
				continue
			if compNameRegex.fullmatch(name) != None:
				continue
			if dbCur.execute("SELECT name from node_imgs WHERE name = ?", (n,)).fetchone() == None:
				continue
			if dbCur.execute("SELECT name from linked_imgs WHERE name = ?", (n,)).fetchone() == None:
				continue
			newChildren.append(n)
		newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)]
		node.children.extend(newChildNames)
		namesToAdd.extend(newChildNames)
for name in namesToAdd:
	parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone()
	(id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone()
	parent = None if parent == "" else parent
	nodeMap[name] = Node(id, [], parent, 0, pSupport == 1)
print(f"Result has {len(nodeMap)} nodes")

print("Setting 'tips' values")
def setTips(nodeName):
	node = nodeMap[nodeName]
	if len(node.children) == 0:
		node.tips = 1
		return 1
	tips = sum([setTips(childName) for childName in node.children])
	node.tips = tips
	return tips
setTips(rootName)

print("Adding reduced tree to database")
dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)")
dbCur.execute("CREATE INDEX r_nodes_idx_nc ON r_nodes(name COLLATE NOCASE)")
dbCur.execute("CREATE TABLE r_edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))")
dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)")
for (name, node) in nodeMap.items():
	parentName = "" if node.parent == None else node.parent
	dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, node.id, node.tips))
	for childName in node.children:
		pSupport = 1 if nodeMap[childName].pSupport else 0
		dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport))

print("Closing database")
dbCon.commit()
dbCon.close()