aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genReducedTreeData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/genReducedTreeData.py')
-rwxr-xr-xbackend/data/genReducedTreeData.py167
1 files changed, 85 insertions, 82 deletions
diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py
index b475794..2e56bba 100755
--- a/backend/data/genReducedTreeData.py
+++ b/backend/data/genReducedTreeData.py
@@ -3,123 +3,131 @@
import sys, os.path, re
import json, sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads \n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Creates a reduced version of the tree in the database.
+Reads a subset of the node names from a file, and creates a
+minimal tree that contains them, possibly with a few extras.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
dbFile = "data.db"
-nodeNamesFile = "reducedTreeNodes.txt"
+nodeNamesFile = "pickedReducedNodes.txt"
minimalNames = set()
nodeMap = {} # Maps node names to node objects
PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit
-compNameRegex = re.compile(r"\[.+ \+ .+]")
+compNameRegex = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes
+
+class Node:
+ " Represents a node from the database "
+ def __init__(self, id, children, parent, tips, pSupport):
+ self.id = id
+ self.children = children
+ self.parent = parent
+ self.tips = tips
+ self.pSupport = pSupport
-# Connect to db
+print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Read in minimal set of node names
+
print("Getting minimal name set")
iterNum = 0
with open(nodeNamesFile) as file:
for line in file:
iterNum += 1
if iterNum % 100 == 0:
- print(f"Iteration {iterNum}")
+ print(f"At iteration {iterNum}")
#
- row = dbCur.execute("SELECT name from nodes WHERE name = ?", (line.rstrip(),)).fetchone()
+ name = line.rstrip()
+ row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone()
if row == None:
- row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (line.rstrip(),)).fetchone()
+ row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone()
if row != None:
minimalNames.add(row[0])
if len(minimalNames) == 0:
- print("ERROR: No names found", file=sys.stderr)
- sys.exit(1)
-print(f"Name set has {len(minimalNames)} names")
-# Add nodes that connect up to root
-print("Getting connected nodes set")
-iterNum = 0
+ print("No names found")
+ sys.exit(0)
+print(f"Result has {len(minimalNames)} names")
+
+print("Getting ancestor nodes")
rootName = None
+iterNum = 0
for name in minimalNames:
iterNum += 1
if iterNum % 100 == 0:
- print(f"Iteration {iterNum}")
+ print(f"At iteration {iterNum}")
#
prevName = None
while name != None:
if name not in nodeMap:
(id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone()
- row = dbCur.execute("SELECT node, p_support from edges where child = ?", (name,)).fetchone()
+ row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone()
parent = None if row == None or row[0] == "" else row[0]
- pSupport = 1 if row == None or row[1] == 1 else 0
- nodeMap[name] = {
- "id": id,
- "children": [] if prevName == None else [prevName],
- "parent": parent,
- "tips": 0,
- "pSupport": pSupport,
- }
+ pSupport = row == None or row[1] == 1
+ children = [] if prevName == None else [prevName]
+ nodeMap[name] = Node(id, children, parent, 0, pSupport)
prevName = name
name = parent
else:
if prevName != None:
- nodeMap[name]["children"].append(prevName)
+ nodeMap[name].children.append(prevName)
break
if name == None:
rootName = prevName
-print(f"New node set has {len(nodeMap)} nodes")
-# Merge-upward compsite-named nodes
-print("Merging-upward composite-named nodes")
+print(f"Result has {len(nodeMap)} nodes")
+
+print("Merging-upward composite nodes")
namesToRemove = set()
-for (name, nodeObj) in nodeMap.items():
- parent = nodeObj["parent"]
+for (name, node) in nodeMap.items():
+ parent = node.parent
if parent != None and compNameRegex.fullmatch(name) != None:
# Connect children to parent
- nodeMap[parent]["children"].remove(name)
- nodeMap[parent]["children"].extend(nodeObj["children"])
- for n in nodeObj["children"]:
- nodeMap[n]["parent"] = parent
- nodeMap[n]["pSupport"] &= nodeObj["pSupport"]
+ nodeMap[parent].children.remove(name)
+ nodeMap[parent].children.extend(node.children)
+ for n in node.children:
+ nodeMap[n].parent = parent
+ nodeMap[n].pSupport &= node.pSupport
# Remember for removal
namesToRemove.add(name)
for name in namesToRemove:
del nodeMap[name]
-print(f"New node set has {len(nodeMap)} nodes")
-# Remove certain 'chain collapsible' nodes
+print(f"Result has {len(nodeMap)} nodes")
+
print("Removing 'chain collapsible' nodes")
namesToRemove2 = set()
-for (name, nodeObj) in nodeMap.items():
- hasOneChild = len(nodeObj["children"]) == 1
- isOnlyChild = nodeObj["parent"] != None and len(nodeMap[nodeObj["parent"]]["children"]) == 1
+for (name, node) in nodeMap.items():
+ hasOneChild = len(node.children) == 1
+ isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1
if name not in minimalNames and (hasOneChild or isOnlyChild):
- parentName = nodeObj["parent"]
- children = nodeObj["children"]
+ parent = node.parent
# Connect parent and children
- nodeMap[parentName]["children"].remove(name)
- nodeMap[parentName]["children"].extend(children)
- for n in children:
- nodeMap[n]["parent"] = parentName
- # Adjust child pSupport
- nodeMap[n]["pSupport"] &= nodeObj["pSupport"]
+ nodeMap[parent].children.remove(name)
+ nodeMap[parent].children.extend(node.children)
+ for n in node.children:
+ nodeMap[n].parent = parent
+ nodeMap[n].pSupport &= node.pSupport
# Remember for removal
namesToRemove2.add(name)
for name in namesToRemove2:
del nodeMap[name]
namesToRemove.add(name)
-print(f"New node set has {len(nodeMap)} nodes")
-# Add some connected children
-print("Adding additional nearby children")
+print(f"Result has {len(nodeMap)} nodes")
+
+print("Adding some additional nearby children")
namesToAdd = []
iterNum = 0
-for (name, nodeObj) in nodeMap.items():
+for (name, node) in nodeMap.items():
iterNum += 1
if iterNum % 100 == 0:
- print(f"Iteration {iterNum}")
+ print(f"At iteration {iterNum}")
#
- numChildren = len(nodeObj["children"])
+ numChildren = len(node.children)
if numChildren < PREF_NUM_CHILDREN:
- children = [row[0] for row in dbCur.execute("SELECT child FROM edges where node = ?", (name,))]
+ children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))]
newChildren = []
for n in children:
if n in nodeMap or n in namesToRemove:
@@ -132,43 +140,38 @@ for (name, nodeObj) in nodeMap.items():
continue
newChildren.append(n)
newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)]
- nodeObj["children"].extend(newChildNames)
+ node.children.extend(newChildNames)
namesToAdd.extend(newChildNames)
for name in namesToAdd:
- (parent, pSupport) = dbCur.execute("SELECT node, p_support from edges WHERE child = ?", (name,)).fetchone()
+ parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone()
(id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone()
parent = None if parent == "" else parent
- nodeMap[name] = {
- "id": id,
- "children": [],
- "parent": parent,
- "tips": 0,
- "pSupport": pSupport == 1,
- }
-print(f"New node set has {len(nodeMap)} nodes")
-# set tips vals
-print("Setting tips vals")
+ nodeMap[name] = Node(id, [], parent, 0, pSupport == 1)
+print(f"Result has {len(nodeMap)} nodes")
+
+print("Setting 'tips' values")
def setTips(nodeName):
- nodeObj = nodeMap[nodeName]
- if len(nodeObj["children"]) == 0:
- nodeObj["tips"] = 1
+ node = nodeMap[nodeName]
+ if len(node.children) == 0:
+ node.tips = 1
return 1
- tips = sum([setTips(childName) for childName in nodeObj["children"]])
- nodeObj["tips"] = tips
+ tips = sum([setTips(childName) for childName in node.children])
+ node.tips = tips
return tips
setTips(rootName)
-# Add new nodes to db
-print("Adding to db")
+
+print("Adding reduced tree to database")
dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)")
dbCur.execute("CREATE INDEX r_nodes_idx_nc ON r_nodes(name COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE r_edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))")
+dbCur.execute("CREATE TABLE r_edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))")
dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)")
-for (name, nodeObj) in nodeMap.items():
- parentName = "" if nodeObj["parent"] == None else nodeObj["parent"]
- dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, nodeObj["id"], nodeObj["tips"]))
- for childName in nodeObj["children"]:
- pSupport = 1 if nodeMap[childName]["pSupport"] else 0
+for (name, node) in nodeMap.items():
+ parentName = "" if node.parent == None else node.parent
+ dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, node.id, node.tips))
+ for childName in node.children:
+ pSupport = 1 if nodeMap[childName].pSupport else 0
dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport))
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()