diff options
Diffstat (limited to 'backend/data/genReducedTreeData.py')
| -rwxr-xr-x | backend/data/genReducedTreeData.py | 167 |
1 files changed, 85 insertions, 82 deletions
diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py index b475794..2e56bba 100755 --- a/backend/data/genReducedTreeData.py +++ b/backend/data/genReducedTreeData.py @@ -3,123 +3,131 @@ import sys, os.path, re import json, sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads \n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Creates a reduced version of the tree in the database. +Reads a subset of the node names from a file, and creates a +minimal tree that contains them, possibly with a few extras. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) dbFile = "data.db" -nodeNamesFile = "reducedTreeNodes.txt" +nodeNamesFile = "pickedReducedNodes.txt" minimalNames = set() nodeMap = {} # Maps node names to node objects PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit -compNameRegex = re.compile(r"\[.+ \+ .+]") +compNameRegex = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes + +class Node: + " Represents a node from the database " + def __init__(self, id, children, parent, tips, pSupport): + self.id = id + self.children = children + self.parent = parent + self.tips = tips + self.pSupport = pSupport -# Connect to db +print("Opening database") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Read in minimal set of node names + print("Getting minimal name set") iterNum = 0 with open(nodeNamesFile) as file: for line in file: iterNum += 1 if iterNum % 100 == 0: - print(f"Iteration {iterNum}") + print(f"At iteration {iterNum}") # - row = dbCur.execute("SELECT name from nodes WHERE name = ?", (line.rstrip(),)).fetchone() + name = line.rstrip() + row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone() if row == None: - row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (line.rstrip(),)).fetchone() + row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone() if row != None: minimalNames.add(row[0]) if len(minimalNames) == 0: - print("ERROR: No names found", file=sys.stderr) - sys.exit(1) -print(f"Name set has {len(minimalNames)} names") -# Add nodes that connect up to root -print("Getting connected nodes set") -iterNum = 0 + print("No names found") + sys.exit(0) +print(f"Result has {len(minimalNames)} names") + +print("Getting ancestor nodes") rootName = None +iterNum = 0 for name in minimalNames: iterNum += 1 if iterNum % 100 == 0: - print(f"Iteration {iterNum}") + print(f"At iteration {iterNum}") # prevName = None while name != None: if name not in nodeMap: (id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone() - row = dbCur.execute("SELECT node, p_support from edges where child = ?", (name,)).fetchone() + row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone() parent = None if row == None or row[0] == "" else row[0] - pSupport = 1 if row == None or row[1] == 1 else 0 - nodeMap[name] = { - "id": id, - "children": [] if prevName == None else [prevName], - "parent": parent, - "tips": 0, - "pSupport": pSupport, - } + pSupport = row == None or row[1] == 1 + children = [] if prevName == None else [prevName] + nodeMap[name] = Node(id, children, parent, 0, pSupport) prevName = name name = parent else: if prevName != None: - nodeMap[name]["children"].append(prevName) + nodeMap[name].children.append(prevName) break if name == None: rootName = prevName -print(f"New node set has {len(nodeMap)} nodes") -# Merge-upward compsite-named nodes -print("Merging-upward composite-named nodes") +print(f"Result has {len(nodeMap)} nodes") + +print("Merging-upward composite nodes") namesToRemove = set() -for (name, nodeObj) in nodeMap.items(): - parent = nodeObj["parent"] +for (name, node) in nodeMap.items(): + parent = node.parent if parent != None and compNameRegex.fullmatch(name) != None: # Connect children to parent - nodeMap[parent]["children"].remove(name) - nodeMap[parent]["children"].extend(nodeObj["children"]) - for n in nodeObj["children"]: - nodeMap[n]["parent"] = parent - nodeMap[n]["pSupport"] &= nodeObj["pSupport"] + nodeMap[parent].children.remove(name) + nodeMap[parent].children.extend(node.children) + for n in node.children: + nodeMap[n].parent = parent + nodeMap[n].pSupport &= node.pSupport # Remember for removal namesToRemove.add(name) for name in namesToRemove: del nodeMap[name] -print(f"New node set has {len(nodeMap)} nodes") -# Remove certain 'chain collapsible' nodes +print(f"Result has {len(nodeMap)} nodes") + print("Removing 'chain collapsible' nodes") namesToRemove2 = set() -for (name, nodeObj) in nodeMap.items(): - hasOneChild = len(nodeObj["children"]) == 1 - isOnlyChild = nodeObj["parent"] != None and len(nodeMap[nodeObj["parent"]]["children"]) == 1 +for (name, node) in nodeMap.items(): + hasOneChild = len(node.children) == 1 + isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1 if name not in minimalNames and (hasOneChild or isOnlyChild): - parentName = nodeObj["parent"] - children = nodeObj["children"] + parent = node.parent # Connect parent and children - nodeMap[parentName]["children"].remove(name) - nodeMap[parentName]["children"].extend(children) - for n in children: - nodeMap[n]["parent"] = parentName - # Adjust child pSupport - nodeMap[n]["pSupport"] &= nodeObj["pSupport"] + nodeMap[parent].children.remove(name) + nodeMap[parent].children.extend(node.children) + for n in node.children: + nodeMap[n].parent = parent + nodeMap[n].pSupport &= node.pSupport # Remember for removal namesToRemove2.add(name) for name in namesToRemove2: del nodeMap[name] namesToRemove.add(name) -print(f"New node set has {len(nodeMap)} nodes") -# Add some connected children -print("Adding additional nearby children") +print(f"Result has {len(nodeMap)} nodes") + +print("Adding some additional nearby children") namesToAdd = [] iterNum = 0 -for (name, nodeObj) in nodeMap.items(): +for (name, node) in nodeMap.items(): iterNum += 1 if iterNum % 100 == 0: - print(f"Iteration {iterNum}") + print(f"At iteration {iterNum}") # - numChildren = len(nodeObj["children"]) + numChildren = len(node.children) if numChildren < PREF_NUM_CHILDREN: - children = [row[0] for row in dbCur.execute("SELECT child FROM edges where node = ?", (name,))] + children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))] newChildren = [] for n in children: if n in nodeMap or n in namesToRemove: @@ -132,43 +140,38 @@ for (name, nodeObj) in nodeMap.items(): continue newChildren.append(n) newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)] - nodeObj["children"].extend(newChildNames) + node.children.extend(newChildNames) namesToAdd.extend(newChildNames) for name in namesToAdd: - (parent, pSupport) = dbCur.execute("SELECT node, p_support from edges WHERE child = ?", (name,)).fetchone() + parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone() (id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone() parent = None if parent == "" else parent - nodeMap[name] = { - "id": id, - "children": [], - "parent": parent, - "tips": 0, - "pSupport": pSupport == 1, - } -print(f"New node set has {len(nodeMap)} nodes") -# set tips vals -print("Setting tips vals") + nodeMap[name] = Node(id, [], parent, 0, pSupport == 1) +print(f"Result has {len(nodeMap)} nodes") + +print("Setting 'tips' values") def setTips(nodeName): - nodeObj = nodeMap[nodeName] - if len(nodeObj["children"]) == 0: - nodeObj["tips"] = 1 + node = nodeMap[nodeName] + if len(node.children) == 0: + node.tips = 1 return 1 - tips = sum([setTips(childName) for childName in nodeObj["children"]]) - nodeObj["tips"] = tips + tips = sum([setTips(childName) for childName in node.children]) + node.tips = tips return tips setTips(rootName) -# Add new nodes to db -print("Adding to db") + +print("Adding reduced tree to database") dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") dbCur.execute("CREATE INDEX r_nodes_idx_nc ON r_nodes(name COLLATE NOCASE)") -dbCur.execute("CREATE TABLE r_edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))") +dbCur.execute("CREATE TABLE r_edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)") -for (name, nodeObj) in nodeMap.items(): - parentName = "" if nodeObj["parent"] == None else nodeObj["parent"] - dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, nodeObj["id"], nodeObj["tips"])) - for childName in nodeObj["children"]: - pSupport = 1 if nodeMap[childName]["pSupport"] else 0 +for (name, node) in nodeMap.items(): + parentName = "" if node.parent == None else node.parent + dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, node.id, node.tips)) + for childName in node.children: + pSupport = 1 if nodeMap[childName].pSupport else 0 dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport)) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() |
