diff options
Diffstat (limited to 'backend/tolData/genReducedTrees.py')
| -rwxr-xr-x | backend/tolData/genReducedTrees.py | 270 |
1 files changed, 140 insertions, 130 deletions
diff --git a/backend/tolData/genReducedTrees.py b/backend/tolData/genReducedTrees.py index a954fd3..66fef40 100755 --- a/backend/tolData/genReducedTrees.py +++ b/backend/tolData/genReducedTrees.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 -import sys, os.path, re -import json, sqlite3 +import sys, re +import sqlite3 import argparse parser = argparse.ArgumentParser(description=""" @@ -17,13 +17,13 @@ Creates reduced versions of the tree in the database: presence in the 'picked' tree. And, for nodes with 'many' children, removing some more, despite any node descriptions. """, formatter_class=argparse.RawDescriptionHelpFormatter) -parser.add_argument("--tree", choices=["picked", "images", "trimmed"], help="Only generate the specified tree") +parser.add_argument('--tree', choices=['picked', 'images', 'trimmed'], help='Only generate the specified tree') args = parser.parse_args() tree = args.tree -dbFile = "data.db" -pickedNodesFile = "pickedNodes.txt" -COMP_NAME_REGEX = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes +dbFile = 'data.db' +pickedNodesFile = 'pickedNodes.txt' +COMP_NAME_REGEX = re.compile(r'\[.+ \+ .+]') # Used to recognise composite nodes class Node: def __init__(self, id, children, parent, tips, pSupport): @@ -33,144 +33,153 @@ class Node: self.tips = tips self.pSupport = pSupport -print("Opening database") +print('Opening database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -def genPickedNodeTree(dbCur, pickedNames, rootName): +def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: str) -> None: global COMP_NAME_REGEX PREF_NUM_CHILDREN = 3 # Include extra children up to this limit - nodeMap = {} # Maps node names to Nodes - print("Getting ancestors") + print('Getting ancestors') nodeMap = genNodeMap(dbCur, pickedNames, 100) - print(f"Result has {len(nodeMap)} nodes") - print("Removing composite nodes") + print(f'Result has {len(nodeMap)} nodes') + print('Removing composite nodes') removedNames = removeCompositeNodes(nodeMap) - print(f"Result has {len(nodeMap)} nodes") - print("Removing 'collapsible' nodes") + print(f'Result has {len(nodeMap)} nodes') + print('Removing \'collapsible\' nodes') temp = removeCollapsibleNodes(nodeMap, pickedNames) removedNames.update(temp) - print(f"Result has {len(nodeMap)} nodes") - print("Adding some additional nearby children") - namesToAdd = [] + print(f'Result has {len(nodeMap)} nodes') + print('Adding some additional nearby children') + namesToAdd: list[str] = [] iterNum = 0 - for (name, node) in nodeMap.items(): + for name, node in nodeMap.items(): iterNum += 1 if iterNum % 100 == 0: - print(f"At iteration {iterNum}") + print(f'At iteration {iterNum}') # numChildren = len(node.children) if numChildren < PREF_NUM_CHILDREN: - children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))] - newChildren = [] + children = [row[0] for row in dbCur.execute('SELECT child FROM edges where parent = ?', (name,))] + newChildren: list[str] = [] for n in children: if n in nodeMap or n in removedNames: continue - if COMP_NAME_REGEX.fullmatch(n) != None: + if COMP_NAME_REGEX.fullmatch(n) is not None: continue - if dbCur.execute("SELECT name from node_imgs WHERE name = ?", (n,)).fetchone() == None and \ - dbCur.execute("SELECT name from linked_imgs WHERE name = ?", (n,)).fetchone() == None: + if dbCur.execute('SELECT name from node_imgs WHERE name = ?', (n,)).fetchone() is None and \ + dbCur.execute('SELECT name from linked_imgs WHERE name = ?', (n,)).fetchone() is None: continue newChildren.append(n) newChildNames = newChildren[:(PREF_NUM_CHILDREN - numChildren)] node.children.extend(newChildNames) namesToAdd.extend(newChildNames) for name in namesToAdd: - parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone() - (id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone() - parent = None if parent == "" else parent + parent, pSupport = dbCur.execute('SELECT parent, p_support from edges WHERE child = ?', (name,)).fetchone() + (id,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (name,)).fetchone() + parent = None if parent == '' else parent nodeMap[name] = Node(id, [], parent, 0, pSupport == 1) - print(f"Result has {len(nodeMap)} nodes") - print("Updating 'tips' values") + print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') updateTips(rootName, nodeMap) - print("Creating table") - addTreeTables(nodeMap, dbCur, "p") -def genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName): - print("Getting ancestors") + print('Creating table') + addTreeTables(nodeMap, dbCur, 'p') +def genImagesOnlyTree( + dbCur: sqlite3.Cursor, + nodesWithImgOrPicked: set[str], + pickedNames: set[str], + rootName: str) -> None: + print('Getting ancestors') nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4) - print(f"Result has {len(nodeMap)} nodes") - print("Removing composite nodes") + print(f'Result has {len(nodeMap)} nodes') + print('Removing composite nodes') removeCompositeNodes(nodeMap) - print(f"Result has {len(nodeMap)} nodes") - print("Removing 'collapsible' nodes") + print(f'Result has {len(nodeMap)} nodes') + print('Removing \'collapsible\' nodes') removeCollapsibleNodes(nodeMap, pickedNames) - print(f"Result has {len(nodeMap)} nodes") - print(f"Updating 'tips' values") # Needed for next trimming step + print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') # Needed for next trimming step updateTips(rootName, nodeMap) - print(f"Trimming from nodes with 'many' children") + print('Trimming from nodes with \'many\' children') trimIfManyChildren(nodeMap, rootName, 300, pickedNames) - print(f"Result has {len(nodeMap)} nodes") - print(f"Updating 'tips' values") + print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') updateTips(rootName, nodeMap) - print("Creating table") - addTreeTables(nodeMap, dbCur, "i") -def genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName): - print("Getting ancestors") + print('Creating table') + addTreeTables(nodeMap, dbCur, 'i') +def genWeaklyTrimmedTree( + dbCur: sqlite3.Cursor, + nodesWithImgDescOrPicked: set[str], + nodesWithImgOrPicked: set[str], + rootName: str) -> None: + print('Getting ancestors') nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5) - print(f"Result has {len(nodeMap)} nodes") - print("Getting nodes to 'strongly keep'") + print(f'Result has {len(nodeMap)} nodes') + print('Getting nodes to \'strongly keep\'') iterNum = 0 - nodesFromImgOrPicked = set() + nodesFromImgOrPicked: set[str] = set() for name in nodesWithImgOrPicked: iterNum += 1 if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") + print(f'At iteration {iterNum}') # - while name != None: + while name is not None: if name not in nodesFromImgOrPicked: nodesFromImgOrPicked.add(name) name = nodeMap[name].parent else: break - print(f"Node set has {len(nodesFromImgOrPicked)} nodes") - print("Removing 'collapsible' nodes") + print(f'Node set has {len(nodesFromImgOrPicked)} nodes') + print('Removing \'collapsible\' nodes') removeCollapsibleNodes(nodeMap, nodesWithImgDescOrPicked) - print(f"Result has {len(nodeMap)} nodes") - print(f"Updating 'tips' values") # Needed for next trimming step + print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') # Needed for next trimming step updateTips(rootName, nodeMap) - print(f"Trimming from nodes with 'many' children") + print('Trimming from nodes with \'many\' children') trimIfManyChildren(nodeMap, rootName, 600, nodesFromImgOrPicked) - print(f"Result has {len(nodeMap)} nodes") - print(f"Updating 'tips' values") + print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') updateTips(rootName, nodeMap) - print("Creating table") - addTreeTables(nodeMap, dbCur, "t") + print('Creating table') + addTreeTables(nodeMap, dbCur, 't') # Helper functions -def genNodeMap(dbCur, nameSet, itersBeforePrint = 1): - " Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map " - nodeMap = {} +def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -> dict[str, Node]: + """ Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map """ + nodeMap: dict[str, Node] = {} iterNum = 0 + name: str | None for name in nameSet: iterNum += 1 if iterNum % itersBeforePrint == 0: - print(f"At iteration {iterNum}") + print(f'At iteration {iterNum}') # - prevName = None - while name != None: + prevName: str | None = None + while name is not None: if name not in nodeMap: # Add node - (id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone() - row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone() - parent = None if row == None or row[0] == "" else row[0] - pSupport = row == None or row[1] == 1 - children = [] if prevName == None else [prevName] + id, tips = dbCur.execute('SELECT id, tips from nodes where name = ?', (name,)).fetchone() + row: None | tuple[str, int] = dbCur.execute( + 'SELECT parent, p_support from edges where child = ?', (name,)).fetchone() + parent = None if row is None or row[0] == '' else row[0] + pSupport = row is None or row[1] == 1 + children = [] if prevName is None else [prevName] nodeMap[name] = Node(id, children, parent, 0, pSupport) # Iterate to parent prevName = name name = parent else: # Just add as child - if prevName != None: + if prevName is not None: nodeMap[name].children.append(prevName) break return nodeMap -def removeCompositeNodes(nodeMap): - " Given a tree, removes composite-name nodes, and returns the removed nodes' names " +def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]: + """ Given a tree, removes composite-name nodes, and returns the removed nodes' names """ global COMP_NAME_REGEX - namesToRemove = set() - for (name, node) in nodeMap.items(): + namesToRemove: set[str] = set() + for name, node in nodeMap.items(): parent = node.parent - if parent != None and COMP_NAME_REGEX.fullmatch(name) != None: + if parent is not None and COMP_NAME_REGEX.fullmatch(name) is not None: # Connect children to parent nodeMap[parent].children.remove(name) nodeMap[parent].children.extend(node.children) @@ -182,13 +191,13 @@ def removeCompositeNodes(nodeMap): for name in namesToRemove: del nodeMap[name] return namesToRemove -def removeCollapsibleNodes(nodeMap, nodesToKeep = {}): +def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set()) -> set[str]: """ Given a tree, removes single-child parents, then only-childs, with given exceptions, and returns the set of removed nodes' names """ - namesToRemove = set() + namesToRemove: set[str] = set() # Remove single-child parents - for (name, node) in nodeMap.items(): - if len(node.children) == 1 and node.parent != None and name not in nodesToKeep: + for name, node in nodeMap.items(): + if len(node.children) == 1 and node.parent is not None and name not in nodesToKeep: # Connect parent and children parent = node.parent child = node.children[0] @@ -202,8 +211,8 @@ def removeCollapsibleNodes(nodeMap, nodesToKeep = {}): del nodeMap[name] # Remove only-childs (not redundant because 'nodesToKeep' can cause single-child parents to be kept) namesToRemove.clear() - for (name, node) in nodeMap.items(): - isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1 + for name, node in nodeMap.items(): + isOnlyChild = node.parent is not None and len(nodeMap[node.parent].children) == 1 if isOnlyChild and name not in nodesToKeep: # Connect parent and children parent = node.parent @@ -217,9 +226,10 @@ def removeCollapsibleNodes(nodeMap, nodesToKeep = {}): del nodeMap[name] # return namesToRemove -def trimIfManyChildren(nodeMap, rootName, childThreshold, nodesToKeep = {}): - namesToRemove = set() - def findTrimmables(nodeName): +def trimIfManyChildren( + nodeMap: dict[str, Node], rootName: str, childThreshold: int, nodesToKeep: set[str] = set()) -> None: + namesToRemove: set[str] = set() + def findTrimmables(nodeName: str) -> None: nonlocal nodeMap, nodesToKeep node = nodeMap[nodeName] if len(node.children) > childThreshold: @@ -236,7 +246,7 @@ def trimIfManyChildren(nodeMap, rootName, childThreshold, nodesToKeep = {}): # Recurse on children for n in node.children: findTrimmables(n) - def markForRemoval(nodeName): + def markForRemoval(nodeName: str) -> None: nonlocal nodeMap, namesToRemove namesToRemove.add(nodeName) for child in nodeMap[nodeName].children: @@ -244,81 +254,81 @@ def trimIfManyChildren(nodeMap, rootName, childThreshold, nodesToKeep = {}): findTrimmables(rootName) for nodeName in namesToRemove: del nodeMap[nodeName] -def updateTips(nodeName, nodeMap): - " Updates the 'tips' values for a node and it's descendants, returning the node's new 'tips' value " +def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int: + """ Updates the 'tips' values for a node and it's descendants, returning the node's new 'tips' value """ node = nodeMap[nodeName] tips = sum([updateTips(childName, nodeMap) for childName in node.children]) tips = max(1, tips) node.tips = tips return tips -def addTreeTables(nodeMap, dbCur, suffix): - " Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix " - nodesTbl = f"nodes_{suffix}" - edgesTbl = f"edges_{suffix}" - dbCur.execute(f"CREATE TABLE {nodesTbl} (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") - dbCur.execute(f"CREATE INDEX {nodesTbl}_idx_nc ON {nodesTbl}(name COLLATE NOCASE)") - dbCur.execute(f"CREATE TABLE {edgesTbl} (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") - dbCur.execute(f"CREATE INDEX {edgesTbl}_child_idx ON {edgesTbl}(child)") - for (name, node) in nodeMap.items(): - dbCur.execute(f"INSERT INTO {nodesTbl} VALUES (?, ?, ?)", (name, node.id, node.tips)) +def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str): + """ Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix """ + nodesTbl = f'nodes_{suffix}' + edgesTbl = f'edges_{suffix}' + dbCur.execute(f'CREATE TABLE {nodesTbl} (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') + dbCur.execute(f'CREATE INDEX {nodesTbl}_idx_nc ON {nodesTbl}(name COLLATE NOCASE)') + dbCur.execute(f'CREATE TABLE {edgesTbl} (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') + dbCur.execute(f'CREATE INDEX {edgesTbl}_child_idx ON {edgesTbl}(child)') + for name, node in nodeMap.items(): + dbCur.execute(f'INSERT INTO {nodesTbl} VALUES (?, ?, ?)', (name, node.id, node.tips)) for childName in node.children: pSupport = 1 if nodeMap[childName].pSupport else 0 - dbCur.execute(f"INSERT INTO {edgesTbl} VALUES (?, ?, ?)", (name, childName, pSupport)) + dbCur.execute(f'INSERT INTO {edgesTbl} VALUES (?, ?, ?)', (name, childName, pSupport)) -print(f"Finding root node") -query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" +print('Finding root node') +query = 'SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1' (rootName,) = dbCur.execute(query).fetchone() -print(f"Found \"{rootName}\"") +print(f'Found \'{rootName}\'') print('=== Getting picked-nodes ===') -pickedNames = set() +pickedNames: set[str] = set() pickedTreeExists = False -if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='nodes_p'").fetchone() == None: - print(f"Reading from {pickedNodesFile}") +if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="nodes_p"').fetchone() is None: + print(f'Reading from {pickedNodesFile}') with open(pickedNodesFile) as file: for line in file: name = line.rstrip() - row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone() - if row == None: - row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone() - if row != None: + row = dbCur.execute('SELECT name from nodes WHERE name = ?', (name,)).fetchone() + if row is None: + row = dbCur.execute('SELECT name from names WHERE alt_name = ?', (name,)).fetchone() + if row is not None: pickedNames.add(row[0]) - if len(pickedNames) == 0: - raise Exception("ERROR: No picked names found") + if not pickedNames: + raise Exception('ERROR: No picked names found') else: pickedTreeExists = True - print("Picked-node tree already exists") + print('Picked-node tree already exists') if tree == 'picked': sys.exit() - for (name,) in dbCur.execute("SELECT name FROM nodes_p"): + for (name,) in dbCur.execute('SELECT name FROM nodes_p'): pickedNames.add(name) -print(f"Found {len(pickedNames)} names") +print(f'Found {len(pickedNames)} names') -if (tree == 'picked' or tree == None) and not pickedTreeExists: - print("=== Generating picked-nodes tree ===") +if (tree == 'picked' or tree is None) and not pickedTreeExists: + print('=== Generating picked-nodes tree ===') genPickedNodeTree(dbCur, pickedNames, rootName) if tree != 'picked': - print("=== Finding 'non-low significance' nodes ===") - nodesWithImgOrPicked = set() - nodesWithImgDescOrPicked = set() - print("Finding nodes with descs") - for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc + print('=== Finding \'non-low significance\' nodes ===') + nodesWithImgOrPicked: set[str] = set() + nodesWithImgDescOrPicked: set[str] = set() + print('Finding nodes with descs') + for (name,) in dbCur.execute('SELECT name FROM wiki_ids'): # Can assume the wiki_id has a desc nodesWithImgDescOrPicked.add(name) - print("Finding nodes with images") - for (name,) in dbCur.execute("SELECT name FROM node_imgs"): + print('Finding nodes with images') + for (name,) in dbCur.execute('SELECT name FROM node_imgs'): nodesWithImgDescOrPicked.add(name) nodesWithImgOrPicked.add(name) - print("Adding picked nodes") + print('Adding picked nodes') for name in pickedNames: nodesWithImgDescOrPicked.add(name) nodesWithImgOrPicked.add(name) - if tree == 'images' or tree == None: - print("=== Generating images-only tree ===") + if tree == 'images' or tree is None: + print('=== Generating images-only tree ===') genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName) - if tree == 'trimmed' or tree == None: - print("=== Generating weakly-trimmed tree ===") + if tree == 'trimmed' or tree is None: + print('=== Generating weakly-trimmed tree ===') genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName) -print("Closing database") +print('Closing database') dbCon.commit() dbCon.close() |
