From e1ef2bf3387769de4edc4a7ec1a6d38c5a21c5e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Thu, 12 May 2022 00:10:12 +1000 Subject: Add reduced-tree data generation+serving+querying+setting Add genReducedTreeData.py, which generates a reduced_nodes table. Adjust server to serve that data for queries with a tree=reduced query param. Adjust client to query for that data depending on a useReducedTree variable. Add a SettingsPane setting to change that useReducedTree variable. --- backend/data/README.md | 7 +- backend/data/genReducedTreeData.py | 152 +++++++++++++++++++++++++++++++++++++ backend/data/reducedTol/README.md | 4 + 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100755 backend/data/genReducedTreeData.py create mode 100644 backend/data/reducedTol/README.md (limited to 'backend/data') diff --git a/backend/data/README.md b/backend/data/README.md index 27619de..c4c46ba 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -7,7 +7,7 @@ File Generation Process table using data in otol/*. 2 Name Data for Search 1 Obtain data in eol/, as specified in it's README. - 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, + 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, using data in eol/vernacularNames.csv and the 'nodes' table. 3 Image Data 1 Use downloadImgsForReview.py to download EOL images into imgsForReview/. @@ -20,6 +20,9 @@ File Generation Process 1 Obtain data in enwiki/, as specified in it's README. 2 Run genEnwikiData.py, which adds a 'descs' table to data.db, using data in enwiki/enwikiData.db, and the 'nodes' table. +5 Reduced Tree Structure Data + 1 Run genReducedTreeData.py, which adds a 'reduced_nodes' table to data.db, + using reducedTol/names.txt, and the 'nodes' and 'names' tables. data.db tables ============== @@ -33,3 +36,5 @@ data.db tables eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT - descs
name TEXT PRIMARY KEY, desc TEXT, redirected INT +- reduced\_nodes
+ name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py new file mode 100755 index 0000000..ed8fae9 --- /dev/null +++ b/backend/data/genReducedTreeData.py @@ -0,0 +1,152 @@ +#!/usr/bin/python3 + +import sys, os.path, re +import json, sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads \n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbFile = "data.db" +nodeNamesFile = "reducedTol/names.txt" +minimalNames = set() +nodeMap = {} # Maps node names to node objects +PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit +compNameRegex = re.compile(r"\[.+ \+ .+]") + +# Connect to db +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Read in minimal set of node names +print("Getting minimal name set") +iterNum = 0 +with open(nodeNamesFile) as file: + for line in file: + iterNum += 1 + if iterNum % 100 == 0: + print("Iteration {}".format(iterNum)) + # + row = dbCur.execute("SELECT name, alt_name from names WHERE alt_name = ?", (line.rstrip(),)).fetchone() + if row != None: + minimalNames.add(row[0]) +if len(minimalNames) == 0: + print("ERROR: No names found", file=sys.stderr) + sys.exit(1) +print("Name set has {} names".format(len(minimalNames))) +# Add nodes that connect up to root +print("Getting connected nodes set") +iterNum = 0 +rootName = None +for name in minimalNames: + iterNum += 1 + if iterNum % 100 == 0: + print("Iteration {}".format(iterNum)) + # + prevName = None + while name != None: + if name not in nodeMap: + (parent, tips, p_support) = dbCur.execute( + "SELECT parent, tips, p_support from nodes WHERE name = ?", (name,)).fetchone() + parent = None if parent == "" else parent + nodeMap[name] = { + "children": [] if prevName == None else [prevName], + "parent": parent, + "tips": 0, + "pSupport": p_support == 1, + } + prevName = name + name = parent + else: + if prevName != None: + nodeMap[name]["children"].append(prevName) + break + if name == None: + rootName = prevName +print("New node set has {} nodes".format(len(nodeMap))) +# Remove certain 'chain collapsible' nodes +print("Removing 'chain collapsible' nodes") +namesToRemove = set() +for (name, nodeObj) in nodeMap.items(): + if name not in minimalNames and len(nodeObj["children"]) == 1: + parentName = nodeObj["parent"] + childName = nodeObj["children"][0] + # Connect parent and child + nodeMap[parentName]["children"].remove(name) + nodeMap[parentName]["children"].append(childName) + nodeMap[childName]["parent"] = parentName + # Adjust child pSupport + nodeMap[childName]["pSupport"] &= nodeObj["pSupport"] + # Remember for removal + namesToRemove.add(name) +for name in namesToRemove: + del nodeMap[name] +print("New node set has {} nodes".format(len(nodeMap))) +# Merge-upward compsite-named nodes +print("Merging-upward composite-named nodes") +namesToRemove2 = set() +for (name, nodeObj) in nodeMap.items(): + parent = nodeObj["parent"] + if parent != None and compNameRegex.fullmatch(name) != None: + # Connect children to parent + nodeMap[parent]["children"].remove(name) + nodeMap[parent]["children"].extend(nodeObj["children"]) + for n in nodeObj["children"]: + nodeMap[n]["parent"] = parent + nodeMap[n]["pSupport"] &= nodeObj["pSupport"] + # Remember for removal + namesToRemove2.add(name) +for name in namesToRemove2: + del nodeMap[name] + namesToRemove.add(name) +print("New node set has {} nodes".format(len(nodeMap))) +# Add some connected children +print("Adding additional nearby children") +namesToAdd = [] +iterNum = 0 +for (name, nodeObj) in nodeMap.items(): + iterNum += 1 + if iterNum % 100 == 0: + print("Iteration {}".format(iterNum)) + # + numChildren = len(nodeObj["children"]) + if numChildren < PREF_NUM_CHILDREN: + row = dbCur.execute("SELECT children from nodes WHERE name = ?", (name,)).fetchone() + newChildren = [n for n in json.loads(row[0]) if + not (n in nodeMap or n in namesToRemove) and + compNameRegex.fullmatch(n) == None] + newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)] + nodeObj["children"].extend(newChildNames) + namesToAdd.extend(newChildNames) +for name in namesToAdd: + (parent, pSupport) = dbCur.execute("SELECT parent, p_support from nodes WHERE name = ?", (name,)).fetchone() + nodeMap[name] = { + "children": [], + "parent": parent, + "tips": 0, + "pSupport": pSupport, + } +print("New node set has {} nodes".format(len(nodeMap))) +# set tips vals +print("Setting tips vals") +def setTips(nodeName): + nodeObj = nodeMap[nodeName] + if len(nodeObj["children"]) == 0: + nodeObj["tips"] = 1 + return 1 + tips = sum([setTips(childName) for childName in nodeObj["children"]]) + nodeObj["tips"] = tips + return tips +setTips(rootName) +# Add new nodes to db +print("Adding to db") +dbCur.execute( + "CREATE TABLE reduced_nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)") +for (name, nodeObj) in nodeMap.items(): + parentName = "" if nodeObj["parent"] == None else nodeObj["parent"] + dbCur.execute("INSERT INTO reduced_nodes VALUES (?, ?, ?, ?, ?)", + (name, json.dumps(nodeObj["children"]), parentName, nodeObj["tips"], 1 if nodeObj["pSupport"] else 0)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/reducedTol/README.md b/backend/data/reducedTol/README.md new file mode 100644 index 0000000..103bffc --- /dev/null +++ b/backend/data/reducedTol/README.md @@ -0,0 +1,4 @@ +Files +===== +- names.txt
+ Contains names of nodes to be kept in a reduced Tree of Life. -- cgit v1.2.3