Add reduced-tree data generation+serving+querying+setting

Add genReducedTreeData.py, which generates a reduced_nodes table. Adjust server to serve that data for queries with a tree=reduced query param. Adjust client to query for that data depending on a useReducedTree variable. Add a SettingsPane setting to change that useReducedTree variable.
author: Terry Truong <terry06890@gmail.com> 2022-05-12 00:10:12 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-05-12 00:10:12 +1000
commit: e1ef2bf3387769de4edc4a7ec1a6d38c5a21c5e7 (patch)
tree: d2a8ee2f6e36cbbc723de774965c9a001b746b0d /backend/data
parent: 4872ce9c22cc3c7024075f66409efdaf8860e9b8 (diff)
3 files changed, 162 insertions, 1 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 27619de..c4c46ba 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -7,7 +7,7 @@ File Generation Process
         table using data in otol/*.
 2   Name Data for Search
     1   Obtain data in eol/, as specified in it's README.
-    2   Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, 
+    2   Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
         using data in eol/vernacularNames.csv and the 'nodes' table.
 3   Image Data
     1   Use downloadImgsForReview.py to download EOL images into imgsForReview/.
@@ -20,6 +20,9 @@ File Generation Process
     1   Obtain data in enwiki/, as specified in it's README.
     2   Run genEnwikiData.py, which adds a 'descs' table to data.db,
         using data in enwiki/enwikiData.db, and the 'nodes' table.
+5   Reduced Tree Structure Data
+    1   Run genReducedTreeData.py, which adds a 'reduced_nodes' table to data.db,
+        using reducedTol/names.txt, and the 'nodes' and 'names' tables.
 
 data.db tables
 ==============
@@ -33,3 +36,5 @@ data.db tables
     eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
 -   descs <br>
     name TEXT PRIMARY KEY, desc TEXT, redirected INT
+-   reduced\_nodes <br>
+    name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT
diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py
new file mode 100755
index 0000000..ed8fae9
--- /dev/null
+++ b/backend/data/genReducedTreeData.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python3
+
+import sys, os.path, re
+import json, sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads \n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+dbFile = "data.db"
+nodeNamesFile = "reducedTol/names.txt"
+minimalNames = set()
+nodeMap = {} # Maps node names to node objects
+PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit
+compNameRegex = re.compile(r"\[.+ \+ .+]")
+
+# Connect to db
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Read in minimal set of node names
+print("Getting minimal name set")
+iterNum = 0
+with open(nodeNamesFile) as file:
+	for line in file:
+		iterNum += 1
+		if iterNum % 100 == 0:
+			print("Iteration {}".format(iterNum))
+		#
+		row = dbCur.execute("SELECT name, alt_name from names WHERE alt_name = ?", (line.rstrip(),)).fetchone()
+		if row != None:
+			minimalNames.add(row[0])
+if len(minimalNames) == 0:
+	print("ERROR: No names found", file=sys.stderr)
+	sys.exit(1)
+print("Name set has {} names".format(len(minimalNames)))
+# Add nodes that connect up to root
+print("Getting connected nodes set")
+iterNum = 0
+rootName = None
+for name in minimalNames:
+	iterNum += 1
+	if iterNum % 100 == 0:
+		print("Iteration {}".format(iterNum))
+	#
+	prevName = None
+	while name != None:
+		if name not in nodeMap:
+			(parent, tips, p_support) = dbCur.execute(
+				"SELECT parent, tips, p_support from nodes WHERE name = ?", (name,)).fetchone()
+			parent = None if parent == "" else parent
+			nodeMap[name] = {
+				"children": [] if prevName == None else [prevName],
+				"parent": parent,
+				"tips": 0,
+				"pSupport": p_support == 1,
+			}
+			prevName = name
+			name = parent
+		else:
+			if prevName != None:
+				nodeMap[name]["children"].append(prevName)
+			break
+	if name == None:
+		rootName = prevName
+print("New node set has {} nodes".format(len(nodeMap)))
+# Remove certain 'chain collapsible' nodes
+print("Removing 'chain collapsible' nodes")
+namesToRemove = set()
+for (name, nodeObj) in nodeMap.items():
+	if name not in minimalNames and len(nodeObj["children"]) == 1:
+		parentName = nodeObj["parent"]
+		childName = nodeObj["children"][0]
+		# Connect parent and child
+		nodeMap[parentName]["children"].remove(name)
+		nodeMap[parentName]["children"].append(childName)
+		nodeMap[childName]["parent"] = parentName
+		# Adjust child pSupport
+		nodeMap[childName]["pSupport"] &= nodeObj["pSupport"]
+		# Remember for removal
+		namesToRemove.add(name)
+for name in namesToRemove:
+	del nodeMap[name]
+print("New node set has {} nodes".format(len(nodeMap)))
+# Merge-upward compsite-named nodes
+print("Merging-upward composite-named nodes")
+namesToRemove2 = set()
+for (name, nodeObj) in nodeMap.items():
+	parent = nodeObj["parent"]
+	if parent != None and compNameRegex.fullmatch(name) != None:
+		# Connect children to parent
+		nodeMap[parent]["children"].remove(name)
+		nodeMap[parent]["children"].extend(nodeObj["children"])
+		for n in nodeObj["children"]:
+			nodeMap[n]["parent"] = parent
+			nodeMap[n]["pSupport"] &= nodeObj["pSupport"]
+		# Remember for removal
+		namesToRemove2.add(name)
+for name in namesToRemove2:
+	del nodeMap[name]
+	namesToRemove.add(name)
+print("New node set has {} nodes".format(len(nodeMap)))
+# Add some connected children
+print("Adding additional nearby children")
+namesToAdd = []
+iterNum = 0
+for (name, nodeObj) in nodeMap.items():
+	iterNum += 1
+	if iterNum % 100 == 0:
+		print("Iteration {}".format(iterNum))
+	#
+	numChildren = len(nodeObj["children"])
+	if numChildren < PREF_NUM_CHILDREN:
+		row = dbCur.execute("SELECT children from nodes WHERE name = ?", (name,)).fetchone()
+		newChildren = [n for n in json.loads(row[0]) if
+			not (n in nodeMap or n in namesToRemove) and
+			compNameRegex.fullmatch(n) == None]
+		newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)]
+		nodeObj["children"].extend(newChildNames)
+		namesToAdd.extend(newChildNames)
+for name in namesToAdd:
+	(parent, pSupport) = dbCur.execute("SELECT parent, p_support from nodes WHERE name = ?", (name,)).fetchone()
+	nodeMap[name] = {
+		"children": [],
+		"parent": parent,
+		"tips": 0,
+		"pSupport": pSupport,
+	}
+print("New node set has {} nodes".format(len(nodeMap)))
+# set tips vals
+print("Setting tips vals")
+def setTips(nodeName):
+	nodeObj = nodeMap[nodeName]
+	if len(nodeObj["children"]) == 0:
+		nodeObj["tips"] = 1
+		return 1
+	tips = sum([setTips(childName) for childName in nodeObj["children"]])
+	nodeObj["tips"] = tips
+	return tips
+setTips(rootName)
+# Add new nodes to db
+print("Adding to db")
+dbCur.execute(
+	"CREATE TABLE reduced_nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)")
+for (name, nodeObj) in nodeMap.items():
+	parentName = "" if nodeObj["parent"] == None else nodeObj["parent"]
+	dbCur.execute("INSERT INTO reduced_nodes VALUES (?, ?, ?, ?, ?)",
+		(name, json.dumps(nodeObj["children"]), parentName, nodeObj["tips"], 1 if nodeObj["pSupport"] else 0))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/reducedTol/README.md b/backend/data/reducedTol/README.md
new file mode 100644
index 0000000..103bffc
--- /dev/null
+++ b/backend/data/reducedTol/README.md
@@ -0,0 +1,4 @@
+Files
+=====
+-   names.txt <br>
+	Contains names of nodes to be kept in a reduced Tree of Life.
author	Terry Truong <terry06890@gmail.com>	2022-05-12 00:10:12 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-05-12 00:10:12 +1000
commit	e1ef2bf3387769de4edc4a7ec1a6d38c5a21c5e7 (patch)
tree	d2a8ee2f6e36cbbc723de774965c9a001b746b0d /backend/data
parent	4872ce9c22cc3c7024075f66409efdaf8860e9b8 (diff)