aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/data/README.md7
-rwxr-xr-xbackend/data/genReducedTreeData.py152
-rw-r--r--backend/data/reducedTol/README.md4
-rwxr-xr-xbackend/server.py36
4 files changed, 185 insertions, 14 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 27619de..c4c46ba 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -7,7 +7,7 @@ File Generation Process
table using data in otol/*.
2 Name Data for Search
1 Obtain data in eol/, as specified in it's README.
- 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
+ 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
using data in eol/vernacularNames.csv and the 'nodes' table.
3 Image Data
1 Use downloadImgsForReview.py to download EOL images into imgsForReview/.
@@ -20,6 +20,9 @@ File Generation Process
1 Obtain data in enwiki/, as specified in it's README.
2 Run genEnwikiData.py, which adds a 'descs' table to data.db,
using data in enwiki/enwikiData.db, and the 'nodes' table.
+5 Reduced Tree Structure Data
+ 1 Run genReducedTreeData.py, which adds a 'reduced_nodes' table to data.db,
+ using reducedTol/names.txt, and the 'nodes' and 'names' tables.
data.db tables
==============
@@ -33,3 +36,5 @@ data.db tables
eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
- descs <br>
name TEXT PRIMARY KEY, desc TEXT, redirected INT
+- reduced\_nodes <br>
+ name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT
diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py
new file mode 100755
index 0000000..ed8fae9
--- /dev/null
+++ b/backend/data/genReducedTreeData.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python3
+
+import sys, os.path, re
+import json, sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads \n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbFile = "data.db"
+nodeNamesFile = "reducedTol/names.txt"
+minimalNames = set()
+nodeMap = {} # Maps node names to node objects
+PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit
+compNameRegex = re.compile(r"\[.+ \+ .+]")
+
+# Connect to db
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Read in minimal set of node names
+print("Getting minimal name set")
+iterNum = 0
+with open(nodeNamesFile) as file:
+ for line in file:
+ iterNum += 1
+ if iterNum % 100 == 0:
+ print("Iteration {}".format(iterNum))
+ #
+ row = dbCur.execute("SELECT name, alt_name from names WHERE alt_name = ?", (line.rstrip(),)).fetchone()
+ if row != None:
+ minimalNames.add(row[0])
+if len(minimalNames) == 0:
+ print("ERROR: No names found", file=sys.stderr)
+ sys.exit(1)
+print("Name set has {} names".format(len(minimalNames)))
+# Add nodes that connect up to root
+print("Getting connected nodes set")
+iterNum = 0
+rootName = None
+for name in minimalNames:
+ iterNum += 1
+ if iterNum % 100 == 0:
+ print("Iteration {}".format(iterNum))
+ #
+ prevName = None
+ while name != None:
+ if name not in nodeMap:
+ (parent, tips, p_support) = dbCur.execute(
+ "SELECT parent, tips, p_support from nodes WHERE name = ?", (name,)).fetchone()
+ parent = None if parent == "" else parent
+ nodeMap[name] = {
+ "children": [] if prevName == None else [prevName],
+ "parent": parent,
+ "tips": 0,
+ "pSupport": p_support == 1,
+ }
+ prevName = name
+ name = parent
+ else:
+ if prevName != None:
+ nodeMap[name]["children"].append(prevName)
+ break
+ if name == None:
+ rootName = prevName
+print("New node set has {} nodes".format(len(nodeMap)))
+# Remove certain 'chain collapsible' nodes
+print("Removing 'chain collapsible' nodes")
+namesToRemove = set()
+for (name, nodeObj) in nodeMap.items():
+ if name not in minimalNames and len(nodeObj["children"]) == 1:
+ parentName = nodeObj["parent"]
+ childName = nodeObj["children"][0]
+ # Connect parent and child
+ nodeMap[parentName]["children"].remove(name)
+ nodeMap[parentName]["children"].append(childName)
+ nodeMap[childName]["parent"] = parentName
+ # Adjust child pSupport
+ nodeMap[childName]["pSupport"] &= nodeObj["pSupport"]
+ # Remember for removal
+ namesToRemove.add(name)
+for name in namesToRemove:
+ del nodeMap[name]
+print("New node set has {} nodes".format(len(nodeMap)))
+# Merge-upward compsite-named nodes
+print("Merging-upward composite-named nodes")
+namesToRemove2 = set()
+for (name, nodeObj) in nodeMap.items():
+ parent = nodeObj["parent"]
+ if parent != None and compNameRegex.fullmatch(name) != None:
+ # Connect children to parent
+ nodeMap[parent]["children"].remove(name)
+ nodeMap[parent]["children"].extend(nodeObj["children"])
+ for n in nodeObj["children"]:
+ nodeMap[n]["parent"] = parent
+ nodeMap[n]["pSupport"] &= nodeObj["pSupport"]
+ # Remember for removal
+ namesToRemove2.add(name)
+for name in namesToRemove2:
+ del nodeMap[name]
+ namesToRemove.add(name)
+print("New node set has {} nodes".format(len(nodeMap)))
+# Add some connected children
+print("Adding additional nearby children")
+namesToAdd = []
+iterNum = 0
+for (name, nodeObj) in nodeMap.items():
+ iterNum += 1
+ if iterNum % 100 == 0:
+ print("Iteration {}".format(iterNum))
+ #
+ numChildren = len(nodeObj["children"])
+ if numChildren < PREF_NUM_CHILDREN:
+ row = dbCur.execute("SELECT children from nodes WHERE name = ?", (name,)).fetchone()
+ newChildren = [n for n in json.loads(row[0]) if
+ not (n in nodeMap or n in namesToRemove) and
+ compNameRegex.fullmatch(n) == None]
+ newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)]
+ nodeObj["children"].extend(newChildNames)
+ namesToAdd.extend(newChildNames)
+for name in namesToAdd:
+ (parent, pSupport) = dbCur.execute("SELECT parent, p_support from nodes WHERE name = ?", (name,)).fetchone()
+ nodeMap[name] = {
+ "children": [],
+ "parent": parent,
+ "tips": 0,
+ "pSupport": pSupport,
+ }
+print("New node set has {} nodes".format(len(nodeMap)))
+# set tips vals
+print("Setting tips vals")
+def setTips(nodeName):
+ nodeObj = nodeMap[nodeName]
+ if len(nodeObj["children"]) == 0:
+ nodeObj["tips"] = 1
+ return 1
+ tips = sum([setTips(childName) for childName in nodeObj["children"]])
+ nodeObj["tips"] = tips
+ return tips
+setTips(rootName)
+# Add new nodes to db
+print("Adding to db")
+dbCur.execute(
+ "CREATE TABLE reduced_nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)")
+for (name, nodeObj) in nodeMap.items():
+ parentName = "" if nodeObj["parent"] == None else nodeObj["parent"]
+ dbCur.execute("INSERT INTO reduced_nodes VALUES (?, ?, ?, ?, ?)",
+ (name, json.dumps(nodeObj["children"]), parentName, nodeObj["tips"], 1 if nodeObj["pSupport"] else 0))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/reducedTol/README.md b/backend/data/reducedTol/README.md
new file mode 100644
index 0000000..103bffc
--- /dev/null
+++ b/backend/data/reducedTol/README.md
@@ -0,0 +1,4 @@
+Files
+=====
+- names.txt <br>
+ Contains names of nodes to be kept in a reduced Tree of Life.
diff --git a/backend/server.py b/backend/server.py
index 9c9764b..374fb53 100755
--- a/backend/server.py
+++ b/backend/server.py
@@ -14,6 +14,7 @@ SEARCH_SUGG_LIMIT = 5
usageInfo = f"usage: {sys.argv[0]}\n"
usageInfo += "Starts a server that listens for GET requests to http://" + hostname + ":" + str(port) + ".\n"
usageInfo += "Responds to path+query /data/type1?name=name1 with JSON data.\n"
+usageInfo += "An additional query parameter tree=reduced is usable to get reduced-tree data\n"
usageInfo += "\n"
usageInfo += "If type1 is 'node': Responds with map from names to objects representing node name1 and it's children.\n"
usageInfo += "If type1 is 'chain': Like 'node', but gets nodes from name1 up to the root, and their direct children.\n"
@@ -25,12 +26,13 @@ if len(sys.argv) > 1:
# Connect to db, and load spellfix extension
dbCon = sqlite3.connect(dbFile)
# Some functions
-def lookupNodes(names):
+def lookupNodes(names, useReducedTree):
nodeObjs = {}
cur = dbCon.cursor()
# Get node info
- query = "SELECT name, children, parent, tips, p_support FROM nodes WHERE" \
- " name IN ({})".format(",".join(["?"] * len(names)))
+ nodesTable = "nodes" if not useReducedTree else "reduced_nodes"
+ query = "SELECT name, children, parent, tips, p_support FROM {} WHERE" \
+ " name IN ({})".format(nodesTable, ",".join(["?"] * len(names)))
namesForImgs = []
firstSubnames = {}
secondSubnames = {}
@@ -89,13 +91,19 @@ def getNodeImg(name):
if os.path.exists(imgDir + filename):
return filename
return None
-def lookupName(name):
+def lookupName(name, useReducedTree):
cur = dbCon.cursor()
results = []
hasMore = False
- for row in cur.execute(
- "SELECT DISTINCT name, alt_name FROM names WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?",
- (name + "%", SEARCH_SUGG_LIMIT)):
+ query = None
+ if not useReducedTree:
+ query = "SELECT DISTINCT name, alt_name FROM names" \
+ " WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?"
+ else:
+ query = "SELECT DISTINCT names.name, alt_name FROM" \
+ " names INNER JOIN reduced_nodes ON names.name = reduced_nodes.name" \
+ " WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?"
+ for row in cur.execute(query, (name + "%", SEARCH_SUGG_LIMIT)):
results.append({"name": row[0], "altName": row[1]})
if len(results) > SEARCH_SUGG_LIMIT:
hasMore = True
@@ -124,15 +132,17 @@ class DbServer(BaseHTTPRequestHandler):
queryDict = urllib.parse.parse_qs(urlParts.query)
# Check first element of path
match = re.match(r"/([^/]+)/(.+)", path)
- if match != None and match.group(1) == "data" and "name" in queryDict:
+ if match != None and match.group(1) == "data" and "name" in queryDict and \
+ ("tree" not in queryDict or queryDict["tree"][0] == "reduced"):
reqType = match.group(2)
name = queryDict["name"][0]
+ useReducedTree = "tree" in queryDict
# Check query string
if reqType == "node":
- nodeObjs = lookupNodes([name])
+ nodeObjs = lookupNodes([name], useReducedTree)
if len(nodeObjs) > 0:
nodeObj = nodeObjs[name]
- childNodeObjs = lookupNodes(nodeObj["children"])
+ childNodeObjs = lookupNodes(nodeObj["children"], useReducedTree)
childNodeObjs[name] = nodeObj
self.respondJson(childNodeObjs)
return
@@ -141,7 +151,7 @@ class DbServer(BaseHTTPRequestHandler):
ranOnce = False
while True:
# Get node
- nodeObjs = lookupNodes([name])
+ nodeObjs = lookupNodes([name], useReducedTree)
if len(nodeObjs) == 0:
if not ranOnce:
self.respondJson(results)
@@ -158,7 +168,7 @@ class DbServer(BaseHTTPRequestHandler):
for childName in nodeObj["children"]:
if childName not in results:
childNamesToAdd.append(childName)
- childNodeObjs = lookupNodes(childNamesToAdd)
+ childNodeObjs = lookupNodes(childNamesToAdd, useReducedTree)
results.update(childNodeObjs)
# Check if root
if nodeObj["parent"] == None:
@@ -167,7 +177,7 @@ class DbServer(BaseHTTPRequestHandler):
else:
name = nodeObj["parent"]
elif reqType == "search":
- self.respondJson(lookupName(name))
+ self.respondJson(lookupName(name, useReducedTree))
return
elif reqType == "info":
self.respondJson(lookupNodeInfo(name))