aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md22
-rwxr-xr-xbackend/data/genEolNameData.py3
-rwxr-xr-xbackend/data/genOtolData.py20
-rwxr-xr-xbackend/data/genReducedTreeData.py29
-rw-r--r--backend/data/reducedTol/names.txt2
5 files changed, 43 insertions, 33 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index e2b5db7..cb9cd42 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -3,8 +3,8 @@ File Generation Process
1 Tree Structure Data
1 Obtain data in otol/, as specified in it's README.
- 2 Run genOtolData.py, which creates data.db, and adds a 'nodes'
- table using data in otol/*.
+ 2 Run genOtolData.py, which creates data.db, and adds
+ 'nodes' and 'edges' tables using data in otol/*.
2 Name Data for Search
1 Obtain data in eol/, as specified in it's README.
2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
@@ -26,17 +26,19 @@ File Generation Process
2 Run genEnwikiData.py, which adds to the 'descs' table, using data in
enwiki/enwikiData.db, reducedTol/names.txt, and the 'nodes' table.
5 Reduced Tree Structure Data
- 1 Run genReducedTreeData.py, which adds a 'reduced_nodes' table to data.db,
- using reducedTol/names.txt, and the 'nodes' and 'names' tables.
+ 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
+ data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
data.db Tables
==============
-- nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT
-- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
-- eol\_ids: id INT PRIMARY KEY, name TEXT
-- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
-- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT
-- reduced\_nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT
+- nodes: name TEXT PRIMARY KEY, tips INT
+- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
+- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
+- eol\_ids: id INT PRIMARY KEY, name TEXT
+- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
+- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT
+- r\_nodes: name TEXT PRIMARY KEY, tips INT
+- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
Other Files
===========
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index 7f7e499..277f3a7 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -64,7 +64,10 @@ dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
# Create tables
dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, PRIMARY KEY(name, alt_name))")
+dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
+dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
+dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
# Iterate through 'nodes' table, resolving to canonical-names
usedPids = set()
unresolvedNodeNames = set()
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index 9298106..2ae154d 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -30,10 +30,6 @@ idToName = {} # Maps node IDs to names
nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs)
dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs
-# Check for existing db
-if os.path.exists(dbFile):
- print("ERROR: Existing {} db".format(dbFile), file=sys.stderr)
- sys.exit(1)
# Parse treeFile
print("Parsing tree file")
data = None
@@ -210,14 +206,18 @@ for [id, node] in nodeMap.items():
if node["parent"] == None:
node["pSupport"] = True
# Create db
-print("Creating nodes table")
+print("Creating nodes and edges tables")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)")
+dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, tips INT)")
+dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))")
+dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)")
for node in nodeMap.values():
- childNames = [nodeMap[id]["name"] for id in node["children"]]
- parentName = "" if node["parent"] == None else nodeMap[node["parent"]]["name"]
- dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?, ?, ?)",
- (node["name"], json.dumps(childNames), parentName, node["tips"], 1 if node["pSupport"] else 0))
+ dbCur.execute("INSERT INTO nodes VALUES (?, ?)", (node["name"], node["tips"]))
+ childIds = node["children"]
+ for childId in childIds:
+ childNode = nodeMap[childId]
+ dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)",
+ (node["name"], childNode["name"], 1 if childNode["pSupport"] else 0))
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py
index ed8fae9..508e751 100755
--- a/backend/data/genReducedTreeData.py
+++ b/backend/data/genReducedTreeData.py
@@ -47,14 +47,15 @@ for name in minimalNames:
prevName = None
while name != None:
if name not in nodeMap:
- (parent, tips, p_support) = dbCur.execute(
- "SELECT parent, tips, p_support from nodes WHERE name = ?", (name,)).fetchone()
- parent = None if parent == "" else parent
+ (tips,) = dbCur.execute("SELECT tips from nodes where name = ?", (name,)).fetchone()
+ row = dbCur.execute("SELECT node, p_support from edges where child = ?", (name,)).fetchone()
+ parent = None if row == None or row[0] == "" else row[0]
+ pSupport = 1 if row == None or row[1] == 1 else 0
nodeMap[name] = {
"children": [] if prevName == None else [prevName],
"parent": parent,
"tips": 0,
- "pSupport": p_support == 1,
+ "pSupport": pSupport,
}
prevName = name
name = parent
@@ -112,20 +113,21 @@ for (name, nodeObj) in nodeMap.items():
#
numChildren = len(nodeObj["children"])
if numChildren < PREF_NUM_CHILDREN:
- row = dbCur.execute("SELECT children from nodes WHERE name = ?", (name,)).fetchone()
- newChildren = [n for n in json.loads(row[0]) if
+ children = [row[0] for row in dbCur.execute("SELECT child FROM edges where node = ?", (name,))]
+ newChildren = [n for n in children if
not (n in nodeMap or n in namesToRemove) and
compNameRegex.fullmatch(n) == None]
newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)]
nodeObj["children"].extend(newChildNames)
namesToAdd.extend(newChildNames)
for name in namesToAdd:
- (parent, pSupport) = dbCur.execute("SELECT parent, p_support from nodes WHERE name = ?", (name,)).fetchone()
+ (parent, pSupport) = dbCur.execute("SELECT node, p_support from edges WHERE child = ?", (name,)).fetchone()
+ parent = None if parent == "" else parent
nodeMap[name] = {
"children": [],
"parent": parent,
"tips": 0,
- "pSupport": pSupport,
+ "pSupport": pSupport == 1,
}
print("New node set has {} nodes".format(len(nodeMap)))
# set tips vals
@@ -141,12 +143,15 @@ def setTips(nodeName):
setTips(rootName)
# Add new nodes to db
print("Adding to db")
-dbCur.execute(
- "CREATE TABLE reduced_nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)")
+dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, tips INT)")
+dbCur.execute("CREATE TABLE r_edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))")
+dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)")
for (name, nodeObj) in nodeMap.items():
parentName = "" if nodeObj["parent"] == None else nodeObj["parent"]
- dbCur.execute("INSERT INTO reduced_nodes VALUES (?, ?, ?, ?, ?)",
- (name, json.dumps(nodeObj["children"]), parentName, nodeObj["tips"], 1 if nodeObj["pSupport"] else 0))
+ dbCur.execute("INSERT INTO r_nodes VALUES (?, ?)", (name, nodeObj["tips"]))
+ for childName in nodeObj["children"]:
+ pSupport = 1 if nodeMap[childName]["pSupport"] else 0
+ dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport))
# Close db
dbCon.commit()
dbCon.close()
diff --git a/backend/data/reducedTol/names.txt b/backend/data/reducedTol/names.txt
index 1b6a5d8..6c6f5c1 100644
--- a/backend/data/reducedTol/names.txt
+++ b/backend/data/reducedTol/names.txt
@@ -489,7 +489,7 @@ chlamydosaurus
chondrichthyes
chondrocladia
chondrostei
-chordata
+chordate
chromalveolate
chrysanthemum
chrysididae