diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 22 | ||||
| -rwxr-xr-x | backend/data/genEolNameData.py | 3 | ||||
| -rwxr-xr-x | backend/data/genOtolData.py | 20 | ||||
| -rwxr-xr-x | backend/data/genReducedTreeData.py | 29 | ||||
| -rw-r--r-- | backend/data/reducedTol/names.txt | 2 |
5 files changed, 43 insertions, 33 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index e2b5db7..cb9cd42 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -3,8 +3,8 @@ File Generation Process 1 Tree Structure Data 1 Obtain data in otol/, as specified in it's README. - 2 Run genOtolData.py, which creates data.db, and adds a 'nodes' - table using data in otol/*. + 2 Run genOtolData.py, which creates data.db, and adds + 'nodes' and 'edges' tables using data in otol/*. 2 Name Data for Search 1 Obtain data in eol/, as specified in it's README. 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, @@ -26,17 +26,19 @@ File Generation Process 2 Run genEnwikiData.py, which adds to the 'descs' table, using data in enwiki/enwikiData.db, reducedTol/names.txt, and the 'nodes' table. 5 Reduced Tree Structure Data - 1 Run genReducedTreeData.py, which adds a 'reduced_nodes' table to data.db, - using reducedTol/names.txt, and the 'nodes' and 'names' tables. + 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to + data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. data.db Tables ============== -- nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT -- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) -- eol\_ids: id INT PRIMARY KEY, name TEXT -- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT -- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT -- reduced\_nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT +- nodes: name TEXT PRIMARY KEY, tips INT +- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) +- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) +- eol\_ids: id INT PRIMARY KEY, name TEXT +- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT +- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT +- r\_nodes: name TEXT PRIMARY KEY, tips INT +- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) Other Files =========== diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py index 7f7e499..277f3a7 100755 --- a/backend/data/genEolNameData.py +++ b/backend/data/genEolNameData.py @@ -64,7 +64,10 @@ dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() # Create tables dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, PRIMARY KEY(name, alt_name))") +dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)") +dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)") dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)") +dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)") # Iterate through 'nodes' table, resolving to canonical-names usedPids = set() unresolvedNodeNames = set() diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py index 9298106..2ae154d 100755 --- a/backend/data/genOtolData.py +++ b/backend/data/genOtolData.py @@ -30,10 +30,6 @@ idToName = {} # Maps node IDs to names nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs -# Check for existing db -if os.path.exists(dbFile): - print("ERROR: Existing {} db".format(dbFile), file=sys.stderr) - sys.exit(1) # Parse treeFile print("Parsing tree file") data = None @@ -210,14 +206,18 @@ for [id, node] in nodeMap.items(): if node["parent"] == None: node["pSupport"] = True # Create db -print("Creating nodes table") +print("Creating nodes and edges tables") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)") +dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, tips INT)") +dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))") +dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)") for node in nodeMap.values(): - childNames = [nodeMap[id]["name"] for id in node["children"]] - parentName = "" if node["parent"] == None else nodeMap[node["parent"]]["name"] - dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?, ?, ?)", - (node["name"], json.dumps(childNames), parentName, node["tips"], 1 if node["pSupport"] else 0)) + dbCur.execute("INSERT INTO nodes VALUES (?, ?)", (node["name"], node["tips"])) + childIds = node["children"] + for childId in childIds: + childNode = nodeMap[childId] + dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)", + (node["name"], childNode["name"], 1 if childNode["pSupport"] else 0)) dbCon.commit() dbCon.close() diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py index ed8fae9..508e751 100755 --- a/backend/data/genReducedTreeData.py +++ b/backend/data/genReducedTreeData.py @@ -47,14 +47,15 @@ for name in minimalNames: prevName = None while name != None: if name not in nodeMap: - (parent, tips, p_support) = dbCur.execute( - "SELECT parent, tips, p_support from nodes WHERE name = ?", (name,)).fetchone() - parent = None if parent == "" else parent + (tips,) = dbCur.execute("SELECT tips from nodes where name = ?", (name,)).fetchone() + row = dbCur.execute("SELECT node, p_support from edges where child = ?", (name,)).fetchone() + parent = None if row == None or row[0] == "" else row[0] + pSupport = 1 if row == None or row[1] == 1 else 0 nodeMap[name] = { "children": [] if prevName == None else [prevName], "parent": parent, "tips": 0, - "pSupport": p_support == 1, + "pSupport": pSupport, } prevName = name name = parent @@ -112,20 +113,21 @@ for (name, nodeObj) in nodeMap.items(): # numChildren = len(nodeObj["children"]) if numChildren < PREF_NUM_CHILDREN: - row = dbCur.execute("SELECT children from nodes WHERE name = ?", (name,)).fetchone() - newChildren = [n for n in json.loads(row[0]) if + children = [row[0] for row in dbCur.execute("SELECT child FROM edges where node = ?", (name,))] + newChildren = [n for n in children if not (n in nodeMap or n in namesToRemove) and compNameRegex.fullmatch(n) == None] newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)] nodeObj["children"].extend(newChildNames) namesToAdd.extend(newChildNames) for name in namesToAdd: - (parent, pSupport) = dbCur.execute("SELECT parent, p_support from nodes WHERE name = ?", (name,)).fetchone() + (parent, pSupport) = dbCur.execute("SELECT node, p_support from edges WHERE child = ?", (name,)).fetchone() + parent = None if parent == "" else parent nodeMap[name] = { "children": [], "parent": parent, "tips": 0, - "pSupport": pSupport, + "pSupport": pSupport == 1, } print("New node set has {} nodes".format(len(nodeMap))) # set tips vals @@ -141,12 +143,15 @@ def setTips(nodeName): setTips(rootName) # Add new nodes to db print("Adding to db") -dbCur.execute( - "CREATE TABLE reduced_nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)") +dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, tips INT)") +dbCur.execute("CREATE TABLE r_edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))") +dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)") for (name, nodeObj) in nodeMap.items(): parentName = "" if nodeObj["parent"] == None else nodeObj["parent"] - dbCur.execute("INSERT INTO reduced_nodes VALUES (?, ?, ?, ?, ?)", - (name, json.dumps(nodeObj["children"]), parentName, nodeObj["tips"], 1 if nodeObj["pSupport"] else 0)) + dbCur.execute("INSERT INTO r_nodes VALUES (?, ?)", (name, nodeObj["tips"])) + for childName in nodeObj["children"]: + pSupport = 1 if nodeMap[childName]["pSupport"] else 0 + dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport)) # Close db dbCon.commit() dbCon.close() diff --git a/backend/data/reducedTol/names.txt b/backend/data/reducedTol/names.txt index 1b6a5d8..6c6f5c1 100644 --- a/backend/data/reducedTol/names.txt +++ b/backend/data/reducedTol/names.txt @@ -489,7 +489,7 @@ chlamydosaurus chondrichthyes chondrocladia chondrostei -chordata +chordate chromalveolate chrysanthemum chrysididae |
