diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 5 | ||||
| -rwxr-xr-x | backend/data/genEolNameData.py | 1 | ||||
| -rwxr-xr-x | backend/data/genOtolData.py | 81 | ||||
| -rwxr-xr-x | backend/data/trimTree.py | 130 |
4 files changed, 136 insertions, 81 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 0bde721..87cf58d 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -55,13 +55,16 @@ File Generation Process 6 Other - Optionally run genEnwikiNameData.py, which adds more entries to the 'names' table, using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables. + - Optionally run trimTree.py, which tries to remove some 'low-significance' nodes, + for the sake of performance and result-relevance. Without this, jumping to certain + nodes within the fungi and moths can take over a minute to render. data.db Tables ============== - nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT - edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) -- eol\_ids: id INT PRIMARY KEY, name TEXT - names: name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name) +- eol\_ids: id INT PRIMARY KEY, name TEXT - descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT - images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src) - node\_imgs: id TEXT PRIMARY KEY, img\_id INT, src TEXT diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py index 74b0e3d..79e6f4f 100755 --- a/backend/data/genEolNameData.py +++ b/backend/data/genEolNameData.py @@ -77,6 +77,7 @@ dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() # Create tables dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))") +dbCur.execute("CREATE INDEX names_idx ON names(name)") dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)") dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)") dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)") diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py index 5a89270..cfb5bed 100755 --- a/backend/data/genOtolData.py +++ b/backend/data/genOtolData.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, os, re +import sys, re import json, sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" @@ -32,8 +32,6 @@ nodeMap = {} # Maps node IDs to node objects nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs pickedDupsFile = "genOtolDataPickedDups.txt" -softChildLimit = 100 -keptNamesFile = "genOtolNamesToKeep.txt" # Contains names to keep when doing node trimming # Parse treeFile print("Parsing tree file") @@ -141,83 +139,6 @@ def parseNewickName(): raise Exception(f"ERROR: invalid name \"{name}\"") return [match.group(1).replace("_", " "), match.group(2)] rootId = parseNewick() -# For nodes with *many* children, remove some of those children -print("Getting nodes for which to avoid trimming") -namesToKeep = set() -if os.path.exists(keptNamesFile): - with open(keptNamesFile) as file: # Contains names with an image, desc, or reduced-tree-presence - for line in file: - namesToKeep.add(line.rstrip()) -else: - print(f"WARNING: No '{keptNamesFile}' file found") -print(f"Read in {len(namesToKeep)} nodes") -keptAncestors = set() -for name in namesToKeep: - if name in nameToFirstId: - ids = [nameToFirstId[name]] if name not in dupNameToIds else dupNameToIds[name] - for id in ids: - parentId = nodeMap[id]["parent"] - while parentId != None: - parentObj = nodeMap[parentId] - keptAncestors.add(parentObj["name"]) - parentId = parentObj["parent"] -oldNamesToKeepSz = len(namesToKeep) -namesToKeep.update(keptAncestors) -print("Added {} ancestor nodes".format(len(namesToKeep) - oldNamesToKeepSz)) -print("Trimming nodes from tree") -def trimChildren(nodeId): - """ Traverse node tree, looking for nodes with too many children """ - nodeObj = nodeMap[nodeId] - tipsRemoved = 0 - if len(nodeObj["children"]) > softChildLimit: - childIds = nodeObj["children"] - # Look for children to delete, excluding 'kept nodes' - idsToKeep, otherIds = [], [] - for id in childIds: - if nodeMap[id]["name"] in namesToKeep: - idsToKeep.append(id) - else: - otherIds.append(id) - if len(idsToKeep) < softChildLimit: - # Order by decreasing number of tips, placing excess children in list - numMoreToKeep = softChildLimit - len(idsToKeep) - otherIds.sort(key = lambda id: nodeMap[id]["tips"], reverse=True) - idsToKeep.extend(otherIds[:numMoreToKeep]) - otherIds = otherIds[numMoreToKeep:] - # Perform deletion - nodeObj["children"] = idsToKeep - for id in otherIds: - tipsRemoved += deleteDownward(id) - # Recurse on children - for childId in nodeObj["children"]: - tipsRemoved += trimChildren(childId) - nodeObj["tips"] -= tipsRemoved - return tipsRemoved -def deleteDownward(nodeId): - """ Deletes a node and it's descendants from the node map, along with associated data """ - nodeObj = nodeMap[nodeId] - name = nodeObj["name"] - # Recurse on children - tipsRemoved = 0 - if len(nodeObj["children"]) == 0: - tipsRemoved = 1 - else: - for childId in nodeObj["children"]: - tipsRemoved += deleteDownward(childId) - # Delete from name maps - if name not in dupNameToIds: - del nameToFirstId[name] - else: - dupNameToIds[name].remove(nodeId) - if nameToFirstId[name] == nodeId: - nameToFirstId[name] = dupNameToIds[name][0] - if len(dupNameToIds[name]) == 1: - del dupNameToIds[name] - # Delete from node map - del nodeMap[nodeId] - # - return tipsRemoved -#trimChildren(rootId) # Resolve duplicate names print("Resolving duplicates") nameToPickedId = {} diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py new file mode 100755 index 0000000..1291c14 --- /dev/null +++ b/backend/data/trimTree.py @@ -0,0 +1,130 @@ +#!/usr/bin/python3 + +import sys +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Removes certain children from a tol-tree in an sqlite db.\n" +usageInfo += "Looks for nodes with an amount of children above a threshold,\n" +usageInfo += "and removes the excess, excluding those with 'significant'\n" +usageInfo += "associations, like those with descriptions and images.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbFile = "data.db" +softChildLimit = 100 + +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Get nodes that shouldn't be deleted, along with their ancestors +print("Finding nodes to keep") +nodesToKeep = set() +print("\tFinding nodes with descs") +for (name,) in dbCur.execute("SELECT name FROM descs"): + nodesToKeep.add(name) +print("\tFinding nodes with images") +for (name,) in dbCur.execute("SELECT name FROM nodes INNER JOIN node_imgs ON nodes.id = node_imgs.id"): + nodesToKeep.add(name) +print("\tFinding nodes in reduced-tree") +for (name,) in dbCur.execute("SELECT name from r_nodes"): + nodesToKeep.add(name) +print("\tFinding ancestors") +ancestors = set() +iterNum = 0 +for name in nodesToKeep: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"\tAt iteration {iterNum}") + # + while True: + row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (name,)).fetchone() + if row != None: + parent = row[0] + if parent not in nodesToKeep and parent not in ancestors: + ancestors.add(parent) + continue + break +nodesToKeep.update(ancestors) +print(f"Total of {len(nodesToKeep)} nodes to keep") +# Find root node +query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.node IS NULL LIMIT 1" +(rootName,) = dbCur.execute(query).fetchone() +print(f"Found root node {rootName}") +# Traverse tree, looking for trimmable nodes +print("Looking for trimmable nodes") +nodeToTipsChg = {} +nodesToDelete = set() +iterNum = 0 +def findTrimmables(nodeName): + global iterNum + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}") + # + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))] + tipsRemoved = 0 + if len(childNames) > softChildLimit: + # Look for children to delete, excluding 'kept nodes' + childrenToKeep, otherChildren = [], [] + for n in childNames: + if n in nodesToKeep: + childrenToKeep.append(n) + else: + otherChildren.append(n) + # Only trim beyond threshold + if len(childrenToKeep) < softChildLimit: + numMoreToKeep = softChildLimit - len(childrenToKeep) + # Prefer keeping nodes with more tips + childToTips = {} + query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(otherChildren))) + for (n, tips) in dbCur.execute(query, otherChildren): + childToTips[n] = tips + otherChildren.sort(key=lambda n: childToTips[n], reverse=True) + childrenToKeep.extend(otherChildren[:numMoreToKeep]) + otherChildren = otherChildren[numMoreToKeep:] + # 'Simulate' deletions + childNames = childrenToKeep + for n in otherChildren: + tipsRemoved += markForDeletion(n) + # Recurse on children + for n in childNames: + tipsRemoved += findTrimmables(n) + # Store info for updating num-tips later + nodeToTipsChg[nodeName] = tipsRemoved + return tipsRemoved +def markForDeletion(nodeName): + nodesToDelete.add(nodeName) + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))] + if len(childNames) == 0: + return 1 + else: + tipsRemoved = 0 + for n in childNames: + tipsRemoved += markForDeletion(n) + return tipsRemoved +findTrimmables(rootName) +# Delete trimmable nodes +print(f"Deleting {len(nodesToDelete)} nodes") +iterNum = 0 +for nodeName in nodesToDelete: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}") + # + dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,)) + dbCur.execute("DELETE FROM edges WHERE node = ?", (nodeName,)) + dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,)) + dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,)) + dbCur.execute("DELETE FROM eol_ids WHERE name = ?", (nodeName,)) +print(f"Updating num-tips for {len(nodeToTipsChg)} nodes") +iterNum = 0 +for (nodeName, tipsChg) in nodeToTipsChg.items(): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") + # + dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName)) +# Close db +dbCon.commit() +dbCon.close() |
