aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-05 20:57:20 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-05 21:03:31 +1000
commit58ee74ad06f00f9043a15d9f1ac855ddc61fe3b5 (patch)
treeb83b66f592c7145985db341dfd13009476c6a6d5 /backend
parent50b10310e4104327a3577d347b1e4ce0a12c7c20 (diff)
Make '... [N]' duplicate node name assignment prefer nodes with less tips
Adjust genOtolData.py to use int ids instead of names, reducing memory usage. Small fix for server.py commented-out substring-search code.
Diffstat (limited to 'backend')
-rwxr-xr-xbackend/data/genOtolData.py177
-rwxr-xr-xbackend/server.py2
2 files changed, 90 insertions, 89 deletions
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index 57a15d2..7dfac54 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -25,15 +25,17 @@ if len(sys.argv) > 1:
treeFile = "otol/labelled_supertree_ottnames.tre"
annFile = "otol/annotations.json"
dbFile = "data.db"
-nodeMap = {} # Maps node names to node objects
+nodeMap = {} # Maps node IDs to node objects
idToName = {} # Maps node IDs to names
+nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs)
+dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs
-# Check for existing db
-if os.path.exists(dbFile):
- print("ERROR: Existing {} db".format(dbFile), file=sys.stderr)
- sys.exit(1)
-
+## Check for existing db
+#if os.path.exists(dbFile):
+# print("ERROR: Existing {} db".format(dbFile), file=sys.stderr)
+# sys.exit(1)
# Parse treeFile
+print("Parsing tree file")
data = None
with open(treeFile) as file:
data = file.read()
@@ -48,13 +50,13 @@ def parseNewick():
# Check for node
if data[dataIdx] == "(": # parse inner node
dataIdx += 1
- childNames = []
+ childIds = []
while True:
# Read child
- childName = parseNewick()
- if childName == None:
+ childId = parseNewick()
+ if childId == None:
return None
- childNames.append(childName)
+ childIds.append(childId)
if (dataIdx == len(data)):
print("ERROR: Unexpected EOF", file=sys.stderr)
return None
@@ -63,34 +65,34 @@ def parseNewick():
dataIdx += 1
continue
else:
- # Get node name
+ # Get node name and id
dataIdx += 1 # Consume an expected ')'
[name, id] = parseNewickName()
- idToName[id] = name
+ updateNameMaps(name, id)
# Get child num-tips total
tips = 0
- for childName in childNames:
- tips += nodeMap[childName]["tips"]
+ for childId in childIds:
+ tips += nodeMap[childId]["tips"]
# Add node to nodeMap
- if name in nodeMap: # Turns out the names might not actually be unique
- count = 2
- name2 = name + " [" + str(count) + "]"
- while name2 in nodeMap:
- count += 1
- name2 = name + " [" + str(count) + "]"
- name = name2
- nodeMap[name] = {
- "name": name, "id": id, "children": childNames, "parent": None, "tips": tips, "pSupport": False
- }
+ nodeMap[id] = {"name": name, "children": childIds, "parent": None, "tips": tips, "pSupport": False}
# Update childrens' parent reference
- for childName in childNames:
- nodeMap[childName]["parent"] = name
- return name
+ for childId in childIds:
+ nodeMap[childId]["parent"] = id
+ return id
else: # Parse node name
[name, id] = parseNewickName()
- idToName[id] = name
- nodeMap[name] = {"name": name, "id": id, "children": [], "parent": None, "tips": 1, "pSupport": False}
- return name
+ updateNameMaps(name, id)
+ nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False}
+ return id
+def updateNameMaps(name, id):
+ idToName[id] = name
+ if name not in nameToFirstId:
+ nameToFirstId[name] = id
+ else:
+ if name not in dupNameToIds:
+ dupNameToIds[name] = [nameToFirstId[name], id]
+ else:
+ dupNameToIds[name].append(id)
def parseNewickName():
"""Helper that parses an input node name, and returns a [name,id] pair"""
global data, dataIdx
@@ -139,67 +141,66 @@ def parseNewickName():
raise Exception("ERROR: invalid name \"{}\"".format(name))
return [match.group(1).replace("_", " "), match.group(2)]
rootName = parseNewick()
-
+# Resolve duplicate names
+print("Resolving duplicates")
+for [dupName, ids] in dupNameToIds.items():
+ # Get conflicting node with most tips
+ tipNums = [nodeMap[id]["tips"] for id in ids]
+ maxIdx = tipNums.index(max(tipNums))
+ maxId = ids[maxIdx]
+ # Adjust name of other conflicting nodes
+ counter = 2
+ for id in ids:
+ if id != maxId:
+ nodeMap[id]["name"] += " [" + str(counter)+ "]"
+ counter += 1
+# Change mrca* names
+print("Changing mrca* names")
+def convertMrcaName(id):
+ node = nodeMap[id]
+ name = node["name"]
+ childIds = node["children"]
+ if len(childIds) < 2:
+ print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr)
+ return
+ # Get 2 children with most tips
+ childTips = [nodeMap[id]["tips"] for id in childIds]
+ maxIdx = childTips.index(max(childTips))
+ childTips[maxIdx] = 0
+ maxIdx2 = childTips.index(max(childTips))
+ childId1 = childIds[maxIdx]
+ childId2 = childIds[maxIdx2]
+ childName1 = nodeMap[childId1]["name"]
+ childName2 = nodeMap[childId2]["name"]
+ # Check for mrca* child names
+ if childName1.startswith("mrca"):
+ childName1 = convertMrcaName(childId1)
+ if childName2.startswith("mrca"):
+ childName2 = convertMrcaName(childId2)
+ # Check for composite names
+ match = re.fullmatch(r"\[(.+) \+ (.+)]", childName1)
+ if match != None:
+ childName1 = match.group(1)
+ match = re.fullmatch(r"\[(.+) \+ (.+)]", childName2)
+ if match != None:
+ childName2 = match.group(1)
+ # Create composite name
+ node["name"] = "[{} + {}]".format(childName1, childName2)
+ return childName1
+for [id, node] in nodeMap.items():
+ if node["name"].startswith("mrca"):
+ convertMrcaName(id)
# Parse annFile
+print("Parsing annotations file")
data = None
with open(annFile) as file:
data = file.read()
obj = json.loads(data)
nodeAnnsMap = obj['nodes']
-
-# Change mrca* names
-def applyMrcaNameConvert(name, namesToSwap):
- """
- Given an mrca* name, makes namesToSwap map it to an expanded version with the form [childName1 + childName2].
- May recurse on child nodes with mrca* names.
- Also returns the name of the highest-tips child (used when recursing).
- """
- node = nodeMap[name]
- childNames = node["children"]
- if len(childNames) < 2:
- print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr)
- return name
- # Get 2 children with most tips
- childTips = []
- for n in childNames:
- childTips.append(nodeMap[n]["tips"])
- maxTips = max(childTips)
- maxIdx = childTips.index(maxTips)
- childTips[maxIdx] = 0
- maxTips2 = max(childTips)
- maxIdx2 = childTips.index(maxTips2)
- childName1 = node["children"][maxIdx]
- childName2 = node["children"][maxIdx2]
- # Check for composite child names
- if childName1.startswith("mrca"):
- childName1 = applyMrcaNameConvert(childName1, namesToSwap)
- if childName2.startswith("mrca"):
- childName2 = applyMrcaNameConvert(childName2, namesToSwap)
- # Create composite name
- namesToSwap[name] = "[{} + {}]".format(childName1, childName2)
- return childName1
-namesToSwap = {} # Maps mrca* names to replacement names
-for node in nodeMap.values():
- name = node["name"]
- if (name.startswith("mrca") and name not in namesToSwap):
- applyMrcaNameConvert(name, namesToSwap)
-for [oldName, newName] in namesToSwap.items():
- nodeMap[newName] = nodeMap[oldName]
- del nodeMap[oldName]
-for node in nodeMap.values():
- parentName = node["parent"]
- if (parentName in namesToSwap):
- node["parent"] = namesToSwap[parentName]
- childNames = node["children"]
- for i in range(len(childNames)):
- childName = childNames[i]
- if (childName in namesToSwap):
- childNames[i] = namesToSwap[childName]
-
# Add annotations data
-for node in nodeMap.values():
+print("Adding annotation data")
+for [id, node] in nodeMap.items():
# Set has-support value using annotations
- id = node["id"]
if id in nodeAnnsMap:
nodeAnns = nodeAnnsMap[id]
supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0
@@ -208,15 +209,15 @@ for node in nodeMap.values():
# Root node gets support
if node["parent"] == None:
node["pSupport"] = True
-
# Create db
+print("Creating nodes table")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT)")
-for name in nodeMap.keys():
- node = nodeMap[name]
+for node in nodeMap.values():
+ childNames = [nodeMap[id]["name"] for id in node["children"]]
+ parentName = "" if node["parent"] == None else nodeMap[node["parent"]]["name"]
dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?, ?, ?)",
- (name, json.dumps(node["children"]), "" if node["parent"] == None else node["parent"],
- node["tips"], 1 if node["pSupport"] else 0))
+ (node["name"], json.dumps(childNames), parentName, node["tips"], 1 if node["pSupport"] else 0))
dbCon.commit()
dbCon.close()
diff --git a/backend/server.py b/backend/server.py
index c8567ab..1d31b36 100755
--- a/backend/server.py
+++ b/backend/server.py
@@ -81,7 +81,7 @@ def lookupName(name):
# "SELECT DISTINCT names.name, names.alt_name, nodes.tips FROM" \
# " names INNER JOIN nodes ON names.name = nodes.name " \
# " WHERE alt_name LIKE ? ORDER BY nodes.tips DESC LIMIT ?",
- # (name, SEARCH_SUGG_LIMIT)):
+ # (name + "%", SEARCH_SUGG_LIMIT)):
# results.append({"name": row[0], "altName": row[1]})
for row in cur.execute(
"SELECT word, alt_name, name FROM" \