From d87bb9bc0991d7ce4eeb895da61c63a204edaa4d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sat, 30 Apr 2022 13:24:26 +1000 Subject: Add scripts for downloading/reviewing/cropping_and_resizing images Also adjust client code to handle new format, and add backend/data/README.md explaining image production process. --- backend/data/otolToSqlite.py | 227 ------------------------------------------- 1 file changed, 227 deletions(-) delete mode 100755 backend/data/otolToSqlite.py (limited to 'backend/data/otolToSqlite.py') diff --git a/backend/data/otolToSqlite.py b/backend/data/otolToSqlite.py deleted file mode 100755 index 2ee47b7..0000000 --- a/backend/data/otolToSqlite.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, json, sqlite3 -import os.path - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release), \n" -usageInfo += "and creates an sqlite database otol.db, which holds entries of the form (name text, data text).\n" -usageInfo += "Each row holds a tree-of-life node name, and a JSON string with the form \n" -usageInfo += "{\"children\": [name1, ...], \"parent\": name1, \"tips\": int1, \"pSupport\": bool1}, holding \n" -usageInfo += "child names, a parent name or null, descendant 'tips', and a phylogeny-support indicator\n" -usageInfo += "\n" -usageInfo += "This script was adapted to handle Open Tree of Life version 13.4.\n" -usageInfo += "Link: https://tree.opentreeoflife.org/about/synthesis-release/v13.4\n" -usageInfo += "\n" -usageInfo += "labelled_supertree_ottnames.tre format:\n" -usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n" -usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n" -usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n" -usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n" -usageInfo += "annotations.json format:\n" -usageInfo += " JSON object holding information about the tree-of-life release.\n" -usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n" -usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n" - -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -treeFile = "otol/labelled_supertree_ottnames.tre" -annFile = "otol/annotations.json" -dbFile = "data.db" -nodeMap = {} # Maps node names to node objects -idToName = {} # Maps node IDs to names - -# Check for existing db -if os.path.exists(dbFile): - print("ERROR: Existing {} db".format(dbFile), file=sys.stderr) - sys.exit(1) - -# Parse treeFile -data = None -with open(treeFile) as file: - data = file.read() -dataIdx = 0 -def parseNewick(): - """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None""" - global dataIdx - # Check for EOF - if dataIdx == len(data): - print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr) - return None - # Check for inner-node start - if data[dataIdx] == "(": - dataIdx += 1 - childNames = [] - while True: - # Read child - childName = parseNewick() - if childName == None: - return None - childNames.append(childName) - if (dataIdx == len(data)): - print("ERROR: Unexpected EOF", file=sys.stderr) - return None - # Check for next child - if (data[dataIdx] == ","): - dataIdx += 1 - continue - else: - # Get node name - dataIdx += 1 # Consume an expected ')' - [name, id] = parseNewickName() - idToName[id] = name - # Get child num-tips total - tips = 0 - for childName in childNames: - tips += nodeMap[childName]["tips"] - # Add node to nodeMap - if name in nodeMap: # Turns out the names might not actually be unique - count = 2 - name2 = name + " [" + str(count) + "]" - while name2 in nodeMap: - count += 1 - name2 = name + " [" + str(count) + "]" - name = name2 - nodeMap[name] = { - "n": name, "id": id, "children": childNames, "parent": None, "tips": tips, "pSupport": False - } - # Update childrens' parent reference - for childName in childNames: - nodeMap[childName]["parent"] = name - return name - else: - [name, id] = parseNewickName() - idToName[id] = name - nodeMap[name] = {"n": name, "id": id, "children": [], "parent": None, "tips": 1, "pSupport": False} - return name -def parseNewickName(): - """Helper that parses an input node name, and returns a [name,id] pair""" - global data, dataIdx - name = None - end = dataIdx - # Get name - if (end < len(data) and data[end] == "'"): # Check for quoted name - end += 1 - inQuote = True - while end < len(data): - if (data[end] == "'"): - if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote - end += 2 - continue - else: - end += 1 - inQuote = False - break - end += 1 - if inQuote: - raise Exception("ERROR: Unexpected EOF") - name = data[dataIdx:end] - dataIdx = end - else: - while end < len(data) and not re.match(r"[(),]", data[end]): - end += 1 - if (end == dataIdx): - raise Exception("ERROR: Unexpected EOF") - name = data[dataIdx:end].rstrip() - if end == len(data): # Ignore trailing input semicolon - name = name[:-1] - dataIdx = end - # Convert to [name, id] - name = name.lower() - if name.startswith("mrca"): - return [name, name] - elif name[0] == "'": - match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) - if match == None: - raise Exception("ERROR: invalid name \"{}\"".format(name)) - name = match.group(1).replace("''", "'") - return [name, match.group(2)] - else: - match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) - if match == None: - raise Exception("ERROR: invalid name \"{}\"".format(name)) - return [match.group(1).replace("_", " "), match.group(2)] -rootName = parseNewick() - -# Parse annFile -data = None -with open(annFile) as file: - data = file.read() -obj = json.loads(data) -nodeAnnsMap = obj['nodes'] - -# Change mrca* names -def applyMrcaNameConvert(name, namesToSwap): - """ - Given an mrca* name, makes namesToSwap map it to an expanded version with the form [childName1 + childName2]. - May recurse on child nodes with mrca* names. - Also returns the name of the highest-tips child (used when recursing). - """ - node = nodeMap[name] - childNames = node["children"] - if len(childNames) < 2: - print("WARNING: MRCA node \"{}\" has less than 2 children".format(name), file=sys.stderr) - return name - # Get 2 children with most tips - childTips = [] - for n in childNames: - childTips.append(nodeMap[n]["tips"]) - maxTips = max(childTips) - maxIdx = childTips.index(maxTips) - childTips[maxIdx] = 0 - maxTips2 = max(childTips) - maxIdx2 = childTips.index(maxTips2) - # - childName1 = node["children"][maxIdx] - childName2 = node["children"][maxIdx2] - if childName1.startswith("mrca"): - childName1 = applyMrcaNameConvert(childName1, namesToSwap) - if childName2.startswith("mrca"): - childName2 = applyMrcaNameConvert(childName2, namesToSwap) - # Create composite name - namesToSwap[name] = "[{} + {}]".format(childName1, childName2) - return childName1 -namesToSwap = {} # Maps mrca* names to replacement names -for node in nodeMap.values(): - name = node["n"] - if (name.startswith("mrca") and name not in namesToSwap): - applyMrcaNameConvert(name, namesToSwap) -for [oldName, newName] in namesToSwap.items(): - nodeMap[newName] = nodeMap[oldName] - del nodeMap[oldName] -for node in nodeMap.values(): - parentName = node["parent"] - if (parentName in namesToSwap): - node["parent"] = namesToSwap[parentName] - childNames = node["children"] - for i in range(len(childNames)): - childName = childNames[i] - if (childName in namesToSwap): - childNames[i] = namesToSwap[childName] - -# Add annotations data, and delete certain fields -for node in nodeMap.values(): - # Set has-support value using annotations - id = node["id"] - if id in nodeAnnsMap: - nodeAnns = nodeAnnsMap[id] - supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0 - conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0 - node["pSupport"] = supportQty > 0 and conflictQty == 0 - # Root node gets support - if node["parent"] == None: - node["pSupport"] = True - # Delete some no-longer-needed fields - del node["n"] - del node["id"] - -# Create db -con = sqlite3.connect(dbFile) -cur = con.cursor() -cur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, data TEXT)") -for name in nodeMap.keys(): - cur.execute("INSERT INTO nodes VALUES (?, ?)", (name, json.dumps(nodeMap[name]))) -con.commit() -con.close() -- cgit v1.2.3