aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-04-27 00:29:20 +1000
committerTerry Truong <terry06890@gmail.com>2022-04-27 01:38:04 +1000
commit6f52cd6b07970010c40270003d63aa74f84f6ae9 (patch)
treef23c0a9ebc50db89463621516ab643d5c76c1590 /backend/data
parent55e281a57c2ac9acb18836ea7a48f5a553d924e2 (diff)
Use EOL vernacular-names data for searching
Add data/eolNamesToSqlite.py to read EOL vernacular-names data and add to sqlite db. Adjust server.py to handle search requests, and adjust SearchModal to make them.
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/eol/README.md8
-rwxr-xr-xbackend/data/eolNamesToSqlite.py56
-rw-r--r--backend/data/otol/README.md6
-rwxr-xr-xbackend/data/otolToSqlite.py2
4 files changed, 71 insertions, 1 deletions
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md
index e69de29..ed970d2 100644
--- a/backend/data/eol/README.md
+++ b/backend/data/eol/README.md
@@ -0,0 +1,8 @@
+Files
+=====
+- images\_list.tgz
+ Obtained from https://opendata.eol.org/dataset/images-list on 24/04/2022
+ Listed as being last updated on 05/02/2020
+- vernacular\_names.csv
+ Obtained from https://opendata.eol.org/dataset/vernacular-names on 24/04/2022
+ Listed as being last updated on 27/10/2020
diff --git a/backend/data/eolNamesToSqlite.py b/backend/data/eolNamesToSqlite.py
new file mode 100755
index 0000000..4013a4c
--- /dev/null
+++ b/backend/data/eolNamesToSqlite.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python3
+
+import sys, re
+import csv, sqlite3
+
+vnamesFile = "eol/vernacular_names.csv"
+dbFile = "data.db"
+
+# Read in vernacular-names data
+nameToPids = {}
+pidToNames = {}
+def updateMaps(name, pid):
+ if name not in nameToPids:
+ nameToPids[name] = {pid}
+ elif pid not in nameToPids[name]:
+ nameToPids[name].add(pid)
+ if pid not in pidToNames:
+ pidToNames[pid] = {name}
+ elif name not in pidToNames[pid]:
+ pidToNames[pid].add(name)
+with open(vnamesFile, newline="") as csvfile:
+ reader = csv.reader(csvfile)
+ lineNum = 0
+ for row in reader:
+ lineNum += 1
+ if lineNum == 1:
+ continue
+ pid = int(row[0])
+ name1 = re.sub(r"<[^>]+>", "", row[1].lower())
+ name2 = row[2].lower()
+ # Add to maps
+ updateMaps(name1, pid)
+ updateMaps(name2, pid)
+# Open db connection
+dbCon = sqlite3.connect(dbFile)
+cur = dbCon.cursor()
+# Create 'names' table
+cur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, PRIMARY KEY(name, alt_name))")
+# Iterate through 'nodes' table
+cur2 = dbCon.cursor()
+iterationNum = 0
+for row in cur2.execute("SELECT name FROM nodes"):
+ name = row[0]
+ iterationNum += 1
+ if iterationNum % 10000 == 0:
+ print("Iteration {}".format(iterationNum))
+ # If name matches a vernacular-names name, add alt-name entries to the 'names' table
+ if name in nameToPids:
+ altNames = {name}
+ for pid in nameToPids[name]:
+ for n in pidToNames[pid]:
+ altNames.add(n)
+ for n in altNames:
+ cur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, pid))
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/otol/README.md b/backend/data/otol/README.md
index e69de29..f720772 100644
--- a/backend/data/otol/README.md
+++ b/backend/data/otol/README.md
@@ -0,0 +1,6 @@
+Files
+=====
+- labelled\_supertree\_ottnames.tre
+ Obtained from https://tree.opentreeoflife.org/about/synthesis-release/v13.4
+- annotations.json
+ Obtained from https://tree.opentreeoflife.org/about/synthesis-release/v13.4
diff --git a/backend/data/otolToSqlite.py b/backend/data/otolToSqlite.py
index 93ed294..2ee47b7 100755
--- a/backend/data/otolToSqlite.py
+++ b/backend/data/otolToSqlite.py
@@ -29,7 +29,7 @@ if len(sys.argv) > 1:
treeFile = "otol/labelled_supertree_ottnames.tre"
annFile = "otol/annotations.json"
-dbFile = "otol.db"
+dbFile = "data.db"
nodeMap = {} # Maps node names to node objects
idToName = {} # Maps node IDs to names