aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md23
-rwxr-xr-xbackend/data/genEolNameData.py11
-rwxr-xr-xbackend/data/genOtolData.py1
-rwxr-xr-xbackend/data/genSpellfixNameData.py32
-rwxr-xr-xbackend/data/spellfix.sobin0 -> 86456 bytes
5 files changed, 59 insertions, 8 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 88053b5..b4ee77e 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -5,9 +5,26 @@ File Generation Process
table using data in otol/*.
3 Run genEolNameData.py, which adds a 'names' table to data.db,
using data in eol/vernacularNames.csv and the 'nodes' table.
-4 Use downloadImgsForReview.py to download EOL images into imgsForReview/.
+4 Run genSpellfixNameData.py, which adds a 'spellfix\_alt\_names'
+ table to data.db, using data in the 'names' table.
+5 Use downloadImgsForReview.py to download EOL images into imgsForReview/.
It uses data in eol/imagesList.db, and the 'nodes' table.
-5 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique
+6 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique
images in imgsReviewed/ (uses 'names' to display common names).
-6 Use genImgsForWeb.py to create cropped/resized images in img/, using
+7 Use genImgsForWeb.py to create cropped/resized images in img/, using
images in imgsReviewed, and also to add an 'images' table to data.db.
+
+spellfix.so
+===========
+This file provides the spellfix1 extension for Sqlite, and
+is used for responding to fuzzy-search requests.
+
+It was obtained by:
+1 Downloading the sqlite source tree from
+ the github mirror at <https://github.com/sqlite/sqlite>,
+ into a directory sqlite/
+2 After making sure autoconf 2.61+ and libtool are installed,
+ running `mkdir bld; cd bld; ../sqlite/configure;`
+3 Running `make`
+4 Running `cp ../sqlite/ext/misc/spellfix.c .`
+5 Running `gcc -fPIC -shared spellfix.c -o spellfix.so`
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index 5070fd7..46e109a 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -1,7 +1,5 @@
#!/usr/bin/python3
-#
-
import sys, re
import csv, sqlite3
@@ -13,6 +11,9 @@ usageInfo += "\n"
usageInfo += "Expects a CSV header describing lines with format:\n"
usageInfo += " page_id, canonical_form, vernacular_string, language_code,\n"
usageInfo += " resource_name, is_preferred_by_resource, is_preferred_by_eol\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
vnamesFile = "eol/vernacularNames.csv"
dbFile = "data.db"
@@ -54,6 +55,7 @@ with open(vnamesFile, newline="") as csvfile:
# Add to maps
updateMaps(name1, pid, True, False)
updateMaps(name2, pid, False, preferred)
+
# Open db connection
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -62,9 +64,9 @@ dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, pref_alt
# Iterate through 'nodes' table, resolving to canonical-names
usedPids = set()
unresolvedNodeNames = set()
-cur2 = dbCon.cursor()
+dbCur2 = dbCon.cursor()
iterationNum = 0
-for row in cur2.execute("SELECT name FROM nodes"):
+for row in dbCur2.execute("SELECT name FROM nodes"):
name = row[0]
iterationNum += 1
if iterationNum % 10000 == 0:
@@ -108,5 +110,6 @@ for name in unresolvedNodeNames:
for n in altNames:
isPreferred = 1 if (n == preferredName) else 0
dbCur.execute("INSERT INTO names VALUES (?, ?, ?, ?)", (name, n, pidToUse, isPreferred))
+# Close db
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index 696d59b..57a15d2 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -18,7 +18,6 @@ usageInfo += "Expected annotations.json format:\n"
usageInfo += " JSON object holding information about the tree-of-life release.\n"
usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n"
-
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
diff --git a/backend/data/genSpellfixNameData.py b/backend/data/genSpellfixNameData.py
new file mode 100755
index 0000000..9a3a7a1
--- /dev/null
+++ b/backend/data/genSpellfixNameData.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python3
+
+import sys
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads alt-names from a 'names' table in a database, and adds a spellfix \n"
+usageInfo += "table 'spellfix_alt_names' usable for fuzzy-searching those names.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbFile = "data.db"
+
+# Connect to db, and load spellfix extension
+dbCon = sqlite3.connect(dbFile)
+dbCon.enable_load_extension(True)
+dbCon.load_extension('./spellfix')
+# Create spellfix table, and insert alt-names
+spellfixCur = dbCon.cursor()
+spellfixCur.execute("CREATE VIRTUAL TABLE spellfix_alt_names USING spellfix1")
+namesCur = dbCon.cursor()
+iterationNum = 0
+for row in namesCur.execute("SELECT DISTINCT alt_name FROM names"):
+ iterationNum += 1
+ if iterationNum % 10000 == 0:
+ print("Loop {}: {}".format(iterationNum, row[0]))
+ # Insert alt-name
+ spellfixCur.execute("INSERT INTO spellfix_alt_names(word) VALUES (?)", (row[0],))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/spellfix.so b/backend/data/spellfix.so
new file mode 100755
index 0000000..0bc985c
--- /dev/null
+++ b/backend/data/spellfix.so
Binary files differ