diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 23 | ||||
| -rwxr-xr-x | backend/data/genEolNameData.py | 11 | ||||
| -rwxr-xr-x | backend/data/genOtolData.py | 1 | ||||
| -rwxr-xr-x | backend/data/genSpellfixNameData.py | 32 | ||||
| -rwxr-xr-x | backend/data/spellfix.so | bin | 0 -> 86456 bytes |
5 files changed, 59 insertions, 8 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 88053b5..b4ee77e 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -5,9 +5,26 @@ File Generation Process table using data in otol/*. 3 Run genEolNameData.py, which adds a 'names' table to data.db, using data in eol/vernacularNames.csv and the 'nodes' table. -4 Use downloadImgsForReview.py to download EOL images into imgsForReview/. +4 Run genSpellfixNameData.py, which adds a 'spellfix\_alt\_names' + table to data.db, using data in the 'names' table. +5 Use downloadImgsForReview.py to download EOL images into imgsForReview/. It uses data in eol/imagesList.db, and the 'nodes' table. -5 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique +6 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique images in imgsReviewed/ (uses 'names' to display common names). -6 Use genImgsForWeb.py to create cropped/resized images in img/, using +7 Use genImgsForWeb.py to create cropped/resized images in img/, using images in imgsReviewed, and also to add an 'images' table to data.db. + +spellfix.so +=========== +This file provides the spellfix1 extension for Sqlite, and +is used for responding to fuzzy-search requests. + +It was obtained by: +1 Downloading the sqlite source tree from + the github mirror at <https://github.com/sqlite/sqlite>, + into a directory sqlite/ +2 After making sure autoconf 2.61+ and libtool are installed, + running `mkdir bld; cd bld; ../sqlite/configure;` +3 Running `make` +4 Running `cp ../sqlite/ext/misc/spellfix.c .` +5 Running `gcc -fPIC -shared spellfix.c -o spellfix.so` diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py index 5070fd7..46e109a 100755 --- a/backend/data/genEolNameData.py +++ b/backend/data/genEolNameData.py @@ -1,7 +1,5 @@ #!/usr/bin/python3 -# - import sys, re import csv, sqlite3 @@ -13,6 +11,9 @@ usageInfo += "\n" usageInfo += "Expects a CSV header describing lines with format:\n" usageInfo += " page_id, canonical_form, vernacular_string, language_code,\n" usageInfo += " resource_name, is_preferred_by_resource, is_preferred_by_eol\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) vnamesFile = "eol/vernacularNames.csv" dbFile = "data.db" @@ -54,6 +55,7 @@ with open(vnamesFile, newline="") as csvfile: # Add to maps updateMaps(name1, pid, True, False) updateMaps(name2, pid, False, preferred) + # Open db connection dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -62,9 +64,9 @@ dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, pref_alt # Iterate through 'nodes' table, resolving to canonical-names usedPids = set() unresolvedNodeNames = set() -cur2 = dbCon.cursor() +dbCur2 = dbCon.cursor() iterationNum = 0 -for row in cur2.execute("SELECT name FROM nodes"): +for row in dbCur2.execute("SELECT name FROM nodes"): name = row[0] iterationNum += 1 if iterationNum % 10000 == 0: @@ -108,5 +110,6 @@ for name in unresolvedNodeNames: for n in altNames: isPreferred = 1 if (n == preferredName) else 0 dbCur.execute("INSERT INTO names VALUES (?, ?, ?, ?)", (name, n, pidToUse, isPreferred)) +# Close db dbCon.commit() dbCon.close() diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py index 696d59b..57a15d2 100755 --- a/backend/data/genOtolData.py +++ b/backend/data/genOtolData.py @@ -18,7 +18,6 @@ usageInfo += "Expected annotations.json format:\n" usageInfo += " JSON object holding information about the tree-of-life release.\n" usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n" usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n" - if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) diff --git a/backend/data/genSpellfixNameData.py b/backend/data/genSpellfixNameData.py new file mode 100755 index 0000000..9a3a7a1 --- /dev/null +++ b/backend/data/genSpellfixNameData.py @@ -0,0 +1,32 @@ +#!/usr/bin/python3 + +import sys +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads alt-names from a 'names' table in a database, and adds a spellfix \n" +usageInfo += "table 'spellfix_alt_names' usable for fuzzy-searching those names.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbFile = "data.db" + +# Connect to db, and load spellfix extension +dbCon = sqlite3.connect(dbFile) +dbCon.enable_load_extension(True) +dbCon.load_extension('./spellfix') +# Create spellfix table, and insert alt-names +spellfixCur = dbCon.cursor() +spellfixCur.execute("CREATE VIRTUAL TABLE spellfix_alt_names USING spellfix1") +namesCur = dbCon.cursor() +iterationNum = 0 +for row in namesCur.execute("SELECT DISTINCT alt_name FROM names"): + iterationNum += 1 + if iterationNum % 10000 == 0: + print("Loop {}: {}".format(iterationNum, row[0])) + # Insert alt-name + spellfixCur.execute("INSERT INTO spellfix_alt_names(word) VALUES (?)", (row[0],)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/spellfix.so b/backend/data/spellfix.so Binary files differnew file mode 100755 index 0000000..0bc985c --- /dev/null +++ b/backend/data/spellfix.so |
