diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-04-27 00:29:20 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-04-27 01:38:04 +1000 |
| commit | 6f52cd6b07970010c40270003d63aa74f84f6ae9 (patch) | |
| tree | f23c0a9ebc50db89463621516ab643d5c76c1590 /backend/data/eolNamesToSqlite.py | |
| parent | 55e281a57c2ac9acb18836ea7a48f5a553d924e2 (diff) | |
Use EOL vernacular-names data for searching
Add data/eolNamesToSqlite.py to read EOL vernacular-names data and add
to sqlite db. Adjust server.py to handle search requests, and adjust
SearchModal to make them.
Diffstat (limited to 'backend/data/eolNamesToSqlite.py')
| -rwxr-xr-x | backend/data/eolNamesToSqlite.py | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/backend/data/eolNamesToSqlite.py b/backend/data/eolNamesToSqlite.py new file mode 100755 index 0000000..4013a4c --- /dev/null +++ b/backend/data/eolNamesToSqlite.py @@ -0,0 +1,56 @@ +#!/usr/bin/python3 + +import sys, re +import csv, sqlite3 + +vnamesFile = "eol/vernacular_names.csv" +dbFile = "data.db" + +# Read in vernacular-names data +nameToPids = {} +pidToNames = {} +def updateMaps(name, pid): + if name not in nameToPids: + nameToPids[name] = {pid} + elif pid not in nameToPids[name]: + nameToPids[name].add(pid) + if pid not in pidToNames: + pidToNames[pid] = {name} + elif name not in pidToNames[pid]: + pidToNames[pid].add(name) +with open(vnamesFile, newline="") as csvfile: + reader = csv.reader(csvfile) + lineNum = 0 + for row in reader: + lineNum += 1 + if lineNum == 1: + continue + pid = int(row[0]) + name1 = re.sub(r"<[^>]+>", "", row[1].lower()) + name2 = row[2].lower() + # Add to maps + updateMaps(name1, pid) + updateMaps(name2, pid) +# Open db connection +dbCon = sqlite3.connect(dbFile) +cur = dbCon.cursor() +# Create 'names' table +cur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, PRIMARY KEY(name, alt_name))") +# Iterate through 'nodes' table +cur2 = dbCon.cursor() +iterationNum = 0 +for row in cur2.execute("SELECT name FROM nodes"): + name = row[0] + iterationNum += 1 + if iterationNum % 10000 == 0: + print("Iteration {}".format(iterationNum)) + # If name matches a vernacular-names name, add alt-name entries to the 'names' table + if name in nameToPids: + altNames = {name} + for pid in nameToPids[name]: + for n in pidToNames[pid]: + altNames.add(n) + for n in altNames: + cur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, pid)) +dbCon.commit() +dbCon.close() |
