aboutsummaryrefslogtreecommitdiff
path: root/backend/data/eolNamesToSqlite.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-04-30 13:24:26 +1000
committerTerry Truong <terry06890@gmail.com>2022-04-30 13:24:26 +1000
commitd87bb9bc0991d7ce4eeb895da61c63a204edaa4d (patch)
tree8a5e51817aba00f4d1a281749764805e2aee618a /backend/data/eolNamesToSqlite.py
parent565495b1153c87cbf907de31d116c5f89bcffc2a (diff)
Add scripts for downloading/reviewing/cropping_and_resizing images
Also adjust client code to handle new format, and add backend/data/README.md explaining image production process.
Diffstat (limited to 'backend/data/eolNamesToSqlite.py')
-rwxr-xr-xbackend/data/eolNamesToSqlite.py93
1 files changed, 0 insertions, 93 deletions
diff --git a/backend/data/eolNamesToSqlite.py b/backend/data/eolNamesToSqlite.py
deleted file mode 100755
index 1df1c23..0000000
--- a/backend/data/eolNamesToSqlite.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import csv, sqlite3
-
-vnamesFile = "eol/vernacular_names.csv"
-dbFile = "data.db"
-
-# Read in vernacular-names data
- # Note: Canonical-names may have multiple pids
- # Note: A canonical-name's associated pids might all have other associated names
-nameToPids = {}
-pidToNames = {}
-canonicalNameToPids = {}
-def updateMaps(name, pid, canonical):
- if name not in nameToPids:
- nameToPids[name] = {pid}
- else:
- nameToPids[name].add(pid)
- if canonical:
- if name not in canonicalNameToPids:
- canonicalNameToPids[name] = {pid}
- else:
- canonicalNameToPids[name].add(pid)
- if pid not in pidToNames:
- pidToNames[pid] = {name}
- else:
- pidToNames[pid].add(name)
-with open(vnamesFile, newline="") as csvfile:
- reader = csv.reader(csvfile)
- lineNum = 0
- for row in reader:
- lineNum += 1
- if lineNum == 1:
- continue
- pid = int(row[0])
- name1 = re.sub(r"<[^>]+>", "", row[1].lower())
- name2 = row[2].lower()
- # Add to maps
- updateMaps(name1, pid, True)
- updateMaps(name2, pid, False)
-# Open db connection
-dbCon = sqlite3.connect(dbFile)
-cur = dbCon.cursor()
-# Create 'names' table
-cur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, PRIMARY KEY(name, alt_name))")
-# Iterate through 'nodes' table, resolving to canonical-names
-usedPids = set()
-unresolvedNodeNames = set()
-cur2 = dbCon.cursor()
-iterationNum = 0
-for row in cur2.execute("SELECT name FROM nodes"):
- name = row[0]
- iterationNum += 1
- if iterationNum % 10000 == 0:
- print("Loop 1 iteration {}".format(iterationNum))
- # If name matches a canonical-name, add alt-name entries to 'names' table
- if name in canonicalNameToPids:
- pidToUse = 0
- for pid in canonicalNameToPids[name]:
- if pid not in usedPids:
- pidToUse = pid
- break
- if pidToUse > 0:
- usedPids.add(pidToUse)
- altNames = {name}
- for n in pidToNames[pidToUse]:
- altNames.add(n)
- for n in altNames:
- cur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, pidToUse))
- elif name in nameToPids:
- unresolvedNodeNames.add(name)
-# Iterate through unresolved nodes, resolving to vernacular-names
-iterationNum = 0
-for name in unresolvedNodeNames:
- iterationNum += 1
- if iterationNum % 10000 == 0:
- print("Loop 2 iteration {}".format(iterationNum))
- # Add alt-name entries to 'names' table for first corresponding pid
- pidToUse = 0
- for pid in nameToPids[name]:
- if pid not in usedPids:
- pidToUse = pid
- break
- if pidToUse > 0:
- usedPids.add(pidToUse)
- altNames = {name}
- for n in pidToNames[pidToUse]:
- altNames.add(n)
- for n in altNames:
- cur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, pidToUse))
-dbCon.commit()
-dbCon.close()