aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rwxr-xr-xbackend/data/eol/reviewImgs.py2
-rwxr-xr-xbackend/data/genEolNameData.py58
2 files changed, 29 insertions, 31 deletions
diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py
index ba313d9..4fea1c4 100755
--- a/backend/data/eol/reviewImgs.py
+++ b/backend/data/eol/reviewImgs.py
@@ -26,7 +26,7 @@ def getExtraInfo(eolId):
" WHERE id = ? and pref_alt = 1"
row = extraInfoDbCur.execute(query, (eolId,)).fetchone()
if row != None:
- return f"Reviewing EOL ID {eolId}, aka \"row[0]\""
+ return f"Reviewing EOL ID {eolId}, aka \"{row[0]}\""
else:
return f"Reviewing EOL ID {eolId}"
IMG_DISPLAY_SZ = 400
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index e7eccba..4945386 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -17,16 +17,19 @@ if len(sys.argv) > 1:
vnamesFile = "eol/vernacularNames.csv"
dbFile = "data.db"
+NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"}
# Read in vernacular-names data
# Note: Canonical-names may have multiple pids
# Note: A canonical-name's associated pids might all have other associated names
print("Reading in vernacular-names data")
nameToPids = {}
-pidToNames = {}
canonicalNameToPids = {}
+pidToNames = {}
pidToPreferred = {}
def updateMaps(name, pid, canonical, preferredAlt):
+ if name in NAMES_TO_SKIP:
+ return
if name not in nameToPids:
nameToPids[name] = {pid}
else:
@@ -57,7 +60,7 @@ with open(vnamesFile, newline="") as csvfile:
preferred = row[6] == "preferred"
# Add to maps
updateMaps(name1, pid, True, False)
- if lang == "eng":
+ if lang == "eng" and name2 != "":
updateMaps(name2, pid, False, preferred)
# Open db connection
dbCon = sqlite3.connect(dbFile)
@@ -72,30 +75,34 @@ dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
usedPids = set()
unresolvedNodeNames = set()
dbCur2 = dbCon.cursor()
+def addToDb(nodeName, pidToUse):
+ altNames = set()
+ preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
+ dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
+ for n in pidToNames[pidToUse]:
+ if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None:
+ altNames.add(n)
+ for n in altNames:
+ isPreferred = 1 if (n == preferredName) else 0
+ dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (nodeName, n, isPreferred))
iterationNum = 0
-for row in dbCur2.execute("SELECT name FROM nodes"):
- name = row[0]
+for (name,) in dbCur2.execute("SELECT name FROM nodes"):
iterationNum += 1
if iterationNum % 10000 == 0:
print(f"Loop 1 iteration {iterationNum}")
# If name matches a canonical-name, add alt-name entries to 'names' table
if name in canonicalNameToPids:
- pidToUse = 0
+ pidToUse = None
for pid in canonicalNameToPids[name]:
- if pid not in usedPids:
+ hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred
+ hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred
+ if hasLowerPrio:
+ continue
+ if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio):
pidToUse = pid
- break
- if pidToUse > 0:
+ if pidToUse != None:
usedPids.add(pidToUse)
- altNames = set()
- preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
- dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, name))
- for n in pidToNames[pidToUse]:
- if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None:
- altNames.add(n)
- for n in altNames:
- isPreferred = 1 if (n == preferredName) else 0
- dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred))
+ addToDb(name, pidToUse)
elif name in nameToPids:
unresolvedNodeNames.add(name)
# Iterate through unresolved nodes, resolving to vernacular-names
@@ -105,22 +112,13 @@ for name in unresolvedNodeNames:
if iterationNum % 100 == 0:
print(f"Loop 2 iteration {iterationNum}")
# Add alt-name entries to 'names' table for first corresponding pid
- pidToUse = 0
+ pidToUse = None
for pid in nameToPids[name]:
- if pid not in usedPids:
+ if pid not in usedPids and (pidToUse == None or pid < pidToUse):
pidToUse = pid
- break
- if pidToUse > 0:
+ if pidToUse != None:
usedPids.add(pidToUse)
- altNames = set()
- preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
- dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, name))
- for n in pidToNames[pidToUse]:
- if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None:
- altNames.add(n)
- for n in altNames:
- isPreferred = 1 if (n == preferredName) else 0
- dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred))
+ addToDb(name, pidToUse)
# Close db
dbCon.commit()
dbCon.close()