diff options
Diffstat (limited to 'backend/data')
| -rwxr-xr-x | backend/data/eol/reviewImgs.py | 2 | ||||
| -rwxr-xr-x | backend/data/genEolNameData.py | 58 |
2 files changed, 29 insertions, 31 deletions
diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py index ba313d9..4fea1c4 100755 --- a/backend/data/eol/reviewImgs.py +++ b/backend/data/eol/reviewImgs.py @@ -26,7 +26,7 @@ def getExtraInfo(eolId): " WHERE id = ? and pref_alt = 1" row = extraInfoDbCur.execute(query, (eolId,)).fetchone() if row != None: - return f"Reviewing EOL ID {eolId}, aka \"row[0]\"" + return f"Reviewing EOL ID {eolId}, aka \"{row[0]}\"" else: return f"Reviewing EOL ID {eolId}" IMG_DISPLAY_SZ = 400 diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py index e7eccba..4945386 100755 --- a/backend/data/genEolNameData.py +++ b/backend/data/genEolNameData.py @@ -17,16 +17,19 @@ if len(sys.argv) > 1: vnamesFile = "eol/vernacularNames.csv" dbFile = "data.db" +NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"} # Read in vernacular-names data # Note: Canonical-names may have multiple pids # Note: A canonical-name's associated pids might all have other associated names print("Reading in vernacular-names data") nameToPids = {} -pidToNames = {} canonicalNameToPids = {} +pidToNames = {} pidToPreferred = {} def updateMaps(name, pid, canonical, preferredAlt): + if name in NAMES_TO_SKIP: + return if name not in nameToPids: nameToPids[name] = {pid} else: @@ -57,7 +60,7 @@ with open(vnamesFile, newline="") as csvfile: preferred = row[6] == "preferred" # Add to maps updateMaps(name1, pid, True, False) - if lang == "eng": + if lang == "eng" and name2 != "": updateMaps(name2, pid, False, preferred) # Open db connection dbCon = sqlite3.connect(dbFile) @@ -72,30 +75,34 @@ dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)") usedPids = set() unresolvedNodeNames = set() dbCur2 = dbCon.cursor() +def addToDb(nodeName, pidToUse): + altNames = set() + preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None + dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName)) + for n in pidToNames[pidToUse]: + if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None: + altNames.add(n) + for n in altNames: + isPreferred = 1 if (n == preferredName) else 0 + dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (nodeName, n, isPreferred)) iterationNum = 0 -for row in dbCur2.execute("SELECT name FROM nodes"): - name = row[0] +for (name,) in dbCur2.execute("SELECT name FROM nodes"): iterationNum += 1 if iterationNum % 10000 == 0: print(f"Loop 1 iteration {iterationNum}") # If name matches a canonical-name, add alt-name entries to 'names' table if name in canonicalNameToPids: - pidToUse = 0 + pidToUse = None for pid in canonicalNameToPids[name]: - if pid not in usedPids: + hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred + hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred + if hasLowerPrio: + continue + if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio): pidToUse = pid - break - if pidToUse > 0: + if pidToUse != None: usedPids.add(pidToUse) - altNames = set() - preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None - dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, name)) - for n in pidToNames[pidToUse]: - if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None: - altNames.add(n) - for n in altNames: - isPreferred = 1 if (n == preferredName) else 0 - dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred)) + addToDb(name, pidToUse) elif name in nameToPids: unresolvedNodeNames.add(name) # Iterate through unresolved nodes, resolving to vernacular-names @@ -105,22 +112,13 @@ for name in unresolvedNodeNames: if iterationNum % 100 == 0: print(f"Loop 2 iteration {iterationNum}") # Add alt-name entries to 'names' table for first corresponding pid - pidToUse = 0 + pidToUse = None for pid in nameToPids[name]: - if pid not in usedPids: + if pid not in usedPids and (pidToUse == None or pid < pidToUse): pidToUse = pid - break - if pidToUse > 0: + if pidToUse != None: usedPids.add(pidToUse) - altNames = set() - preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None - dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, name)) - for n in pidToNames[pidToUse]: - if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None: - altNames.add(n) - for n in altNames: - isPreferred = 1 if (n == preferredName) else 0 - dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred)) + addToDb(name, pidToUse) # Close db dbCon.commit() dbCon.close() |
