aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-17 21:31:08 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-17 21:31:08 +1000
commit71ba62c24e99935c0ba81dd6a07eef5746a86575 (patch)
tree47bb6d38d6c3b69d641150d0fa66c79d5c19d3fc /backend/data
parent202a3d211ce5c315b488278f4bbc5c1e3c0e66f6 (diff)
Add code+file for excluding bad-alt-names
Diffstat (limited to 'backend/data')
-rwxr-xr-xbackend/data/genEolNameData.py21
1 files changed, 19 insertions, 2 deletions
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index 79e6f4f..cd3247d 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -19,6 +19,7 @@ vnamesFile = "eol/vernacularNames.csv"
dbFile = "data.db"
NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"}
pickedIdsFile = "genEolNameDataPickedIds.txt"
+badAltsFile = "genEolNameDataBadAlts.txt"
# Read in vernacular-names data
# Note: Canonical-names may have multiple pids
@@ -72,6 +73,18 @@ if os.path.exists(pickedIdsFile):
(name, _, eolId) = line.rstrip().partition("|")
nameToPickedPid[name] = None if eolId == "" else int(eolId)
print(f"Found {len(nameToPickedPid)}")
+# Read in node-alt_names to avoid
+print("Checking for bad-alt-names")
+nameToBadAlts = {}
+if os.path.exists(badAltsFile):
+ with open(badAltsFile) as file:
+ for line in file:
+ (name, _, altName) = line.rstrip().partition("|")
+ if name not in nameToBadAlts:
+ nameToBadAlts[name] = [altName]
+ else:
+ nameToBadAlts[name].append(altName)
+print(f"Found bad-alts for {len(nameToBadAlts)} nodes")
# Open db connection
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -91,8 +104,12 @@ def addToDb(nodeName, pidToUse):
preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
for n in pidToNames[pidToUse]:
- if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None:
- altNames.add(n)
+ if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None:
+ continue
+ if nodeName in nameToBadAlts and n in nameToBadAlts[nodeName]:
+ print(f"Excluding bad-alt {n} for node {nodeName}")
+ continue
+ altNames.add(n)
for n in altNames:
isPreferred = 1 if (n == preferredName) else 0
dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred))