diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-17 21:31:08 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-17 21:31:08 +1000 |
| commit | 71ba62c24e99935c0ba81dd6a07eef5746a86575 (patch) | |
| tree | 47bb6d38d6c3b69d641150d0fa66c79d5c19d3fc | |
| parent | 202a3d211ce5c315b488278f4bbc5c1e3c0e66f6 (diff) | |
Add code+file for excluding bad-alt-names
| -rw-r--r-- | .gitignore | 1 | ||||
| -rwxr-xr-x | backend/data/genEolNameData.py | 21 |
2 files changed, 20 insertions, 2 deletions
@@ -23,6 +23,7 @@ /backend/data/genOtolNamesToKeep.txt /backend/data/genOtolDataPickedDups.txt /backend/data/genEolNameDataPickedIds.txt +/backend/data/genEolNameDataBadAlts.txt /backend/data/genDescNamesToSkip.txt /backend/data/dbpPickedLabels.txt /backend/data/genEnwikiDescTitlesToUse.txt diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py index 79e6f4f..cd3247d 100755 --- a/backend/data/genEolNameData.py +++ b/backend/data/genEolNameData.py @@ -19,6 +19,7 @@ vnamesFile = "eol/vernacularNames.csv" dbFile = "data.db" NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"} pickedIdsFile = "genEolNameDataPickedIds.txt" +badAltsFile = "genEolNameDataBadAlts.txt" # Read in vernacular-names data # Note: Canonical-names may have multiple pids @@ -72,6 +73,18 @@ if os.path.exists(pickedIdsFile): (name, _, eolId) = line.rstrip().partition("|") nameToPickedPid[name] = None if eolId == "" else int(eolId) print(f"Found {len(nameToPickedPid)}") +# Read in node-alt_names to avoid +print("Checking for bad-alt-names") +nameToBadAlts = {} +if os.path.exists(badAltsFile): + with open(badAltsFile) as file: + for line in file: + (name, _, altName) = line.rstrip().partition("|") + if name not in nameToBadAlts: + nameToBadAlts[name] = [altName] + else: + nameToBadAlts[name].append(altName) +print(f"Found bad-alts for {len(nameToBadAlts)} nodes") # Open db connection dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -91,8 +104,12 @@ def addToDb(nodeName, pidToUse): preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName)) for n in pidToNames[pidToUse]: - if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() == None: - altNames.add(n) + if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None: + continue + if nodeName in nameToBadAlts and n in nameToBadAlts[nodeName]: + print(f"Excluding bad-alt {n} for node {nodeName}") + continue + altNames.add(n) for n in altNames: isPreferred = 1 if (n == preferredName) else 0 dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred)) |
