diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-07-12 01:58:41 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-07-12 01:58:41 +1000 |
| commit | 9c5abe160cf11c9d28794e454d41cba5cb6cd637 (patch) | |
| tree | 677ad1957fbec59b486b30243844cf5cc44b4162 | |
| parent | f5b8a9952ed2f82f1099f69715696237d1e1e8a4 (diff) | |
Fix desc-association error due to case-insensitive label-matching
| -rwxr-xr-x | backend/tolData/genDbpData.py | 12 |
1 files changed, 6 insertions, 6 deletions
diff --git a/backend/tolData/genDbpData.py b/backend/tolData/genDbpData.py index 606ffcb..68b135e 100755 --- a/backend/tolData/genDbpData.py +++ b/backend/tolData/genDbpData.py @@ -19,7 +19,7 @@ namesToSkipFile = "pickedEnwikiNamesToSkip.txt" pickedLabelsFile = "pickedDbpLabels.txt" dbFile = "data.db" rootNodeName = "cellular organisms" -rootLabel = "organism" # Will be associated with root node +rootLabel = "Organism" # Will be associated with root node # Got about 400k descriptions when testing print("Opening databases") @@ -85,7 +85,7 @@ nodeToLabel[rootNodeName] = rootLabel if rootNodeName in nameToVariants: del nameToVariants["cellular organisms"] -print("Trying to resolve {len(nameToVariants)} conflicts") +print(f"Trying to resolve {len(nameToVariants)} conflicts") def resolveWithPickedLabels(): " Attempts to resolve conflicts using a picked-names file " with open(pickedLabelsFile) as file: @@ -134,7 +134,7 @@ def resolveWithCategoryList(): found = False for label in variants: match = nameVariantRegex.match(label) - if match != None and match.group(2) in generalCategories: + if match != None and match.group(2).lower() in generalCategories: nodeToLabel[name] = label namesToRemove.add(name) found = True @@ -142,7 +142,7 @@ def resolveWithCategoryList(): if not found: for label in variants: match = nameVariantRegex.match(label) - if match != None and match.group(2) in specificCategories: + if match != None and match.group(2).lower() in specificCategories: nodeToLabel[name] = label namesToRemove.add(name) break @@ -196,7 +196,7 @@ def resolveWithTypeData(): else: match = nameVariantRegex.fullmatch(name) if match != None: - name = match.group(1) + name = match.group(1).lower() if name in nameToVariants: nodeToLabel[name] = label del nameToVariants[name] @@ -208,7 +208,7 @@ print(f"Remaining number of conflicts: {len(nameToVariants)}") print("Getting node IRIs") nodeToIri = {} for (name, label) in nodeToLabel.items(): - (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone() + (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ?", (label,)).fetchone() nodeToIri[name] = iri print("Resolving redirects") |
