diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-18 20:03:24 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-05-18 21:06:53 +1000 |
| commit | eaaa97c186a2f8e2ba0768bd208120c0054ec0d3 (patch) | |
| tree | 083850cb6a7d62293a144f037dd5f8f699ed3716 /backend | |
| parent | 992853eeac9ace9f6eb4332516a8ba6ff7545deb (diff) | |
For dbpedia data, prioritise manual conflict resolutions
Diffstat (limited to 'backend')
| -rw-r--r-- | backend/data/dbpPickedLabels.txt | 1 | ||||
| -rwxr-xr-x | backend/data/genDbpData.py | 42 |
2 files changed, 22 insertions, 21 deletions
diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt index d8f939e..1fa1e71 100644 --- a/backend/data/dbpPickedLabels.txt +++ b/backend/data/dbpPickedLabels.txt @@ -609,6 +609,7 @@ Tawera Telphusa Termitomyces schimperi Theba +Theria Thestor Thomomys Thria diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py index 1dce001..0ba1ef1 100755 --- a/backend/data/genDbpData.py +++ b/backend/data/genDbpData.py @@ -67,6 +67,27 @@ for name in nodeToLabel: del nameToVariants[name] nodeToLabel["cellular organisms"] = "organism" # Special case for root node print("Number of conflicts: {}".format(len(nameToVariants))) +# Try conflict resolution via picked-labels +print("Resolving conflicts using picked-labels") +with open(pickedLabelsFile) as file: + for line in file: + pickedLabel = line.rstrip() + name = pickedLabel.lower() + if name in nameToVariants: + nodeToLabel[name] = pickedLabel + del nameToVariants[name] + else: + match = nameVariantRegex.match(pickedLabel) + if match == None: + print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr) + else: + name = match.group(1) + if name not in nameToVariants: + print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr) + else: + nodeToLabel[name] = pickedLabel + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) # Try conflict resolution via category-list # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) print("Resolving conflicts using category-list") @@ -164,27 +185,6 @@ for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN t nodeToLabel[name] = label del nameToVariants[name] print("Number of conflicts: {}".format(len(nameToVariants))) -# Try conflict resolution via picked-labels -print("Resolving conflicts using picked-labels") -with open(pickedLabelsFile) as file: - for line in file: - pickedLabel = line.rstrip() - name = pickedLabel.lower() - if name in nameToVariants: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] - else: - match = nameVariantRegex.match(pickedLabel) - if match == None: - print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr) - else: - name = match.group(1) - if name not in nameToVariants: - print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr) - else: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] -print("Number of conflicts: {}".format(len(nameToVariants))) # Associate nodes with IRIs print("Getting nodes IRIs") nodeToIri = {} |
