aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-18 20:03:24 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-18 21:06:53 +1000
commiteaaa97c186a2f8e2ba0768bd208120c0054ec0d3 (patch)
tree083850cb6a7d62293a144f037dd5f8f699ed3716 /backend
parent992853eeac9ace9f6eb4332516a8ba6ff7545deb (diff)
For dbpedia data, prioritise manual conflict resolutions
Diffstat (limited to 'backend')
-rw-r--r--backend/data/dbpPickedLabels.txt1
-rwxr-xr-xbackend/data/genDbpData.py42
2 files changed, 22 insertions, 21 deletions
diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt
index d8f939e..1fa1e71 100644
--- a/backend/data/dbpPickedLabels.txt
+++ b/backend/data/dbpPickedLabels.txt
@@ -609,6 +609,7 @@ Tawera
Telphusa
Termitomyces schimperi
Theba
+Theria
Thestor
Thomomys
Thria
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
index 1dce001..0ba1ef1 100755
--- a/backend/data/genDbpData.py
+++ b/backend/data/genDbpData.py
@@ -67,6 +67,27 @@ for name in nodeToLabel:
del nameToVariants[name]
nodeToLabel["cellular organisms"] = "organism" # Special case for root node
print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via picked-labels
+print("Resolving conflicts using picked-labels")
+with open(pickedLabelsFile) as file:
+ for line in file:
+ pickedLabel = line.rstrip()
+ name = pickedLabel.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = pickedLabel
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.match(pickedLabel)
+ if match == None:
+ print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr)
+ else:
+ name = match.group(1)
+ if name not in nameToVariants:
+ print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr)
+ else:
+ nodeToLabel[name] = pickedLabel
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
# Try conflict resolution via category-list
# Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
print("Resolving conflicts using category-list")
@@ -164,27 +185,6 @@ for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN t
nodeToLabel[name] = label
del nameToVariants[name]
print("Number of conflicts: {}".format(len(nameToVariants)))
-# Try conflict resolution via picked-labels
-print("Resolving conflicts using picked-labels")
-with open(pickedLabelsFile) as file:
- for line in file:
- pickedLabel = line.rstrip()
- name = pickedLabel.lower()
- if name in nameToVariants:
- nodeToLabel[name] = pickedLabel
- del nameToVariants[name]
- else:
- match = nameVariantRegex.match(pickedLabel)
- if match == None:
- print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr)
- else:
- name = match.group(1)
- if name not in nameToVariants:
- print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr)
- else:
- nodeToLabel[name] = pickedLabel
- del nameToVariants[name]
-print("Number of conflicts: {}".format(len(nameToVariants)))
# Associate nodes with IRIs
print("Getting nodes IRIs")
nodeToIri = {}