From eaaa97c186a2f8e2ba0768bd208120c0054ec0d3 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 18 May 2022 20:03:24 +1000 Subject: For dbpedia data, prioritise manual conflict resolutions --- backend/data/genDbpData.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'backend/data/genDbpData.py') diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py index 1dce001..0ba1ef1 100755 --- a/backend/data/genDbpData.py +++ b/backend/data/genDbpData.py @@ -67,6 +67,27 @@ for name in nodeToLabel: del nameToVariants[name] nodeToLabel["cellular organisms"] = "organism" # Special case for root node print("Number of conflicts: {}".format(len(nameToVariants))) +# Try conflict resolution via picked-labels +print("Resolving conflicts using picked-labels") +with open(pickedLabelsFile) as file: + for line in file: + pickedLabel = line.rstrip() + name = pickedLabel.lower() + if name in nameToVariants: + nodeToLabel[name] = pickedLabel + del nameToVariants[name] + else: + match = nameVariantRegex.match(pickedLabel) + if match == None: + print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr) + else: + name = match.group(1) + if name not in nameToVariants: + print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr) + else: + nodeToLabel[name] = pickedLabel + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) # Try conflict resolution via category-list # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) print("Resolving conflicts using category-list") @@ -164,27 +185,6 @@ for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN t nodeToLabel[name] = label del nameToVariants[name] print("Number of conflicts: {}".format(len(nameToVariants))) -# Try conflict resolution via picked-labels -print("Resolving conflicts using picked-labels") -with open(pickedLabelsFile) as file: - for line in file: - pickedLabel = line.rstrip() - name = pickedLabel.lower() - if name in nameToVariants: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] - else: - match = nameVariantRegex.match(pickedLabel) - if match == None: - print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr) - else: - name = match.group(1) - if name not in nameToVariants: - print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr) - else: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] -print("Number of conflicts: {}".format(len(nameToVariants))) # Associate nodes with IRIs print("Getting nodes IRIs") nodeToIri = {} -- cgit v1.2.3