diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-07 23:06:26 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-07 23:13:01 +1000 |
| commit | 27361479b3615a0f1156be3a97579df7f128d993 (patch) | |
| tree | 20b8aa14ac7ba3b409a3c983fd4b04e60e7d2660 /backend/data/genDbpData.py | |
| parent | 1879c7920607feb1df1102a09d2d4f915d9544a5 (diff) | |
Add more manual-correction for dbp-desc generation
Diffstat (limited to 'backend/data/genDbpData.py')
| -rwxr-xr-x | backend/data/genDbpData.py | 244 |
1 files changed, 128 insertions, 116 deletions
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py index 0655344..887e8a8 100755 --- a/backend/data/genDbpData.py +++ b/backend/data/genDbpData.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, re +import sys, os, re import sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" @@ -13,6 +13,7 @@ if len(sys.argv) > 1: sys.exit(1) dbpediaDb = "dbpedia/dbpData.db" +namesToSkipFile = "dbpNamesToSkip.txt" pickedLabelsFile = "dbpPickedLabels.txt" dbFile = "data.db" @@ -24,8 +25,16 @@ dbCur = dbCon.cursor() # Get node names print("Reading node names") nodeNames = set() -for row in dbCur.execute("SELECT name from nodes"): - nodeNames.add(row[0]) +for (name,) in dbCur.execute("SELECT name from nodes"): + nodeNames.add(name) +# Skipping certain names +print("Checking for names to skip") +oldSz = len(nodeNames) +if os.path.exists(namesToSkipFile): + with open(namesToSkipFile) as file: + for line in file: + nodeNames.remove(line.rstrip()) +print(f"Skipping {oldSz - len(nodeNames)} nodes") # Get disambiguation page labels print("Reading disambiguation-page labels") disambigLabels = set() @@ -57,9 +66,9 @@ for (label,) in dbpCur.execute("SELECT label from labels"): subName = match.group(1) if subName in nodeNames and match.group(2) != "disambiguation": if subName not in nameToVariants: - nameToVariants[subName] = [name] # Intentionally ignoring case here + nameToVariants[subName] = [label] elif name not in nameToVariants[subName]: - nameToVariants[subName].append(name) + nameToVariants[subName].append(label) for (name, variants) in nameToVariants.items(): if len(variants) == 1: nodeToLabel[name] = variants[0] @@ -67,126 +76,128 @@ for name in nodeToLabel: del nameToVariants[name] nodeToLabel["cellular organisms"] = "organism" # Special case for root node print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via picked-labels -print("Resolving conflicts using picked-labels") -with open(pickedLabelsFile) as file: - for line in file: - pickedLabel = line.rstrip() - name = pickedLabel.lower() - if name in nameToVariants: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] - else: - match = nameVariantRegex.match(pickedLabel) - if match == None: - print(f"WARNING: Picked label {pickedLabel} not found (1)", file=sys.stderr) +# Try resolving conflicts +def resolveWithPickedLabels(): + # Attempts conflict resolution using a file with lines of the form 'name1|label1', + # where label1 may be absent, indicating that no label should be associated with the name + print("Resolving conflicts using picked-labels") + with open(pickedLabelsFile) as file: + for line in file: + (name, _, label) = line.rstrip().partition("|") + if name not in nameToVariants: + print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr) + continue + if label == "": + del nameToVariants[name] else: - name = match.group(1) - if name not in nameToVariants: - print(f"WARNING: Picked label {pickedLabel} not found (2)", file=sys.stderr) - else: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via category-list + if label not in nameToVariants[name]: + print(f"WARNING: Picked label \"{label}\" for name \"{name}\" not found", file=sys.stderr) + continue + nodeToLabel[name] = label + del nameToVariants[name] + print(f"Remaining number of conflicts: {len(nameToVariants)}") +def resolveWithCategoryList(): + # Attempts conflict resolution using category-text in labels of the form 'name1 (category1)' # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) -print("Resolving conflicts using category-list") -generalCategories = { - "species", "genus", - "plant", "fungus", "animal", - "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", - "fish", "amphibian", "reptile", "bird", "mammal", -} -specificCategories = { - "protist", "alveolate", "dinoflagellates", - "orchid", "Poaceae", "fern", "moss", "alga", - "bryozoan", "hydrozoan", - "sponge", "cnidarian", "coral", "polychaete", "echinoderm", - "bivalve", "gastropod", "chiton", - "shrimp", "decapod", "crab", "barnacle", "copepod", - "arachnid", "spider", "harvestman", "mite", - "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", - "beetle", "fly", "butterfly", "moth", "wasp", - "catfish", - "frog", - "lizard", - "horse", "sheep", "cattle", "mouse", -} -namesToRemove = set() -for (name, variants) in nameToVariants.items(): - found = False - for label in variants: - match = nameVariantRegex.match(label) - if match != None and match.group(2) in generalCategories: - nodeToLabel[name] = label - namesToRemove.add(name) - found = True - break - if not found: + print("Resolving conflicts using category-list") + generalCategories = { + "species", "genus", + "plant", "fungus", "animal", + "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", + "fish", "amphibian", "reptile", "bird", "mammal", + } + specificCategories = { + "protist", "alveolate", "dinoflagellates", + "orchid", "Poaceae", "fern", "moss", "alga", + "bryozoan", "hydrozoan", + "sponge", "cnidarian", "coral", "polychaete", "echinoderm", + "bivalve", "gastropod", "chiton", + "shrimp", "decapod", "crab", "barnacle", "copepod", + "arachnid", "spider", "harvestman", "mite", + "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", + "beetle", "fly", "butterfly", "moth", "wasp", + "catfish", + "frog", + "lizard", + "horse", "sheep", "cattle", "mouse", + } + namesToRemove = set() + for (name, variants) in nameToVariants.items(): + found = False for label in variants: match = nameVariantRegex.match(label) - if match != None and match.group(2) in specificCategories: + if match != None and match.group(2) in generalCategories: nodeToLabel[name] = label namesToRemove.add(name) + found = True break -for name in namesToRemove: - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via taxon-type information -print("Resolving conflicts using instance-type data") -taxonTypes = { # Obtained from the DBpedia ontology - "http://dbpedia.org/ontology/Species", - "http://dbpedia.org/ontology/Archaea", - "http://dbpedia.org/ontology/Bacteria", - "http://dbpedia.org/ontology/Eukaryote", - "http://dbpedia.org/ontology/Plant", - "http://dbpedia.org/ontology/ClubMoss", - "http://dbpedia.org/ontology/Conifer", - "http://dbpedia.org/ontology/CultivatedVariety", - "http://dbpedia.org/ontology/Cycad", - "http://dbpedia.org/ontology/Fern", - "http://dbpedia.org/ontology/FloweringPlant", - "http://dbpedia.org/ontology/Grape", - "http://dbpedia.org/ontology/Ginkgo", - "http://dbpedia.org/ontology/Gnetophytes", - "http://dbpedia.org/ontology/GreenAlga", - "http://dbpedia.org/ontology/Moss", - "http://dbpedia.org/ontology/Fungus", - "http://dbpedia.org/ontology/Animal", - "http://dbpedia.org/ontology/Fish", - "http://dbpedia.org/ontology/Crustacean", - "http://dbpedia.org/ontology/Mollusca", - "http://dbpedia.org/ontology/Insect", - "http://dbpedia.org/ontology/Arachnid", - "http://dbpedia.org/ontology/Amphibian", - "http://dbpedia.org/ontology/Reptile", - "http://dbpedia.org/ontology/Bird", - "http://dbpedia.org/ontology/Mammal", - "http://dbpedia.org/ontology/Cat", - "http://dbpedia.org/ontology/Dog", - "http://dbpedia.org/ontology/Horse", -} -iterNum = 0 -for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") - # - if type in taxonTypes: - name = label.lower() - if name in nameToVariants: - nodeToLabel[name] = label - del nameToVariants[name] - else: - match = nameVariantRegex.fullmatch(name) - if match != None: - name = match.group(1) - if name in nameToVariants: + if not found: + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in specificCategories: nodeToLabel[name] = label - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") + namesToRemove.add(name) + break + for name in namesToRemove: + del nameToVariants[name] + print(f"Remaining number of conflicts: {len(nameToVariants)}") +def resolveWithTypeData(): + # Attempts conflict-resolution using dbpedia's instance-type data + print("Resolving conflicts using instance-type data") + taxonTypes = { # Obtained from the DBpedia ontology + "http://dbpedia.org/ontology/Species", + "http://dbpedia.org/ontology/Archaea", + "http://dbpedia.org/ontology/Bacteria", + "http://dbpedia.org/ontology/Eukaryote", + "http://dbpedia.org/ontology/Plant", + "http://dbpedia.org/ontology/ClubMoss", + "http://dbpedia.org/ontology/Conifer", + "http://dbpedia.org/ontology/CultivatedVariety", + "http://dbpedia.org/ontology/Cycad", + "http://dbpedia.org/ontology/Fern", + "http://dbpedia.org/ontology/FloweringPlant", + "http://dbpedia.org/ontology/Grape", + "http://dbpedia.org/ontology/Ginkgo", + "http://dbpedia.org/ontology/Gnetophytes", + "http://dbpedia.org/ontology/GreenAlga", + "http://dbpedia.org/ontology/Moss", + "http://dbpedia.org/ontology/Fungus", + "http://dbpedia.org/ontology/Animal", + "http://dbpedia.org/ontology/Fish", + "http://dbpedia.org/ontology/Crustacean", + "http://dbpedia.org/ontology/Mollusca", + "http://dbpedia.org/ontology/Insect", + "http://dbpedia.org/ontology/Arachnid", + "http://dbpedia.org/ontology/Amphibian", + "http://dbpedia.org/ontology/Reptile", + "http://dbpedia.org/ontology/Bird", + "http://dbpedia.org/ontology/Mammal", + "http://dbpedia.org/ontology/Cat", + "http://dbpedia.org/ontology/Dog", + "http://dbpedia.org/ontology/Horse", + } + iterNum = 0 + for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"Processing line {iterNum}") + # + if type in taxonTypes: + name = label.lower() + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + name = match.group(1) + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] + print(f"Remaining number of conflicts: {len(nameToVariants)}") +resolveWithPickedLabels() # Associate nodes with IRIs -print("Getting nodes IRIs") +print("Getting node IRIs") nodeToIri = {} iterNum = 0 for (name, label) in nodeToLabel.items(): @@ -212,6 +223,7 @@ for (name, iri) in nodeToIri.items(): # Find descriptions, and add to db print("Adding node description data") dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)") +dbCur.execute("CREATE INDEX descs_id_idx ON descs(wiki_id)") # wiki_id intentionally left non-unique iterNum = 0 for (name, iri) in nodeToIri.items(): iterNum += 1 |
