diff options
Diffstat (limited to 'backend/tolData/genDbpData.py')
| -rwxr-xr-x | backend/tolData/genDbpData.py | 247 |
1 files changed, 247 insertions, 0 deletions
diff --git a/backend/tolData/genDbpData.py b/backend/tolData/genDbpData.py new file mode 100755 index 0000000..df3a6be --- /dev/null +++ b/backend/tolData/genDbpData.py @@ -0,0 +1,247 @@ +#!/usr/bin/python3 + +import sys, os, re +import sqlite3 + +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads a database containing data from DBpedia, and tries to associate +DBpedia IRIs with nodes in a database, adding short-descriptions for them. +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbpediaDb = "dbpedia/descData.db" +namesToSkipFile = "pickedEnwikiNamesToSkip.txt" +pickedLabelsFile = "pickedDbpLabels.txt" +dbFile = "data.db" +rootNodeName = "cellular organisms" +rootLabel = "organism" # Will be associated with root node +# Got about 400k descriptions when testing + +print("Opening databases") +dbpCon = sqlite3.connect(dbpediaDb) +dbpCur = dbpCon.cursor() +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() + +print("Getting node names") +nodeNames = set() +for (name,) in dbCur.execute("SELECT name from nodes"): + nodeNames.add(name) + +print("Checking for names to skip") +oldSz = len(nodeNames) +if os.path.exists(namesToSkipFile): + with open(namesToSkipFile) as file: + for line in file: + nodeNames.remove(line.rstrip()) +print(f"Skipping {oldSz - len(nodeNames)} nodes") + +print("Reading disambiguation-page labels") +disambigLabels = set() +query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri" +for (label,) in dbpCur.execute(query): + disambigLabels.add(label) + +print("Trying to associate nodes with DBpedia labels") +nodeToLabel = {} +nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)' +nameToVariants = {} # Maps node names to lists of matching labels +iterNum = 0 +for (label,) in dbpCur.execute("SELECT label from labels"): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") + # + if label in disambigLabels: + continue + name = label.lower() + if name in nodeNames: + if name not in nameToVariants: + nameToVariants[name] = [label] + elif label not in nameToVariants[name]: + nameToVariants[name].append(label) + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + subName = match.group(1) + if subName in nodeNames and match.group(2) != "disambiguation": + if subName not in nameToVariants: + nameToVariants[subName] = [label] + elif name not in nameToVariants[subName]: + nameToVariants[subName].append(label) +# Associate labels without conflicts +for (name, variants) in nameToVariants.items(): + if len(variants) == 1: + nodeToLabel[name] = variants[0] +for name in nodeToLabel: + del nameToVariants[name] +# Special case for root node +nodeToLabel[rootNodeName] = rootLabel +if rootNodeName in nameToVariants: + del nameToVariants["cellular organisms"] + +print("Trying to resolve {len(nameToVariants)} conflicts") +def resolveWithPickedLabels(): + " Attempts to resolve conflicts using a picked-names file " + with open(pickedLabelsFile) as file: + for line in file: + (name, _, label) = line.rstrip().partition("|") + if name not in nameToVariants: + print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr) + continue + if label == "": + del nameToVariants[name] + else: + if label not in nameToVariants[name]: + print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr) + nodeToLabel[name] = label + del nameToVariants[name] +def resolveWithCategoryList(): + """ + Attempts to resolve conflicts by looking for labels like 'name1 (category1)', + and choosing those with a category1 that seems 'biological'. + Does two passes, using more generic categories first. This helps avoid stuff like + Pan being classified as a horse instead of an ape. + """ + generalCategories = { + "species", "genus", + "plant", "fungus", "animal", + "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", + "fish", "amphibian", "reptile", "bird", "mammal", + } + specificCategories = { + "protist", "alveolate", "dinoflagellates", + "orchid", "poaceae", "fern", "moss", "alga", + "bryozoan", "hydrozoan", + "sponge", "cnidarian", "coral", "polychaete", "echinoderm", + "bivalve", "gastropod", "chiton", + "shrimp", "decapod", "crab", "barnacle", "copepod", + "arachnid", "spider", "harvestman", "mite", + "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", + "beetle", "fly", "butterfly", "moth", "wasp", + "catfish", + "frog", + "lizard", + "horse", "sheep", "cattle", "mouse", + } + namesToRemove = set() + for (name, variants) in nameToVariants.items(): + found = False + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in generalCategories: + nodeToLabel[name] = label + namesToRemove.add(name) + found = True + break + if not found: + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in specificCategories: + nodeToLabel[name] = label + namesToRemove.add(name) + break + for name in namesToRemove: + del nameToVariants[name] +def resolveWithTypeData(): + " Attempts to resolve conflicts using DBpedia's type data " + taxonTypes = { # Obtained from the DBpedia ontology + "http://dbpedia.org/ontology/Species", + "http://dbpedia.org/ontology/Archaea", + "http://dbpedia.org/ontology/Bacteria", + "http://dbpedia.org/ontology/Eukaryote", + "http://dbpedia.org/ontology/Plant", + "http://dbpedia.org/ontology/ClubMoss", + "http://dbpedia.org/ontology/Conifer", + "http://dbpedia.org/ontology/CultivatedVariety", + "http://dbpedia.org/ontology/Cycad", + "http://dbpedia.org/ontology/Fern", + "http://dbpedia.org/ontology/FloweringPlant", + "http://dbpedia.org/ontology/Grape", + "http://dbpedia.org/ontology/Ginkgo", + "http://dbpedia.org/ontology/Gnetophytes", + "http://dbpedia.org/ontology/GreenAlga", + "http://dbpedia.org/ontology/Moss", + "http://dbpedia.org/ontology/Fungus", + "http://dbpedia.org/ontology/Animal", + "http://dbpedia.org/ontology/Fish", + "http://dbpedia.org/ontology/Crustacean", + "http://dbpedia.org/ontology/Mollusca", + "http://dbpedia.org/ontology/Insect", + "http://dbpedia.org/ontology/Arachnid", + "http://dbpedia.org/ontology/Amphibian", + "http://dbpedia.org/ontology/Reptile", + "http://dbpedia.org/ontology/Bird", + "http://dbpedia.org/ontology/Mammal", + "http://dbpedia.org/ontology/Cat", + "http://dbpedia.org/ontology/Dog", + "http://dbpedia.org/ontology/Horse", + } + iterNum = 0 + for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") + # + if type in taxonTypes: + name = label.lower() + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + name = match.group(1) + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] +#resolveWithTypeData() +#resolveWithCategoryList() +resolveWithPickedLabels() +print(f"Remaining number of conflicts: {len(nameToVariants)}") + +print("Getting node IRIs") +nodeToIri = {} +for (name, label) in nodeToLabel.items(): + (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone() + nodeToIri[name] = iri + +print("Resolving redirects") +redirectingIriSet = set() +iterNum = 0 +for (name, iri) in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}") + # + row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone() + if row != None: + nodeToIri[name] = row[0] + redirectingIriSet.add(name) + +print("Adding description tables") +dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)") +dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)") +dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)") +iterNum = 0 +for (name, iri) in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}") + # + query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?" + row = dbpCur.execute(query, (iri,)).fetchone() + if row != None: + desc, wikiId = row + dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0)) + dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1)) + +print("Closing databases") +dbCon.commit() +dbCon.close() +dbpCon.commit() +dbpCon.close() |
