#!/usr/bin/python3 import sys, os, re import sqlite3 usageInfo = f""" Usage: {sys.argv[0]} Reads a database containing data from DBpedia, and tries to associate DBpedia IRIs with nodes in the tree-of-life database, adding short-descriptions for them. """ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) dbpediaDb = "dbpedia/descData.db" namesToSkipFile = "pickedEnwikiNamesToSkip.txt" pickedLabelsFile = "pickedDbpLabels.txt" dbFile = "data.db" rootNodeName = "cellular organisms" rootLabel = "Organism" # Will be associated with root node # Got about 400k descriptions when testing print("Opening databases") dbpCon = sqlite3.connect(dbpediaDb) dbpCur = dbpCon.cursor() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() print("Getting node names") nodeNames = set() for (name,) in dbCur.execute("SELECT name from nodes"): nodeNames.add(name) print("Checking for names to skip") oldSz = len(nodeNames) if os.path.exists(namesToSkipFile): with open(namesToSkipFile) as file: for line in file: nodeNames.remove(line.rstrip()) print(f"Skipping {oldSz - len(nodeNames)} nodes") print("Reading disambiguation-page labels") disambigLabels = set() query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri" for (label,) in dbpCur.execute(query): disambigLabels.add(label) print("Trying to associate nodes with DBpedia labels") nodeToLabel = {} nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)' nameToVariants = {} # Maps node names to lists of matching labels iterNum = 0 for (label,) in dbpCur.execute("SELECT label from labels"): iterNum += 1 if iterNum % 1e5 == 0: print(f"At iteration {iterNum}") # if label in disambigLabels: continue name = label.lower() if name in nodeNames: if name not in nameToVariants: nameToVariants[name] = [label] elif label not in nameToVariants[name]: nameToVariants[name].append(label) else: match = nameVariantRegex.fullmatch(name) if match != None: subName = match.group(1) if subName in nodeNames and match.group(2) != "disambiguation": if subName not in nameToVariants: nameToVariants[subName] = [label] elif name not in nameToVariants[subName]: nameToVariants[subName].append(label) # Associate labels without conflicts for (name, variants) in nameToVariants.items(): if len(variants) == 1: nodeToLabel[name] = variants[0] for name in nodeToLabel: del nameToVariants[name] # Special case for root node nodeToLabel[rootNodeName] = rootLabel if rootNodeName in nameToVariants: del nameToVariants["cellular organisms"] print(f"Trying to resolve {len(nameToVariants)} conflicts") def resolveWithPickedLabels(): " Attempts to resolve conflicts using a picked-names file " with open(pickedLabelsFile) as file: for line in file: (name, _, label) = line.rstrip().partition("|") if name not in nameToVariants: print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr) continue if label == "": del nameToVariants[name] else: if label not in nameToVariants[name]: print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr) nodeToLabel[name] = label del nameToVariants[name] def resolveWithCategoryList(): """ Attempts to resolve conflicts by looking for labels like 'name1 (category1)', and choosing those with a category1 that seems 'biological'. Does two passes, using more generic categories first. This helps avoid stuff like Pan being classified as a horse instead of an ape. """ generalCategories = { "species", "genus", "plant", "fungus", "animal", "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", "fish", "amphibian", "reptile", "bird", "mammal", } specificCategories = { "protist", "alveolate", "dinoflagellates", "orchid", "poaceae", "fern", "moss", "alga", "bryozoan", "hydrozoan", "sponge", "cnidarian", "coral", "polychaete", "echinoderm", "bivalve", "gastropod", "chiton", "shrimp", "decapod", "crab", "barnacle", "copepod", "arachnid", "spider", "harvestman", "mite", "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", "beetle", "fly", "butterfly", "moth", "wasp", "catfish", "frog", "lizard", "horse", "sheep", "cattle", "mouse", } namesToRemove = set() for (name, variants) in nameToVariants.items(): found = False for label in variants: match = nameVariantRegex.match(label) if match != None and match.group(2).lower() in generalCategories: nodeToLabel[name] = label namesToRemove.add(name) found = True break if not found: for label in variants: match = nameVariantRegex.match(label) if match != None and match.group(2).lower() in specificCategories: nodeToLabel[name] = label namesToRemove.add(name) break for name in namesToRemove: del nameToVariants[name] def resolveWithTypeData(): " Attempts to resolve conflicts using DBpedia's type data " taxonTypes = { # Obtained from the DBpedia ontology "http://dbpedia.org/ontology/Species", "http://dbpedia.org/ontology/Archaea", "http://dbpedia.org/ontology/Bacteria", "http://dbpedia.org/ontology/Eukaryote", "http://dbpedia.org/ontology/Plant", "http://dbpedia.org/ontology/ClubMoss", "http://dbpedia.org/ontology/Conifer", "http://dbpedia.org/ontology/CultivatedVariety", "http://dbpedia.org/ontology/Cycad", "http://dbpedia.org/ontology/Fern", "http://dbpedia.org/ontology/FloweringPlant", "http://dbpedia.org/ontology/Grape", "http://dbpedia.org/ontology/Ginkgo", "http://dbpedia.org/ontology/Gnetophytes", "http://dbpedia.org/ontology/GreenAlga", "http://dbpedia.org/ontology/Moss", "http://dbpedia.org/ontology/Fungus", "http://dbpedia.org/ontology/Animal", "http://dbpedia.org/ontology/Fish", "http://dbpedia.org/ontology/Crustacean", "http://dbpedia.org/ontology/Mollusca", "http://dbpedia.org/ontology/Insect", "http://dbpedia.org/ontology/Arachnid", "http://dbpedia.org/ontology/Amphibian", "http://dbpedia.org/ontology/Reptile", "http://dbpedia.org/ontology/Bird", "http://dbpedia.org/ontology/Mammal", "http://dbpedia.org/ontology/Cat", "http://dbpedia.org/ontology/Dog", "http://dbpedia.org/ontology/Horse", } iterNum = 0 for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): iterNum += 1 if iterNum % 1e5 == 0: print(f"At iteration {iterNum}") # if type in taxonTypes: name = label.lower() if name in nameToVariants: nodeToLabel[name] = label del nameToVariants[name] else: match = nameVariantRegex.fullmatch(name) if match != None: name = match.group(1).lower() if name in nameToVariants: nodeToLabel[name] = label del nameToVariants[name] #resolveWithTypeData() #resolveWithCategoryList() resolveWithPickedLabels() print(f"Remaining number of conflicts: {len(nameToVariants)}") print("Getting node IRIs") nodeToIri = {} for (name, label) in nodeToLabel.items(): (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ?", (label,)).fetchone() nodeToIri[name] = iri print("Resolving redirects") redirectingIriSet = set() iterNum = 0 for (name, iri) in nodeToIri.items(): iterNum += 1 if iterNum % 1e4 == 0: print(f"At iteration {iterNum}") # row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone() if row != None: nodeToIri[name] = row[0] redirectingIriSet.add(name) print("Adding description tables") dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)") dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)") dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)") iterNum = 0 for (name, iri) in nodeToIri.items(): iterNum += 1 if iterNum % 1e4 == 0: print(f"At iteration {iterNum}") # query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?" row = dbpCur.execute(query, (iri,)).fetchone() if row != None: desc, wikiId = row dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0)) dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1)) print("Closing databases") dbCon.commit() dbCon.close() dbpCon.commit() dbpCon.close()