aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genDbpData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/genDbpData.py')
-rwxr-xr-xbackend/tolData/genDbpData.py245
1 files changed, 0 insertions, 245 deletions
diff --git a/backend/tolData/genDbpData.py b/backend/tolData/genDbpData.py
deleted file mode 100755
index 9d52e1d..0000000
--- a/backend/tolData/genDbpData.py
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os, re
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads a database containing data from DBpedia, and tries to associate
-DBpedia IRIs with nodes in the tree-of-life database, adding
-short-descriptions for them.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-dbpediaDb = "dbpedia/descData.db"
-namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
-pickedLabelsFile = "pickedDbpLabels.txt"
-dbFile = "data.db"
-rootNodeName = "cellular organisms"
-rootLabel = "Organism" # Will be associated with root node
-# Got about 400k descriptions when testing
-
-print("Opening databases")
-dbpCon = sqlite3.connect(dbpediaDb)
-dbpCur = dbpCon.cursor()
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-
-print("Getting node names")
-nodeNames = set()
-for (name,) in dbCur.execute("SELECT name from nodes"):
- nodeNames.add(name)
-
-print("Checking for names to skip")
-oldSz = len(nodeNames)
-if os.path.exists(namesToSkipFile):
- with open(namesToSkipFile) as file:
- for line in file:
- nodeNames.remove(line.rstrip())
-print(f"Skipping {oldSz - len(nodeNames)} nodes")
-
-print("Reading disambiguation-page labels")
-disambigLabels = set()
-query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
-for (label,) in dbpCur.execute(query):
- disambigLabels.add(label)
-
-print("Trying to associate nodes with DBpedia labels")
-nodeToLabel = {}
-nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)'
-nameToVariants = {} # Maps node names to lists of matching labels
-iterNum = 0
-for (label,) in dbpCur.execute("SELECT label from labels"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"At iteration {iterNum}")
- #
- if label in disambigLabels:
- continue
- name = label.lower()
- if name in nodeNames:
- if name not in nameToVariants:
- nameToVariants[name] = [label]
- elif label not in nameToVariants[name]:
- nameToVariants[name].append(label)
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- subName = match.group(1)
- if subName in nodeNames and match.group(2) != "disambiguation":
- if subName not in nameToVariants:
- nameToVariants[subName] = [label]
- elif name not in nameToVariants[subName]:
- nameToVariants[subName].append(label)
-# Associate labels without conflicts
-for (name, variants) in nameToVariants.items():
- if len(variants) == 1:
- nodeToLabel[name] = variants[0]
-for name in nodeToLabel:
- del nameToVariants[name]
-# Special case for root node
-nodeToLabel[rootNodeName] = rootLabel
-if rootNodeName in nameToVariants:
- del nameToVariants["cellular organisms"]
-
-print(f"Trying to resolve {len(nameToVariants)} conflicts")
-def resolveWithPickedLabels():
- " Attempts to resolve conflicts using a picked-names file "
- with open(pickedLabelsFile) as file:
- for line in file:
- (name, _, label) = line.rstrip().partition("|")
- if name not in nameToVariants:
- print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr)
- continue
- if label == "":
- del nameToVariants[name]
- else:
- if label not in nameToVariants[name]:
- print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr)
- nodeToLabel[name] = label
- del nameToVariants[name]
-def resolveWithCategoryList():
- """
- Attempts to resolve conflicts by looking for labels like 'name1 (category1)',
- and choosing those with a category1 that seems 'biological'.
- Does two passes, using more generic categories first. This helps avoid stuff like
- Pan being classified as a horse instead of an ape.
- """
- generalCategories = {
- "species", "genus",
- "plant", "fungus", "animal",
- "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
- "fish", "amphibian", "reptile", "bird", "mammal",
- }
- specificCategories = {
- "protist", "alveolate", "dinoflagellates",
- "orchid", "poaceae", "fern", "moss", "alga",
- "bryozoan", "hydrozoan",
- "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
- "bivalve", "gastropod", "chiton",
- "shrimp", "decapod", "crab", "barnacle", "copepod",
- "arachnid", "spider", "harvestman", "mite",
- "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
- "beetle", "fly", "butterfly", "moth", "wasp",
- "catfish",
- "frog",
- "lizard",
- "horse", "sheep", "cattle", "mouse",
- }
- namesToRemove = set()
- for (name, variants) in nameToVariants.items():
- found = False
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2).lower() in generalCategories:
- nodeToLabel[name] = label
- namesToRemove.add(name)
- found = True
- break
- if not found:
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2).lower() in specificCategories:
- nodeToLabel[name] = label
- namesToRemove.add(name)
- break
- for name in namesToRemove:
- del nameToVariants[name]
-def resolveWithTypeData():
- " Attempts to resolve conflicts using DBpedia's type data "
- taxonTypes = { # Obtained from the DBpedia ontology
- "http://dbpedia.org/ontology/Species",
- "http://dbpedia.org/ontology/Archaea",
- "http://dbpedia.org/ontology/Bacteria",
- "http://dbpedia.org/ontology/Eukaryote",
- "http://dbpedia.org/ontology/Plant",
- "http://dbpedia.org/ontology/ClubMoss",
- "http://dbpedia.org/ontology/Conifer",
- "http://dbpedia.org/ontology/CultivatedVariety",
- "http://dbpedia.org/ontology/Cycad",
- "http://dbpedia.org/ontology/Fern",
- "http://dbpedia.org/ontology/FloweringPlant",
- "http://dbpedia.org/ontology/Grape",
- "http://dbpedia.org/ontology/Ginkgo",
- "http://dbpedia.org/ontology/Gnetophytes",
- "http://dbpedia.org/ontology/GreenAlga",
- "http://dbpedia.org/ontology/Moss",
- "http://dbpedia.org/ontology/Fungus",
- "http://dbpedia.org/ontology/Animal",
- "http://dbpedia.org/ontology/Fish",
- "http://dbpedia.org/ontology/Crustacean",
- "http://dbpedia.org/ontology/Mollusca",
- "http://dbpedia.org/ontology/Insect",
- "http://dbpedia.org/ontology/Arachnid",
- "http://dbpedia.org/ontology/Amphibian",
- "http://dbpedia.org/ontology/Reptile",
- "http://dbpedia.org/ontology/Bird",
- "http://dbpedia.org/ontology/Mammal",
- "http://dbpedia.org/ontology/Cat",
- "http://dbpedia.org/ontology/Dog",
- "http://dbpedia.org/ontology/Horse",
- }
- iterNum = 0
- for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"At iteration {iterNum}")
- #
- if type in taxonTypes:
- name = label.lower()
- if name in nameToVariants:
- nodeToLabel[name] = label
- del nameToVariants[name]
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- name = match.group(1).lower()
- if name in nameToVariants:
- nodeToLabel[name] = label
- del nameToVariants[name]
-#resolveWithTypeData()
-#resolveWithCategoryList()
-resolveWithPickedLabels()
-print(f"Remaining number of conflicts: {len(nameToVariants)}")
-
-print("Getting node IRIs")
-nodeToIri = {}
-for (name, label) in nodeToLabel.items():
- (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ?", (label,)).fetchone()
- nodeToIri[name] = iri
-
-print("Resolving redirects")
-redirectingIriSet = set()
-iterNum = 0
-for (name, iri) in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone()
- if row != None:
- nodeToIri[name] = row[0]
- redirectingIriSet.add(name)
-
-print("Adding description tables")
-dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)")
-dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)")
-dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)")
-iterNum = 0
-for (name, iri) in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?"
- row = dbpCur.execute(query, (iri,)).fetchone()
- if row != None:
- desc, wikiId = row
- dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0))
- dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1))
-
-print("Closing databases")
-dbCon.commit()
-dbCon.close()
-dbpCon.commit()
-dbpCon.close()