aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genDbpData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-14 19:30:43 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-14 19:39:10 +1000
commitc97acf8852e2017fd4776d65069f707121405f43 (patch)
tree1c0d725b6ae496239036b0f1d1c4a2caadf209cf /backend/data/genDbpData.py
parent7003ef7f92f3a8fed059dab2b37c0e203c000dba (diff)
Use DBpedia data for node descriptions
Add backend/data/dbpedia/ directory containing scripts and README for obtaining DBpedia data, storing it into a db, converting/adding description data to data.db, and for resolving tol-node DBpedia-node association conflicts (via DBpedia relations, manual listing, etc). Resulted in less (about 3/4 as many) descriptions as with using enwiki, but with notably less mis-associations (eg: node Thor is described as a shrimp instead of a god).
Diffstat (limited to 'backend/data/genDbpData.py')
-rwxr-xr-xbackend/data/genDbpData.py227
1 files changed, 227 insertions, 0 deletions
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
new file mode 100755
index 0000000..6cc8d33
--- /dev/null
+++ b/backend/data/genDbpData.py
@@ -0,0 +1,227 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads DBpedia data from dbpedia/*, along with tree-of-life\n"
+usageInfo += "node and name data from a sqlite database, associates nodes with\n"
+usageInfo += "DBpedia IRIs, and adds alt-name and description information for\n"
+usageInfo += "those nodes.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbpediaDb = "dbpedia/dbpData.db"
+pickedLabelsFile = "dbpPickedLabels.txt"
+dbFile = "data.db"
+
+# Open dbs
+dbpCon = sqlite3.connect(dbpediaDb)
+dbpCur = dbpCon.cursor()
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Get node names
+print("Reading node names")
+nodeNames = set()
+for row in dbCur.execute("SELECT name from nodes"):
+ nodeNames.add(row[0])
+# Get disambiguation page labels
+print("Reading disambiguation-page labels")
+disambigLabels = set()
+query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
+for (label,) in dbpCur.execute(query):
+ disambigLabels.add(label)
+# Try associating nodes with IRIs, accounting for disambiguation labels
+print("Trying to associate nodes with labels")
+nodeToLabel = {}
+nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)")
+nameToVariants = {}
+iterNum = 0
+for (label,) in dbpCur.execute("SELECT label from labels"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if label in disambigLabels:
+ continue
+ name = label.lower()
+ if name in nodeNames:
+ if name not in nameToVariants:
+ nameToVariants[name] = [label]
+ elif label not in nameToVariants[name]:
+ nameToVariants[name].append(label)
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ subName = match.group(1)
+ if subName in nodeNames and match.group(2) != "disambiguation":
+ if subName not in nameToVariants:
+ nameToVariants[subName] = [name] # Intentionally ignoring case here
+ elif name not in nameToVariants[subName]:
+ nameToVariants[subName].append(name)
+for (name, variants) in nameToVariants.items():
+ if len(variants) == 1:
+ nodeToLabel[name] = variants[0]
+for name in nodeToLabel:
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via category-list
+ # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
+print("Resolving conflicts using category-list")
+generalCategories = {
+ "species", "genus",
+ "plant", "fungus", "animal",
+ "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
+ "fish", "amphibian", "reptile", "bird", "mammal",
+}
+specificCategories = {
+ "protist", "alveolate", "dinoflagellates",
+ "orchid", "Poaceae", "fern", "moss", "alga",
+ "bryozoan", "hydrozoan",
+ "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
+ "bivalve", "gastropod", "chiton",
+ "shrimp", "decapod", "crab", "barnacle", "copepod",
+ "arachnid", "spider", "harvestman", "mite",
+ "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
+ "beetle", "fly", "butterfly", "moth", "wasp",
+ "catfish",
+ "frog",
+ "lizard",
+ "horse", "sheep", "cattle", "mouse",
+}
+namesToRemove = set()
+for (name, variants) in nameToVariants.items():
+ found = False
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in generalCategories:
+ nodeToLabel[name] = label
+ namesToRemove.add(name)
+ found = True
+ break
+ if not found:
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in specificCategories:
+ nodeToLabel[name] = label
+ namesToRemove.add(name)
+ break
+for name in namesToRemove:
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via taxon-type information
+print("Resolving conflicts using instance-type data")
+taxonTypes = { # Obtained from the DBpedia ontology
+ "http://dbpedia.org/ontology/Species",
+ "http://dbpedia.org/ontology/Archaea",
+ "http://dbpedia.org/ontology/Bacteria",
+ "http://dbpedia.org/ontology/Eukaryote",
+ "http://dbpedia.org/ontology/Plant",
+ "http://dbpedia.org/ontology/ClubMoss",
+ "http://dbpedia.org/ontology/Conifer",
+ "http://dbpedia.org/ontology/CultivatedVariety",
+ "http://dbpedia.org/ontology/Cycad",
+ "http://dbpedia.org/ontology/Fern",
+ "http://dbpedia.org/ontology/FloweringPlant",
+ "http://dbpedia.org/ontology/Grape",
+ "http://dbpedia.org/ontology/Ginkgo",
+ "http://dbpedia.org/ontology/Gnetophytes",
+ "http://dbpedia.org/ontology/GreenAlga",
+ "http://dbpedia.org/ontology/Moss",
+ "http://dbpedia.org/ontology/Fungus",
+ "http://dbpedia.org/ontology/Animal",
+ "http://dbpedia.org/ontology/Fish",
+ "http://dbpedia.org/ontology/Crustacean",
+ "http://dbpedia.org/ontology/Mollusca",
+ "http://dbpedia.org/ontology/Insect",
+ "http://dbpedia.org/ontology/Arachnid",
+ "http://dbpedia.org/ontology/Amphibian",
+ "http://dbpedia.org/ontology/Reptile",
+ "http://dbpedia.org/ontology/Bird",
+ "http://dbpedia.org/ontology/Mammal",
+ "http://dbpedia.org/ontology/Cat",
+ "http://dbpedia.org/ontology/Dog",
+ "http://dbpedia.org/ontology/Horse",
+}
+iterNum = 0
+for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if type in taxonTypes:
+ name = label.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ name = match.group(1)
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via picked-labels
+print("Resolving conflicts using picked-labels")
+with open(pickedLabelsFile) as file:
+ for line in file:
+ pickedLabel = line.rstrip()
+ name = pickedLabel.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = pickedLabel
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.match(pickedLabel)
+ if match == None:
+ print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr)
+ else:
+ name = match.group(1)
+ if name not in nameToVariants:
+ print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr)
+ else:
+ nodeToLabel[name] = pickedLabel
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Associate nodes with IRIs
+print("Getting nodes IRIs")
+nodeToIri = {}
+iterNum = 0
+for (name, label) in nodeToLabel.items():
+ row = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone()
+ if row == None:
+ print("ERROR: Couldn't find label {}".format(label), file=sys.stderr)
+ sys.exit(1)
+ else:
+ nodeToIri[name] = row[0]
+# Resolve redirects
+print("Resolving redirects")
+redirectingIriSet = set()
+iterNum = 0
+for (name, iri) in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone()
+ if row != None:
+ nodeToIri[name] = row[0]
+ redirectingIriSet.add(iri)
+# Find descriptions, and add to db
+print("Adding node description data")
+dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)")
+iterNum = 0
+for (name, iri) in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = dbpCur.execute("SELECT abstract FROM abstracts where iri = ?", (iri,)).fetchone()
+ if row != None:
+ dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingIriSet else 0))
+# Close dbs
+dbCon.commit()
+dbCon.close()
+dbpCon.commit()
+dbpCon.close()