aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genDbpData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/genDbpData.py')
-rwxr-xr-xbackend/data/genDbpData.py244
1 files changed, 128 insertions, 116 deletions
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
index 0655344..887e8a8 100755
--- a/backend/data/genDbpData.py
+++ b/backend/data/genDbpData.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, re
+import sys, os, re
import sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
@@ -13,6 +13,7 @@ if len(sys.argv) > 1:
sys.exit(1)
dbpediaDb = "dbpedia/dbpData.db"
+namesToSkipFile = "dbpNamesToSkip.txt"
pickedLabelsFile = "dbpPickedLabels.txt"
dbFile = "data.db"
@@ -24,8 +25,16 @@ dbCur = dbCon.cursor()
# Get node names
print("Reading node names")
nodeNames = set()
-for row in dbCur.execute("SELECT name from nodes"):
- nodeNames.add(row[0])
+for (name,) in dbCur.execute("SELECT name from nodes"):
+ nodeNames.add(name)
+# Skipping certain names
+print("Checking for names to skip")
+oldSz = len(nodeNames)
+if os.path.exists(namesToSkipFile):
+ with open(namesToSkipFile) as file:
+ for line in file:
+ nodeNames.remove(line.rstrip())
+print(f"Skipping {oldSz - len(nodeNames)} nodes")
# Get disambiguation page labels
print("Reading disambiguation-page labels")
disambigLabels = set()
@@ -57,9 +66,9 @@ for (label,) in dbpCur.execute("SELECT label from labels"):
subName = match.group(1)
if subName in nodeNames and match.group(2) != "disambiguation":
if subName not in nameToVariants:
- nameToVariants[subName] = [name] # Intentionally ignoring case here
+ nameToVariants[subName] = [label]
elif name not in nameToVariants[subName]:
- nameToVariants[subName].append(name)
+ nameToVariants[subName].append(label)
for (name, variants) in nameToVariants.items():
if len(variants) == 1:
nodeToLabel[name] = variants[0]
@@ -67,126 +76,128 @@ for name in nodeToLabel:
del nameToVariants[name]
nodeToLabel["cellular organisms"] = "organism" # Special case for root node
print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via picked-labels
-print("Resolving conflicts using picked-labels")
-with open(pickedLabelsFile) as file:
- for line in file:
- pickedLabel = line.rstrip()
- name = pickedLabel.lower()
- if name in nameToVariants:
- nodeToLabel[name] = pickedLabel
- del nameToVariants[name]
- else:
- match = nameVariantRegex.match(pickedLabel)
- if match == None:
- print(f"WARNING: Picked label {pickedLabel} not found (1)", file=sys.stderr)
+# Try resolving conflicts
+def resolveWithPickedLabels():
+ # Attempts conflict resolution using a file with lines of the form 'name1|label1',
+ # where label1 may be absent, indicating that no label should be associated with the name
+ print("Resolving conflicts using picked-labels")
+ with open(pickedLabelsFile) as file:
+ for line in file:
+ (name, _, label) = line.rstrip().partition("|")
+ if name not in nameToVariants:
+ print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr)
+ continue
+ if label == "":
+ del nameToVariants[name]
else:
- name = match.group(1)
- if name not in nameToVariants:
- print(f"WARNING: Picked label {pickedLabel} not found (2)", file=sys.stderr)
- else:
- nodeToLabel[name] = pickedLabel
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via category-list
+ if label not in nameToVariants[name]:
+ print(f"WARNING: Picked label \"{label}\" for name \"{name}\" not found", file=sys.stderr)
+ continue
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ print(f"Remaining number of conflicts: {len(nameToVariants)}")
+def resolveWithCategoryList():
+ # Attempts conflict resolution using category-text in labels of the form 'name1 (category1)'
# Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
-print("Resolving conflicts using category-list")
-generalCategories = {
- "species", "genus",
- "plant", "fungus", "animal",
- "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
- "fish", "amphibian", "reptile", "bird", "mammal",
-}
-specificCategories = {
- "protist", "alveolate", "dinoflagellates",
- "orchid", "Poaceae", "fern", "moss", "alga",
- "bryozoan", "hydrozoan",
- "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
- "bivalve", "gastropod", "chiton",
- "shrimp", "decapod", "crab", "barnacle", "copepod",
- "arachnid", "spider", "harvestman", "mite",
- "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
- "beetle", "fly", "butterfly", "moth", "wasp",
- "catfish",
- "frog",
- "lizard",
- "horse", "sheep", "cattle", "mouse",
-}
-namesToRemove = set()
-for (name, variants) in nameToVariants.items():
- found = False
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2) in generalCategories:
- nodeToLabel[name] = label
- namesToRemove.add(name)
- found = True
- break
- if not found:
+ print("Resolving conflicts using category-list")
+ generalCategories = {
+ "species", "genus",
+ "plant", "fungus", "animal",
+ "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
+ "fish", "amphibian", "reptile", "bird", "mammal",
+ }
+ specificCategories = {
+ "protist", "alveolate", "dinoflagellates",
+ "orchid", "Poaceae", "fern", "moss", "alga",
+ "bryozoan", "hydrozoan",
+ "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
+ "bivalve", "gastropod", "chiton",
+ "shrimp", "decapod", "crab", "barnacle", "copepod",
+ "arachnid", "spider", "harvestman", "mite",
+ "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
+ "beetle", "fly", "butterfly", "moth", "wasp",
+ "catfish",
+ "frog",
+ "lizard",
+ "horse", "sheep", "cattle", "mouse",
+ }
+ namesToRemove = set()
+ for (name, variants) in nameToVariants.items():
+ found = False
for label in variants:
match = nameVariantRegex.match(label)
- if match != None and match.group(2) in specificCategories:
+ if match != None and match.group(2) in generalCategories:
nodeToLabel[name] = label
namesToRemove.add(name)
+ found = True
break
-for name in namesToRemove:
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via taxon-type information
-print("Resolving conflicts using instance-type data")
-taxonTypes = { # Obtained from the DBpedia ontology
- "http://dbpedia.org/ontology/Species",
- "http://dbpedia.org/ontology/Archaea",
- "http://dbpedia.org/ontology/Bacteria",
- "http://dbpedia.org/ontology/Eukaryote",
- "http://dbpedia.org/ontology/Plant",
- "http://dbpedia.org/ontology/ClubMoss",
- "http://dbpedia.org/ontology/Conifer",
- "http://dbpedia.org/ontology/CultivatedVariety",
- "http://dbpedia.org/ontology/Cycad",
- "http://dbpedia.org/ontology/Fern",
- "http://dbpedia.org/ontology/FloweringPlant",
- "http://dbpedia.org/ontology/Grape",
- "http://dbpedia.org/ontology/Ginkgo",
- "http://dbpedia.org/ontology/Gnetophytes",
- "http://dbpedia.org/ontology/GreenAlga",
- "http://dbpedia.org/ontology/Moss",
- "http://dbpedia.org/ontology/Fungus",
- "http://dbpedia.org/ontology/Animal",
- "http://dbpedia.org/ontology/Fish",
- "http://dbpedia.org/ontology/Crustacean",
- "http://dbpedia.org/ontology/Mollusca",
- "http://dbpedia.org/ontology/Insect",
- "http://dbpedia.org/ontology/Arachnid",
- "http://dbpedia.org/ontology/Amphibian",
- "http://dbpedia.org/ontology/Reptile",
- "http://dbpedia.org/ontology/Bird",
- "http://dbpedia.org/ontology/Mammal",
- "http://dbpedia.org/ontology/Cat",
- "http://dbpedia.org/ontology/Dog",
- "http://dbpedia.org/ontology/Horse",
-}
-iterNum = 0
-for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
- #
- if type in taxonTypes:
- name = label.lower()
- if name in nameToVariants:
- nodeToLabel[name] = label
- del nameToVariants[name]
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- name = match.group(1)
- if name in nameToVariants:
+ if not found:
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in specificCategories:
nodeToLabel[name] = label
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
+ namesToRemove.add(name)
+ break
+ for name in namesToRemove:
+ del nameToVariants[name]
+ print(f"Remaining number of conflicts: {len(nameToVariants)}")
+def resolveWithTypeData():
+ # Attempts conflict-resolution using dbpedia's instance-type data
+ print("Resolving conflicts using instance-type data")
+ taxonTypes = { # Obtained from the DBpedia ontology
+ "http://dbpedia.org/ontology/Species",
+ "http://dbpedia.org/ontology/Archaea",
+ "http://dbpedia.org/ontology/Bacteria",
+ "http://dbpedia.org/ontology/Eukaryote",
+ "http://dbpedia.org/ontology/Plant",
+ "http://dbpedia.org/ontology/ClubMoss",
+ "http://dbpedia.org/ontology/Conifer",
+ "http://dbpedia.org/ontology/CultivatedVariety",
+ "http://dbpedia.org/ontology/Cycad",
+ "http://dbpedia.org/ontology/Fern",
+ "http://dbpedia.org/ontology/FloweringPlant",
+ "http://dbpedia.org/ontology/Grape",
+ "http://dbpedia.org/ontology/Ginkgo",
+ "http://dbpedia.org/ontology/Gnetophytes",
+ "http://dbpedia.org/ontology/GreenAlga",
+ "http://dbpedia.org/ontology/Moss",
+ "http://dbpedia.org/ontology/Fungus",
+ "http://dbpedia.org/ontology/Animal",
+ "http://dbpedia.org/ontology/Fish",
+ "http://dbpedia.org/ontology/Crustacean",
+ "http://dbpedia.org/ontology/Mollusca",
+ "http://dbpedia.org/ontology/Insect",
+ "http://dbpedia.org/ontology/Arachnid",
+ "http://dbpedia.org/ontology/Amphibian",
+ "http://dbpedia.org/ontology/Reptile",
+ "http://dbpedia.org/ontology/Bird",
+ "http://dbpedia.org/ontology/Mammal",
+ "http://dbpedia.org/ontology/Cat",
+ "http://dbpedia.org/ontology/Dog",
+ "http://dbpedia.org/ontology/Horse",
+ }
+ iterNum = 0
+ for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"Processing line {iterNum}")
+ #
+ if type in taxonTypes:
+ name = label.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ name = match.group(1)
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ print(f"Remaining number of conflicts: {len(nameToVariants)}")
+resolveWithPickedLabels()
# Associate nodes with IRIs
-print("Getting nodes IRIs")
+print("Getting node IRIs")
nodeToIri = {}
iterNum = 0
for (name, label) in nodeToLabel.items():
@@ -212,6 +223,7 @@ for (name, iri) in nodeToIri.items():
# Find descriptions, and add to db
print("Adding node description data")
dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)")
+dbCur.execute("CREATE INDEX descs_id_idx ON descs(wiki_id)") # wiki_id intentionally left non-unique
iterNum = 0
for (name, iri) in nodeToIri.items():
iterNum += 1