aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genDbpData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
committerTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
commit5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/tolData/genDbpData.py
parenta8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/tolData/genDbpData.py')
-rwxr-xr-xbackend/tolData/genDbpData.py247
1 files changed, 247 insertions, 0 deletions
diff --git a/backend/tolData/genDbpData.py b/backend/tolData/genDbpData.py
new file mode 100755
index 0000000..df3a6be
--- /dev/null
+++ b/backend/tolData/genDbpData.py
@@ -0,0 +1,247 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads a database containing data from DBpedia, and tries to associate
+DBpedia IRIs with nodes in a database, adding short-descriptions for them.
+"""
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbpediaDb = "dbpedia/descData.db"
+namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
+pickedLabelsFile = "pickedDbpLabels.txt"
+dbFile = "data.db"
+rootNodeName = "cellular organisms"
+rootLabel = "organism" # Will be associated with root node
+# Got about 400k descriptions when testing
+
+print("Opening databases")
+dbpCon = sqlite3.connect(dbpediaDb)
+dbpCur = dbpCon.cursor()
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+
+print("Getting node names")
+nodeNames = set()
+for (name,) in dbCur.execute("SELECT name from nodes"):
+ nodeNames.add(name)
+
+print("Checking for names to skip")
+oldSz = len(nodeNames)
+if os.path.exists(namesToSkipFile):
+ with open(namesToSkipFile) as file:
+ for line in file:
+ nodeNames.remove(line.rstrip())
+print(f"Skipping {oldSz - len(nodeNames)} nodes")
+
+print("Reading disambiguation-page labels")
+disambigLabels = set()
+query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
+for (label,) in dbpCur.execute(query):
+ disambigLabels.add(label)
+
+print("Trying to associate nodes with DBpedia labels")
+nodeToLabel = {}
+nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)'
+nameToVariants = {} # Maps node names to lists of matching labels
+iterNum = 0
+for (label,) in dbpCur.execute("SELECT label from labels"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ if label in disambigLabels:
+ continue
+ name = label.lower()
+ if name in nodeNames:
+ if name not in nameToVariants:
+ nameToVariants[name] = [label]
+ elif label not in nameToVariants[name]:
+ nameToVariants[name].append(label)
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ subName = match.group(1)
+ if subName in nodeNames and match.group(2) != "disambiguation":
+ if subName not in nameToVariants:
+ nameToVariants[subName] = [label]
+ elif name not in nameToVariants[subName]:
+ nameToVariants[subName].append(label)
+# Associate labels without conflicts
+for (name, variants) in nameToVariants.items():
+ if len(variants) == 1:
+ nodeToLabel[name] = variants[0]
+for name in nodeToLabel:
+ del nameToVariants[name]
+# Special case for root node
+nodeToLabel[rootNodeName] = rootLabel
+if rootNodeName in nameToVariants:
+ del nameToVariants["cellular organisms"]
+
+print("Trying to resolve {len(nameToVariants)} conflicts")
+def resolveWithPickedLabels():
+ " Attempts to resolve conflicts using a picked-names file "
+ with open(pickedLabelsFile) as file:
+ for line in file:
+ (name, _, label) = line.rstrip().partition("|")
+ if name not in nameToVariants:
+ print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr)
+ continue
+ if label == "":
+ del nameToVariants[name]
+ else:
+ if label not in nameToVariants[name]:
+ print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr)
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+def resolveWithCategoryList():
+ """
+ Attempts to resolve conflicts by looking for labels like 'name1 (category1)',
+ and choosing those with a category1 that seems 'biological'.
+ Does two passes, using more generic categories first. This helps avoid stuff like
+ Pan being classified as a horse instead of an ape.
+ """
+ generalCategories = {
+ "species", "genus",
+ "plant", "fungus", "animal",
+ "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
+ "fish", "amphibian", "reptile", "bird", "mammal",
+ }
+ specificCategories = {
+ "protist", "alveolate", "dinoflagellates",
+ "orchid", "poaceae", "fern", "moss", "alga",
+ "bryozoan", "hydrozoan",
+ "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
+ "bivalve", "gastropod", "chiton",
+ "shrimp", "decapod", "crab", "barnacle", "copepod",
+ "arachnid", "spider", "harvestman", "mite",
+ "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
+ "beetle", "fly", "butterfly", "moth", "wasp",
+ "catfish",
+ "frog",
+ "lizard",
+ "horse", "sheep", "cattle", "mouse",
+ }
+ namesToRemove = set()
+ for (name, variants) in nameToVariants.items():
+ found = False
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in generalCategories:
+ nodeToLabel[name] = label
+ namesToRemove.add(name)
+ found = True
+ break
+ if not found:
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in specificCategories:
+ nodeToLabel[name] = label
+ namesToRemove.add(name)
+ break
+ for name in namesToRemove:
+ del nameToVariants[name]
+def resolveWithTypeData():
+ " Attempts to resolve conflicts using DBpedia's type data "
+ taxonTypes = { # Obtained from the DBpedia ontology
+ "http://dbpedia.org/ontology/Species",
+ "http://dbpedia.org/ontology/Archaea",
+ "http://dbpedia.org/ontology/Bacteria",
+ "http://dbpedia.org/ontology/Eukaryote",
+ "http://dbpedia.org/ontology/Plant",
+ "http://dbpedia.org/ontology/ClubMoss",
+ "http://dbpedia.org/ontology/Conifer",
+ "http://dbpedia.org/ontology/CultivatedVariety",
+ "http://dbpedia.org/ontology/Cycad",
+ "http://dbpedia.org/ontology/Fern",
+ "http://dbpedia.org/ontology/FloweringPlant",
+ "http://dbpedia.org/ontology/Grape",
+ "http://dbpedia.org/ontology/Ginkgo",
+ "http://dbpedia.org/ontology/Gnetophytes",
+ "http://dbpedia.org/ontology/GreenAlga",
+ "http://dbpedia.org/ontology/Moss",
+ "http://dbpedia.org/ontology/Fungus",
+ "http://dbpedia.org/ontology/Animal",
+ "http://dbpedia.org/ontology/Fish",
+ "http://dbpedia.org/ontology/Crustacean",
+ "http://dbpedia.org/ontology/Mollusca",
+ "http://dbpedia.org/ontology/Insect",
+ "http://dbpedia.org/ontology/Arachnid",
+ "http://dbpedia.org/ontology/Amphibian",
+ "http://dbpedia.org/ontology/Reptile",
+ "http://dbpedia.org/ontology/Bird",
+ "http://dbpedia.org/ontology/Mammal",
+ "http://dbpedia.org/ontology/Cat",
+ "http://dbpedia.org/ontology/Dog",
+ "http://dbpedia.org/ontology/Horse",
+ }
+ iterNum = 0
+ for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ if type in taxonTypes:
+ name = label.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ name = match.group(1)
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+#resolveWithTypeData()
+#resolveWithCategoryList()
+resolveWithPickedLabels()
+print(f"Remaining number of conflicts: {len(nameToVariants)}")
+
+print("Getting node IRIs")
+nodeToIri = {}
+for (name, label) in nodeToLabel.items():
+ (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone()
+ nodeToIri[name] = iri
+
+print("Resolving redirects")
+redirectingIriSet = set()
+iterNum = 0
+for (name, iri) in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone()
+ if row != None:
+ nodeToIri[name] = row[0]
+ redirectingIriSet.add(name)
+
+print("Adding description tables")
+dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)")
+dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)")
+dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)")
+iterNum = 0
+for (name, iri) in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?"
+ row = dbpCur.execute(query, (iri,)).fetchone()
+ if row != None:
+ desc, wikiId = row
+ dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0))
+ dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1))
+
+print("Closing databases")
+dbCon.commit()
+dbCon.close()
+dbpCon.commit()
+dbpCon.close()