aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-07 23:06:26 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-07 23:13:01 +1000
commit27361479b3615a0f1156be3a97579df7f128d993 (patch)
tree20b8aa14ac7ba3b409a3c983fd4b04e60e7d2660
parent1879c7920607feb1df1102a09d2d4f915d9544a5 (diff)
Add more manual-correction for dbp-desc generation
-rw-r--r--.gitignore2
-rw-r--r--backend/data/dbpPickedLabels.txt655
-rwxr-xr-xbackend/data/genDbpConflicts.py202
-rwxr-xr-xbackend/data/genDbpData.py244
4 files changed, 130 insertions, 973 deletions
diff --git a/.gitignore b/.gitignore
index 3ac32b0..2493f5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,7 @@
/backend/data/dbpedia/*.db
/backend/data/genOtolNamesToKeep.txt
/backend/data/genOtolDataPickedDups.txt
+/backend/data/dbpNamesToSkip.txt
+/backend/data/dbpPickedLabels.txt
/backend/data/genEnwikiDescNamesToSkip.txt
/backend/data/genEnwikiDescTitlesToUse.txt
diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt
deleted file mode 100644
index 1fa1e71..0000000
--- a/backend/data/dbpPickedLabels.txt
+++ /dev/null
@@ -1,655 +0,0 @@
-Abditomys latidens
-Abdopus aculeatus
-Ablerus
-Abralia
-Abramis brama
-Abraxas sylvata
-Abrostola
-Acidobacteria
-Acontia
-Actiniaria
-Actinobacteria
-Addisonia
-Aenetus
-Agapanthus
-Aglais
-Aglaope
-agrotis infusa (bogong moth)
-agrypnia (caddisfly)
-Agyrium
-Ainoa
-Alauda
-Alces
-Aleurotrachelus atratus
-Alexicles
-Alistra
-Allactaga
-Alligator
-Aloe
-Aloe vera
-Aluta
-Alypia
-Amanita
-Amaranthus
-Amaryllis
-Amazona
-Ambrysus
-ambystomatidae (mole salamanders)
-Amphibia
-Amyris
-Anagyrus
-Anania
-Ancylini
-Anemosa
-Angustia
-anhanguera (pterosaur)
-Aniba
-Annona
-Anthene
-Anticrates
-Aparupa
-Aplysina archeri
-Apollonias
-Arabidopsis thaliana
-Aracus
-Arbutus
-Arca noae
-Arctia
-Arctiini
-Ardices
-Ardipithecus
-Argas
-Argia
-Aristea
-arjuna (elephant)
-Armina
-Artace
-Arthrophyllum
-Asca
-asclera (beetle genus)
-Asperitas
-Astacus
-Asteroidea
-Astia
-Astyra
-Atenia
-Atrax
-Atrina fragilis
-Augustia
-Austrogomphus
-Avena
-Aves
-Axiidae
-Azalea
-Azenia
-Babina holsti
-Bacchini
-balfouria (flatworm)
-ballana (leafhopper)
-Barcella
-Baryonyx
-Begonia
-Belbina
-belisarius (scorpion)
-belzebub (prawn)
-Bembidion
-Benincasa
-beroe (ctenophore)
-Bertmainius tingle
-Betta
-Biastes
-Bigelowia
-Bignonia
-Bitia
-Blumea
-Boa constrictor
-Boea
-Bolax
-Bonia
-Boops
-Bougainvillia
-brevinema (bacterium)
-Briza
-Brunonia
-buchneria (millipede)
-bucolion (cockroach)
-Burara
-Buxbaumia
-Cabello
-Cacatua
-Caedicia
-Calanus
-calchas (scorpion)
-Callia
-callianira (ctenophore)
-Caluromys
-Cambarellus
-Camellia
-campion (lacewing)
-Camptonotus
-Campylopus
-candelabrum (hydrozoa)
-Canella
-canelo (tree)
-Canis lupus
-Canis lupus dingo
-Capitata
-Capito
-Caprona
-Capsicum
-Carallia
-Carcinus
-Carlina
-Carpentaria
-Caryanda elegans
-Cassiopea
-Castela
-Castolus
-Cavia
-Ceiba
-Celestus anelpistus
-Cellana
-cellia (subgenus)
-Centaurea
-Cepa
-Cephoidea
-Cercyra
-Cetacea
-Chaetomium thermophilum
-Chaetonotus
-chane (mayfly)
-chapmania (flatworm)
-charidia (skipper)
-charmion (skipper)
-Charpentiera
-Chilena
-Chiroptera
-Chondrocladia
-Chrysanthemum
-Chumma
-chuniella (worm)
-Cicindela
-Cilnia
-cinchona (shrub family)
-Cispia
-Cisthene
-Citipati
-Citronia vasiformis
-claria (rotifer)
-clava (hydrozoa)
-clostridioides difficile (bacteria)
-Clupea
-Clytus
-Coccinella
-Cochlearia
-Coffea canephora
-Cojoba
-colemaniella (worm)
-coleodesmium (worm)
-colle (grape)
-coloradia (dinosaur)
-confucius (leafhopper)
-Conger
-Conistra
-Conta
-Conus
-Conus marmoreus
-coppa (grape)
-copula (jellyfish)
-Corambis
-Corixa
-Corsia
-Cosmophyllum
-Cossinia
-Cotylea
-Craniata
-Critonia
-Crocus
-Cronius
-Cropia
-Crossosoma
-Cryptophagus
-cryptopora (brachiopod)
-Cryptosporidium
-Curculio
-Curtia
-Cyana
-Cyclamen
-Cyclocotyla
-Cyclostomata
-Cyclostomatida
-Cylindera
-Cynanchum louiseae
-Cynara
-Cyrba
-Dacus
-dalla (skipper)
-Daphne laureola
-Daphnia
-Datura
-Davidiella
-Davidsonia
-Decapoda
-Degeneria
-Deinonychus
-deiopea (ctenophore)
-Desmos
-Diaphorus
-Diaprepes
-Dichanthelium clandestinum
-Digitalis
-dikwa (amphipod)
-Dilipa
-dilong (dinosaur)
-dioxys (bee)
-Dipoena
-Dolichocephala
-Doras
-Dracaena braunii
-Draco mindanensis
-drepanophorus (worm)
-Drimiopsis
-Drobeta
-Drymus
-Dufourea
-Dulzura
-Dynastes
-Echeclus
-edwardsiella (bacterium)
-Eidothea
-Elaenia
-Eleutherodactylus
-Elona
-Elseya
-Encolpius
-endere (millipede)
-Enispa
-Enteropneusta
-Erasinus
-Eresus
-eriosoma (aphid)
-Erodium
-Erythronium
-ethesia (acanthaceae)
-Eucalyptus
-Euphorbia
-Euphrasia
-euryglossa (bee)
-Eusarcus
-Evansia
-Extra extra
-fallax (brachiopod)
-Felidae
-Felis
-Fernandezia
-fimbriaria (flatworm)
-florea (millipede)
-Flos
-Forestiera
-Forsythia
-Fossarina
-Fulcinia
-Fulgora
-Fungi
-Fungia
-Fusinus
-Galaxaura
-Galega
-Gaoligongshania
-gargantua (gorilla)
-Gaura
-Gazella
-Gelae
-Gemina
-Geminia
-Geomalacus
-georgium (caddisfly)
-Gergithus
-Geum coccineum
-Giraffa
-Girella zebra
-Glena
-Gnathostomata
-Gobius niger
-Gongora
-Gongylus
-gratia (mayfly)
-gryphus (brachiopod)
-Gymnopodium
-habeas corpus (pig)
-Halenia
-Halesia
-Hallucigenia
-Harmothoe
-Harpa
-Heliophila
-helleria (woodlouse)
-Helvidia
-Hemiaspis
-Hemithea
-Herpetopoma
-Heteroteuthis
-Hibana
-Hippotion
-Holops
-Homo erectus
-Homo sapiens
-Hoplitosaurus
-Hortensia
-Hostus
-Hottea
-Houttuynia
-Huntia
-hydrosaurus (mosasaur)
-Hypodematium
-iassus (leafhopper)
-Ibana
-Iberus
-Ibis
-Icona
-Ilerda
-Imma
-Impatiens
-Insecta
-Isurus
-Ixora
-Jacaranda
-Junco
-Khaan
-kobus (antelope)
-Koppe
-La paloma
-Labyrinthus
-Laetilia
-lampea (ctenophore)
-Lampetra
-Lanceola
-Laurencia
-laureola (woodlouse)
-Laurus
-Lemuria
-Lepidoptera
-Leptonema
-Lepus
-Lestoidea
-Leuconia
-Ligia
-Lilium
-Limax
-Limia
-Limnornis
-liniscus (nematode)
-Listeria
-Livius
-Lobster
-Loefgrenia
-Lonicera
-Loxodonta africana
-Lucanus cervus
-Lunella
-Lurio
-Lusius
-Lytta vesicatoria
-Macroglossinae
-macrophylla (wych elm cultivar)
-Madia
-Madrepora
-magellania (brachiopod)
-Magnolia
-Magnolia grandiflora
-Magnoliopsida
-malena (stork)
-Mammalia
-manis (orangutan)
-Mantica
-Mapusaurus
-Marchena
-Marmorata
-Marmosa
-Marmota
-Martensia
-Massaga
-Medius
-Megaceros
-Megaphyllum
-Meloe
-Mene
-Menziesia
-Mermessus
-Metabolus
-metallus (sawfly)
-Metasequoia
-migros (turkey)
-Milla
-Mimosa
-Mispila
-Mizuhopecten yessoensis
-Molinia
-Molione
-Molycria
-Monaeses
-Monotonia
-Monticola
-Moritasgus
-Morone
-Mussidia
-Mynes
-Myuchelys bellii
-Myuchelys georgesi
-Myuchelys latisternum
-Myxoderma
-Nagaina
-Naja
-Nala
-Napaeus
-Nardoa
-Naubolus
-nausithoe (jellyfish)
-Naxia
-Nectria
-Neis
-nemertes (worm)
-Neomeris
-Nephus
-Nerine
-Nicodamus
-Nitrospira
-Nitzschia
-Nycteris
-octodon (plant genus)
-odius (amphipod)
-Oeax
-Omphalotropis
-Opas
-Orcinus
-Ornitholestes
-Ornithopus
-Orsinome
-Oxyuris
-Pachyuromys
-padina (algae)
-Pajanelia
-Palisa
-Panthea
-Papilio
-Papilio buddha
-Papilionidae
-parachela (tardigrade)
-Paratheria
-parkinsonia (ammonite)
-Parthenium
-Parus
-Passer
-Patrinia
-Perdix
-Perilla frutescens
-Perizoma
-Persoonia
-Petricola
-Petronia
-Pharnacia
-Phassus
-Pheia
-Pheretima
-Philotis
-Phlegon
-Phlogius
-Phlox
-Pholas dactylus
-Phyllanthus distichus
-Phyllium
-Phytoecia
-Piaya
-Pieridae
-Pilia
-Pirula
-Pisania
-Pistoria
-plasmodium (slime mold)
-Platanus
-Plautia
-Pluteus
-Podalia
-Poliana
-Polistes canadensis
-Polypodium australe
-pompholyx (rotifer)
-Porius
-Posidonia
-Potamanthus yooni
-Potos
-Primates
-Problema
-Procolobus
-Pronous
-Prothoe
-Pseudonaja mengdeni
-Psylla
-Ptelea
-Pteronotus
-Pullimosina
-puncha (snakefly)
-Pusa
-pygora (goat)
-Pyrnus
-Pyrola
-Pyrus
-Pythium
-Quintilia
-Rafflesia
-Rattus norvegicus
-Rattus rattus
-Rhene
-Rhipicephalus
-Rhombodera
-Rhombodera extensicollis
-Rhombodera megaera
-Rosales
-Rotaria
-Roussea
-Rubia
-Rugosa
-Salamandra
-Salpingotus
-Salticus coronatus
-Salvia
-Samanea
-Samaris
-Savarna
-Scarabaeus
-Schiedea
-Schizopetalum
-Scopula gracilis
-seira (springtail)
-Selimus
-serina (grape)
-Setina
-Sibbaldia
-Sicyonia
-Sideroxylon
-Silene
-Sinea
-Siphonaptera
-Sirenia
-Sithon
-Smilax
-soa (barklice)
-Solanum
-Solidago houghtonii
-Sonneratia
-Sorex dispar
-Sorex longirostris
-Sorghum
-Spea
-Spelobia
-sphingidae (hawk moths)
-Squalus clarkae
-Stegonotus
-Stenonia
-Stephania
-Stichius
-Strombus
-strongylus (nematode)
-Struthiomimus
-suca (lacewing)
-sulcia (bacteria)
-superba (elm hybrid)
-Swan
-Sybota
-Sydowia
-Syllis
-Syneta
-Tainia
-taku (whale)
-Tantilla
-tapejara (pterosaur)
-tapia (tree)
-Tarne
-Tawera
-Telphusa
-Termitomyces schimperi
-Theba
-Theria
-Thestor
-Thomomys
-Thria
-Thunbergia
-Thunnus
-Thyasiridae
-Thyreus
-tinerfe (ctenophore)
-Tiso
-Titanophora
-tortricidae (snakes)
-Tortrix
-Triaenophorus
-Triops
-Trochilus
-tunga (flea)
-Tusitala
-Tutelina
-Tympanuchus cupido pinnatus
-Tyrannosaurus rex
-uga (dog)
-Unenlagia
-Uria
-Ursia
-valentines (grape)
-Vanessa atalanta
-Varicella
-velamen (ctenophore)
-Velociraptor
-Venia
-Vinca
-Vipera
-vitrum (tunicate)
-Vitula
-wolga (rotifer)
-Wulfenia
-Yacolla
-Zalmunna
-Zelia
-Zeuxippus
-Zimmermannella
-zinga (leafhopper)
-Zingel
diff --git a/backend/data/genDbpConflicts.py b/backend/data/genDbpConflicts.py
deleted file mode 100755
index c0d3704..0000000
--- a/backend/data/genDbpConflicts.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import sqlite3
-
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads DBpedia data from dbpedia/dbpData.db, along with tree-of-life\n"
-usageInfo += "node name data from a sqlite database, and looks for potential\n"
-usageInfo += "conflicts in associating node names with DBpedia-node labels. For\n"
-usageInfo += "example, a node named 'homo sapiens' might have conflicting labels\n"
-usageInfo += "'Homo sapiens', 'homo sapiens (novel)', and 'homo sapiens (song)'.\n"
-usageInfo += "\n"
-usageInfo += "Writes conflict information to file. For each conflict, a line is printed,\n"
-usageInfo += "holding comma-separated DBpedia labels. If the labels include no-parentheses elements,\n"
-usageInfo += "additional tab-indented lines are printed, wholding short-abstracts for those labels.\n"
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-dbpDb = "dbpedia/dbpData.db"
-dbFile = "data.db"
-outFile = "conflicts.txt"
-
-# Open dbs
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-dbpCon = sqlite3.connect(dbpDb)
-dbpCur = dbpCon.cursor()
-# Get node names
-print("Reading node names")
-nodeNames = set()
-for row in dbCur.execute("SELECT name from nodes"):
- nodeNames.add(row[0])
-# Get disambiguation page labels
-print("Reading disambiguation-page labels")
-disambigLabels = set()
-query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
-for (label,) in dbpCur.execute(query):
- disambigLabels.add(label)
-# Find labels with conflicts
-print("Finding conflicting labels")
-nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)")
-nameToVariants = {}
-iterNum = 0
-for (label,) in dbpCur.execute("SELECT label from labels"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
- #
- if label in disambigLabels:
- continue
- name = label.lower()
- if name in nodeNames:
- if name not in nameToVariants:
- nameToVariants[name] = [label]
- elif label not in nameToVariants[name]:
- nameToVariants[name].append(label)
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- subName = match.group(1)
- if subName in nodeNames and match.group(2) != "disambiguation":
- if subName not in nameToVariants:
- nameToVariants[subName] = [name] # Intentionally ignoring case here
- elif name not in nameToVariants[subName]:
- nameToVariants[subName].append(name)
-namesToRemove = set()
-for (name, variants) in nameToVariants.items():
- if len(variants) == 1:
- namesToRemove.add(name)
-for name in namesToRemove:
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via taxon-type information
-print("Resolving conflicts using instance-type data")
-taxonTypes = { # Obtained from the DBpedia ontology
- "http://dbpedia.org/ontology/Species",
- "http://dbpedia.org/ontology/Archaea",
- "http://dbpedia.org/ontology/Bacteria",
- "http://dbpedia.org/ontology/Eukaryote",
- "http://dbpedia.org/ontology/Plant",
- "http://dbpedia.org/ontology/ClubMoss",
- "http://dbpedia.org/ontology/Conifer",
- "http://dbpedia.org/ontology/CultivatedVariety",
- "http://dbpedia.org/ontology/Cycad",
- "http://dbpedia.org/ontology/Fern",
- "http://dbpedia.org/ontology/FloweringPlant",
- "http://dbpedia.org/ontology/Grape",
- "http://dbpedia.org/ontology/Ginkgo",
- "http://dbpedia.org/ontology/Gnetophytes",
- "http://dbpedia.org/ontology/GreenAlga",
- "http://dbpedia.org/ontology/Moss",
- "http://dbpedia.org/ontology/Fungus",
- "http://dbpedia.org/ontology/Animal",
- "http://dbpedia.org/ontology/Fish",
- "http://dbpedia.org/ontology/Crustacean",
- "http://dbpedia.org/ontology/Mollusca",
- "http://dbpedia.org/ontology/Insect",
- "http://dbpedia.org/ontology/Arachnid",
- "http://dbpedia.org/ontology/Amphibian",
- "http://dbpedia.org/ontology/Reptile",
- "http://dbpedia.org/ontology/Bird",
- "http://dbpedia.org/ontology/Mammal",
- "http://dbpedia.org/ontology/Cat",
- "http://dbpedia.org/ontology/Dog",
- "http://dbpedia.org/ontology/Horse",
-}
-iterNum = 0
-for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
- #
- if type in taxonTypes:
- name = label.lower()
- if name in nameToVariants:
- del nameToVariants[name]
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- name = match.group(1)
- if name in nameToVariants:
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via category-list
- # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
-print("Resolving conflicts using category-list")
-generalCategories = {
- "species", "genus",
- "plant", "fungus", "animal",
- "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
- "fish", "amphibian", "reptile", "bird", "mammal",
-}
-specificCategories = {
- "protist", "alveolate", "dinoflagellates",
- "orchid", "Poaceae", "fern", "moss", "alga",
- "bryozoan", "hydrozoan",
- "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
- "bivalve", "gastropod", "chiton",
- "shrimp", "decapod", "crab", "barnacle", "copepod",
- "arachnid", "spider", "harvestman", "mite",
- "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
- "beetle", "fly", "butterfly", "moth", "wasp",
- "catfish",
- "frog",
- "lizard",
- "horse", "sheep", "cattle", "mouse",
-}
-namesToRemove = set()
-for (name, variants) in nameToVariants.items():
- found = False
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2) in generalCategories:
- namesToRemove.add(name)
- found = True
- break
- if not found:
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2) in specificCategories:
- namesToRemove.add(name)
- break
-for name in namesToRemove:
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Find descriptions for plain-named labels
-print("Finding descriptions for plain-named labels")
-labelToDesc = {}
-iterNum = 0
-query = "SELECT label, abstract from labels INNER JOIN abstracts ON labels.iri = abstracts.iri"
-for (label, desc,) in dbpCur.execute(query):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
- #
- if label.lower() in nameToVariants:
- labelToDesc[label] = desc
-print("Finding descriptions for redirect-resolved labels")
-iterNum = 0
-query = "SELECT label, abstract from labels" \
- " INNER JOIN redirects ON labels.iri = redirects.iri INNER JOIN abstracts ON redirects.target = abstracts.iri"
-for (label, desc,) in dbpCur.execute(query):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
- #
- if label.lower() in nameToVariants:
- labelToDesc[label] = desc
-#
-print("Writing conflict data to file")
-with open(outFile, "w") as file:
- for (name, variants) in nameToVariants.items():
- for n in variants:
- file.write(n + ", ")
- file.write("\n")
- for n in variants:
- if n in labelToDesc:
- file.write(f"\t{n}: {labelToDesc[n]}\n")
-# Close dbs
-dbCon.close()
-dbpCon.close()
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
index 0655344..887e8a8 100755
--- a/backend/data/genDbpData.py
+++ b/backend/data/genDbpData.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, re
+import sys, os, re
import sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
@@ -13,6 +13,7 @@ if len(sys.argv) > 1:
sys.exit(1)
dbpediaDb = "dbpedia/dbpData.db"
+namesToSkipFile = "dbpNamesToSkip.txt"
pickedLabelsFile = "dbpPickedLabels.txt"
dbFile = "data.db"
@@ -24,8 +25,16 @@ dbCur = dbCon.cursor()
# Get node names
print("Reading node names")
nodeNames = set()
-for row in dbCur.execute("SELECT name from nodes"):
- nodeNames.add(row[0])
+for (name,) in dbCur.execute("SELECT name from nodes"):
+ nodeNames.add(name)
+# Skipping certain names
+print("Checking for names to skip")
+oldSz = len(nodeNames)
+if os.path.exists(namesToSkipFile):
+ with open(namesToSkipFile) as file:
+ for line in file:
+ nodeNames.remove(line.rstrip())
+print(f"Skipping {oldSz - len(nodeNames)} nodes")
# Get disambiguation page labels
print("Reading disambiguation-page labels")
disambigLabels = set()
@@ -57,9 +66,9 @@ for (label,) in dbpCur.execute("SELECT label from labels"):
subName = match.group(1)
if subName in nodeNames and match.group(2) != "disambiguation":
if subName not in nameToVariants:
- nameToVariants[subName] = [name] # Intentionally ignoring case here
+ nameToVariants[subName] = [label]
elif name not in nameToVariants[subName]:
- nameToVariants[subName].append(name)
+ nameToVariants[subName].append(label)
for (name, variants) in nameToVariants.items():
if len(variants) == 1:
nodeToLabel[name] = variants[0]
@@ -67,126 +76,128 @@ for name in nodeToLabel:
del nameToVariants[name]
nodeToLabel["cellular organisms"] = "organism" # Special case for root node
print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via picked-labels
-print("Resolving conflicts using picked-labels")
-with open(pickedLabelsFile) as file:
- for line in file:
- pickedLabel = line.rstrip()
- name = pickedLabel.lower()
- if name in nameToVariants:
- nodeToLabel[name] = pickedLabel
- del nameToVariants[name]
- else:
- match = nameVariantRegex.match(pickedLabel)
- if match == None:
- print(f"WARNING: Picked label {pickedLabel} not found (1)", file=sys.stderr)
+# Try resolving conflicts
+def resolveWithPickedLabels():
+ # Attempts conflict resolution using a file with lines of the form 'name1|label1',
+ # where label1 may be absent, indicating that no label should be associated with the name
+ print("Resolving conflicts using picked-labels")
+ with open(pickedLabelsFile) as file:
+ for line in file:
+ (name, _, label) = line.rstrip().partition("|")
+ if name not in nameToVariants:
+ print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr)
+ continue
+ if label == "":
+ del nameToVariants[name]
else:
- name = match.group(1)
- if name not in nameToVariants:
- print(f"WARNING: Picked label {pickedLabel} not found (2)", file=sys.stderr)
- else:
- nodeToLabel[name] = pickedLabel
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via category-list
+ if label not in nameToVariants[name]:
+ print(f"WARNING: Picked label \"{label}\" for name \"{name}\" not found", file=sys.stderr)
+ continue
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ print(f"Remaining number of conflicts: {len(nameToVariants)}")
+def resolveWithCategoryList():
+ # Attempts conflict resolution using category-text in labels of the form 'name1 (category1)'
# Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
-print("Resolving conflicts using category-list")
-generalCategories = {
- "species", "genus",
- "plant", "fungus", "animal",
- "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
- "fish", "amphibian", "reptile", "bird", "mammal",
-}
-specificCategories = {
- "protist", "alveolate", "dinoflagellates",
- "orchid", "Poaceae", "fern", "moss", "alga",
- "bryozoan", "hydrozoan",
- "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
- "bivalve", "gastropod", "chiton",
- "shrimp", "decapod", "crab", "barnacle", "copepod",
- "arachnid", "spider", "harvestman", "mite",
- "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
- "beetle", "fly", "butterfly", "moth", "wasp",
- "catfish",
- "frog",
- "lizard",
- "horse", "sheep", "cattle", "mouse",
-}
-namesToRemove = set()
-for (name, variants) in nameToVariants.items():
- found = False
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2) in generalCategories:
- nodeToLabel[name] = label
- namesToRemove.add(name)
- found = True
- break
- if not found:
+ print("Resolving conflicts using category-list")
+ generalCategories = {
+ "species", "genus",
+ "plant", "fungus", "animal",
+ "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
+ "fish", "amphibian", "reptile", "bird", "mammal",
+ }
+ specificCategories = {
+ "protist", "alveolate", "dinoflagellates",
+ "orchid", "Poaceae", "fern", "moss", "alga",
+ "bryozoan", "hydrozoan",
+ "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
+ "bivalve", "gastropod", "chiton",
+ "shrimp", "decapod", "crab", "barnacle", "copepod",
+ "arachnid", "spider", "harvestman", "mite",
+ "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
+ "beetle", "fly", "butterfly", "moth", "wasp",
+ "catfish",
+ "frog",
+ "lizard",
+ "horse", "sheep", "cattle", "mouse",
+ }
+ namesToRemove = set()
+ for (name, variants) in nameToVariants.items():
+ found = False
for label in variants:
match = nameVariantRegex.match(label)
- if match != None and match.group(2) in specificCategories:
+ if match != None and match.group(2) in generalCategories:
nodeToLabel[name] = label
namesToRemove.add(name)
+ found = True
break
-for name in namesToRemove:
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Try conflict resolution via taxon-type information
-print("Resolving conflicts using instance-type data")
-taxonTypes = { # Obtained from the DBpedia ontology
- "http://dbpedia.org/ontology/Species",
- "http://dbpedia.org/ontology/Archaea",
- "http://dbpedia.org/ontology/Bacteria",
- "http://dbpedia.org/ontology/Eukaryote",
- "http://dbpedia.org/ontology/Plant",
- "http://dbpedia.org/ontology/ClubMoss",
- "http://dbpedia.org/ontology/Conifer",
- "http://dbpedia.org/ontology/CultivatedVariety",
- "http://dbpedia.org/ontology/Cycad",
- "http://dbpedia.org/ontology/Fern",
- "http://dbpedia.org/ontology/FloweringPlant",
- "http://dbpedia.org/ontology/Grape",
- "http://dbpedia.org/ontology/Ginkgo",
- "http://dbpedia.org/ontology/Gnetophytes",
- "http://dbpedia.org/ontology/GreenAlga",
- "http://dbpedia.org/ontology/Moss",
- "http://dbpedia.org/ontology/Fungus",
- "http://dbpedia.org/ontology/Animal",
- "http://dbpedia.org/ontology/Fish",
- "http://dbpedia.org/ontology/Crustacean",
- "http://dbpedia.org/ontology/Mollusca",
- "http://dbpedia.org/ontology/Insect",
- "http://dbpedia.org/ontology/Arachnid",
- "http://dbpedia.org/ontology/Amphibian",
- "http://dbpedia.org/ontology/Reptile",
- "http://dbpedia.org/ontology/Bird",
- "http://dbpedia.org/ontology/Mammal",
- "http://dbpedia.org/ontology/Cat",
- "http://dbpedia.org/ontology/Dog",
- "http://dbpedia.org/ontology/Horse",
-}
-iterNum = 0
-for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
- #
- if type in taxonTypes:
- name = label.lower()
- if name in nameToVariants:
- nodeToLabel[name] = label
- del nameToVariants[name]
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- name = match.group(1)
- if name in nameToVariants:
+ if not found:
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in specificCategories:
nodeToLabel[name] = label
- del nameToVariants[name]
-print(f"Number of conflicts: {len(nameToVariants)}")
+ namesToRemove.add(name)
+ break
+ for name in namesToRemove:
+ del nameToVariants[name]
+ print(f"Remaining number of conflicts: {len(nameToVariants)}")
+def resolveWithTypeData():
+ # Attempts conflict-resolution using dbpedia's instance-type data
+ print("Resolving conflicts using instance-type data")
+ taxonTypes = { # Obtained from the DBpedia ontology
+ "http://dbpedia.org/ontology/Species",
+ "http://dbpedia.org/ontology/Archaea",
+ "http://dbpedia.org/ontology/Bacteria",
+ "http://dbpedia.org/ontology/Eukaryote",
+ "http://dbpedia.org/ontology/Plant",
+ "http://dbpedia.org/ontology/ClubMoss",
+ "http://dbpedia.org/ontology/Conifer",
+ "http://dbpedia.org/ontology/CultivatedVariety",
+ "http://dbpedia.org/ontology/Cycad",
+ "http://dbpedia.org/ontology/Fern",
+ "http://dbpedia.org/ontology/FloweringPlant",
+ "http://dbpedia.org/ontology/Grape",
+ "http://dbpedia.org/ontology/Ginkgo",
+ "http://dbpedia.org/ontology/Gnetophytes",
+ "http://dbpedia.org/ontology/GreenAlga",
+ "http://dbpedia.org/ontology/Moss",
+ "http://dbpedia.org/ontology/Fungus",
+ "http://dbpedia.org/ontology/Animal",
+ "http://dbpedia.org/ontology/Fish",
+ "http://dbpedia.org/ontology/Crustacean",
+ "http://dbpedia.org/ontology/Mollusca",
+ "http://dbpedia.org/ontology/Insect",
+ "http://dbpedia.org/ontology/Arachnid",
+ "http://dbpedia.org/ontology/Amphibian",
+ "http://dbpedia.org/ontology/Reptile",
+ "http://dbpedia.org/ontology/Bird",
+ "http://dbpedia.org/ontology/Mammal",
+ "http://dbpedia.org/ontology/Cat",
+ "http://dbpedia.org/ontology/Dog",
+ "http://dbpedia.org/ontology/Horse",
+ }
+ iterNum = 0
+ for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"Processing line {iterNum}")
+ #
+ if type in taxonTypes:
+ name = label.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ name = match.group(1)
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ print(f"Remaining number of conflicts: {len(nameToVariants)}")
+resolveWithPickedLabels()
# Associate nodes with IRIs
-print("Getting nodes IRIs")
+print("Getting node IRIs")
nodeToIri = {}
iterNum = 0
for (name, label) in nodeToLabel.items():
@@ -212,6 +223,7 @@ for (name, iri) in nodeToIri.items():
# Find descriptions, and add to db
print("Adding node description data")
dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)")
+dbCur.execute("CREATE INDEX descs_id_idx ON descs(wiki_id)") # wiki_id intentionally left non-unique
iterNum = 0
for (name, iri) in nodeToIri.items():
iterNum += 1