aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md45
-rw-r--r--backend/data/dbpPickedLabels.txt657
-rw-r--r--backend/data/dbpedia/README.md25
-rwxr-xr-xbackend/data/dbpedia/genData.py122
-rwxr-xr-xbackend/data/genDbpConflicts.py202
-rwxr-xr-xbackend/data/genDbpData.py227
6 files changed, 1262 insertions, 16 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index c4c46ba..b568f90 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -17,24 +17,37 @@ File Generation Process
3 Use genImgsForWeb.py to create cropped/resized images in img/, using
images in imgsReviewed, and also to add an 'images' table to data.db.
4 Node Description Data
- 1 Obtain data in enwiki/, as specified in it's README.
- 2 Run genEnwikiData.py, which adds a 'descs' table to data.db,
- using data in enwiki/enwikiData.db, and the 'nodes' table.
+ - Using DBpedia
+ 1 Obtain data in dbpedia/, as specified in it's README.
+ 2 Run genDbpData.py, which adds a 'descs' table to data.db, using
+ data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table.
+ - Using wikipedia dump (old method)
+ 1 Obtain data in enwiki/, as specified in it's README.
+ 2 Run genEnwikiData.py, which adds a 'descs' table to data.db,
+ using data in enwiki/enwikiData.db, and the 'nodes' table.
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds a 'reduced_nodes' table to data.db,
using reducedTol/names.txt, and the 'nodes' and 'names' tables.
-data.db tables
+data.db Tables
==============
-- nodes <br>
- name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT
-- names <br>
- name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
-- eol\_ids <br>
- id INT PRIMARY KEY, name TEXT
-- images <br>
- eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
-- descs <br>
- name TEXT PRIMARY KEY, desc TEXT, redirected INT
-- reduced\_nodes <br>
- name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT
+- nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT
+- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
+- eol\_ids: id INT PRIMARY KEY, name TEXT
+- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
+- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT
+- reduced\_nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT
+
+Other Files
+===========
+- dbpPickedLabels.txt <br>
+ Contains DBpedia labels, one per line. Used by genDbpData.py to help
+ resolve conflicts when associating tree-of-life node names with
+ DBpedia node labels. Was generated by manually editing the output
+ of genDbpConflicts.py.
+- genDbpConflicts.py <br>
+ Reads data from dbpedia/dbpData.db, and the 'nodes' table of data.db,
+ and looks for potential conflicts that would arise when genDbpData.db
+ tries to associate tree-of-life node names wth DBpedia node labels. It
+ writes data about them to conflicts.txt, which can be manually edited
+ to resolve them.
diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt
new file mode 100644
index 0000000..80a4770
--- /dev/null
+++ b/backend/data/dbpPickedLabels.txt
@@ -0,0 +1,657 @@
+Abditomys latidens
+Abdopus aculeatus
+Ablerus
+Abralia
+Abramis brama
+Abraxas sylvata
+Abrostola
+Acidobacteria
+Acontia
+Actiniaria
+Actinobacteria
+Addisonia
+Aenetus
+Agapanthus
+Aglais
+Aglaope
+agrotis infusa (bogong moth)
+agrypnia (caddisfly)
+Agyrium
+Ainoa
+Alauda
+Alces
+Aleurotrachelus atratus
+Alexicles
+Alistra
+Allactaga
+Alligator
+Aloe
+Aloe vera
+Aluta
+Alypia
+Amanita
+Amaranthus
+Amaryllis
+Amazona
+Ambrysus
+ambystomatidae (mole salamanders)
+Amphibia
+Amyris
+Anagyrus
+Anania
+Ancylini
+Anemosa
+Angustia
+anhanguera (pterosaur)
+Aniba
+Annona
+Anthene
+Anticrates
+Aparupa
+Aplysina archeri
+Apollonias
+Arabidopsis thaliana
+Aracus
+Arbutus
+Arca noae
+Arctia
+Arctiini
+Ardices
+Ardipithecus
+Argas
+Argia
+Aristea
+arjuna (elephant)
+Armina
+Artace
+Arthrophyllum
+Asca
+asclera (beetle genus)
+Asperitas
+Astacus
+Asteroidea
+Astia
+Astyra
+Atenia
+Atrax
+Atrina fragilis
+Augustia
+Austrogomphus
+Avena
+Aves
+Axiidae
+Azalea
+Azenia
+Babina holsti
+Bacchini
+balfouria (flatworm)
+ballana (leafhopper)
+Barcella
+Baryonyx
+basuto (horse)
+Begonia
+Belbina
+belisarius (scorpion)
+belzebub (prawn)
+Bembidion
+Benincasa
+beroe (ctenophore)
+Bertmainius tingle
+Betta
+Biastes
+Bigelowia
+Bignonia
+Bitia
+Blumea
+Boa constrictor
+Boea
+Bolax
+Bonia
+Boops
+Bougainvillia
+brevinema (bacterium)
+Briza
+Brunonia
+buchneria (millipede)
+bucolion (cockroach)
+Burara
+Buxbaumia
+Cabello
+Cacatua
+Caedicia
+Calanus
+calchas (scorpion)
+Callia
+callianira (ctenophore)
+Caluromys
+Cambarellus
+Camellia
+campion (lacewing)
+Camptonotus
+Campylopus
+candelabrum (hydrozoa)
+Canella
+canelo (tree)
+Canis lupus
+Canis lupus dingo
+Capitata
+Capito
+Caprona
+Capsicum
+Carallia
+Carcinus
+Carlina
+Carpentaria
+Caryanda elegans
+Cassiopea
+Castela
+Castolus
+Cavia
+Ceiba
+Celestus anelpistus
+Cellana
+cellia (subgenus)
+Centaurea
+Cepa
+Cephoidea
+Cercyra
+Cetacea
+Chaetomium thermophilum
+Chaetonotus
+chane (mayfly)
+chapmania (flatworm)
+charidia (skipper)
+charmion (skipper)
+Charpentiera
+Chilena
+Chiroptera
+Chondrocladia
+Chrysanthemum
+Chumma
+chuniella (worm)
+Cicindela
+Cilnia
+cinchona (shrub family)
+Cispia
+Cisthene
+Citipati
+Citronia vasiformis
+claria (rotifer)
+clava (hydrozoa)
+clostridioides difficile (bacteria)
+Clupea
+Clytus
+Coccinella
+Cochlearia
+Coffea canephora
+Cojoba
+colemaniella (worm)
+coleodesmium (worm)
+colle (grape)
+coloradia (dinosaur)
+confucius (leafhopper)
+Conger
+Conistra
+Conta
+Conus
+Conus marmoreus
+coppa (grape)
+copula (jellyfish)
+Corambis
+Corixa
+Corsia
+Cosmophyllum
+Cossinia
+Cotylea
+Craniata
+Critonia
+Crocus
+Cronius
+Cropia
+Crossosoma
+Cryptophagus
+cryptopora (brachiopod)
+Cryptosporidium
+Curculio
+Curtia
+Cyana
+Cyclamen
+Cyclocotyla
+Cyclostomata
+Cyclostomatida
+Cylindera
+Cynanchum louiseae
+Cynara
+Cyrba
+Dacus
+dalla (skipper)
+Daphne laureola
+Daphnia
+Datura
+Davidiella
+Davidsonia
+Decapoda
+Degeneria
+Deinonychus
+deiopea (ctenophore)
+Desmos
+Diaphorus
+Diaprepes
+Dichanthelium clandestinum
+Digitalis
+dikwa (amphipod)
+Dilipa
+dilong (dinosaur)
+dioxys (bee)
+Dipoena
+Dolichocephala
+Doras
+Dracaena braunii
+Draco mindanensis
+drepanophorus (worm)
+Drimiopsis
+Drobeta
+Drymus
+Dufourea
+Dulzura
+Dynastes
+Echeclus
+edwardsiella (bacterium)
+Eidothea
+Elaenia
+Eleutherodactylus
+Elona
+Elseya
+Encolpius
+endere (millipede)
+Enispa
+Enteropneusta
+Erasinus
+Eresus
+eriosoma (aphid)
+Erodium
+Erythronium
+ethesia (acanthaceae)
+Eucalyptus
+Euphorbia
+Euphrasia
+euryglossa (bee)
+Eusarcus
+Evansia
+Extra extra
+fallax (brachiopod)
+Felidae
+Felis
+Fernandezia
+fimbriaria (flatworm)
+florea (millipede)
+Flos
+Forestiera
+Forsythia
+Fossarina
+Fulcinia
+Fulgora
+Fungi
+Fungia
+Fusinus
+Galaxaura
+Galega
+Gaoligongshania
+gargantua (gorilla)
+Gaura
+Gazella
+Gelae
+Gemina
+Geminia
+Geomalacus
+georgium (caddisfly)
+Gergithus
+Geum coccineum
+Giraffa
+Girella zebra
+Glena
+Gnathostomata
+Gobius niger
+Gongora
+Gongylus
+gratia (mayfly)
+gryphus (brachiopod)
+Gymnopodium
+habeas corpus (pig)
+Halenia
+Halesia
+halla (horse)
+Hallucigenia
+Harmothoe
+Harpa
+Heliophila
+helleria (woodlouse)
+Helvidia
+Hemiaspis
+Hemithea
+Herpetopoma
+Heteroteuthis
+Hibana
+Hippotion
+Holops
+Homo erectus
+Homo sapiens
+Hoplitosaurus
+Hortensia
+Hostus
+Hottea
+Houttuynia
+Huntia
+hydrosaurus (mosasaur)
+Hypodematium
+iassus (leafhopper)
+Ibana
+Iberus
+Ibis
+Icona
+Ilerda
+Imma
+Impatiens
+Insecta
+Isurus
+Ixora
+Jacaranda
+Junco
+Khaan
+kobus (antelope)
+Koppe
+La paloma
+Labyrinthus
+Laetilia
+lampea (ctenophore)
+Lampetra
+Lanceola
+Laurencia
+laureola (woodlouse)
+Laurus
+Lemuria
+Lepidoptera
+Leptonema
+Lepus
+Lestoidea
+Leuconia
+Ligia
+Lilium
+Limax
+Limia
+Limnornis
+liniscus (nematode)
+Listeria
+Livius
+Lobster
+Loefgrenia
+Lonicera
+Loxodonta africana
+Lucanus cervus
+Lunella
+Lurio
+Lusius
+Lytta vesicatoria
+Macroglossinae
+macrophylla (wych elm cultivar)
+Madia
+Madrepora
+magellania (brachiopod)
+Magnolia
+Magnolia grandiflora
+Magnoliopsida
+malena (stork)
+Mammalia
+manis (orangutan)
+Mantica
+Mapusaurus
+Marchena
+Marmorata
+Marmosa
+Marmota
+Martensia
+Massaga
+Medius
+Megaceros
+Megaphyllum
+Meloe
+Mene
+Menziesia
+Mermessus
+Metabolus
+metallus (sawfly)
+Metasequoia
+migros (turkey)
+Milla
+Mimosa
+Mispila
+Mizuhopecten yessoensis
+Molinia
+Molione
+Molycria
+Monaeses
+Monotonia
+Monticola
+Moritasgus
+Morone
+Mussidia
+Mynes
+Myuchelys bellii
+Myuchelys georgesi
+Myuchelys latisternum
+Myxoderma
+Nagaina
+Naja
+Nala
+Napaeus
+Nardoa
+Naubolus
+nausithoe (jellyfish)
+Naxia
+Nectria
+Neis
+nemertes (worm)
+Neomeris
+Nephus
+Nerine
+Nicodamus
+Nitrospira
+Nitzschia
+Nycteris
+octodon (plant genus)
+odius (amphipod)
+Oeax
+Omphalotropis
+Opas
+Orcinus
+Ornitholestes
+Ornithopus
+Orsinome
+Oxyuris
+Pachyuromys
+padina (algae)
+Pajanelia
+Palisa
+Panthea
+Papilio
+Papilio buddha
+Papilionidae
+parachela (tardigrade)
+Paratheria
+parkinsonia (ammonite)
+Parthenium
+Parus
+Passer
+Patrinia
+Perdix
+Perilla frutescens
+Perizoma
+Persoonia
+Petricola
+Petronia
+Pharnacia
+Phassus
+Pheia
+Pheretima
+Philotis
+Phlegon
+Phlogius
+Phlox
+Pholas dactylus
+Phyllanthus distichus
+Phyllium
+Phytoecia
+Piaya
+Pieridae
+Pilia
+Pirula
+Pisania
+Pistoria
+plasmodium (slime mold)
+Platanus
+Plautia
+Pluteus
+Podalia
+Poliana
+Polistes canadensis
+Polypodium australe
+pompholyx (rotifer)
+Porius
+Posidonia
+Potamanthus yooni
+Potos
+Primates
+Problema
+Procolobus
+Pronous
+Prothoe
+Pseudonaja mengdeni
+Psylla
+Ptelea
+Pteronotus
+Pullimosina
+puncha (snakefly)
+Pusa
+pygora (goat)
+Pyrnus
+Pyrola
+Pyrus
+Pythium
+Quintilia
+Rafflesia
+Rattus norvegicus
+Rattus rattus
+Rhene
+Rhipicephalus
+Rhombodera
+Rhombodera extensicollis
+Rhombodera megaera
+Rosales
+Rotaria
+Roussea
+Rubia
+Rugosa
+Salamandra
+Salpingotus
+Salticus coronatus
+Salvia
+Samanea
+Samaris
+Savarna
+Scarabaeus
+Schiedea
+Schizopetalum
+Scopula gracilis
+seira (springtail)
+Selimus
+serina (grape)
+Setina
+Sibbaldia
+Sicyonia
+Sideroxylon
+Silene
+Sinea
+Siphonaptera
+Sirenia
+Sithon
+Smilax
+soa (barklice)
+Solanum
+Solidago houghtonii
+Sonneratia
+Sorex dispar
+Sorex longirostris
+Sorghum
+Spea
+Spelobia
+sphingidae (hawk moths)
+Squalus clarkae
+Stegonotus
+Stenonia
+Stephania
+Stichius
+Strombus
+strongylus (nematode)
+Struthiomimus
+suca (lacewing)
+sulcia (bacteria)
+superba (elm hybrid)
+Swan
+Sybota
+Sydowia
+Syllis
+Syneta
+Tainia
+taku (whale)
+Tantilla
+tapejara (pterosaur)
+tapia (tree)
+Tarne
+Tawera
+Telphusa
+Termitomyces schimperi
+Theba
+Thestor
+Thomomys
+Thria
+Thunbergia
+Thunnus
+Thyasiridae
+Thyreus
+tinerfe (ctenophore)
+Tiso
+Titanophora
+tokara (horse)
+tortricidae (snakes)
+Tortrix
+Triaenophorus
+Triops
+Trochilus
+tunga (flea)
+Tusitala
+Tutelina
+Tympanuchus cupido pinnatus
+Tyrannosaurus rex
+uga (dog)
+Unenlagia
+Uria
+Ursia
+valentines (grape)
+Vanessa atalanta
+Varicella
+velamen (ctenophore)
+Velociraptor
+Venia
+Vinca
+Vipera
+vitrum (tunicate)
+Vitula
+wolga (rotifer)
+Wulfenia
+Yacolla
+Zalmunna
+Zelia
+Zeuxippus
+Zimmermannella
+zinga (leafhopper)
+Zingel
diff --git a/backend/data/dbpedia/README.md b/backend/data/dbpedia/README.md
new file mode 100644
index 0000000..0e7c266
--- /dev/null
+++ b/backend/data/dbpedia/README.md
@@ -0,0 +1,25 @@
+Downloaded Files
+================
+- labels\_lang=en.ttl.bz2 <br>
+ Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core,
+ using the link <https://databus.dbpedia.org/dbpedia/generic/labels/2022.03.01/labels_lang=en.ttl.bz2>.
+- redirects\_lang=en\_transitive.ttl.bz2 <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/generic/redirects/2022.03.01/redirects_lang=en_transitive.ttl.bz2>.
+- disambiguations\_lang=en.ttl.bz2 <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/generic/disambiguations/2022.03.01/disambiguations_lang=en.ttl.bz2>.
+- instance-types\_lang=en\_specific.ttl.bz2 <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/mappings/instance-types/2022.03.01/instance-types_lang=en_specific.ttl.bz2>.
+- short-abstracts\_lang=en.ttl.bz2 <br>
+ Downloaded from <https://databus.dbpedia.org/vehnem/text/short-abstracts/2021.05.01/short-abstracts_lang=en.ttl.bz2>.
+
+Generated Files
+===============
+- dbpData.db <br>
+ An sqlite database representing data from the ttl files.
+ Generated by running genData.py.
+ Tables
+ - labels: iri TEXT PRIMARY KEY, label TEXT
+ - redirects: iri TEXT PRIMARY KEY, target TEXT
+ - disambiguations: iri TEXT PRIMARY KEY
+ - types: iri TEXT, type TEXT
+ - abstracts: iri TEXT PRIMARY KEY, abstract TEXT
diff --git a/backend/data/dbpedia/genData.py b/backend/data/dbpedia/genData.py
new file mode 100755
index 0000000..e147641
--- /dev/null
+++ b/backend/data/dbpedia/genData.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2, sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads DBpedia labels+types+redirects+abstracts data,\n"
+usageInfo += "and creates a sqlite db containing that data.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines
+redirectsFile = "redirects_lang=en_transitive.ttl.bz2"
+disambigFile = "disambiguations_lang=en.ttl.bz2"
+typesFile = "instance-types_lang=en_specific.ttl.bz2"
+abstractsFile = "short-abstracts_lang=en.ttl.bz2"
+dbFile = "dbpData.db"
+
+# Open db
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Read/store labels
+print("Reading/storing label data")
+dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)")
+dbCur.execute("CREATE INDEX labels_idx ON labels(label COLLATE NOCASE)")
+labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n')
+lineNum = 0
+with bz2.open(labelsFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print("Processing line {}".format(lineNum))
+ #
+ match = labelLineRegex.fullmatch(line)
+ if match == None:
+ print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr)
+ sys.exit(1)
+ else:
+ dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2)))
+dbCon.commit()
+# Read/store redirects
+print("Reading/storing redirection data")
+dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)")
+redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
+lineNum = 0
+with bz2.open(redirectsFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print("Processing line {}".format(lineNum))
+ #
+ match = redirLineRegex.fullmatch(line)
+ if match == None:
+ print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr)
+ sys.exit(1)
+ else:
+ dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2)))
+dbCon.commit()
+# Read/store diambiguation-page data
+print("Reading/storing diambiguation-page data")
+disambigNames = set()
+disambigLineRegex = redirLineRegex
+lineNum = 0
+with bz2.open(disambigFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print("Processing line {}".format(lineNum))
+ #
+ match = disambigLineRegex.fullmatch(line)
+ if match == None:
+ print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr)
+ sys.exit(1)
+ else:
+ disambigNames.add(match.group(1))
+dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)")
+for name in disambigNames:
+ dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,))
+dbCon.commit()
+# Read/store instance-type
+print("Reading/storing instance-type data")
+dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)")
+dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)")
+typeLineRegex = redirLineRegex
+lineNum = 0
+with bz2.open(typesFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print("Processing line {}".format(lineNum))
+ #
+ match = typeLineRegex.fullmatch(line)
+ if match == None:
+ print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr)
+ sys.exit(1)
+ else:
+ dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2)))
+dbCon.commit()
+# Read/store abstracts
+print("Reading/storing abstracts")
+dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)")
+descLineRegex = labelLineRegex
+lineNum = 0
+with bz2.open(abstractsFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print("Processing line {}".format(lineNum))
+ #
+ if line[0] == "#":
+ continue
+ match = descLineRegex.fullmatch(line)
+ if match == None:
+ print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr)
+ sys.exit(1)
+ else:
+ dbCur.execute("INSERT INTO abstracts VALUES (?, ?)",
+ (match.group(1), match.group(2).replace(r'\"', '"')))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/genDbpConflicts.py b/backend/data/genDbpConflicts.py
new file mode 100755
index 0000000..0ad4e1e
--- /dev/null
+++ b/backend/data/genDbpConflicts.py
@@ -0,0 +1,202 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads DBpedia data from dbpedia/dbpData.db, along with tree-of-life\n"
+usageInfo += "node name data from a sqlite database, and looks for potential\n"
+usageInfo += "conflicts in associating node names with DBpedia-node labels. For\n"
+usageInfo += "example, a node named 'homo sapiens' might have conflicting labels\n"
+usageInfo += "'Homo sapiens', 'homo sapiens (novel)', and 'homo sapiens (song)'.\n"
+usageInfo += "\n"
+usageInfo += "Writes conflict information to file. For each conflict, a line is printed,\n"
+usageInfo += "holding comma-separated DBpedia labels. If the labels include no-parentheses elements,\n"
+usageInfo += "additional tab-indented lines are printed, wholding short-abstracts for those labels.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbpDb = "dbpedia/dbpData.db"
+dbFile = "data.db"
+outFile = "conflicts.txt"
+
+# Open dbs
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+dbpCon = sqlite3.connect(dbpDb)
+dbpCur = dbpCon.cursor()
+# Get node names
+print("Reading node names")
+nodeNames = set()
+for row in dbCur.execute("SELECT name from nodes"):
+ nodeNames.add(row[0])
+# Get disambiguation page labels
+print("Reading disambiguation-page labels")
+disambigLabels = set()
+query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
+for (label,) in dbpCur.execute(query):
+ disambigLabels.add(label)
+# Find labels with conflicts
+print("Finding conflicting labels")
+nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)")
+nameToVariants = {}
+iterNum = 0
+for (label,) in dbpCur.execute("SELECT label from labels"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if label in disambigLabels:
+ continue
+ name = label.lower()
+ if name in nodeNames:
+ if name not in nameToVariants:
+ nameToVariants[name] = [label]
+ elif label not in nameToVariants[name]:
+ nameToVariants[name].append(label)
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ subName = match.group(1)
+ if subName in nodeNames and match.group(2) != "disambiguation":
+ if subName not in nameToVariants:
+ nameToVariants[subName] = [name] # Intentionally ignoring case here
+ elif name not in nameToVariants[subName]:
+ nameToVariants[subName].append(name)
+namesToRemove = set()
+for (name, variants) in nameToVariants.items():
+ if len(variants) == 1:
+ namesToRemove.add(name)
+for name in namesToRemove:
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via taxon-type information
+print("Resolving conflicts using instance-type data")
+taxonTypes = { # Obtained from the DBpedia ontology
+ "http://dbpedia.org/ontology/Species",
+ "http://dbpedia.org/ontology/Archaea",
+ "http://dbpedia.org/ontology/Bacteria",
+ "http://dbpedia.org/ontology/Eukaryote",
+ "http://dbpedia.org/ontology/Plant",
+ "http://dbpedia.org/ontology/ClubMoss",
+ "http://dbpedia.org/ontology/Conifer",
+ "http://dbpedia.org/ontology/CultivatedVariety",
+ "http://dbpedia.org/ontology/Cycad",
+ "http://dbpedia.org/ontology/Fern",
+ "http://dbpedia.org/ontology/FloweringPlant",
+ "http://dbpedia.org/ontology/Grape",
+ "http://dbpedia.org/ontology/Ginkgo",
+ "http://dbpedia.org/ontology/Gnetophytes",
+ "http://dbpedia.org/ontology/GreenAlga",
+ "http://dbpedia.org/ontology/Moss",
+ "http://dbpedia.org/ontology/Fungus",
+ "http://dbpedia.org/ontology/Animal",
+ "http://dbpedia.org/ontology/Fish",
+ "http://dbpedia.org/ontology/Crustacean",
+ "http://dbpedia.org/ontology/Mollusca",
+ "http://dbpedia.org/ontology/Insect",
+ "http://dbpedia.org/ontology/Arachnid",
+ "http://dbpedia.org/ontology/Amphibian",
+ "http://dbpedia.org/ontology/Reptile",
+ "http://dbpedia.org/ontology/Bird",
+ "http://dbpedia.org/ontology/Mammal",
+ "http://dbpedia.org/ontology/Cat",
+ "http://dbpedia.org/ontology/Dog",
+ "http://dbpedia.org/ontology/Horse",
+}
+iterNum = 0
+for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if type in taxonTypes:
+ name = label.lower()
+ if name in nameToVariants:
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ name = match.group(1)
+ if name in nameToVariants:
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via category-list
+ # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
+print("Resolving conflicts using category-list")
+generalCategories = {
+ "species", "genus",
+ "plant", "fungus", "animal",
+ "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
+ "fish", "amphibian", "reptile", "bird", "mammal",
+}
+specificCategories = {
+ "protist", "alveolate", "dinoflagellates",
+ "orchid", "Poaceae", "fern", "moss", "alga",
+ "bryozoan", "hydrozoan",
+ "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
+ "bivalve", "gastropod", "chiton",
+ "shrimp", "decapod", "crab", "barnacle", "copepod",
+ "arachnid", "spider", "harvestman", "mite",
+ "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
+ "beetle", "fly", "butterfly", "moth", "wasp",
+ "catfish",
+ "frog",
+ "lizard",
+ "horse", "sheep", "cattle", "mouse",
+}
+namesToRemove = set()
+for (name, variants) in nameToVariants.items():
+ found = False
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in generalCategories:
+ namesToRemove.add(name)
+ found = True
+ break
+ if not found:
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in specificCategories:
+ namesToRemove.add(name)
+ break
+for name in namesToRemove:
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Find descriptions for plain-named labels
+print("Finding descriptions for plain-named labels")
+labelToDesc = {}
+iterNum = 0
+query = "SELECT label, abstract from labels INNER JOIN abstracts ON labels.iri = abstracts.iri"
+for (label, desc,) in dbpCur.execute(query):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if label.lower() in nameToVariants:
+ labelToDesc[label] = desc
+print("Finding descriptions for redirect-resolved labels")
+iterNum = 0
+query = "SELECT label, abstract from labels" \
+ " INNER JOIN redirects ON labels.iri = redirects.iri INNER JOIN abstracts ON redirects.target = abstracts.iri"
+for (label, desc,) in dbpCur.execute(query):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if label.lower() in nameToVariants:
+ labelToDesc[label] = desc
+#
+print("Writing conflict data to file")
+with open(outFile, "w") as file:
+ for (name, variants) in nameToVariants.items():
+ for n in variants:
+ file.write(n + ", ")
+ file.write("\n")
+ for n in variants:
+ if n in labelToDesc:
+ file.write("\t{}: {}\n".format(n, labelToDesc[n]))
+# Close dbs
+dbCon.close()
+dbpCon.close()
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
new file mode 100755
index 0000000..6cc8d33
--- /dev/null
+++ b/backend/data/genDbpData.py
@@ -0,0 +1,227 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads DBpedia data from dbpedia/*, along with tree-of-life\n"
+usageInfo += "node and name data from a sqlite database, associates nodes with\n"
+usageInfo += "DBpedia IRIs, and adds alt-name and description information for\n"
+usageInfo += "those nodes.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbpediaDb = "dbpedia/dbpData.db"
+pickedLabelsFile = "dbpPickedLabels.txt"
+dbFile = "data.db"
+
+# Open dbs
+dbpCon = sqlite3.connect(dbpediaDb)
+dbpCur = dbpCon.cursor()
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Get node names
+print("Reading node names")
+nodeNames = set()
+for row in dbCur.execute("SELECT name from nodes"):
+ nodeNames.add(row[0])
+# Get disambiguation page labels
+print("Reading disambiguation-page labels")
+disambigLabels = set()
+query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
+for (label,) in dbpCur.execute(query):
+ disambigLabels.add(label)
+# Try associating nodes with IRIs, accounting for disambiguation labels
+print("Trying to associate nodes with labels")
+nodeToLabel = {}
+nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)")
+nameToVariants = {}
+iterNum = 0
+for (label,) in dbpCur.execute("SELECT label from labels"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if label in disambigLabels:
+ continue
+ name = label.lower()
+ if name in nodeNames:
+ if name not in nameToVariants:
+ nameToVariants[name] = [label]
+ elif label not in nameToVariants[name]:
+ nameToVariants[name].append(label)
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ subName = match.group(1)
+ if subName in nodeNames and match.group(2) != "disambiguation":
+ if subName not in nameToVariants:
+ nameToVariants[subName] = [name] # Intentionally ignoring case here
+ elif name not in nameToVariants[subName]:
+ nameToVariants[subName].append(name)
+for (name, variants) in nameToVariants.items():
+ if len(variants) == 1:
+ nodeToLabel[name] = variants[0]
+for name in nodeToLabel:
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via category-list
+ # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
+print("Resolving conflicts using category-list")
+generalCategories = {
+ "species", "genus",
+ "plant", "fungus", "animal",
+ "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
+ "fish", "amphibian", "reptile", "bird", "mammal",
+}
+specificCategories = {
+ "protist", "alveolate", "dinoflagellates",
+ "orchid", "Poaceae", "fern", "moss", "alga",
+ "bryozoan", "hydrozoan",
+ "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
+ "bivalve", "gastropod", "chiton",
+ "shrimp", "decapod", "crab", "barnacle", "copepod",
+ "arachnid", "spider", "harvestman", "mite",
+ "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
+ "beetle", "fly", "butterfly", "moth", "wasp",
+ "catfish",
+ "frog",
+ "lizard",
+ "horse", "sheep", "cattle", "mouse",
+}
+namesToRemove = set()
+for (name, variants) in nameToVariants.items():
+ found = False
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in generalCategories:
+ nodeToLabel[name] = label
+ namesToRemove.add(name)
+ found = True
+ break
+ if not found:
+ for label in variants:
+ match = nameVariantRegex.match(label)
+ if match != None and match.group(2) in specificCategories:
+ nodeToLabel[name] = label
+ namesToRemove.add(name)
+ break
+for name in namesToRemove:
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via taxon-type information
+print("Resolving conflicts using instance-type data")
+taxonTypes = { # Obtained from the DBpedia ontology
+ "http://dbpedia.org/ontology/Species",
+ "http://dbpedia.org/ontology/Archaea",
+ "http://dbpedia.org/ontology/Bacteria",
+ "http://dbpedia.org/ontology/Eukaryote",
+ "http://dbpedia.org/ontology/Plant",
+ "http://dbpedia.org/ontology/ClubMoss",
+ "http://dbpedia.org/ontology/Conifer",
+ "http://dbpedia.org/ontology/CultivatedVariety",
+ "http://dbpedia.org/ontology/Cycad",
+ "http://dbpedia.org/ontology/Fern",
+ "http://dbpedia.org/ontology/FloweringPlant",
+ "http://dbpedia.org/ontology/Grape",
+ "http://dbpedia.org/ontology/Ginkgo",
+ "http://dbpedia.org/ontology/Gnetophytes",
+ "http://dbpedia.org/ontology/GreenAlga",
+ "http://dbpedia.org/ontology/Moss",
+ "http://dbpedia.org/ontology/Fungus",
+ "http://dbpedia.org/ontology/Animal",
+ "http://dbpedia.org/ontology/Fish",
+ "http://dbpedia.org/ontology/Crustacean",
+ "http://dbpedia.org/ontology/Mollusca",
+ "http://dbpedia.org/ontology/Insect",
+ "http://dbpedia.org/ontology/Arachnid",
+ "http://dbpedia.org/ontology/Amphibian",
+ "http://dbpedia.org/ontology/Reptile",
+ "http://dbpedia.org/ontology/Bird",
+ "http://dbpedia.org/ontology/Mammal",
+ "http://dbpedia.org/ontology/Cat",
+ "http://dbpedia.org/ontology/Dog",
+ "http://dbpedia.org/ontology/Horse",
+}
+iterNum = 0
+for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print("Processing line {}".format(iterNum))
+ #
+ if type in taxonTypes:
+ name = label.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.fullmatch(name)
+ if match != None:
+ name = match.group(1)
+ if name in nameToVariants:
+ nodeToLabel[name] = label
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Try conflict resolution via picked-labels
+print("Resolving conflicts using picked-labels")
+with open(pickedLabelsFile) as file:
+ for line in file:
+ pickedLabel = line.rstrip()
+ name = pickedLabel.lower()
+ if name in nameToVariants:
+ nodeToLabel[name] = pickedLabel
+ del nameToVariants[name]
+ else:
+ match = nameVariantRegex.match(pickedLabel)
+ if match == None:
+ print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr)
+ else:
+ name = match.group(1)
+ if name not in nameToVariants:
+ print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr)
+ else:
+ nodeToLabel[name] = pickedLabel
+ del nameToVariants[name]
+print("Number of conflicts: {}".format(len(nameToVariants)))
+# Associate nodes with IRIs
+print("Getting nodes IRIs")
+nodeToIri = {}
+iterNum = 0
+for (name, label) in nodeToLabel.items():
+ row = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone()
+ if row == None:
+ print("ERROR: Couldn't find label {}".format(label), file=sys.stderr)
+ sys.exit(1)
+ else:
+ nodeToIri[name] = row[0]
+# Resolve redirects
+print("Resolving redirects")
+redirectingIriSet = set()
+iterNum = 0
+for (name, iri) in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone()
+ if row != None:
+ nodeToIri[name] = row[0]
+ redirectingIriSet.add(iri)
+# Find descriptions, and add to db
+print("Adding node description data")
+dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)")
+iterNum = 0
+for (name, iri) in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = dbpCur.execute("SELECT abstract FROM abstracts where iri = ?", (iri,)).fetchone()
+ if row != None:
+ dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingIriSet else 0))
+# Close dbs
+dbCon.commit()
+dbCon.close()
+dbpCon.commit()
+dbpCon.close()