diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 45 | ||||
| -rw-r--r-- | backend/data/dbpPickedLabels.txt | 657 | ||||
| -rw-r--r-- | backend/data/dbpedia/README.md | 25 | ||||
| -rwxr-xr-x | backend/data/dbpedia/genData.py | 122 | ||||
| -rwxr-xr-x | backend/data/genDbpConflicts.py | 202 | ||||
| -rwxr-xr-x | backend/data/genDbpData.py | 227 |
6 files changed, 1262 insertions, 16 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index c4c46ba..b568f90 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -17,24 +17,37 @@ File Generation Process 3 Use genImgsForWeb.py to create cropped/resized images in img/, using images in imgsReviewed, and also to add an 'images' table to data.db. 4 Node Description Data - 1 Obtain data in enwiki/, as specified in it's README. - 2 Run genEnwikiData.py, which adds a 'descs' table to data.db, - using data in enwiki/enwikiData.db, and the 'nodes' table. + - Using DBpedia + 1 Obtain data in dbpedia/, as specified in it's README. + 2 Run genDbpData.py, which adds a 'descs' table to data.db, using + data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table. + - Using wikipedia dump (old method) + 1 Obtain data in enwiki/, as specified in it's README. + 2 Run genEnwikiData.py, which adds a 'descs' table to data.db, + using data in enwiki/enwikiData.db, and the 'nodes' table. 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds a 'reduced_nodes' table to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. -data.db tables +data.db Tables ============== -- nodes <br> - name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT -- names <br> - name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) -- eol\_ids <br> - id INT PRIMARY KEY, name TEXT -- images <br> - eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT -- descs <br> - name TEXT PRIMARY KEY, desc TEXT, redirected INT -- reduced\_nodes <br> - name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p_support INT +- nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT +- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) +- eol\_ids: id INT PRIMARY KEY, name TEXT +- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT +- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT +- reduced\_nodes: name TEXT PRIMARY KEY, children TEXT, parent TEXT, tips INT, p\_support INT + +Other Files +=========== +- dbpPickedLabels.txt <br> + Contains DBpedia labels, one per line. Used by genDbpData.py to help + resolve conflicts when associating tree-of-life node names with + DBpedia node labels. Was generated by manually editing the output + of genDbpConflicts.py. +- genDbpConflicts.py <br> + Reads data from dbpedia/dbpData.db, and the 'nodes' table of data.db, + and looks for potential conflicts that would arise when genDbpData.db + tries to associate tree-of-life node names wth DBpedia node labels. It + writes data about them to conflicts.txt, which can be manually edited + to resolve them. diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt new file mode 100644 index 0000000..80a4770 --- /dev/null +++ b/backend/data/dbpPickedLabels.txt @@ -0,0 +1,657 @@ +Abditomys latidens +Abdopus aculeatus +Ablerus +Abralia +Abramis brama +Abraxas sylvata +Abrostola +Acidobacteria +Acontia +Actiniaria +Actinobacteria +Addisonia +Aenetus +Agapanthus +Aglais +Aglaope +agrotis infusa (bogong moth) +agrypnia (caddisfly) +Agyrium +Ainoa +Alauda +Alces +Aleurotrachelus atratus +Alexicles +Alistra +Allactaga +Alligator +Aloe +Aloe vera +Aluta +Alypia +Amanita +Amaranthus +Amaryllis +Amazona +Ambrysus +ambystomatidae (mole salamanders) +Amphibia +Amyris +Anagyrus +Anania +Ancylini +Anemosa +Angustia +anhanguera (pterosaur) +Aniba +Annona +Anthene +Anticrates +Aparupa +Aplysina archeri +Apollonias +Arabidopsis thaliana +Aracus +Arbutus +Arca noae +Arctia +Arctiini +Ardices +Ardipithecus +Argas +Argia +Aristea +arjuna (elephant) +Armina +Artace +Arthrophyllum +Asca +asclera (beetle genus) +Asperitas +Astacus +Asteroidea +Astia +Astyra +Atenia +Atrax +Atrina fragilis +Augustia +Austrogomphus +Avena +Aves +Axiidae +Azalea +Azenia +Babina holsti +Bacchini +balfouria (flatworm) +ballana (leafhopper) +Barcella +Baryonyx +basuto (horse) +Begonia +Belbina +belisarius (scorpion) +belzebub (prawn) +Bembidion +Benincasa +beroe (ctenophore) +Bertmainius tingle +Betta +Biastes +Bigelowia +Bignonia +Bitia +Blumea +Boa constrictor +Boea +Bolax +Bonia +Boops +Bougainvillia +brevinema (bacterium) +Briza +Brunonia +buchneria (millipede) +bucolion (cockroach) +Burara +Buxbaumia +Cabello +Cacatua +Caedicia +Calanus +calchas (scorpion) +Callia +callianira (ctenophore) +Caluromys +Cambarellus +Camellia +campion (lacewing) +Camptonotus +Campylopus +candelabrum (hydrozoa) +Canella +canelo (tree) +Canis lupus +Canis lupus dingo +Capitata +Capito +Caprona +Capsicum +Carallia +Carcinus +Carlina +Carpentaria +Caryanda elegans +Cassiopea +Castela +Castolus +Cavia +Ceiba +Celestus anelpistus +Cellana +cellia (subgenus) +Centaurea +Cepa +Cephoidea +Cercyra +Cetacea +Chaetomium thermophilum +Chaetonotus +chane (mayfly) +chapmania (flatworm) +charidia (skipper) +charmion (skipper) +Charpentiera +Chilena +Chiroptera +Chondrocladia +Chrysanthemum +Chumma +chuniella (worm) +Cicindela +Cilnia +cinchona (shrub family) +Cispia +Cisthene +Citipati +Citronia vasiformis +claria (rotifer) +clava (hydrozoa) +clostridioides difficile (bacteria) +Clupea +Clytus +Coccinella +Cochlearia +Coffea canephora +Cojoba +colemaniella (worm) +coleodesmium (worm) +colle (grape) +coloradia (dinosaur) +confucius (leafhopper) +Conger +Conistra +Conta +Conus +Conus marmoreus +coppa (grape) +copula (jellyfish) +Corambis +Corixa +Corsia +Cosmophyllum +Cossinia +Cotylea +Craniata +Critonia +Crocus +Cronius +Cropia +Crossosoma +Cryptophagus +cryptopora (brachiopod) +Cryptosporidium +Curculio +Curtia +Cyana +Cyclamen +Cyclocotyla +Cyclostomata +Cyclostomatida +Cylindera +Cynanchum louiseae +Cynara +Cyrba +Dacus +dalla (skipper) +Daphne laureola +Daphnia +Datura +Davidiella +Davidsonia +Decapoda +Degeneria +Deinonychus +deiopea (ctenophore) +Desmos +Diaphorus +Diaprepes +Dichanthelium clandestinum +Digitalis +dikwa (amphipod) +Dilipa +dilong (dinosaur) +dioxys (bee) +Dipoena +Dolichocephala +Doras +Dracaena braunii +Draco mindanensis +drepanophorus (worm) +Drimiopsis +Drobeta +Drymus +Dufourea +Dulzura +Dynastes +Echeclus +edwardsiella (bacterium) +Eidothea +Elaenia +Eleutherodactylus +Elona +Elseya +Encolpius +endere (millipede) +Enispa +Enteropneusta +Erasinus +Eresus +eriosoma (aphid) +Erodium +Erythronium +ethesia (acanthaceae) +Eucalyptus +Euphorbia +Euphrasia +euryglossa (bee) +Eusarcus +Evansia +Extra extra +fallax (brachiopod) +Felidae +Felis +Fernandezia +fimbriaria (flatworm) +florea (millipede) +Flos +Forestiera +Forsythia +Fossarina +Fulcinia +Fulgora +Fungi +Fungia +Fusinus +Galaxaura +Galega +Gaoligongshania +gargantua (gorilla) +Gaura +Gazella +Gelae +Gemina +Geminia +Geomalacus +georgium (caddisfly) +Gergithus +Geum coccineum +Giraffa +Girella zebra +Glena +Gnathostomata +Gobius niger +Gongora +Gongylus +gratia (mayfly) +gryphus (brachiopod) +Gymnopodium +habeas corpus (pig) +Halenia +Halesia +halla (horse) +Hallucigenia +Harmothoe +Harpa +Heliophila +helleria (woodlouse) +Helvidia +Hemiaspis +Hemithea +Herpetopoma +Heteroteuthis +Hibana +Hippotion +Holops +Homo erectus +Homo sapiens +Hoplitosaurus +Hortensia +Hostus +Hottea +Houttuynia +Huntia +hydrosaurus (mosasaur) +Hypodematium +iassus (leafhopper) +Ibana +Iberus +Ibis +Icona +Ilerda +Imma +Impatiens +Insecta +Isurus +Ixora +Jacaranda +Junco +Khaan +kobus (antelope) +Koppe +La paloma +Labyrinthus +Laetilia +lampea (ctenophore) +Lampetra +Lanceola +Laurencia +laureola (woodlouse) +Laurus +Lemuria +Lepidoptera +Leptonema +Lepus +Lestoidea +Leuconia +Ligia +Lilium +Limax +Limia +Limnornis +liniscus (nematode) +Listeria +Livius +Lobster +Loefgrenia +Lonicera +Loxodonta africana +Lucanus cervus +Lunella +Lurio +Lusius +Lytta vesicatoria +Macroglossinae +macrophylla (wych elm cultivar) +Madia +Madrepora +magellania (brachiopod) +Magnolia +Magnolia grandiflora +Magnoliopsida +malena (stork) +Mammalia +manis (orangutan) +Mantica +Mapusaurus +Marchena +Marmorata +Marmosa +Marmota +Martensia +Massaga +Medius +Megaceros +Megaphyllum +Meloe +Mene +Menziesia +Mermessus +Metabolus +metallus (sawfly) +Metasequoia +migros (turkey) +Milla +Mimosa +Mispila +Mizuhopecten yessoensis +Molinia +Molione +Molycria +Monaeses +Monotonia +Monticola +Moritasgus +Morone +Mussidia +Mynes +Myuchelys bellii +Myuchelys georgesi +Myuchelys latisternum +Myxoderma +Nagaina +Naja +Nala +Napaeus +Nardoa +Naubolus +nausithoe (jellyfish) +Naxia +Nectria +Neis +nemertes (worm) +Neomeris +Nephus +Nerine +Nicodamus +Nitrospira +Nitzschia +Nycteris +octodon (plant genus) +odius (amphipod) +Oeax +Omphalotropis +Opas +Orcinus +Ornitholestes +Ornithopus +Orsinome +Oxyuris +Pachyuromys +padina (algae) +Pajanelia +Palisa +Panthea +Papilio +Papilio buddha +Papilionidae +parachela (tardigrade) +Paratheria +parkinsonia (ammonite) +Parthenium +Parus +Passer +Patrinia +Perdix +Perilla frutescens +Perizoma +Persoonia +Petricola +Petronia +Pharnacia +Phassus +Pheia +Pheretima +Philotis +Phlegon +Phlogius +Phlox +Pholas dactylus +Phyllanthus distichus +Phyllium +Phytoecia +Piaya +Pieridae +Pilia +Pirula +Pisania +Pistoria +plasmodium (slime mold) +Platanus +Plautia +Pluteus +Podalia +Poliana +Polistes canadensis +Polypodium australe +pompholyx (rotifer) +Porius +Posidonia +Potamanthus yooni +Potos +Primates +Problema +Procolobus +Pronous +Prothoe +Pseudonaja mengdeni +Psylla +Ptelea +Pteronotus +Pullimosina +puncha (snakefly) +Pusa +pygora (goat) +Pyrnus +Pyrola +Pyrus +Pythium +Quintilia +Rafflesia +Rattus norvegicus +Rattus rattus +Rhene +Rhipicephalus +Rhombodera +Rhombodera extensicollis +Rhombodera megaera +Rosales +Rotaria +Roussea +Rubia +Rugosa +Salamandra +Salpingotus +Salticus coronatus +Salvia +Samanea +Samaris +Savarna +Scarabaeus +Schiedea +Schizopetalum +Scopula gracilis +seira (springtail) +Selimus +serina (grape) +Setina +Sibbaldia +Sicyonia +Sideroxylon +Silene +Sinea +Siphonaptera +Sirenia +Sithon +Smilax +soa (barklice) +Solanum +Solidago houghtonii +Sonneratia +Sorex dispar +Sorex longirostris +Sorghum +Spea +Spelobia +sphingidae (hawk moths) +Squalus clarkae +Stegonotus +Stenonia +Stephania +Stichius +Strombus +strongylus (nematode) +Struthiomimus +suca (lacewing) +sulcia (bacteria) +superba (elm hybrid) +Swan +Sybota +Sydowia +Syllis +Syneta +Tainia +taku (whale) +Tantilla +tapejara (pterosaur) +tapia (tree) +Tarne +Tawera +Telphusa +Termitomyces schimperi +Theba +Thestor +Thomomys +Thria +Thunbergia +Thunnus +Thyasiridae +Thyreus +tinerfe (ctenophore) +Tiso +Titanophora +tokara (horse) +tortricidae (snakes) +Tortrix +Triaenophorus +Triops +Trochilus +tunga (flea) +Tusitala +Tutelina +Tympanuchus cupido pinnatus +Tyrannosaurus rex +uga (dog) +Unenlagia +Uria +Ursia +valentines (grape) +Vanessa atalanta +Varicella +velamen (ctenophore) +Velociraptor +Venia +Vinca +Vipera +vitrum (tunicate) +Vitula +wolga (rotifer) +Wulfenia +Yacolla +Zalmunna +Zelia +Zeuxippus +Zimmermannella +zinga (leafhopper) +Zingel diff --git a/backend/data/dbpedia/README.md b/backend/data/dbpedia/README.md new file mode 100644 index 0000000..0e7c266 --- /dev/null +++ b/backend/data/dbpedia/README.md @@ -0,0 +1,25 @@ +Downloaded Files +================ +- labels\_lang=en.ttl.bz2 <br> + Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core, + using the link <https://databus.dbpedia.org/dbpedia/generic/labels/2022.03.01/labels_lang=en.ttl.bz2>. +- redirects\_lang=en\_transitive.ttl.bz2 <br> + Downloaded from <https://databus.dbpedia.org/dbpedia/generic/redirects/2022.03.01/redirects_lang=en_transitive.ttl.bz2>. +- disambiguations\_lang=en.ttl.bz2 <br> + Downloaded from <https://databus.dbpedia.org/dbpedia/generic/disambiguations/2022.03.01/disambiguations_lang=en.ttl.bz2>. +- instance-types\_lang=en\_specific.ttl.bz2 <br> + Downloaded from <https://databus.dbpedia.org/dbpedia/mappings/instance-types/2022.03.01/instance-types_lang=en_specific.ttl.bz2>. +- short-abstracts\_lang=en.ttl.bz2 <br> + Downloaded from <https://databus.dbpedia.org/vehnem/text/short-abstracts/2021.05.01/short-abstracts_lang=en.ttl.bz2>. + +Generated Files +=============== +- dbpData.db <br> + An sqlite database representing data from the ttl files. + Generated by running genData.py. + Tables + - labels: iri TEXT PRIMARY KEY, label TEXT + - redirects: iri TEXT PRIMARY KEY, target TEXT + - disambiguations: iri TEXT PRIMARY KEY + - types: iri TEXT, type TEXT + - abstracts: iri TEXT PRIMARY KEY, abstract TEXT diff --git a/backend/data/dbpedia/genData.py b/backend/data/dbpedia/genData.py new file mode 100755 index 0000000..e147641 --- /dev/null +++ b/backend/data/dbpedia/genData.py @@ -0,0 +1,122 @@ +#!/usr/bin/python3 + +import sys, re +import bz2, sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads DBpedia labels+types+redirects+abstracts data,\n" +usageInfo += "and creates a sqlite db containing that data.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines +redirectsFile = "redirects_lang=en_transitive.ttl.bz2" +disambigFile = "disambiguations_lang=en.ttl.bz2" +typesFile = "instance-types_lang=en_specific.ttl.bz2" +abstractsFile = "short-abstracts_lang=en.ttl.bz2" +dbFile = "dbpData.db" + +# Open db +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Read/store labels +print("Reading/storing label data") +dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)") +dbCur.execute("CREATE INDEX labels_idx ON labels(label COLLATE NOCASE)") +labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n') +lineNum = 0 +with bz2.open(labelsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("Processing line {}".format(lineNum)) + # + match = labelLineRegex.fullmatch(line) + if match == None: + print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2))) +dbCon.commit() +# Read/store redirects +print("Reading/storing redirection data") +dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)") +redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') +lineNum = 0 +with bz2.open(redirectsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("Processing line {}".format(lineNum)) + # + match = redirLineRegex.fullmatch(line) + if match == None: + print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2))) +dbCon.commit() +# Read/store diambiguation-page data +print("Reading/storing diambiguation-page data") +disambigNames = set() +disambigLineRegex = redirLineRegex +lineNum = 0 +with bz2.open(disambigFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("Processing line {}".format(lineNum)) + # + match = disambigLineRegex.fullmatch(line) + if match == None: + print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr) + sys.exit(1) + else: + disambigNames.add(match.group(1)) +dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)") +for name in disambigNames: + dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,)) +dbCon.commit() +# Read/store instance-type +print("Reading/storing instance-type data") +dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)") +dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)") +typeLineRegex = redirLineRegex +lineNum = 0 +with bz2.open(typesFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("Processing line {}".format(lineNum)) + # + match = typeLineRegex.fullmatch(line) + if match == None: + print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2))) +dbCon.commit() +# Read/store abstracts +print("Reading/storing abstracts") +dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)") +descLineRegex = labelLineRegex +lineNum = 0 +with bz2.open(abstractsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("Processing line {}".format(lineNum)) + # + if line[0] == "#": + continue + match = descLineRegex.fullmatch(line) + if match == None: + print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO abstracts VALUES (?, ?)", + (match.group(1), match.group(2).replace(r'\"', '"'))) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/genDbpConflicts.py b/backend/data/genDbpConflicts.py new file mode 100755 index 0000000..0ad4e1e --- /dev/null +++ b/backend/data/genDbpConflicts.py @@ -0,0 +1,202 @@ +#!/usr/bin/python3 + +import sys, re +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads DBpedia data from dbpedia/dbpData.db, along with tree-of-life\n" +usageInfo += "node name data from a sqlite database, and looks for potential\n" +usageInfo += "conflicts in associating node names with DBpedia-node labels. For\n" +usageInfo += "example, a node named 'homo sapiens' might have conflicting labels\n" +usageInfo += "'Homo sapiens', 'homo sapiens (novel)', and 'homo sapiens (song)'.\n" +usageInfo += "\n" +usageInfo += "Writes conflict information to file. For each conflict, a line is printed,\n" +usageInfo += "holding comma-separated DBpedia labels. If the labels include no-parentheses elements,\n" +usageInfo += "additional tab-indented lines are printed, wholding short-abstracts for those labels.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbpDb = "dbpedia/dbpData.db" +dbFile = "data.db" +outFile = "conflicts.txt" + +# Open dbs +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +dbpCon = sqlite3.connect(dbpDb) +dbpCur = dbpCon.cursor() +# Get node names +print("Reading node names") +nodeNames = set() +for row in dbCur.execute("SELECT name from nodes"): + nodeNames.add(row[0]) +# Get disambiguation page labels +print("Reading disambiguation-page labels") +disambigLabels = set() +query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri" +for (label,) in dbpCur.execute(query): + disambigLabels.add(label) +# Find labels with conflicts +print("Finding conflicting labels") +nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") +nameToVariants = {} +iterNum = 0 +for (label,) in dbpCur.execute("SELECT label from labels"): + iterNum += 1 + if iterNum % 1e5 == 0: + print("Processing line {}".format(iterNum)) + # + if label in disambigLabels: + continue + name = label.lower() + if name in nodeNames: + if name not in nameToVariants: + nameToVariants[name] = [label] + elif label not in nameToVariants[name]: + nameToVariants[name].append(label) + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + subName = match.group(1) + if subName in nodeNames and match.group(2) != "disambiguation": + if subName not in nameToVariants: + nameToVariants[subName] = [name] # Intentionally ignoring case here + elif name not in nameToVariants[subName]: + nameToVariants[subName].append(name) +namesToRemove = set() +for (name, variants) in nameToVariants.items(): + if len(variants) == 1: + namesToRemove.add(name) +for name in namesToRemove: + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) +# Try conflict resolution via taxon-type information +print("Resolving conflicts using instance-type data") +taxonTypes = { # Obtained from the DBpedia ontology + "http://dbpedia.org/ontology/Species", + "http://dbpedia.org/ontology/Archaea", + "http://dbpedia.org/ontology/Bacteria", + "http://dbpedia.org/ontology/Eukaryote", + "http://dbpedia.org/ontology/Plant", + "http://dbpedia.org/ontology/ClubMoss", + "http://dbpedia.org/ontology/Conifer", + "http://dbpedia.org/ontology/CultivatedVariety", + "http://dbpedia.org/ontology/Cycad", + "http://dbpedia.org/ontology/Fern", + "http://dbpedia.org/ontology/FloweringPlant", + "http://dbpedia.org/ontology/Grape", + "http://dbpedia.org/ontology/Ginkgo", + "http://dbpedia.org/ontology/Gnetophytes", + "http://dbpedia.org/ontology/GreenAlga", + "http://dbpedia.org/ontology/Moss", + "http://dbpedia.org/ontology/Fungus", + "http://dbpedia.org/ontology/Animal", + "http://dbpedia.org/ontology/Fish", + "http://dbpedia.org/ontology/Crustacean", + "http://dbpedia.org/ontology/Mollusca", + "http://dbpedia.org/ontology/Insect", + "http://dbpedia.org/ontology/Arachnid", + "http://dbpedia.org/ontology/Amphibian", + "http://dbpedia.org/ontology/Reptile", + "http://dbpedia.org/ontology/Bird", + "http://dbpedia.org/ontology/Mammal", + "http://dbpedia.org/ontology/Cat", + "http://dbpedia.org/ontology/Dog", + "http://dbpedia.org/ontology/Horse", +} +iterNum = 0 +for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): + iterNum += 1 + if iterNum % 1e5 == 0: + print("Processing line {}".format(iterNum)) + # + if type in taxonTypes: + name = label.lower() + if name in nameToVariants: + del nameToVariants[name] + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + name = match.group(1) + if name in nameToVariants: + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) +# Try conflict resolution via category-list + # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) +print("Resolving conflicts using category-list") +generalCategories = { + "species", "genus", + "plant", "fungus", "animal", + "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", + "fish", "amphibian", "reptile", "bird", "mammal", +} +specificCategories = { + "protist", "alveolate", "dinoflagellates", + "orchid", "Poaceae", "fern", "moss", "alga", + "bryozoan", "hydrozoan", + "sponge", "cnidarian", "coral", "polychaete", "echinoderm", + "bivalve", "gastropod", "chiton", + "shrimp", "decapod", "crab", "barnacle", "copepod", + "arachnid", "spider", "harvestman", "mite", + "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", + "beetle", "fly", "butterfly", "moth", "wasp", + "catfish", + "frog", + "lizard", + "horse", "sheep", "cattle", "mouse", +} +namesToRemove = set() +for (name, variants) in nameToVariants.items(): + found = False + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in generalCategories: + namesToRemove.add(name) + found = True + break + if not found: + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in specificCategories: + namesToRemove.add(name) + break +for name in namesToRemove: + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) +# Find descriptions for plain-named labels +print("Finding descriptions for plain-named labels") +labelToDesc = {} +iterNum = 0 +query = "SELECT label, abstract from labels INNER JOIN abstracts ON labels.iri = abstracts.iri" +for (label, desc,) in dbpCur.execute(query): + iterNum += 1 + if iterNum % 1e5 == 0: + print("Processing line {}".format(iterNum)) + # + if label.lower() in nameToVariants: + labelToDesc[label] = desc +print("Finding descriptions for redirect-resolved labels") +iterNum = 0 +query = "SELECT label, abstract from labels" \ + " INNER JOIN redirects ON labels.iri = redirects.iri INNER JOIN abstracts ON redirects.target = abstracts.iri" +for (label, desc,) in dbpCur.execute(query): + iterNum += 1 + if iterNum % 1e5 == 0: + print("Processing line {}".format(iterNum)) + # + if label.lower() in nameToVariants: + labelToDesc[label] = desc +# +print("Writing conflict data to file") +with open(outFile, "w") as file: + for (name, variants) in nameToVariants.items(): + for n in variants: + file.write(n + ", ") + file.write("\n") + for n in variants: + if n in labelToDesc: + file.write("\t{}: {}\n".format(n, labelToDesc[n])) +# Close dbs +dbCon.close() +dbpCon.close() diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py new file mode 100755 index 0000000..6cc8d33 --- /dev/null +++ b/backend/data/genDbpData.py @@ -0,0 +1,227 @@ +#!/usr/bin/python3 + +import sys, re +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads DBpedia data from dbpedia/*, along with tree-of-life\n" +usageInfo += "node and name data from a sqlite database, associates nodes with\n" +usageInfo += "DBpedia IRIs, and adds alt-name and description information for\n" +usageInfo += "those nodes.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbpediaDb = "dbpedia/dbpData.db" +pickedLabelsFile = "dbpPickedLabels.txt" +dbFile = "data.db" + +# Open dbs +dbpCon = sqlite3.connect(dbpediaDb) +dbpCur = dbpCon.cursor() +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Get node names +print("Reading node names") +nodeNames = set() +for row in dbCur.execute("SELECT name from nodes"): + nodeNames.add(row[0]) +# Get disambiguation page labels +print("Reading disambiguation-page labels") +disambigLabels = set() +query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri" +for (label,) in dbpCur.execute(query): + disambigLabels.add(label) +# Try associating nodes with IRIs, accounting for disambiguation labels +print("Trying to associate nodes with labels") +nodeToLabel = {} +nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") +nameToVariants = {} +iterNum = 0 +for (label,) in dbpCur.execute("SELECT label from labels"): + iterNum += 1 + if iterNum % 1e5 == 0: + print("Processing line {}".format(iterNum)) + # + if label in disambigLabels: + continue + name = label.lower() + if name in nodeNames: + if name not in nameToVariants: + nameToVariants[name] = [label] + elif label not in nameToVariants[name]: + nameToVariants[name].append(label) + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + subName = match.group(1) + if subName in nodeNames and match.group(2) != "disambiguation": + if subName not in nameToVariants: + nameToVariants[subName] = [name] # Intentionally ignoring case here + elif name not in nameToVariants[subName]: + nameToVariants[subName].append(name) +for (name, variants) in nameToVariants.items(): + if len(variants) == 1: + nodeToLabel[name] = variants[0] +for name in nodeToLabel: + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) +# Try conflict resolution via category-list + # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) +print("Resolving conflicts using category-list") +generalCategories = { + "species", "genus", + "plant", "fungus", "animal", + "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", + "fish", "amphibian", "reptile", "bird", "mammal", +} +specificCategories = { + "protist", "alveolate", "dinoflagellates", + "orchid", "Poaceae", "fern", "moss", "alga", + "bryozoan", "hydrozoan", + "sponge", "cnidarian", "coral", "polychaete", "echinoderm", + "bivalve", "gastropod", "chiton", + "shrimp", "decapod", "crab", "barnacle", "copepod", + "arachnid", "spider", "harvestman", "mite", + "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", + "beetle", "fly", "butterfly", "moth", "wasp", + "catfish", + "frog", + "lizard", + "horse", "sheep", "cattle", "mouse", +} +namesToRemove = set() +for (name, variants) in nameToVariants.items(): + found = False + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in generalCategories: + nodeToLabel[name] = label + namesToRemove.add(name) + found = True + break + if not found: + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in specificCategories: + nodeToLabel[name] = label + namesToRemove.add(name) + break +for name in namesToRemove: + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) +# Try conflict resolution via taxon-type information +print("Resolving conflicts using instance-type data") +taxonTypes = { # Obtained from the DBpedia ontology + "http://dbpedia.org/ontology/Species", + "http://dbpedia.org/ontology/Archaea", + "http://dbpedia.org/ontology/Bacteria", + "http://dbpedia.org/ontology/Eukaryote", + "http://dbpedia.org/ontology/Plant", + "http://dbpedia.org/ontology/ClubMoss", + "http://dbpedia.org/ontology/Conifer", + "http://dbpedia.org/ontology/CultivatedVariety", + "http://dbpedia.org/ontology/Cycad", + "http://dbpedia.org/ontology/Fern", + "http://dbpedia.org/ontology/FloweringPlant", + "http://dbpedia.org/ontology/Grape", + "http://dbpedia.org/ontology/Ginkgo", + "http://dbpedia.org/ontology/Gnetophytes", + "http://dbpedia.org/ontology/GreenAlga", + "http://dbpedia.org/ontology/Moss", + "http://dbpedia.org/ontology/Fungus", + "http://dbpedia.org/ontology/Animal", + "http://dbpedia.org/ontology/Fish", + "http://dbpedia.org/ontology/Crustacean", + "http://dbpedia.org/ontology/Mollusca", + "http://dbpedia.org/ontology/Insect", + "http://dbpedia.org/ontology/Arachnid", + "http://dbpedia.org/ontology/Amphibian", + "http://dbpedia.org/ontology/Reptile", + "http://dbpedia.org/ontology/Bird", + "http://dbpedia.org/ontology/Mammal", + "http://dbpedia.org/ontology/Cat", + "http://dbpedia.org/ontology/Dog", + "http://dbpedia.org/ontology/Horse", +} +iterNum = 0 +for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): + iterNum += 1 + if iterNum % 1e5 == 0: + print("Processing line {}".format(iterNum)) + # + if type in taxonTypes: + name = label.lower() + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + name = match.group(1) + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) +# Try conflict resolution via picked-labels +print("Resolving conflicts using picked-labels") +with open(pickedLabelsFile) as file: + for line in file: + pickedLabel = line.rstrip() + name = pickedLabel.lower() + if name in nameToVariants: + nodeToLabel[name] = pickedLabel + del nameToVariants[name] + else: + match = nameVariantRegex.match(pickedLabel) + if match == None: + print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr) + else: + name = match.group(1) + if name not in nameToVariants: + print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr) + else: + nodeToLabel[name] = pickedLabel + del nameToVariants[name] +print("Number of conflicts: {}".format(len(nameToVariants))) +# Associate nodes with IRIs +print("Getting nodes IRIs") +nodeToIri = {} +iterNum = 0 +for (name, label) in nodeToLabel.items(): + row = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone() + if row == None: + print("ERROR: Couldn't find label {}".format(label), file=sys.stderr) + sys.exit(1) + else: + nodeToIri[name] = row[0] +# Resolve redirects +print("Resolving redirects") +redirectingIriSet = set() +iterNum = 0 +for (name, iri) in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone() + if row != None: + nodeToIri[name] = row[0] + redirectingIriSet.add(iri) +# Find descriptions, and add to db +print("Adding node description data") +dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)") +iterNum = 0 +for (name, iri) in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + row = dbpCur.execute("SELECT abstract FROM abstracts where iri = ?", (iri,)).fetchone() + if row != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingIriSet else 0)) +# Close dbs +dbCon.commit() +dbCon.close() +dbpCon.commit() +dbpCon.close() |
