From 27361479b3615a0f1156be3a97579df7f128d993 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Tue, 7 Jun 2022 23:06:26 +1000 Subject: Add more manual-correction for dbp-desc generation --- backend/data/dbpPickedLabels.txt | 655 --------------------------------------- backend/data/genDbpConflicts.py | 202 ------------ backend/data/genDbpData.py | 244 ++++++++------- 3 files changed, 128 insertions(+), 973 deletions(-) delete mode 100644 backend/data/dbpPickedLabels.txt delete mode 100755 backend/data/genDbpConflicts.py (limited to 'backend/data') diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt deleted file mode 100644 index 1fa1e71..0000000 --- a/backend/data/dbpPickedLabels.txt +++ /dev/null @@ -1,655 +0,0 @@ -Abditomys latidens -Abdopus aculeatus -Ablerus -Abralia -Abramis brama -Abraxas sylvata -Abrostola -Acidobacteria -Acontia -Actiniaria -Actinobacteria -Addisonia -Aenetus -Agapanthus -Aglais -Aglaope -agrotis infusa (bogong moth) -agrypnia (caddisfly) -Agyrium -Ainoa -Alauda -Alces -Aleurotrachelus atratus -Alexicles -Alistra -Allactaga -Alligator -Aloe -Aloe vera -Aluta -Alypia -Amanita -Amaranthus -Amaryllis -Amazona -Ambrysus -ambystomatidae (mole salamanders) -Amphibia -Amyris -Anagyrus -Anania -Ancylini -Anemosa -Angustia -anhanguera (pterosaur) -Aniba -Annona -Anthene -Anticrates -Aparupa -Aplysina archeri -Apollonias -Arabidopsis thaliana -Aracus -Arbutus -Arca noae -Arctia -Arctiini -Ardices -Ardipithecus -Argas -Argia -Aristea -arjuna (elephant) -Armina -Artace -Arthrophyllum -Asca -asclera (beetle genus) -Asperitas -Astacus -Asteroidea -Astia -Astyra -Atenia -Atrax -Atrina fragilis -Augustia -Austrogomphus -Avena -Aves -Axiidae -Azalea -Azenia -Babina holsti -Bacchini -balfouria (flatworm) -ballana (leafhopper) -Barcella -Baryonyx -Begonia -Belbina -belisarius (scorpion) -belzebub (prawn) -Bembidion -Benincasa -beroe (ctenophore) -Bertmainius tingle -Betta -Biastes -Bigelowia -Bignonia -Bitia -Blumea -Boa constrictor -Boea -Bolax -Bonia -Boops -Bougainvillia -brevinema (bacterium) -Briza -Brunonia -buchneria (millipede) -bucolion (cockroach) -Burara -Buxbaumia -Cabello -Cacatua -Caedicia -Calanus -calchas (scorpion) -Callia -callianira (ctenophore) -Caluromys -Cambarellus -Camellia -campion (lacewing) -Camptonotus -Campylopus -candelabrum (hydrozoa) -Canella -canelo (tree) -Canis lupus -Canis lupus dingo -Capitata -Capito -Caprona -Capsicum -Carallia -Carcinus -Carlina -Carpentaria -Caryanda elegans -Cassiopea -Castela -Castolus -Cavia -Ceiba -Celestus anelpistus -Cellana -cellia (subgenus) -Centaurea -Cepa -Cephoidea -Cercyra -Cetacea -Chaetomium thermophilum -Chaetonotus -chane (mayfly) -chapmania (flatworm) -charidia (skipper) -charmion (skipper) -Charpentiera -Chilena -Chiroptera -Chondrocladia -Chrysanthemum -Chumma -chuniella (worm) -Cicindela -Cilnia -cinchona (shrub family) -Cispia -Cisthene -Citipati -Citronia vasiformis -claria (rotifer) -clava (hydrozoa) -clostridioides difficile (bacteria) -Clupea -Clytus -Coccinella -Cochlearia -Coffea canephora -Cojoba -colemaniella (worm) -coleodesmium (worm) -colle (grape) -coloradia (dinosaur) -confucius (leafhopper) -Conger -Conistra -Conta -Conus -Conus marmoreus -coppa (grape) -copula (jellyfish) -Corambis -Corixa -Corsia -Cosmophyllum -Cossinia -Cotylea -Craniata -Critonia -Crocus -Cronius -Cropia -Crossosoma -Cryptophagus -cryptopora (brachiopod) -Cryptosporidium -Curculio -Curtia -Cyana -Cyclamen -Cyclocotyla -Cyclostomata -Cyclostomatida -Cylindera -Cynanchum louiseae -Cynara -Cyrba -Dacus -dalla (skipper) -Daphne laureola -Daphnia -Datura -Davidiella -Davidsonia -Decapoda -Degeneria -Deinonychus -deiopea (ctenophore) -Desmos -Diaphorus -Diaprepes -Dichanthelium clandestinum -Digitalis -dikwa (amphipod) -Dilipa -dilong (dinosaur) -dioxys (bee) -Dipoena -Dolichocephala -Doras -Dracaena braunii -Draco mindanensis -drepanophorus (worm) -Drimiopsis -Drobeta -Drymus -Dufourea -Dulzura -Dynastes -Echeclus -edwardsiella (bacterium) -Eidothea -Elaenia -Eleutherodactylus -Elona -Elseya -Encolpius -endere (millipede) -Enispa -Enteropneusta -Erasinus -Eresus -eriosoma (aphid) -Erodium -Erythronium -ethesia (acanthaceae) -Eucalyptus -Euphorbia -Euphrasia -euryglossa (bee) -Eusarcus -Evansia -Extra extra -fallax (brachiopod) -Felidae -Felis -Fernandezia -fimbriaria (flatworm) -florea (millipede) -Flos -Forestiera -Forsythia -Fossarina -Fulcinia -Fulgora -Fungi -Fungia -Fusinus -Galaxaura -Galega -Gaoligongshania -gargantua (gorilla) -Gaura -Gazella -Gelae -Gemina -Geminia -Geomalacus -georgium (caddisfly) -Gergithus -Geum coccineum -Giraffa -Girella zebra -Glena -Gnathostomata -Gobius niger -Gongora -Gongylus -gratia (mayfly) -gryphus (brachiopod) -Gymnopodium -habeas corpus (pig) -Halenia -Halesia -Hallucigenia -Harmothoe -Harpa -Heliophila -helleria (woodlouse) -Helvidia -Hemiaspis -Hemithea -Herpetopoma -Heteroteuthis -Hibana -Hippotion -Holops -Homo erectus -Homo sapiens -Hoplitosaurus -Hortensia -Hostus -Hottea -Houttuynia -Huntia -hydrosaurus (mosasaur) -Hypodematium -iassus (leafhopper) -Ibana -Iberus -Ibis -Icona -Ilerda -Imma -Impatiens -Insecta -Isurus -Ixora -Jacaranda -Junco -Khaan -kobus (antelope) -Koppe -La paloma -Labyrinthus -Laetilia -lampea (ctenophore) -Lampetra -Lanceola -Laurencia -laureola (woodlouse) -Laurus -Lemuria -Lepidoptera -Leptonema -Lepus -Lestoidea -Leuconia -Ligia -Lilium -Limax -Limia -Limnornis -liniscus (nematode) -Listeria -Livius -Lobster -Loefgrenia -Lonicera -Loxodonta africana -Lucanus cervus -Lunella -Lurio -Lusius -Lytta vesicatoria -Macroglossinae -macrophylla (wych elm cultivar) -Madia -Madrepora -magellania (brachiopod) -Magnolia -Magnolia grandiflora -Magnoliopsida -malena (stork) -Mammalia -manis (orangutan) -Mantica -Mapusaurus -Marchena -Marmorata -Marmosa -Marmota -Martensia -Massaga -Medius -Megaceros -Megaphyllum -Meloe -Mene -Menziesia -Mermessus -Metabolus -metallus (sawfly) -Metasequoia -migros (turkey) -Milla -Mimosa -Mispila -Mizuhopecten yessoensis -Molinia -Molione -Molycria -Monaeses -Monotonia -Monticola -Moritasgus -Morone -Mussidia -Mynes -Myuchelys bellii -Myuchelys georgesi -Myuchelys latisternum -Myxoderma -Nagaina -Naja -Nala -Napaeus -Nardoa -Naubolus -nausithoe (jellyfish) -Naxia -Nectria -Neis -nemertes (worm) -Neomeris -Nephus -Nerine -Nicodamus -Nitrospira -Nitzschia -Nycteris -octodon (plant genus) -odius (amphipod) -Oeax -Omphalotropis -Opas -Orcinus -Ornitholestes -Ornithopus -Orsinome -Oxyuris -Pachyuromys -padina (algae) -Pajanelia -Palisa -Panthea -Papilio -Papilio buddha -Papilionidae -parachela (tardigrade) -Paratheria -parkinsonia (ammonite) -Parthenium -Parus -Passer -Patrinia -Perdix -Perilla frutescens -Perizoma -Persoonia -Petricola -Petronia -Pharnacia -Phassus -Pheia -Pheretima -Philotis -Phlegon -Phlogius -Phlox -Pholas dactylus -Phyllanthus distichus -Phyllium -Phytoecia -Piaya -Pieridae -Pilia -Pirula -Pisania -Pistoria -plasmodium (slime mold) -Platanus -Plautia -Pluteus -Podalia -Poliana -Polistes canadensis -Polypodium australe -pompholyx (rotifer) -Porius -Posidonia -Potamanthus yooni -Potos -Primates -Problema -Procolobus -Pronous -Prothoe -Pseudonaja mengdeni -Psylla -Ptelea -Pteronotus -Pullimosina -puncha (snakefly) -Pusa -pygora (goat) -Pyrnus -Pyrola -Pyrus -Pythium -Quintilia -Rafflesia -Rattus norvegicus -Rattus rattus -Rhene -Rhipicephalus -Rhombodera -Rhombodera extensicollis -Rhombodera megaera -Rosales -Rotaria -Roussea -Rubia -Rugosa -Salamandra -Salpingotus -Salticus coronatus -Salvia -Samanea -Samaris -Savarna -Scarabaeus -Schiedea -Schizopetalum -Scopula gracilis -seira (springtail) -Selimus -serina (grape) -Setina -Sibbaldia -Sicyonia -Sideroxylon -Silene -Sinea -Siphonaptera -Sirenia -Sithon -Smilax -soa (barklice) -Solanum -Solidago houghtonii -Sonneratia -Sorex dispar -Sorex longirostris -Sorghum -Spea -Spelobia -sphingidae (hawk moths) -Squalus clarkae -Stegonotus -Stenonia -Stephania -Stichius -Strombus -strongylus (nematode) -Struthiomimus -suca (lacewing) -sulcia (bacteria) -superba (elm hybrid) -Swan -Sybota -Sydowia -Syllis -Syneta -Tainia -taku (whale) -Tantilla -tapejara (pterosaur) -tapia (tree) -Tarne -Tawera -Telphusa -Termitomyces schimperi -Theba -Theria -Thestor -Thomomys -Thria -Thunbergia -Thunnus -Thyasiridae -Thyreus -tinerfe (ctenophore) -Tiso -Titanophora -tortricidae (snakes) -Tortrix -Triaenophorus -Triops -Trochilus -tunga (flea) -Tusitala -Tutelina -Tympanuchus cupido pinnatus -Tyrannosaurus rex -uga (dog) -Unenlagia -Uria -Ursia -valentines (grape) -Vanessa atalanta -Varicella -velamen (ctenophore) -Velociraptor -Venia -Vinca -Vipera -vitrum (tunicate) -Vitula -wolga (rotifer) -Wulfenia -Yacolla -Zalmunna -Zelia -Zeuxippus -Zimmermannella -zinga (leafhopper) -Zingel diff --git a/backend/data/genDbpConflicts.py b/backend/data/genDbpConflicts.py deleted file mode 100755 index c0d3704..0000000 --- a/backend/data/genDbpConflicts.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/python3 - -import sys, re -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads DBpedia data from dbpedia/dbpData.db, along with tree-of-life\n" -usageInfo += "node name data from a sqlite database, and looks for potential\n" -usageInfo += "conflicts in associating node names with DBpedia-node labels. For\n" -usageInfo += "example, a node named 'homo sapiens' might have conflicting labels\n" -usageInfo += "'Homo sapiens', 'homo sapiens (novel)', and 'homo sapiens (song)'.\n" -usageInfo += "\n" -usageInfo += "Writes conflict information to file. For each conflict, a line is printed,\n" -usageInfo += "holding comma-separated DBpedia labels. If the labels include no-parentheses elements,\n" -usageInfo += "additional tab-indented lines are printed, wholding short-abstracts for those labels.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -dbpDb = "dbpedia/dbpData.db" -dbFile = "data.db" -outFile = "conflicts.txt" - -# Open dbs -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbpCon = sqlite3.connect(dbpDb) -dbpCur = dbpCon.cursor() -# Get node names -print("Reading node names") -nodeNames = set() -for row in dbCur.execute("SELECT name from nodes"): - nodeNames.add(row[0]) -# Get disambiguation page labels -print("Reading disambiguation-page labels") -disambigLabels = set() -query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri" -for (label,) in dbpCur.execute(query): - disambigLabels.add(label) -# Find labels with conflicts -print("Finding conflicting labels") -nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") -nameToVariants = {} -iterNum = 0 -for (label,) in dbpCur.execute("SELECT label from labels"): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") - # - if label in disambigLabels: - continue - name = label.lower() - if name in nodeNames: - if name not in nameToVariants: - nameToVariants[name] = [label] - elif label not in nameToVariants[name]: - nameToVariants[name].append(label) - else: - match = nameVariantRegex.fullmatch(name) - if match != None: - subName = match.group(1) - if subName in nodeNames and match.group(2) != "disambiguation": - if subName not in nameToVariants: - nameToVariants[subName] = [name] # Intentionally ignoring case here - elif name not in nameToVariants[subName]: - nameToVariants[subName].append(name) -namesToRemove = set() -for (name, variants) in nameToVariants.items(): - if len(variants) == 1: - namesToRemove.add(name) -for name in namesToRemove: - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via taxon-type information -print("Resolving conflicts using instance-type data") -taxonTypes = { # Obtained from the DBpedia ontology - "http://dbpedia.org/ontology/Species", - "http://dbpedia.org/ontology/Archaea", - "http://dbpedia.org/ontology/Bacteria", - "http://dbpedia.org/ontology/Eukaryote", - "http://dbpedia.org/ontology/Plant", - "http://dbpedia.org/ontology/ClubMoss", - "http://dbpedia.org/ontology/Conifer", - "http://dbpedia.org/ontology/CultivatedVariety", - "http://dbpedia.org/ontology/Cycad", - "http://dbpedia.org/ontology/Fern", - "http://dbpedia.org/ontology/FloweringPlant", - "http://dbpedia.org/ontology/Grape", - "http://dbpedia.org/ontology/Ginkgo", - "http://dbpedia.org/ontology/Gnetophytes", - "http://dbpedia.org/ontology/GreenAlga", - "http://dbpedia.org/ontology/Moss", - "http://dbpedia.org/ontology/Fungus", - "http://dbpedia.org/ontology/Animal", - "http://dbpedia.org/ontology/Fish", - "http://dbpedia.org/ontology/Crustacean", - "http://dbpedia.org/ontology/Mollusca", - "http://dbpedia.org/ontology/Insect", - "http://dbpedia.org/ontology/Arachnid", - "http://dbpedia.org/ontology/Amphibian", - "http://dbpedia.org/ontology/Reptile", - "http://dbpedia.org/ontology/Bird", - "http://dbpedia.org/ontology/Mammal", - "http://dbpedia.org/ontology/Cat", - "http://dbpedia.org/ontology/Dog", - "http://dbpedia.org/ontology/Horse", -} -iterNum = 0 -for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") - # - if type in taxonTypes: - name = label.lower() - if name in nameToVariants: - del nameToVariants[name] - else: - match = nameVariantRegex.fullmatch(name) - if match != None: - name = match.group(1) - if name in nameToVariants: - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via category-list - # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) -print("Resolving conflicts using category-list") -generalCategories = { - "species", "genus", - "plant", "fungus", "animal", - "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", - "fish", "amphibian", "reptile", "bird", "mammal", -} -specificCategories = { - "protist", "alveolate", "dinoflagellates", - "orchid", "Poaceae", "fern", "moss", "alga", - "bryozoan", "hydrozoan", - "sponge", "cnidarian", "coral", "polychaete", "echinoderm", - "bivalve", "gastropod", "chiton", - "shrimp", "decapod", "crab", "barnacle", "copepod", - "arachnid", "spider", "harvestman", "mite", - "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", - "beetle", "fly", "butterfly", "moth", "wasp", - "catfish", - "frog", - "lizard", - "horse", "sheep", "cattle", "mouse", -} -namesToRemove = set() -for (name, variants) in nameToVariants.items(): - found = False - for label in variants: - match = nameVariantRegex.match(label) - if match != None and match.group(2) in generalCategories: - namesToRemove.add(name) - found = True - break - if not found: - for label in variants: - match = nameVariantRegex.match(label) - if match != None and match.group(2) in specificCategories: - namesToRemove.add(name) - break -for name in namesToRemove: - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") -# Find descriptions for plain-named labels -print("Finding descriptions for plain-named labels") -labelToDesc = {} -iterNum = 0 -query = "SELECT label, abstract from labels INNER JOIN abstracts ON labels.iri = abstracts.iri" -for (label, desc,) in dbpCur.execute(query): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") - # - if label.lower() in nameToVariants: - labelToDesc[label] = desc -print("Finding descriptions for redirect-resolved labels") -iterNum = 0 -query = "SELECT label, abstract from labels" \ - " INNER JOIN redirects ON labels.iri = redirects.iri INNER JOIN abstracts ON redirects.target = abstracts.iri" -for (label, desc,) in dbpCur.execute(query): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") - # - if label.lower() in nameToVariants: - labelToDesc[label] = desc -# -print("Writing conflict data to file") -with open(outFile, "w") as file: - for (name, variants) in nameToVariants.items(): - for n in variants: - file.write(n + ", ") - file.write("\n") - for n in variants: - if n in labelToDesc: - file.write(f"\t{n}: {labelToDesc[n]}\n") -# Close dbs -dbCon.close() -dbpCon.close() diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py index 0655344..887e8a8 100755 --- a/backend/data/genDbpData.py +++ b/backend/data/genDbpData.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, re +import sys, os, re import sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" @@ -13,6 +13,7 @@ if len(sys.argv) > 1: sys.exit(1) dbpediaDb = "dbpedia/dbpData.db" +namesToSkipFile = "dbpNamesToSkip.txt" pickedLabelsFile = "dbpPickedLabels.txt" dbFile = "data.db" @@ -24,8 +25,16 @@ dbCur = dbCon.cursor() # Get node names print("Reading node names") nodeNames = set() -for row in dbCur.execute("SELECT name from nodes"): - nodeNames.add(row[0]) +for (name,) in dbCur.execute("SELECT name from nodes"): + nodeNames.add(name) +# Skipping certain names +print("Checking for names to skip") +oldSz = len(nodeNames) +if os.path.exists(namesToSkipFile): + with open(namesToSkipFile) as file: + for line in file: + nodeNames.remove(line.rstrip()) +print(f"Skipping {oldSz - len(nodeNames)} nodes") # Get disambiguation page labels print("Reading disambiguation-page labels") disambigLabels = set() @@ -57,9 +66,9 @@ for (label,) in dbpCur.execute("SELECT label from labels"): subName = match.group(1) if subName in nodeNames and match.group(2) != "disambiguation": if subName not in nameToVariants: - nameToVariants[subName] = [name] # Intentionally ignoring case here + nameToVariants[subName] = [label] elif name not in nameToVariants[subName]: - nameToVariants[subName].append(name) + nameToVariants[subName].append(label) for (name, variants) in nameToVariants.items(): if len(variants) == 1: nodeToLabel[name] = variants[0] @@ -67,126 +76,128 @@ for name in nodeToLabel: del nameToVariants[name] nodeToLabel["cellular organisms"] = "organism" # Special case for root node print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via picked-labels -print("Resolving conflicts using picked-labels") -with open(pickedLabelsFile) as file: - for line in file: - pickedLabel = line.rstrip() - name = pickedLabel.lower() - if name in nameToVariants: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] - else: - match = nameVariantRegex.match(pickedLabel) - if match == None: - print(f"WARNING: Picked label {pickedLabel} not found (1)", file=sys.stderr) +# Try resolving conflicts +def resolveWithPickedLabels(): + # Attempts conflict resolution using a file with lines of the form 'name1|label1', + # where label1 may be absent, indicating that no label should be associated with the name + print("Resolving conflicts using picked-labels") + with open(pickedLabelsFile) as file: + for line in file: + (name, _, label) = line.rstrip().partition("|") + if name not in nameToVariants: + print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr) + continue + if label == "": + del nameToVariants[name] else: - name = match.group(1) - if name not in nameToVariants: - print(f"WARNING: Picked label {pickedLabel} not found (2)", file=sys.stderr) - else: - nodeToLabel[name] = pickedLabel - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via category-list + if label not in nameToVariants[name]: + print(f"WARNING: Picked label \"{label}\" for name \"{name}\" not found", file=sys.stderr) + continue + nodeToLabel[name] = label + del nameToVariants[name] + print(f"Remaining number of conflicts: {len(nameToVariants)}") +def resolveWithCategoryList(): + # Attempts conflict resolution using category-text in labels of the form 'name1 (category1)' # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) -print("Resolving conflicts using category-list") -generalCategories = { - "species", "genus", - "plant", "fungus", "animal", - "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", - "fish", "amphibian", "reptile", "bird", "mammal", -} -specificCategories = { - "protist", "alveolate", "dinoflagellates", - "orchid", "Poaceae", "fern", "moss", "alga", - "bryozoan", "hydrozoan", - "sponge", "cnidarian", "coral", "polychaete", "echinoderm", - "bivalve", "gastropod", "chiton", - "shrimp", "decapod", "crab", "barnacle", "copepod", - "arachnid", "spider", "harvestman", "mite", - "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", - "beetle", "fly", "butterfly", "moth", "wasp", - "catfish", - "frog", - "lizard", - "horse", "sheep", "cattle", "mouse", -} -namesToRemove = set() -for (name, variants) in nameToVariants.items(): - found = False - for label in variants: - match = nameVariantRegex.match(label) - if match != None and match.group(2) in generalCategories: - nodeToLabel[name] = label - namesToRemove.add(name) - found = True - break - if not found: + print("Resolving conflicts using category-list") + generalCategories = { + "species", "genus", + "plant", "fungus", "animal", + "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", + "fish", "amphibian", "reptile", "bird", "mammal", + } + specificCategories = { + "protist", "alveolate", "dinoflagellates", + "orchid", "Poaceae", "fern", "moss", "alga", + "bryozoan", "hydrozoan", + "sponge", "cnidarian", "coral", "polychaete", "echinoderm", + "bivalve", "gastropod", "chiton", + "shrimp", "decapod", "crab", "barnacle", "copepod", + "arachnid", "spider", "harvestman", "mite", + "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", + "beetle", "fly", "butterfly", "moth", "wasp", + "catfish", + "frog", + "lizard", + "horse", "sheep", "cattle", "mouse", + } + namesToRemove = set() + for (name, variants) in nameToVariants.items(): + found = False for label in variants: match = nameVariantRegex.match(label) - if match != None and match.group(2) in specificCategories: + if match != None and match.group(2) in generalCategories: nodeToLabel[name] = label namesToRemove.add(name) + found = True break -for name in namesToRemove: - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") -# Try conflict resolution via taxon-type information -print("Resolving conflicts using instance-type data") -taxonTypes = { # Obtained from the DBpedia ontology - "http://dbpedia.org/ontology/Species", - "http://dbpedia.org/ontology/Archaea", - "http://dbpedia.org/ontology/Bacteria", - "http://dbpedia.org/ontology/Eukaryote", - "http://dbpedia.org/ontology/Plant", - "http://dbpedia.org/ontology/ClubMoss", - "http://dbpedia.org/ontology/Conifer", - "http://dbpedia.org/ontology/CultivatedVariety", - "http://dbpedia.org/ontology/Cycad", - "http://dbpedia.org/ontology/Fern", - "http://dbpedia.org/ontology/FloweringPlant", - "http://dbpedia.org/ontology/Grape", - "http://dbpedia.org/ontology/Ginkgo", - "http://dbpedia.org/ontology/Gnetophytes", - "http://dbpedia.org/ontology/GreenAlga", - "http://dbpedia.org/ontology/Moss", - "http://dbpedia.org/ontology/Fungus", - "http://dbpedia.org/ontology/Animal", - "http://dbpedia.org/ontology/Fish", - "http://dbpedia.org/ontology/Crustacean", - "http://dbpedia.org/ontology/Mollusca", - "http://dbpedia.org/ontology/Insect", - "http://dbpedia.org/ontology/Arachnid", - "http://dbpedia.org/ontology/Amphibian", - "http://dbpedia.org/ontology/Reptile", - "http://dbpedia.org/ontology/Bird", - "http://dbpedia.org/ontology/Mammal", - "http://dbpedia.org/ontology/Cat", - "http://dbpedia.org/ontology/Dog", - "http://dbpedia.org/ontology/Horse", -} -iterNum = 0 -for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") - # - if type in taxonTypes: - name = label.lower() - if name in nameToVariants: - nodeToLabel[name] = label - del nameToVariants[name] - else: - match = nameVariantRegex.fullmatch(name) - if match != None: - name = match.group(1) - if name in nameToVariants: + if not found: + for label in variants: + match = nameVariantRegex.match(label) + if match != None and match.group(2) in specificCategories: nodeToLabel[name] = label - del nameToVariants[name] -print(f"Number of conflicts: {len(nameToVariants)}") + namesToRemove.add(name) + break + for name in namesToRemove: + del nameToVariants[name] + print(f"Remaining number of conflicts: {len(nameToVariants)}") +def resolveWithTypeData(): + # Attempts conflict-resolution using dbpedia's instance-type data + print("Resolving conflicts using instance-type data") + taxonTypes = { # Obtained from the DBpedia ontology + "http://dbpedia.org/ontology/Species", + "http://dbpedia.org/ontology/Archaea", + "http://dbpedia.org/ontology/Bacteria", + "http://dbpedia.org/ontology/Eukaryote", + "http://dbpedia.org/ontology/Plant", + "http://dbpedia.org/ontology/ClubMoss", + "http://dbpedia.org/ontology/Conifer", + "http://dbpedia.org/ontology/CultivatedVariety", + "http://dbpedia.org/ontology/Cycad", + "http://dbpedia.org/ontology/Fern", + "http://dbpedia.org/ontology/FloweringPlant", + "http://dbpedia.org/ontology/Grape", + "http://dbpedia.org/ontology/Ginkgo", + "http://dbpedia.org/ontology/Gnetophytes", + "http://dbpedia.org/ontology/GreenAlga", + "http://dbpedia.org/ontology/Moss", + "http://dbpedia.org/ontology/Fungus", + "http://dbpedia.org/ontology/Animal", + "http://dbpedia.org/ontology/Fish", + "http://dbpedia.org/ontology/Crustacean", + "http://dbpedia.org/ontology/Mollusca", + "http://dbpedia.org/ontology/Insect", + "http://dbpedia.org/ontology/Arachnid", + "http://dbpedia.org/ontology/Amphibian", + "http://dbpedia.org/ontology/Reptile", + "http://dbpedia.org/ontology/Bird", + "http://dbpedia.org/ontology/Mammal", + "http://dbpedia.org/ontology/Cat", + "http://dbpedia.org/ontology/Dog", + "http://dbpedia.org/ontology/Horse", + } + iterNum = 0 + for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"Processing line {iterNum}") + # + if type in taxonTypes: + name = label.lower() + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] + else: + match = nameVariantRegex.fullmatch(name) + if match != None: + name = match.group(1) + if name in nameToVariants: + nodeToLabel[name] = label + del nameToVariants[name] + print(f"Remaining number of conflicts: {len(nameToVariants)}") +resolveWithPickedLabels() # Associate nodes with IRIs -print("Getting nodes IRIs") +print("Getting node IRIs") nodeToIri = {} iterNum = 0 for (name, label) in nodeToLabel.items(): @@ -212,6 +223,7 @@ for (name, iri) in nodeToIri.items(): # Find descriptions, and add to db print("Adding node description data") dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)") +dbCur.execute("CREATE INDEX descs_id_idx ON descs(wiki_id)") # wiki_id intentionally left non-unique iterNum = 0 for (name, iri) in nodeToIri.items(): iterNum += 1 -- cgit v1.2.3