1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
|
#!/usr/bin/python3
import sys, re
import sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
usageInfo += "Reads DBpedia data from dbpedia/*, along with tree-of-life\n"
usageInfo += "node and name data from a sqlite database, associates nodes with\n"
usageInfo += "DBpedia IRIs, and adds alt-name and description information for\n"
usageInfo += "those nodes.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
dbpediaDb = "dbpedia/dbpData.db"
pickedLabelsFile = "dbpPickedLabels.txt"
dbFile = "data.db"
# Open dbs
dbpCon = sqlite3.connect(dbpediaDb)
dbpCur = dbpCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
# Get node names
print("Reading node names")
nodeNames = set()
for row in dbCur.execute("SELECT name from nodes"):
nodeNames.add(row[0])
# Get disambiguation page labels
print("Reading disambiguation-page labels")
disambigLabels = set()
query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
for (label,) in dbpCur.execute(query):
disambigLabels.add(label)
# Try associating nodes with IRIs, accounting for disambiguation labels
print("Trying to associate nodes with labels")
nodeToLabel = {}
nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)")
nameToVariants = {}
iterNum = 0
for (label,) in dbpCur.execute("SELECT label from labels"):
iterNum += 1
if iterNum % 1e5 == 0:
print("Processing line {}".format(iterNum))
#
if label in disambigLabels:
continue
name = label.lower()
if name in nodeNames:
if name not in nameToVariants:
nameToVariants[name] = [label]
elif label not in nameToVariants[name]:
nameToVariants[name].append(label)
else:
match = nameVariantRegex.fullmatch(name)
if match != None:
subName = match.group(1)
if subName in nodeNames and match.group(2) != "disambiguation":
if subName not in nameToVariants:
nameToVariants[subName] = [name] # Intentionally ignoring case here
elif name not in nameToVariants[subName]:
nameToVariants[subName].append(name)
for (name, variants) in nameToVariants.items():
if len(variants) == 1:
nodeToLabel[name] = variants[0]
for name in nodeToLabel:
del nameToVariants[name]
nodeToLabel["cellular organisms"] = "organism" # Special case for root node
print("Number of conflicts: {}".format(len(nameToVariants)))
# Try conflict resolution via category-list
# Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
print("Resolving conflicts using category-list")
generalCategories = {
"species", "genus",
"plant", "fungus", "animal",
"annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
"fish", "amphibian", "reptile", "bird", "mammal",
}
specificCategories = {
"protist", "alveolate", "dinoflagellates",
"orchid", "Poaceae", "fern", "moss", "alga",
"bryozoan", "hydrozoan",
"sponge", "cnidarian", "coral", "polychaete", "echinoderm",
"bivalve", "gastropod", "chiton",
"shrimp", "decapod", "crab", "barnacle", "copepod",
"arachnid", "spider", "harvestman", "mite",
"dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
"beetle", "fly", "butterfly", "moth", "wasp",
"catfish",
"frog",
"lizard",
"horse", "sheep", "cattle", "mouse",
}
namesToRemove = set()
for (name, variants) in nameToVariants.items():
found = False
for label in variants:
match = nameVariantRegex.match(label)
if match != None and match.group(2) in generalCategories:
nodeToLabel[name] = label
namesToRemove.add(name)
found = True
break
if not found:
for label in variants:
match = nameVariantRegex.match(label)
if match != None and match.group(2) in specificCategories:
nodeToLabel[name] = label
namesToRemove.add(name)
break
for name in namesToRemove:
del nameToVariants[name]
print("Number of conflicts: {}".format(len(nameToVariants)))
# Try conflict resolution via taxon-type information
print("Resolving conflicts using instance-type data")
taxonTypes = { # Obtained from the DBpedia ontology
"http://dbpedia.org/ontology/Species",
"http://dbpedia.org/ontology/Archaea",
"http://dbpedia.org/ontology/Bacteria",
"http://dbpedia.org/ontology/Eukaryote",
"http://dbpedia.org/ontology/Plant",
"http://dbpedia.org/ontology/ClubMoss",
"http://dbpedia.org/ontology/Conifer",
"http://dbpedia.org/ontology/CultivatedVariety",
"http://dbpedia.org/ontology/Cycad",
"http://dbpedia.org/ontology/Fern",
"http://dbpedia.org/ontology/FloweringPlant",
"http://dbpedia.org/ontology/Grape",
"http://dbpedia.org/ontology/Ginkgo",
"http://dbpedia.org/ontology/Gnetophytes",
"http://dbpedia.org/ontology/GreenAlga",
"http://dbpedia.org/ontology/Moss",
"http://dbpedia.org/ontology/Fungus",
"http://dbpedia.org/ontology/Animal",
"http://dbpedia.org/ontology/Fish",
"http://dbpedia.org/ontology/Crustacean",
"http://dbpedia.org/ontology/Mollusca",
"http://dbpedia.org/ontology/Insect",
"http://dbpedia.org/ontology/Arachnid",
"http://dbpedia.org/ontology/Amphibian",
"http://dbpedia.org/ontology/Reptile",
"http://dbpedia.org/ontology/Bird",
"http://dbpedia.org/ontology/Mammal",
"http://dbpedia.org/ontology/Cat",
"http://dbpedia.org/ontology/Dog",
"http://dbpedia.org/ontology/Horse",
}
iterNum = 0
for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
iterNum += 1
if iterNum % 1e5 == 0:
print("Processing line {}".format(iterNum))
#
if type in taxonTypes:
name = label.lower()
if name in nameToVariants:
nodeToLabel[name] = label
del nameToVariants[name]
else:
match = nameVariantRegex.fullmatch(name)
if match != None:
name = match.group(1)
if name in nameToVariants:
nodeToLabel[name] = label
del nameToVariants[name]
print("Number of conflicts: {}".format(len(nameToVariants)))
# Try conflict resolution via picked-labels
print("Resolving conflicts using picked-labels")
with open(pickedLabelsFile) as file:
for line in file:
pickedLabel = line.rstrip()
name = pickedLabel.lower()
if name in nameToVariants:
nodeToLabel[name] = pickedLabel
del nameToVariants[name]
else:
match = nameVariantRegex.match(pickedLabel)
if match == None:
print("WARNING: Picked label {} not found (1)".format(pickedLabel), file=sys.stderr)
else:
name = match.group(1)
if name not in nameToVariants:
print("WARNING: Picked label {} not found (2)".format(pickedLabel), file=sys.stderr)
else:
nodeToLabel[name] = pickedLabel
del nameToVariants[name]
print("Number of conflicts: {}".format(len(nameToVariants)))
# Associate nodes with IRIs
print("Getting nodes IRIs")
nodeToIri = {}
iterNum = 0
for (name, label) in nodeToLabel.items():
row = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone()
if row == None:
print("ERROR: Couldn't find label {}".format(label), file=sys.stderr)
sys.exit(1)
else:
nodeToIri[name] = row[0]
# Resolve redirects
print("Resolving redirects")
redirectingIriSet = set()
iterNum = 0
for (name, iri) in nodeToIri.items():
iterNum += 1
if iterNum % 1e4 == 0:
print("At iteration {}".format(iterNum))
#
row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone()
if row != None:
nodeToIri[name] = row[0]
redirectingIriSet.add(iri)
# Find descriptions, and add to db
print("Adding node description data")
dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)")
iterNum = 0
for (name, iri) in nodeToIri.items():
iterNum += 1
if iterNum % 1e4 == 0:
print("At iteration {}".format(iterNum))
#
row = dbpCur.execute("SELECT abstract FROM abstracts where iri = ?", (iri,)).fetchone()
if row != None:
dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingIriSet else 0))
# Close dbs
dbCon.commit()
dbCon.close()
dbpCon.commit()
dbpCon.close()
|