aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genDbpConflicts.py
blob: c0d37049743b9e1a0528c5776c5b13a7269fe679 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/python3

import sys, re
import sqlite3

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Reads DBpedia data from dbpedia/dbpData.db, along with tree-of-life\n"
usageInfo += "node name data from a sqlite database, and looks for potential\n"
usageInfo += "conflicts in associating node names with DBpedia-node labels. For\n"
usageInfo += "example, a node named 'homo sapiens' might have conflicting labels\n"
usageInfo += "'Homo sapiens', 'homo sapiens (novel)', and 'homo sapiens (song)'.\n"
usageInfo += "\n"
usageInfo += "Writes conflict information to file. For each conflict, a line is printed,\n"
usageInfo += "holding comma-separated DBpedia labels. If the labels include no-parentheses elements,\n"
usageInfo += "additional tab-indented lines are printed, wholding short-abstracts for those labels.\n"
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

dbpDb = "dbpedia/dbpData.db"
dbFile = "data.db"
outFile = "conflicts.txt"

# Open dbs
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbpCon = sqlite3.connect(dbpDb)
dbpCur = dbpCon.cursor()
# Get node names
print("Reading node names")
nodeNames = set()
for row in dbCur.execute("SELECT name from nodes"):
	nodeNames.add(row[0])
# Get disambiguation page labels
print("Reading disambiguation-page labels")
disambigLabels = set()
query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
for (label,) in dbpCur.execute(query):
	disambigLabels.add(label)
# Find labels with conflicts
print("Finding conflicting labels")
nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)")
nameToVariants = {}
iterNum = 0
for (label,) in dbpCur.execute("SELECT label from labels"):
	iterNum += 1
	if iterNum % 1e5 == 0:
		print(f"Processing line {iterNum}")
	#
	if label in disambigLabels:
		continue
	name = label.lower()
	if name in nodeNames:
		if name not in nameToVariants:
			nameToVariants[name] = [label]
		elif label not in nameToVariants[name]:
			nameToVariants[name].append(label)
	else:
		match = nameVariantRegex.fullmatch(name)
		if match != None:
			subName = match.group(1)
			if subName in nodeNames and match.group(2) != "disambiguation":
				if subName not in nameToVariants:
					nameToVariants[subName] = [name] # Intentionally ignoring case here
				elif name not in nameToVariants[subName]:
					nameToVariants[subName].append(name)
namesToRemove = set()
for (name, variants) in nameToVariants.items():
	if len(variants) == 1:
		namesToRemove.add(name)
for name in namesToRemove:
	del nameToVariants[name]
print(f"Number of conflicts: {len(nameToVariants)}")
# Try conflict resolution via taxon-type information
print("Resolving conflicts using instance-type data")
taxonTypes = { # Obtained from the DBpedia ontology
	"http://dbpedia.org/ontology/Species",
	"http://dbpedia.org/ontology/Archaea",
	"http://dbpedia.org/ontology/Bacteria",
	"http://dbpedia.org/ontology/Eukaryote",
	"http://dbpedia.org/ontology/Plant",
	"http://dbpedia.org/ontology/ClubMoss",
	"http://dbpedia.org/ontology/Conifer",
	"http://dbpedia.org/ontology/CultivatedVariety",
	"http://dbpedia.org/ontology/Cycad",
	"http://dbpedia.org/ontology/Fern",
	"http://dbpedia.org/ontology/FloweringPlant",
	"http://dbpedia.org/ontology/Grape",
	"http://dbpedia.org/ontology/Ginkgo",
	"http://dbpedia.org/ontology/Gnetophytes",
	"http://dbpedia.org/ontology/GreenAlga",
	"http://dbpedia.org/ontology/Moss",
	"http://dbpedia.org/ontology/Fungus",
	"http://dbpedia.org/ontology/Animal",
	"http://dbpedia.org/ontology/Fish",
	"http://dbpedia.org/ontology/Crustacean",
	"http://dbpedia.org/ontology/Mollusca",
	"http://dbpedia.org/ontology/Insect",
	"http://dbpedia.org/ontology/Arachnid",
	"http://dbpedia.org/ontology/Amphibian",
	"http://dbpedia.org/ontology/Reptile",
	"http://dbpedia.org/ontology/Bird",
	"http://dbpedia.org/ontology/Mammal",
	"http://dbpedia.org/ontology/Cat",
	"http://dbpedia.org/ontology/Dog",
	"http://dbpedia.org/ontology/Horse",
}
iterNum = 0
for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
	iterNum += 1
	if iterNum % 1e5 == 0:
		print(f"Processing line {iterNum}")
	#
	if type in taxonTypes:
		name = label.lower()
		if name in nameToVariants:
			del nameToVariants[name]
		else:
			match = nameVariantRegex.fullmatch(name)
			if match != None:
				name = match.group(1)
				if name in nameToVariants:
					del nameToVariants[name]
print(f"Number of conflicts: {len(nameToVariants)}")
# Try conflict resolution via category-list
	# Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
print("Resolving conflicts using category-list")
generalCategories = {
	"species", "genus", 
	"plant", "fungus", "animal", 
	"annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", 
	"fish", "amphibian", "reptile", "bird", "mammal", 
}
specificCategories = {
	"protist", "alveolate", "dinoflagellates", 
	"orchid", "Poaceae", "fern", "moss", "alga",
	"bryozoan", "hydrozoan", 
	"sponge", "cnidarian", "coral", "polychaete", "echinoderm",
	"bivalve", "gastropod", "chiton",
	"shrimp", "decapod", "crab", "barnacle", "copepod", 
	"arachnid", "spider", "harvestman", "mite",
	"dragonfly", "mantis", "cicada", "grasshopper", "planthopper", 
		"beetle", "fly", "butterfly", "moth", "wasp",
	"catfish",
	"frog",
	"lizard",
	"horse", "sheep", "cattle", "mouse",
}
namesToRemove = set()
for (name, variants) in nameToVariants.items():
	found = False
	for label in variants:
		match = nameVariantRegex.match(label)
		if match != None and match.group(2) in generalCategories:
			namesToRemove.add(name)
			found = True
			break
	if not found:
		for label in variants:
			match = nameVariantRegex.match(label)
			if match != None and match.group(2) in specificCategories:
				namesToRemove.add(name)
				break
for name in namesToRemove:
	del nameToVariants[name]
print(f"Number of conflicts: {len(nameToVariants)}")
# Find descriptions for plain-named labels
print("Finding descriptions for plain-named labels")
labelToDesc = {}
iterNum = 0
query = "SELECT label, abstract from labels INNER JOIN abstracts ON labels.iri = abstracts.iri"
for (label, desc,) in dbpCur.execute(query):
	iterNum += 1
	if iterNum % 1e5 == 0:
		print(f"Processing line {iterNum}")
	#
	if label.lower() in nameToVariants:
		labelToDesc[label] = desc
print("Finding descriptions for redirect-resolved labels")
iterNum = 0
query = "SELECT label, abstract from labels" \
	" INNER JOIN redirects ON labels.iri = redirects.iri INNER JOIN abstracts ON redirects.target = abstracts.iri"
for (label, desc,) in dbpCur.execute(query):
	iterNum += 1
	if iterNum % 1e5 == 0:
		print(f"Processing line {iterNum}")
	#
	if label.lower() in nameToVariants:
		labelToDesc[label] = desc
#
print("Writing conflict data to file")
with open(outFile, "w") as file:
	for (name, variants) in nameToVariants.items():
		for n in variants:
			file.write(n + ", ")
		file.write("\n")
		for n in variants:
			if n in labelToDesc:
				file.write(f"\t{n}: {labelToDesc[n]}\n")
# Close dbs
dbCon.close()
dbpCon.close()