aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/gen_name_data.py
blob: 2e92c20835e9864daf304849d8394250f5a1b40a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python3

"""
Maps nodes to vernacular names, using data from EOL, enwiki, and a
picked-names file, and stores results in the database.
"""

import re, os
import html, csv, sqlite3

EOL_NAMES_FILE = os.path.join('eol', 'vernacularNames.csv')
ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
PICKED_NAMES_FILE = 'picked_names.txt'
DB_FILE = 'data.db'

def genData(eolNamesFile: str, enwikiDb: str, pickedNamesFile: str, dbFile: str) -> None:
	""" Reads the files and adds to db """
	dbCon = sqlite3.connect(dbFile)
	dbCur = dbCon.cursor()
	#
	print('Creating table')
	dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))')
	dbCur.execute('CREATE INDEX names_idx ON names(name)')
	dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)')
	dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)')
	#
	print('Getting node mappings')
	nodeToTips: dict[str, int] = {}
	for name, tips in dbCur.execute('SELECT name, tips from nodes'):
		nodeToTips[name] = tips
	#
	addEolNames(eolNamesFile, nodeToTips, dbCur)
	addEnwikiNames(enwikiDb, nodeToTips, dbCur)
	addPickedNames(pickedNamesFile, nodeToTips, dbCur)
	#
	print('Closing database')
	dbCon.commit()
	dbCon.close()
def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
	""" Reads EOL names, associates them with otol nodes, and writes to db """
	# The CSV file has a header line, then lines with these fields:
		# page_id, canonical_form (canonical name, not always unique to page ID),
		# vernacular_string (vernacular name), language_code,
		# resource_name, is_preferred_by_resource, is_preferred_by_eol
	print('Getting EOL mappings')
	eolIdToNode: dict[int, str] = {} # Maps eol ID to node name (if there are multiple, choose one with most tips)
	for name, eolId in dbCur.execute('SELECT name, id from eol_ids'):
		if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]:
			eolIdToNode[eolId] = name
	print('Adding names from EOL')
	namesToSkip = {'unknown', 'unknown species', 'unidentified species'}
	with open(eolNamesFile, newline='') as file:
		for lineNum, fields in enumerate(csv.reader(file), 1):
			if lineNum % 1e5 == 0:
				print(f'At line {lineNum}') # Reached about 2.8e6
			# Skip header line
			if lineNum == 1:
				continue
			# Parse line
			eolId = int(fields[0])
			name = html.unescape(fields[2]).lower()
			lang = fields[3]
			isPreferred = 1 if fields[6] == 'preferred' else 0
			# Add to db
			if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \
				and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words
				cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')'
					# The 'OR IGNORE' accounts for duplicate lines
				dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred))
def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
	""" Reads enwiki names, associates them with otol nodes, and writes to db """
	print('Getting enwiki mappings')
	wikiIdToNode: dict[int, str] = {}
	for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
		if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]:
			wikiIdToNode[wikiId] = name
	print('Adding names from enwiki')
	altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)',
	enwikiCon = sqlite3.connect(enwikiDb)
	enwikiCur = enwikiCon.cursor()
	iterNum = 0
	for wikiId, nodeName in wikiIdToNode.items():
		iterNum += 1
		if iterNum % 1e4 == 0:
			print(f'At iteration {iterNum}') # Reached about 3.6e5
		#
		query = 'SELECT p1.title FROM pages p1' \
			' INNER JOIN redirects r1 ON p1.id = r1.id' \
			' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?'
		for (name,) in enwikiCur.execute(query, (wikiId,)):
			name = name.lower()
			if altNameRegex.fullmatch(name) is not None and name != nodeName and name not in nodeToTips:
				dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0))
def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
	# File format:
		# nodename1|altName1|isPreferred1 -> Add an alt-name
		# nodename1|altName1|             -> Remove an alt-name
		# nodename1|nodeName1|            -> Remove any preferred-alt status
	if os.path.exists(pickedNamesFile):
		print('Getting picked names')
		with open(pickedNamesFile) as file:
			for line in file:
				nodeName, altName, isPreferredStr = line.lower().rstrip().split('|')
				if nodeName not in nodeToTips:
					print(f'Skipping "{nodeName}", as no such node exists')
					continue
				if isPreferredStr:
					isPreferred = 1 if isPreferredStr == '1' else 0
					if isPreferred == 1:
						# Remove any existing preferred-alt status
						cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1'
						dbCur.execute(cmd, (nodeName, altName))
					# Remove any existing record
					dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName))
					# Add record
					dbCur.execute('INSERT INTO names VALUES (?, ?, ?, "picked")', (nodeName, altName, isPreferred))
				elif nodeName != altName: # Remove any matching record
					dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName))
				else: # Remove any preferred-alt status
					cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND pref_alt = 1'
					dbCur.execute(cmd, (nodeName,))

if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	args = parser.parse_args()
	#
	genData(EOL_NAMES_FILE, ENWIKI_DB, PICKED_NAMES_FILE, DB_FILE)