aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/gen_imgs.py
blob: 2479742bb556399ae0555ef4bea083f171322192 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/python3

"""
Reads node IDs and image paths from a file, and possibly from a directory,
and generates cropped/resized versions of those images into a directory,
with names of the form 'nodeId1.jpg'. Also adds image metadata to the
database.

SIGINT can be used to stop, and the program can be re-run to continue
processing. It uses already-existing database entries to decide what
to skip.
"""

import argparse
import os
import subprocess
import sqlite3
import urllib.parse
import signal

IMG_LIST_FILE = 'img_list.txt'
EOL_IMG_DIR = os.path.join('eol', 'imgs') # Used to decide which IMG_LIST_FILE lines denote chosen EOL images
OUT_DIR = 'img'
EOL_IMG_DB = os.path.join('eol', 'images_list.db')
ENWIKI_IMG_DB = os.path.join('enwiki', 'img_data.db')
PICKED_IMGS_DIR = 'picked_imgs'
PICKED_IMGS_FILE = 'img_data.txt'
DB_FILE = 'data.db'

IMG_OUT_SZ = 200

ImgId = tuple[int, str] # Holds an int ID and a source string (eg: 'eol')

class PickedImg:
	""" Represents a picked-image from pickedImgsDir """
	def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str):
		self.nodeName = nodeName
		self.id = id
		self.filename = filename
		self.url = url
		self.license = license
		self.artist = artist
		self.credit = credit

def genImgs(
		imgListFile: str, eolImgDir: str, outDir: str, eolImgDb: str, enwikiImgDb: str,
		pickedImgsDir: str, pickedImgsFile: str, dbFile):
	""" Reads the image-list file, generates images, and updates db """
	if not os.path.exists(outDir):
		os.mkdir(outDir)
	dbCon = sqlite3.connect(dbFile)
	dbCur = dbCon.cursor()

	print('Checking for image tables')
	nodesDone: set[str] = set()
	imgsDone: set[ImgId] = set()
	if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="node_imgs"').fetchone() is None:
		# Add image tables if not present
		dbCur.execute('CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)')
		dbCur.execute('CREATE TABLE images (' \
			'id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))')
	else:
		# Get existing image-associated nodes
		for (otolId,) in dbCur.execute('SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name'):
			nodesDone.add(otolId)
		# Get existing node-associated images
		for imgId, imgSrc in dbCur.execute('SELECT id, src from images'):
			imgsDone.add((imgId, imgSrc))
		print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip')

	print('Processing picked-images')
	success = processPickedImgs(pickedImgsDir, pickedImgsFile, nodesDone, imgsDone, outDir, dbCur)
	if success:
		print('Processing images from eol and enwiki')
		processImgs(imgListFile, eolImgDir, eolImgDb, enwikiImgDb, nodesDone, imgsDone, outDir, dbCur)

	dbCon.commit()
	dbCon.close()

def processPickedImgs(
		pickedImgsDir: str, pickedImgsFile: str, nodesDone: set[str], imgsDone: set[ImgId],
		outDir: str, dbCur: sqlite3.Cursor) -> bool:
	""" Converts picked-images and updates db, returning False upon interruption or failure """
	# Read picked-image data
	nodeToPickedImg: dict[str, PickedImg] = {}
	if os.path.exists(os.path.join(pickedImgsDir, pickedImgsFile)):
		with open(os.path.join(pickedImgsDir, pickedImgsFile)) as file:
			for lineNum, line in enumerate(file, 1):
				filename, url, license, artist, credit = line.rstrip().split('|')
				nodeName = os.path.splitext(filename)[0] # Remove extension
				(otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone()
				nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit)

	# Set SIGINT handler
	interrupted = False
	def onSigint(sig, frame):
		nonlocal interrupted
		interrupted = True
	signal.signal(signal.SIGINT, onSigint)

	# Convert images
	for otolId, imgData in nodeToPickedImg.items():
		# Check for SIGINT event
		if interrupted:
			print('Exiting')
			return False

		# Skip if already processed
		if otolId in nodesDone:
			continue

		# Convert image
		success = convertImage(os.path.join(pickedImgsDir, imgData.filename), os.path.join(outDir, otolId + '.jpg'))
		if not success:
			return False

		# Add entry to db
		if (imgData.id, 'picked') not in imgsDone:
			dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
				(imgData.id, 'picked', imgData.url, imgData.license, imgData.artist, imgData.credit))
			imgsDone.add((imgData.id, 'picked'))
		dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked'))
		nodesDone.add(otolId)
	return True

def processImgs(
		imgListFile: str, eolImgDir: str, eolImgDb: str, enwikiImgDb: str,
		nodesDone: set[str], imgsDone: set[ImgId], outDir: str, dbCur: sqlite3.Cursor) -> bool:
	""" Converts EOL and enwiki images, and updates db, returning False upon interruption or failure """
	eolCon = sqlite3.connect(eolImgDb)
	eolCur = eolCon.cursor()
	enwikiCon = sqlite3.connect(enwikiImgDb)
	enwikiCur = enwikiCon.cursor()

	# Set SIGINT handler
	interrupted = False
	def onSigint(sig, frame):
		nonlocal interrupted
		interrupted = True
	signal.signal(signal.SIGINT, onSigint)

	# Convert images
	flag = False # Set to True upon interruption or failure
	with open(imgListFile) as file:
		for line in file:
			# Check for SIGINT event
			if interrupted:
				print('Exiting')
				flag = True
				break

			# Skip lines without an image path
			if line.find(' ') == -1:
				continue

			# Get filenames
			otolId, _, imgPath = line.rstrip().partition(' ')

			# Skip if already processed
			if otolId in nodesDone:
				continue

			# Convert image
			success = convertImage(imgPath, os.path.join(outDir, otolId + '.jpg'))
			if not success:
				flag = True
				break

			# Add entry to db
			(nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone()
			fromEol = imgPath.startswith(eolImgDir)
			imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
			imgName = os.path.splitext(imgName)[0] # Remove extension
			if fromEol:
				eolIdStr, _, contentIdStr = imgName.partition(' ')
				eolId, contentId = int(eolIdStr), int(contentIdStr)
				if (eolId, 'eol') not in imgsDone:
					query = 'SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?'
					row = eolCur.execute(query, (contentId,)).fetchone()
					if row is None:
						print(f'ERROR: No image record for EOL ID {eolId}, content ID {contentId}')
						flag = True
						break
					url, license, owner = row
					dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
						(eolId, 'eol', url, license, owner, ''))
					imgsDone.add((eolId, 'eol'))
				dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, eolId, 'eol'))
			else:
				enwikiId = int(imgName)
				if (enwikiId, 'enwiki') not in imgsDone:
					query = 'SELECT name, license, artist, credit FROM' \
						' page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name' \
						' WHERE page_imgs.page_id = ?'
					row = enwikiCur.execute(query, (enwikiId,)).fetchone()
					if row is None:
						print(f'ERROR: No image record for enwiki ID {enwikiId}')
						flag = True
						break
					name, license, artist, credit = row
					url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
					dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
						(enwikiId, 'enwiki', url, license, artist, credit))
					imgsDone.add((enwikiId, 'enwiki'))
				dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki'))

	eolCon.close()
	enwikiCon.close()
	return not flag

def convertImage(imgPath: str, outPath: str):
	print(f'Converting {imgPath} to {outPath}')
	if os.path.exists(outPath):
		print('ERROR: Output image already exists')
		return False

	try:
		completedProcess = subprocess.run(
			['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
			stdout=subprocess.DEVNULL
		)
	except Exception as e:
		print(f'ERROR: Exception while attempting to run smartcrop: {e}')
		return False
	if completedProcess.returncode != 0:
		print(f'ERROR: smartcrop had exit status {completedProcess.returncode}')
		return False
	return True

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	parser.parse_args()

	genImgs(IMG_LIST_FILE, EOL_IMG_DIR, OUT_DIR, EOL_IMG_DB, ENWIKI_IMG_DB, PICKED_IMGS_DIR, PICKED_IMGS_FILE, DB_FILE)