aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/gen_imgs.py
blob: 817de03b87599e96b0044dfa7f511fef460d6286 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python3

"""
Looks at images described by a database, and generates resized/cropped versions
into an output directory, with names of the form 'eventId1.jpg'.
Adds the image associations and metadata to the history database.

SIGINT can be used to stop, and the program can be re-run to continue
processing. It uses already-existing database entries to decide what
to skip.
"""

import os, math, subprocess
import sqlite3, urllib.parse
import signal
from PIL import Image

IMG_DIR = os.path.join('enwiki', 'imgs')
IMG_DB = os.path.join('enwiki', 'img_data.db')
OUT_DIR = 'img'
DB_FILE = 'data.db'
#
MAX_MINOR_DIM = 200
MAX_DIM_RATIO = 3/2

def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
	""" Converts images and updates db, checking for entries to skip """
	if not os.path.exists(outDir):
		os.mkdir(outDir)
	dbCon = sqlite3.connect(dbFile)
	dbCur = dbCon.cursor()
	#
	print('Checking for image tables')
	eventsDone: set[int] = set()
	imgsDone: set[int] = set()
	if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="event_imgs"').fetchone() is None:
		# Add image tables
		dbCur.execute('CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)')
		dbCur.execute('CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)')
	else:
		# Get existing image-associated events
		for (eventId,) in dbCur.execute('SELECT id FROM event_imgs'):
			eventsDone.add(eventId)
		# Get existing event-associated images
		for (imgId,) in dbCur.execute('SELECT id from images'):
			imgsDone.add(imgId)
		print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip')
	#
	print('Processing images from eol and enwiki')
	processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone)
	#
	dbCon.commit()
	dbCon.close()
def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
		eventsDone: set[int], imgsDone: set[int]) -> bool:
	""" Converts images and updates db, returning False upon interruption or failure """
	imgDbCon = sqlite3.connect(imgDb)
	imgDbCur = imgDbCon.cursor()
	# Set SIGINT handler
	interrupted = False
	def onSigint(sig, frame):
		nonlocal interrupted
		interrupted = True
	signal.signal(signal.SIGINT, onSigint)
	# Convert images
	flag = False # Set to True upon interruption or failure
	for imgFile in os.listdir(imgDir):
		# Check for SIGINT event
		if interrupted:
			print('Exiting')
			flag = True
			break
		# Get image ID
		imgIdStr, _ = os.path.splitext(imgFile)
		imgId = int(imgIdStr)
		# Get associated events
		eventIds: set[int] = set()
		query = 'SELECT title FROM page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name WHERE imgs.id = ?'
		for (title,) in imgDbCur.execute(query, (imgId,)):
			row = dbCur.execute('SELECT id FROM events WHERE title = ?', (title,)).fetchone()
			if row is None:
				print('ERROR: No event ID found for title {title} associated with image {imgFile}')
				continue
			eventIds.add(row[0])
		eventIds = eventIds.difference(eventsDone)
		if not eventIds:
			continue
		# Convert image
		if imgId not in imgsDone:
			success = convertImage(os.path.join(imgDir, imgFile), os.path.join(outDir, str(imgId) + '.jpg'))
			if not success:
				flag = True
				break
		# Add entry to db
		if imgId not in imgsDone:
			row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone()
			if row is None:
				print(f'ERROR: No image record for ID {imgId}')
				flag = True
				break
			name, license, artist, credit = row
			url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
			dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit))
		for eventId in eventIds:
			dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId))
	imgDbCon.close()
	return not flag
def convertImage(imgPath: str, outPath: str):
	print(f'Converting {imgPath} to {outPath}')
	if os.path.exists(outPath):
		print('ERROR: Output image already exists')
		return False
	# Get image dims
	width: int
	height: int
	try:
		with Image.open(imgPath) as image:
			width, height = image.size
	except Exception as e: # Being more specific runs the risk of ending the program without committing to db
		print(f'ERROR: Unable to open {imgPath}: {e}')
		return False
	# Limit output dims
	if width > height:
		if height > MAX_MINOR_DIM:
			width = math.ceil(width * height / MAX_MINOR_DIM)
			height = MAX_MINOR_DIM
		if width / height > MAX_DIM_RATIO:
			width = math.ceil(height * MAX_DIM_RATIO)
	else:
		if width > MAX_MINOR_DIM:
			height = math.ceil(height * width / MAX_MINOR_DIM)
			width = MAX_MINOR_DIM
		if height / width > MAX_DIM_RATIO:
			height = math.ceil(width * MAX_DIM_RATIO)
	# Convert image
	try:
		completedProcess = subprocess.run(
			['npx', 'smartcrop-cli', '--width', str(width), '--height', str(height), imgPath, outPath],
			stdout=subprocess.DEVNULL
		)
	except Exception as e:
		print(f'ERROR: Exception while attempting to run smartcrop: {e}')
		return False
	if completedProcess.returncode != 0:
		print(f'ERROR: smartcrop had exit status {completedProcess.returncode}')
		return False
	return True

if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	parser.parse_args()
	#
	genImgs(IMG_DIR, IMG_DB, OUT_DIR, DB_FILE)