aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genImgsForWeb.py
blob: 1db543f2feba06adfc2f9e6ccd9aecfacedddb09 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/python3

import sys, os, subprocess
import sqlite3, urllib.parse
import signal

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
usageInfo += "Also adds image metadata to an sqlite database.\n"
usageInfo += "\n"
usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

imgListFile = "mergedImgList.txt"
outDir = "img/"
eolImgDb = "eol/imagesList.db"
enwikiImgDb = "enwiki/enwikiImgs.db"
dbFile = "data.db"
IMG_OUT_SZ = 200
genImgFiles = True

# Create output directory if not present
if not os.path.exists(outDir):
	os.mkdir(outDir)
# Open dbs
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
eolCon = sqlite3.connect(eolImgDb)
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
# Create image tables if not present
nodesDone = set()
imgsDone = set()
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
	dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
	dbCur.execute("CREATE TABLE images" \
		" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
else:
	# Get existing node-associations
	for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
		nodesDone.add(otolId)
	# And images
	for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
		imgsDone.add((imgId, imgSrc))
	print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
# Detect SIGINT signals
interrupted = False
def onSigint(sig, frame):
	global interrupted
	interrupted = True
signal.signal(signal.SIGINT, onSigint)
# Iterate though images to process
iterNum = 0
with open(imgListFile) as file:
	for line in file:
		iterNum += 1
		# Check for SIGINT event
		if interrupted:
			print("Exiting")
			break
		# Skip lines without an image path
		if line.find(" ") == -1:
			continue
		# Get filenames
		(otolId, _, imgPath) = line.rstrip().partition(" ")
		# Skip if already processed
		if otolId in nodesDone:
			continue
		# Convert image
		if genImgFiles:
			print(f"Processing {otolId}: converting {imgPath}")
			outPath = outDir + otolId + ".jpg"
			if os.path.exists(outPath):
				print(f"ERROR: Output image already exists")
				break
			try:
				completedProcess = subprocess.run(
					['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
					stdout=subprocess.DEVNULL)
			except Exception as e:
				print(f"ERROR: Exception while attempting to run smartcrop: {e}")
				break
			if completedProcess.returncode != 0:
				print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
				break
		else:
			if iterNum % 1e4 == 0:
				print(f"At iteration {iterNum}")
		# Add entry to db
		(nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone()
		fromEol = imgPath.startswith("eol/")
		imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
		imgName = os.path.splitext(imgName)[0] # Remove extension
		if fromEol:
			(eolId, _, contentId) = imgName.partition(" ")
			(eolId, contentId) = (int(eolId), int(contentId))
			if (eolId, "eol") not in imgsDone:
				query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
				row = eolCur.execute(query, (contentId,)).fetchone()
				if row == None:
					print("ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
					break
				(url, license, owner) = row
				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
					(eolId, "eol", url, license, owner, ""))
				imgsDone.add((eolId, "eol"))
			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
		else:
			enwikiId = int(imgName)
			if (enwikiId, "enwiki") not in imgsDone:
				query = "SELECT name, license, artist, credit FROM" \
					" page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
					" WHERE page_imgs.page_id = ?"
				row = enwikiCur.execute(query, (enwikiId,)).fetchone()
				if row == None:
					print("ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
					break
				(name, license, artist, credit) = row
				url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
					(enwikiId, "enwiki", url, license, artist, credit))
				imgsDone.add((enwikiId, "enwiki"))
			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
# Close dbs
dbCon.commit()
dbCon.close()
eolCon.close()
enwikiCon.close()