aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genImgsForWeb.py
blob: 8bf1435c804d7d7e8d61291ee77e602319b04892 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/python3

import sys, os, subprocess
import sqlite3, urllib.parse
import signal

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
usageInfo += "Also adds image metadata to an sqlite database.\n"
usageInfo += "\n"
usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

imgListFile = "mergedImgList.txt"
outDir = "img/"
eolImgDb = "eol/imagesList.db"
enwikiImgDb = "enwiki/enwikiImgs.db"
dbFile = "data.db"
IMG_OUT_SZ = 200

# Create output directory if not present
if not os.path.exists(outDir):
	os.mkdir(outDir)
# Open dbs
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
eolCon = sqlite3.connect(eolImgDb)
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
# Create image tables if not present
nodesDone = set()
imgsDone = set()
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
	dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
	dbCur.execute("CREATE TABLE images" \
		" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
else:
	# Get existing node-associations
	for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
		nodesDone.add(otolId)
	# And images
	for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
		imgsDone.add((imgId, imgSrc))
	print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
# Detect SIGINT signals
interrupted = False
def onSigint(sig, frame):
	global interrupted
	interrupted = True
signal.signal(signal.SIGINT, onSigint)
# Iterate though images to process
with open(imgListFile) as file:
	for line in file:
		# Check for SIGINT event
		if interrupted:
			print("Exiting")
			break
		# Skip lines without an image path
		if line.find(" ") == -1:
			continue
		# Get filenames
		(otolId, _, imgPath) = line.rstrip().partition(" ")
		# Skip if already processed
		if otolId in nodesDone:
			continue
		outPath = outDir + otolId + ".jpg"
		# Convert image
		print(f"{otolId}: converting {imgPath}")
		if os.path.exists(outPath):
			print(f"ERROR: Output image already exists")
			break
		try:
			completedProcess = subprocess.run(
				['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
				stdout=subprocess.DEVNULL)
		except Exception as e:
			print(f"ERROR: Exception while attempting to run smartcrop: {e}")
			break
		if completedProcess.returncode != 0:
			print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
			break
		# Add entry to db
		nodeName = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,))
		fromEol = imgPath.startswith("eol/")
		imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
		imgName = os.path.splitext(imgName)[0] # Remove extension
		if fromEol:
			(eolId, _, contentId) = imgName.partition(" ")
			(eolId, contentId) = (int(eolId), int(contentId))
			if (eolId, "eol") not in imgsDone:
				query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
				row = eolCur.execute(query, (contentId,)).fetchone()
				if row == None:
					print("ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
					break
				(url, license, owner) = row
				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
					(eolId, "eol", url, license, owner, ""))
				imgsDone.add((eolId, "eol"))
			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
		else:
			enwikiId = int(imgName)
			if (enwikiId, "enwiki") not in imgsDone:
				query = "SELECT name, license, artist, credit FROM" \
					" page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
					" WHERE page_imgs.page_id = ?"
				row = enwikiCur.execute(query, (enwikiId,)).fetchone()
				if row == None:
					print("ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
					break
				(name, license, artist, credit) = row
				url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
					(enwikiId, "enwiki", url, license, artist, credit))
				imgsDone.add((enwikiId, "enwiki"))
			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
# Close dbs
dbCon.commit()
dbCon.close()
eolCon.close()
enwikiCon.close()