1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
#!/usr/bin/python3
import sys, os, subprocess
import sqlite3, urllib.parse
import signal
usageInfo = f"usage: {sys.argv[0]}\n"
usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
usageInfo += "Also adds image metadata to an sqlite database.\n"
usageInfo += "\n"
usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
imgListFile = "mergedImgList.txt"
outDir = "img/"
eolImgDb = "eol/imagesList.db"
enwikiImgDb = "enwiki/enwikiImgs.db"
pickedImgsDir = "pickedImgs/"
pickedImgsFile = "metadata.txt"
dbFile = "data.db"
IMG_OUT_SZ = 200
genImgFiles = True
# Create output directory if not present
if not os.path.exists(outDir):
os.mkdir(outDir)
# Open dbs
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
eolCon = sqlite3.connect(eolImgDb)
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
# Get 'picked images' info
nodeToPickedImg = {}
if os.path.exists(pickedImgsDir + pickedImgsFile):
lineNum = 0
with open(pickedImgsDir + pickedImgsFile) as file:
for line in file:
lineNum += 1
(filename, url, license, artist, credit) = line.rstrip().split("|")
nodeName = os.path.splitext(filename)[0] # Remove extension
(otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone()
nodeToPickedImg[otolId] = {
"nodeName": nodeName, "id": lineNum,
"filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
}
# Create image tables if not present
nodesDone = set()
imgsDone = set()
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
dbCur.execute("CREATE TABLE images" \
" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
else:
# Get existing node-associations
for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
nodesDone.add(otolId)
# And images
for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
imgsDone.add((imgId, imgSrc))
print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
# Detect SIGINT signals
interrupted = False
def onSigint(sig, frame):
global interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
# Iterate though images to process
def quit():
dbCon.commit()
dbCon.close()
eolCon.close()
enwikiCon.close()
sys.exit(0)
def convertImage(imgPath, outPath):
print(f"Converting {imgPath} to {outPath}")
if os.path.exists(outPath):
print(f"ERROR: Output image already exists")
return False
try:
completedProcess = subprocess.run(
['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
stdout=subprocess.DEVNULL
)
except Exception as e:
print(f"ERROR: Exception while attempting to run smartcrop: {e}")
return False
if completedProcess.returncode != 0:
print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
return False
return True
print("Processing picked images")
for (otolId, imgData) in nodeToPickedImg.items():
# Check for SIGINT event
if interrupted:
print("Exiting")
quit()
# Skip if already processed
if otolId in nodesDone:
continue
# Convert image
if genImgFiles:
if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"):
quit()
else:
print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
# Add entry to db
if (imgData["id"], "picked") not in imgsDone:
dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
(imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"]))
imgsDone.add((imgData["id"], "picked"))
dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked"))
nodesDone.add(otolId)
print("Processing images from eol and enwiki")
iterNum = 0
with open(imgListFile) as file:
for line in file:
iterNum += 1
# Check for SIGINT event
if interrupted:
print("Exiting")
break
# Skip lines without an image path
if line.find(" ") == -1:
continue
# Get filenames
(otolId, _, imgPath) = line.rstrip().partition(" ")
# Skip if already processed
if otolId in nodesDone:
continue
# Convert image
if genImgFiles:
if not convertImage(imgPath, outDir + otolId + ".jpg"):
break
else:
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
# Add entry to db
(nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone()
fromEol = imgPath.startswith("eol/")
imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
imgName = os.path.splitext(imgName)[0] # Remove extension
if fromEol:
(eolId, _, contentId) = imgName.partition(" ")
(eolId, contentId) = (int(eolId), int(contentId))
if (eolId, "eol") not in imgsDone:
query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
row = eolCur.execute(query, (contentId,)).fetchone()
if row == None:
print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
break
(url, license, owner) = row
dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
(eolId, "eol", url, license, owner, ""))
imgsDone.add((eolId, "eol"))
dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
else:
enwikiId = int(imgName)
if (enwikiId, "enwiki") not in imgsDone:
query = "SELECT name, license, artist, credit FROM" \
" page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
" WHERE page_imgs.page_id = ?"
row = enwikiCur.execute(query, (enwikiId,)).fetchone()
if row == None:
print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
break
(name, license, artist, credit) = row
url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
(enwikiId, "enwiki", url, license, artist, credit))
imgsDone.add((enwikiId, "enwiki"))
dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
# Close dbs
quit()
|