aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genImgs.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/genImgs.py')
-rwxr-xr-xbackend/data/genImgs.py179
1 files changed, 179 insertions, 0 deletions
diff --git a/backend/data/genImgs.py b/backend/data/genImgs.py
new file mode 100755
index 0000000..097959f
--- /dev/null
+++ b/backend/data/genImgs.py
@@ -0,0 +1,179 @@
+#!/usr/bin/python3
+
+import sys, os, subprocess
+import sqlite3, urllib.parse
+import signal
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
+usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
+usageInfo += "Also adds image metadata to an sqlite database.\n"
+usageInfo += "\n"
+usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
+usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+imgListFile = "imgList.txt"
+outDir = "img/"
+eolImgDb = "eol/imagesList.db"
+enwikiImgDb = "enwiki/imgData.db"
+pickedImgsDir = "pickedImgs/"
+pickedImgsFilename = "imgData.txt"
+dbFile = "data.db"
+IMG_OUT_SZ = 200
+genImgFiles = True
+
+# Create output directory if not present
+if not os.path.exists(outDir):
+ os.mkdir(outDir)
+# Open dbs
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+eolCon = sqlite3.connect(eolImgDb)
+eolCur = eolCon.cursor()
+enwikiCon = sqlite3.connect(enwikiImgDb)
+enwikiCur = enwikiCon.cursor()
+# Get 'picked images' info
+nodeToPickedImg = {}
+if os.path.exists(pickedImgsDir + pickedImgsFilename):
+ lineNum = 0
+ with open(pickedImgsDir + pickedImgsFilename) as file:
+ for line in file:
+ lineNum += 1
+ (filename, url, license, artist, credit) = line.rstrip().split("|")
+ nodeName = os.path.splitext(filename)[0] # Remove extension
+ (otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone()
+ nodeToPickedImg[otolId] = {
+ "nodeName": nodeName, "id": lineNum,
+ "filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
+ }
+# Create image tables if not present
+nodesDone = set()
+imgsDone = set()
+if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
+ dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
+ dbCur.execute("CREATE TABLE images" \
+ " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
+else:
+ # Get existing node-associations
+ for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
+ nodesDone.add(otolId)
+ # And images
+ for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
+ imgsDone.add((imgId, imgSrc))
+ print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
+# Detect SIGINT signals
+interrupted = False
+def onSigint(sig, frame):
+ global interrupted
+ interrupted = True
+signal.signal(signal.SIGINT, onSigint)
+# Iterate though images to process
+def quit():
+ dbCon.commit()
+ dbCon.close()
+ eolCon.close()
+ enwikiCon.close()
+ sys.exit(0)
+def convertImage(imgPath, outPath):
+ print(f"Converting {imgPath} to {outPath}")
+ if os.path.exists(outPath):
+ print(f"ERROR: Output image already exists")
+ return False
+ try:
+ completedProcess = subprocess.run(
+ ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
+ stdout=subprocess.DEVNULL
+ )
+ except Exception as e:
+ print(f"ERROR: Exception while attempting to run smartcrop: {e}")
+ return False
+ if completedProcess.returncode != 0:
+ print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
+ return False
+ return True
+print("Processing picked images")
+for (otolId, imgData) in nodeToPickedImg.items():
+ # Check for SIGINT event
+ if interrupted:
+ print("Exiting")
+ quit()
+ # Skip if already processed
+ if otolId in nodesDone:
+ continue
+ # Convert image
+ if genImgFiles:
+ if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"):
+ quit()
+ else:
+ print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
+ # Add entry to db
+ if (imgData["id"], "picked") not in imgsDone:
+ dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+ (imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"]))
+ imgsDone.add((imgData["id"], "picked"))
+ dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked"))
+ nodesDone.add(otolId)
+print("Processing images from eol and enwiki")
+iterNum = 0
+with open(imgListFile) as file:
+ for line in file:
+ iterNum += 1
+ # Check for SIGINT event
+ if interrupted:
+ print("Exiting")
+ break
+ # Skip lines without an image path
+ if line.find(" ") == -1:
+ continue
+ # Get filenames
+ (otolId, _, imgPath) = line.rstrip().partition(" ")
+ # Skip if already processed
+ if otolId in nodesDone:
+ continue
+ # Convert image
+ if genImgFiles:
+ if not convertImage(imgPath, outDir + otolId + ".jpg"):
+ break
+ else:
+ if iterNum % 1e4 == 0:
+ print(f"At iteration {iterNum}")
+ # Add entry to db
+ (nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone()
+ fromEol = imgPath.startswith("eol/")
+ imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
+ imgName = os.path.splitext(imgName)[0] # Remove extension
+ if fromEol:
+ (eolId, _, contentId) = imgName.partition(" ")
+ (eolId, contentId) = (int(eolId), int(contentId))
+ if (eolId, "eol") not in imgsDone:
+ query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
+ row = eolCur.execute(query, (contentId,)).fetchone()
+ if row == None:
+ print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
+ break
+ (url, license, owner) = row
+ dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+ (eolId, "eol", url, license, owner, ""))
+ imgsDone.add((eolId, "eol"))
+ dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
+ else:
+ enwikiId = int(imgName)
+ if (enwikiId, "enwiki") not in imgsDone:
+ query = "SELECT name, license, artist, credit FROM" \
+ " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
+ " WHERE page_imgs.page_id = ?"
+ row = enwikiCur.execute(query, (enwikiId,)).fetchone()
+ if row == None:
+ print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
+ break
+ (name, license, artist, credit) = row
+ url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
+ dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+ (enwikiId, "enwiki", url, license, artist, credit))
+ imgsDone.add((enwikiId, "enwiki"))
+ dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
+# Close dbs
+quit()