diff options
Diffstat (limited to 'backend/data/genImgs.py')
| -rwxr-xr-x | backend/data/genImgs.py | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/backend/data/genImgs.py b/backend/data/genImgs.py new file mode 100755 index 0000000..097959f --- /dev/null +++ b/backend/data/genImgs.py @@ -0,0 +1,179 @@ +#!/usr/bin/python3 + +import sys, os, subprocess +import sqlite3, urllib.parse +import signal + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n" +usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n" +usageInfo += "Also adds image metadata to an sqlite database.\n" +usageInfo += "\n" +usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n" +usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +imgListFile = "imgList.txt" +outDir = "img/" +eolImgDb = "eol/imagesList.db" +enwikiImgDb = "enwiki/imgData.db" +pickedImgsDir = "pickedImgs/" +pickedImgsFilename = "imgData.txt" +dbFile = "data.db" +IMG_OUT_SZ = 200 +genImgFiles = True + +# Create output directory if not present +if not os.path.exists(outDir): + os.mkdir(outDir) +# Open dbs +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +eolCon = sqlite3.connect(eolImgDb) +eolCur = eolCon.cursor() +enwikiCon = sqlite3.connect(enwikiImgDb) +enwikiCur = enwikiCon.cursor() +# Get 'picked images' info +nodeToPickedImg = {} +if os.path.exists(pickedImgsDir + pickedImgsFilename): + lineNum = 0 + with open(pickedImgsDir + pickedImgsFilename) as file: + for line in file: + lineNum += 1 + (filename, url, license, artist, credit) = line.rstrip().split("|") + nodeName = os.path.splitext(filename)[0] # Remove extension + (otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone() + nodeToPickedImg[otolId] = { + "nodeName": nodeName, "id": lineNum, + "filename": filename, "url": url, "license": license, "artist": artist, "credit": credit, + } +# Create image tables if not present +nodesDone = set() +imgsDone = set() +if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None: + dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)") + dbCur.execute("CREATE TABLE images" \ + " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))") +else: + # Get existing node-associations + for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"): + nodesDone.add(otolId) + # And images + for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"): + imgsDone.add((imgId, imgSrc)) + print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing") +# Detect SIGINT signals +interrupted = False +def onSigint(sig, frame): + global interrupted + interrupted = True +signal.signal(signal.SIGINT, onSigint) +# Iterate though images to process +def quit(): + dbCon.commit() + dbCon.close() + eolCon.close() + enwikiCon.close() + sys.exit(0) +def convertImage(imgPath, outPath): + print(f"Converting {imgPath} to {outPath}") + if os.path.exists(outPath): + print(f"ERROR: Output image already exists") + return False + try: + completedProcess = subprocess.run( + ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], + stdout=subprocess.DEVNULL + ) + except Exception as e: + print(f"ERROR: Exception while attempting to run smartcrop: {e}") + return False + if completedProcess.returncode != 0: + print(f"ERROR: smartcrop had exit status {completedProcess.returncode}") + return False + return True +print("Processing picked images") +for (otolId, imgData) in nodeToPickedImg.items(): + # Check for SIGINT event + if interrupted: + print("Exiting") + quit() + # Skip if already processed + if otolId in nodesDone: + continue + # Convert image + if genImgFiles: + if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"): + quit() + else: + print(f"Processing {imgData['nodeName']}: {otolId}.jpg") + # Add entry to db + if (imgData["id"], "picked") not in imgsDone: + dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", + (imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"])) + imgsDone.add((imgData["id"], "picked")) + dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked")) + nodesDone.add(otolId) +print("Processing images from eol and enwiki") +iterNum = 0 +with open(imgListFile) as file: + for line in file: + iterNum += 1 + # Check for SIGINT event + if interrupted: + print("Exiting") + break + # Skip lines without an image path + if line.find(" ") == -1: + continue + # Get filenames + (otolId, _, imgPath) = line.rstrip().partition(" ") + # Skip if already processed + if otolId in nodesDone: + continue + # Convert image + if genImgFiles: + if not convertImage(imgPath, outDir + otolId + ".jpg"): + break + else: + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}") + # Add entry to db + (nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone() + fromEol = imgPath.startswith("eol/") + imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component + imgName = os.path.splitext(imgName)[0] # Remove extension + if fromEol: + (eolId, _, contentId) = imgName.partition(" ") + (eolId, contentId) = (int(eolId), int(contentId)) + if (eolId, "eol") not in imgsDone: + query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?" + row = eolCur.execute(query, (contentId,)).fetchone() + if row == None: + print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr) + break + (url, license, owner) = row + dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", + (eolId, "eol", url, license, owner, "")) + imgsDone.add((eolId, "eol")) + dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol")) + else: + enwikiId = int(imgName) + if (enwikiId, "enwiki") not in imgsDone: + query = "SELECT name, license, artist, credit FROM" \ + " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \ + " WHERE page_imgs.page_id = ?" + row = enwikiCur.execute(query, (enwikiId,)).fetchone() + if row == None: + print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr) + break + (name, license, artist, credit) = row + url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name) + dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", + (enwikiId, "enwiki", url, license, artist, credit)) + imgsDone.add((enwikiId, "enwiki")) + dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki")) +# Close dbs +quit() |
