aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genImgsForWeb.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-22 01:42:41 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-22 09:39:44 +1000
commite78c4df403e5f98afa08f7a0841ff233d5f6d05b (patch)
treef13dbf91228550075644be9766b4546eb20f1e1f /backend/data/genImgsForWeb.py
parentae1467d2ab35a03eb2d7bf3e5ca1cf4634b23443 (diff)
Update backend READMEs, rename some files for consistency
Diffstat (limited to 'backend/data/genImgsForWeb.py')
-rwxr-xr-xbackend/data/genImgsForWeb.py179
1 files changed, 0 insertions, 179 deletions
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
deleted file mode 100755
index 3c299bb..0000000
--- a/backend/data/genImgsForWeb.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os, subprocess
-import sqlite3, urllib.parse
-import signal
-
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
-usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
-usageInfo += "Also adds image metadata to an sqlite database.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
-usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-imgListFile = "mergedImgList.txt"
-outDir = "img/"
-eolImgDb = "eol/imagesList.db"
-enwikiImgDb = "enwiki/enwikiImgs.db"
-pickedImgsDir = "pickedImgs/"
-pickedImgsFile = "metadata.txt"
-dbFile = "data.db"
-IMG_OUT_SZ = 200
-genImgFiles = True
-
-# Create output directory if not present
-if not os.path.exists(outDir):
- os.mkdir(outDir)
-# Open dbs
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-eolCon = sqlite3.connect(eolImgDb)
-eolCur = eolCon.cursor()
-enwikiCon = sqlite3.connect(enwikiImgDb)
-enwikiCur = enwikiCon.cursor()
-# Get 'picked images' info
-nodeToPickedImg = {}
-if os.path.exists(pickedImgsDir + pickedImgsFile):
- lineNum = 0
- with open(pickedImgsDir + pickedImgsFile) as file:
- for line in file:
- lineNum += 1
- (filename, url, license, artist, credit) = line.rstrip().split("|")
- nodeName = os.path.splitext(filename)[0] # Remove extension
- (otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone()
- nodeToPickedImg[otolId] = {
- "nodeName": nodeName, "id": lineNum,
- "filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
- }
-# Create image tables if not present
-nodesDone = set()
-imgsDone = set()
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
- dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
- dbCur.execute("CREATE TABLE images" \
- " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
-else:
- # Get existing node-associations
- for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
- nodesDone.add(otolId)
- # And images
- for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
- imgsDone.add((imgId, imgSrc))
- print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
-# Detect SIGINT signals
-interrupted = False
-def onSigint(sig, frame):
- global interrupted
- interrupted = True
-signal.signal(signal.SIGINT, onSigint)
-# Iterate though images to process
-def quit():
- dbCon.commit()
- dbCon.close()
- eolCon.close()
- enwikiCon.close()
- sys.exit(0)
-def convertImage(imgPath, outPath):
- print(f"Converting {imgPath} to {outPath}")
- if os.path.exists(outPath):
- print(f"ERROR: Output image already exists")
- return False
- try:
- completedProcess = subprocess.run(
- ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
- stdout=subprocess.DEVNULL
- )
- except Exception as e:
- print(f"ERROR: Exception while attempting to run smartcrop: {e}")
- return False
- if completedProcess.returncode != 0:
- print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
- return False
- return True
-print("Processing picked images")
-for (otolId, imgData) in nodeToPickedImg.items():
- # Check for SIGINT event
- if interrupted:
- print("Exiting")
- quit()
- # Skip if already processed
- if otolId in nodesDone:
- continue
- # Convert image
- if genImgFiles:
- if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"):
- quit()
- else:
- print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
- # Add entry to db
- if (imgData["id"], "picked") not in imgsDone:
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
- (imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"]))
- imgsDone.add((imgData["id"], "picked"))
- dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked"))
- nodesDone.add(otolId)
-print("Processing images from eol and enwiki")
-iterNum = 0
-with open(imgListFile) as file:
- for line in file:
- iterNum += 1
- # Check for SIGINT event
- if interrupted:
- print("Exiting")
- break
- # Skip lines without an image path
- if line.find(" ") == -1:
- continue
- # Get filenames
- (otolId, _, imgPath) = line.rstrip().partition(" ")
- # Skip if already processed
- if otolId in nodesDone:
- continue
- # Convert image
- if genImgFiles:
- if not convertImage(imgPath, outDir + otolId + ".jpg"):
- break
- else:
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- # Add entry to db
- (nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone()
- fromEol = imgPath.startswith("eol/")
- imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
- imgName = os.path.splitext(imgName)[0] # Remove extension
- if fromEol:
- (eolId, _, contentId) = imgName.partition(" ")
- (eolId, contentId) = (int(eolId), int(contentId))
- if (eolId, "eol") not in imgsDone:
- query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
- row = eolCur.execute(query, (contentId,)).fetchone()
- if row == None:
- print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
- break
- (url, license, owner) = row
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
- (eolId, "eol", url, license, owner, ""))
- imgsDone.add((eolId, "eol"))
- dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
- else:
- enwikiId = int(imgName)
- if (enwikiId, "enwiki") not in imgsDone:
- query = "SELECT name, license, artist, credit FROM" \
- " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
- " WHERE page_imgs.page_id = ?"
- row = enwikiCur.execute(query, (enwikiId,)).fetchone()
- if row == None:
- print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
- break
- (name, license, artist, credit) = row
- url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
- (enwikiId, "enwiki", url, license, artist, credit))
- imgsDone.add((enwikiId, "enwiki"))
- dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
-# Close dbs
-quit()