From 0a9b2c2e5eca8a04e37fbdd423379882863237c2 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sat, 21 Jan 2023 12:21:03 +1100 Subject: Adjust backend coding style Increase line spacing, add section comments, etc --- backend/hist_data/gen_imgs.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'backend/hist_data/gen_imgs.py') diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py index 46cf6ee..44c0020 100755 --- a/backend/hist_data/gen_imgs.py +++ b/backend/hist_data/gen_imgs.py @@ -10,17 +10,20 @@ processing. It uses already-existing database entries to decide what to skip. """ -# Took about 10 hours to process about 60k images +# Note: Took about 10 hours to process about 60k images import argparse -import os, subprocess, signal -import sqlite3, urllib.parse +import os +import subprocess +import signal +import sqlite3 +import urllib.parse IMG_DIR = os.path.join('enwiki', 'imgs') IMG_DB = os.path.join('enwiki', 'img_data.db') OUT_DIR = 'img' DB_FILE = 'data.db' -# + IMG_OUT_SZ = 200 def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): @@ -29,7 +32,7 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): os.mkdir(outDir) dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Checking for image tables') eventsDone: set[int] = set() imgsDone: set[int] = set() @@ -45,23 +48,26 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): for (imgId,) in dbCur.execute('SELECT id from images'): imgsDone.add(imgId) print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip') - # + print('Processing images') processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone) - # + dbCon.commit() dbCon.close() + def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, eventsDone: set[int], imgsDone: set[int]) -> bool: """ Converts images and updates db, returning False upon interruption or failure """ imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() + # Set SIGINT handler interrupted = False def onSigint(sig, frame): nonlocal interrupted interrupted = True signal.signal(signal.SIGINT, onSigint) + # Convert images flag = False # Set to True upon interruption or failure for imgFile in os.listdir(imgDir): @@ -70,9 +76,11 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, print('Exiting') flag = True break + # Get image ID imgIdStr, _ = os.path.splitext(imgFile) imgId = int(imgIdStr) + # Get associated events eventIds: set[int] = set() query = 'SELECT title FROM page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name WHERE imgs.id = ?' @@ -85,12 +93,14 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, eventIds = eventIds.difference(eventsDone) if not eventIds: continue + # Convert image if imgId not in imgsDone: success = convertImage(os.path.join(imgDir, imgFile), os.path.join(outDir, str(imgId) + '.jpg')) if not success: flag = True break + # Add image to db row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone() if row is None: @@ -100,16 +110,21 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, name, license, artist, credit = row url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name) dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit)) + # Add event association to db for eventId in eventIds: dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId)) + imgDbCon.close() return not flag + def convertImage(imgPath: str, outPath: str): + """ Converts an image using smartcrop """ print(f'Converting {imgPath} to {outPath}') if os.path.exists(outPath): print('ERROR: Output image already exists') return False + try: completedProcess = subprocess.run( ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], @@ -126,5 +141,5 @@ def convertImage(imgPath: str, outPath: str): if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genImgs(IMG_DIR, IMG_DB, OUT_DIR, DB_FILE) -- cgit v1.2.3