diff options
| -rw-r--r-- | backend/hist_data/README.md | 11 | ||||
| -rwxr-xr-x | backend/hist_data/gen_imgs.py | 155 | ||||
| -rw-r--r-- | backend/tests/test_gen_imgs.py | 79 | ||||
| -rw-r--r-- | backend/tests/test_img.png | bin | 0 -> 5067 bytes | |||
| -rw-r--r-- | package.json | 13 |
5 files changed, 256 insertions, 2 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index 5b64462..c5cf66f 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -21,6 +21,12 @@ This directory holds files used to generate the history database data.db. - `pop`: <br> Format: `id INT PRIMARY KEY, pop INT` <br> Associates each event with a popularity measure (currently an average monthly viewcount) +- `images`: <br> + Format: `id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT` <br> + Holds metadata for available images +- `event_imgs`: <br> + Format: `id INT PRIMARY KEY, img_id INT` <br> + Assocates events with images # Generating the Database @@ -46,10 +52,11 @@ Some of the scripts use third-party packages: 1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found images, and adds them to the image database. 1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. -1. Run +1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/. + Adds the `imgs` and `event_imgs` tables. ## Generate Description Data 1. Obtain an enwiki dump in enwiki/, as specified in the README. 1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump. 1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database. -1. Run +1. Run diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py new file mode 100755 index 0000000..526da1b --- /dev/null +++ b/backend/hist_data/gen_imgs.py @@ -0,0 +1,155 @@ +#!/usr/bin/python3 + +""" + +Looks at images described by a database, and generates resized/cropped versions +into an output directory, with names of the form 'eventId1.jpg'. +Adds the image associations and metadata to the history database. + +SIGINT can be used to stop, and the program can be re-run to continue +processing. It uses already-existing database entries to decide what +to skip. +""" + +import os, math, subprocess +import sqlite3, urllib.parse +import signal +from PIL import Image, UnidentifiedImageError + +IMG_DIR = os.path.join('enwiki', 'imgs') +IMG_DB = os.path.join('enwiki', 'img_data.db') +OUT_DIR = 'img' +DB_FILE = 'data.db' +# +MAX_MINOR_DIM = 200 +MAX_DIM_RATIO = 2 + +def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): + """ Converts images and updates db, checking for entries to skip """ + if not os.path.exists(outDir): + os.mkdir(outDir) + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Checking for image tables') + eventsDone: set[int] = set() + imgsDone: set[int] = set() + if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="event_imgs"').fetchone() is None: + # Add image tables + dbCur.execute('CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)') + dbCur.execute('CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)') + else: + # Get existing image-associated events + for (eventId,) in dbCur.execute('SELECT id FROM event_imgs'): + eventsDone.add(eventId) + # Get existing event-associated images + for (imgId,) in dbCur.execute('SELECT id from images'): + imgsDone.add(imgId) + print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip') + # + print('Processing images from eol and enwiki') + processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone) + # + dbCon.commit() + dbCon.close() +def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, + eventsDone: set[int], imgsDone: set[int]) -> bool: + """ Converts images and updates db, returning False upon interruption or failure """ + imgDbCon = sqlite3.connect(imgDb) + imgDbCur = imgDbCon.cursor() + # Set SIGINT handler + interrupted = False + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, onSigint) + # Convert images + flag = False # Set to True upon interruption or failure + for imgFile in os.listdir(imgDir): + # Check for SIGINT event + if interrupted: + print('Exiting') + flag = True + break + # Get image ID + imgIdStr, _ = os.path.splitext(imgFile) + imgId = int(imgIdStr) + # Get associated events + eventIds: set[int] = set() + query = 'SELECT title FROM page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name WHERE imgs.id = ?' + for (title,) in imgDbCur.execute(query, (imgId,)): + row = dbCur.execute('SELECT id FROM events WHERE title = ?', (title,)).fetchone() + if row is None: + print('ERROR: No event ID found for title {title} associated with image {imgFile}') + continue + eventIds.add(row[0]) + eventIds = eventIds.difference(eventsDone) + if not eventIds: + continue + # Convert image + if imgId not in imgsDone: + success = convertImage(os.path.join(imgDir, imgFile), os.path.join(outDir, str(imgId) + '.jpg')) + if not success: + flag = True + break + # Add entry to db + if imgId not in imgsDone: + row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone() + if row is None: + print(f'ERROR: No image record for ID {imgId}') + flag = True + break + name, license, artist, credit = row + url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name) + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit)) + for eventId in eventIds: + dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId)) + imgDbCon.close() + return not flag +def convertImage(imgPath: str, outPath: str): + print(f'Converting {imgPath} to {outPath}') + if os.path.exists(outPath): + print('ERROR: Output image already exists') + return False + # Get image dims + width: int + height: int + try: + with Image.open(imgPath) as image: + width, height = image.size + except UnidentifiedImageError as e: + print(f'ERROR: Unable to open/identify {imgPath}: {e}') + return False + # Limit output dims + if width > height: + if height > MAX_MINOR_DIM: + width = math.ceil(width * height / MAX_MINOR_DIM) + height = MAX_MINOR_DIM + if width / height > MAX_DIM_RATIO: + width = math.ceil(height * MAX_DIM_RATIO) + else: + if width > MAX_MINOR_DIM: + height = math.ceil(height * width / MAX_MINOR_DIM) + width = MAX_MINOR_DIM + if height / width > MAX_DIM_RATIO: + height = math.ceil(width * MAX_DIM_RATIO) + # Convert image + try: + completedProcess = subprocess.run( + ['npx', 'smartcrop-cli', '--width', str(width), '--height', str(height), imgPath, outPath], + stdout=subprocess.DEVNULL + ) + except Exception as e: + print(f'ERROR: Exception while attempting to run smartcrop: {e}') + return False + if completedProcess.returncode != 0: + print(f'ERROR: smartcrop had exit status {completedProcess.returncode}') + return False + return True + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genImgs(IMG_DIR, IMG_DB, OUT_DIR, DB_FILE) diff --git a/backend/tests/test_gen_imgs.py b/backend/tests/test_gen_imgs.py new file mode 100644 index 0000000..2541c1d --- /dev/null +++ b/backend/tests/test_gen_imgs.py @@ -0,0 +1,79 @@ +import unittest +from unittest.mock import patch +import tempfile, os, shutil + +from tests.common import createTestDbTable, readTestDbTable +from hist_data.gen_imgs import genImgs + +TEST_IMG = os.path.join(os.path.dirname(__file__), 'test_img.png') + +class TestGenImgs(unittest.TestCase): + @patch('hist_data.gen_imgs.convertImage', autospec=True) + def test_gen(self, convertImageMock): + with tempfile.TemporaryDirectory() as tempDir: + convertImageMock.side_effect = \ + lambda imgPath, outPath: shutil.copy(imgPath, outPath) + # Create temp images + imgDir = os.path.join(tempDir, 'enwiki_imgs') + os.mkdir(imgDir) + shutil.copy(TEST_IMG, os.path.join(imgDir, '100.jpg')) + shutil.copy(TEST_IMG, os.path.join(imgDir, '200.jpeg')) + shutil.copy(TEST_IMG, os.path.join(imgDir, '400.png')) + # Create temp image db + imgDb = os.path.join(tempDir, 'img_data.db') + createTestDbTable( + imgDb, + 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)', + 'INSERT INTO page_imgs VALUES (?, ?, ?)', + { + (1, 'first', 'one.jpg'), + (2, 'second', 'two.jpeg'), + (3, 'third', 'two.jpeg'), + } + ) + createTestDbTable( + imgDb, + 'CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \ + 'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)', + 'INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)', + { + (100, 'one.jpg', 'CC BY-SA 3.0', 'author1', 'credits1', '', 'https://upload.wikimedia.org/one.jpg'), + (200, 'two.jpeg', 'cc-by', 'author2', 'credits2', '', 'https://upload.wikimedia.org/two.jpeg'), + } + ) + # Create temp history db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ + 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', + 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + { + (10, 'first', 100, 1000, None, None, 0, 'event'), + (20, 'second', 10, 20, None, None, 0, 'event'), + (30, 'third', 1, 20, 30, 40, 2, 'event'), + } + ) + # Run + outDir = os.path.join(tempDir, 'imgs') + genImgs(imgDir, imgDb, outDir, dbFile) + # Check + self.assertEqual(set(os.listdir(outDir)), { + '100.jpg', + '200.jpg', + }) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, img_id from event_imgs'), + { + (10, 100), + (20, 200), + (30, 200), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, url, license, artist, credit from images'), + { + (100, 'https://en.wikipedia.org/wiki/File:one.jpg', 'CC BY-SA 3.0', 'author1', 'credits1'), + (200, 'https://en.wikipedia.org/wiki/File:two.jpeg', 'cc-by', 'author2', 'credits2'), + } + ) diff --git a/backend/tests/test_img.png b/backend/tests/test_img.png Binary files differnew file mode 100644 index 0000000..d4f15c9 --- /dev/null +++ b/backend/tests/test_img.png diff --git a/package.json b/package.json new file mode 100644 index 0000000..882895f --- /dev/null +++ b/package.json @@ -0,0 +1,13 @@ +{ + "name": "histplorer", + "version": "0.1.0", + "description": "An interactive historical timeline", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Terry Truong", + "license": "MIT", + "devDependencies": { + "smartcrop-cli": "^2.0.3" + } +} |
