aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-10-02 21:15:53 +1100
committerTerry Truong <terry06890@gmail.com>2022-10-02 21:15:53 +1100
commit3e256d2fd048997370b2c043ea293ea9a3e2430c (patch)
tree14ea5dc358720ce9adedaaae3240e0b3d8f18793 /backend/hist_data
parent149dc178c491d8e447a05ff3705fdc6ceddf129e (diff)
Add gen_imgs.py
Add package.json, for using npm package smartcrop-cli Add unit test
Diffstat (limited to 'backend/hist_data')
-rw-r--r--backend/hist_data/README.md11
-rwxr-xr-xbackend/hist_data/gen_imgs.py155
2 files changed, 164 insertions, 2 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index 5b64462..c5cf66f 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -21,6 +21,12 @@ This directory holds files used to generate the history database data.db.
- `pop`: <br>
Format: `id INT PRIMARY KEY, pop INT` <br>
Associates each event with a popularity measure (currently an average monthly viewcount)
+- `images`: <br>
+ Format: `id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT` <br>
+ Holds metadata for available images
+- `event_imgs`: <br>
+ Format: `id INT PRIMARY KEY, img_id INT` <br>
+ Assocates events with images
# Generating the Database
@@ -46,10 +52,11 @@ Some of the scripts use third-party packages:
1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
images, and adds them to the image database.
1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
-1. Run
+1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/.
+ Adds the `imgs` and `event_imgs` tables.
## Generate Description Data
1. Obtain an enwiki dump in enwiki/, as specified in the README.
1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
-1. Run
+1. Run
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
new file mode 100755
index 0000000..526da1b
--- /dev/null
+++ b/backend/hist_data/gen_imgs.py
@@ -0,0 +1,155 @@
+#!/usr/bin/python3
+
+"""
+
+Looks at images described by a database, and generates resized/cropped versions
+into an output directory, with names of the form 'eventId1.jpg'.
+Adds the image associations and metadata to the history database.
+
+SIGINT can be used to stop, and the program can be re-run to continue
+processing. It uses already-existing database entries to decide what
+to skip.
+"""
+
+import os, math, subprocess
+import sqlite3, urllib.parse
+import signal
+from PIL import Image, UnidentifiedImageError
+
+IMG_DIR = os.path.join('enwiki', 'imgs')
+IMG_DB = os.path.join('enwiki', 'img_data.db')
+OUT_DIR = 'img'
+DB_FILE = 'data.db'
+#
+MAX_MINOR_DIM = 200
+MAX_DIM_RATIO = 2
+
+def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
+ """ Converts images and updates db, checking for entries to skip """
+ if not os.path.exists(outDir):
+ os.mkdir(outDir)
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ #
+ print('Checking for image tables')
+ eventsDone: set[int] = set()
+ imgsDone: set[int] = set()
+ if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="event_imgs"').fetchone() is None:
+ # Add image tables
+ dbCur.execute('CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)')
+ dbCur.execute('CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)')
+ else:
+ # Get existing image-associated events
+ for (eventId,) in dbCur.execute('SELECT id FROM event_imgs'):
+ eventsDone.add(eventId)
+ # Get existing event-associated images
+ for (imgId,) in dbCur.execute('SELECT id from images'):
+ imgsDone.add(imgId)
+ print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip')
+ #
+ print('Processing images from eol and enwiki')
+ processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone)
+ #
+ dbCon.commit()
+ dbCon.close()
+def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
+ eventsDone: set[int], imgsDone: set[int]) -> bool:
+ """ Converts images and updates db, returning False upon interruption or failure """
+ imgDbCon = sqlite3.connect(imgDb)
+ imgDbCur = imgDbCon.cursor()
+ # Set SIGINT handler
+ interrupted = False
+ def onSigint(sig, frame):
+ nonlocal interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, onSigint)
+ # Convert images
+ flag = False # Set to True upon interruption or failure
+ for imgFile in os.listdir(imgDir):
+ # Check for SIGINT event
+ if interrupted:
+ print('Exiting')
+ flag = True
+ break
+ # Get image ID
+ imgIdStr, _ = os.path.splitext(imgFile)
+ imgId = int(imgIdStr)
+ # Get associated events
+ eventIds: set[int] = set()
+ query = 'SELECT title FROM page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name WHERE imgs.id = ?'
+ for (title,) in imgDbCur.execute(query, (imgId,)):
+ row = dbCur.execute('SELECT id FROM events WHERE title = ?', (title,)).fetchone()
+ if row is None:
+ print('ERROR: No event ID found for title {title} associated with image {imgFile}')
+ continue
+ eventIds.add(row[0])
+ eventIds = eventIds.difference(eventsDone)
+ if not eventIds:
+ continue
+ # Convert image
+ if imgId not in imgsDone:
+ success = convertImage(os.path.join(imgDir, imgFile), os.path.join(outDir, str(imgId) + '.jpg'))
+ if not success:
+ flag = True
+ break
+ # Add entry to db
+ if imgId not in imgsDone:
+ row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone()
+ if row is None:
+ print(f'ERROR: No image record for ID {imgId}')
+ flag = True
+ break
+ name, license, artist, credit = row
+ url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
+ dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit))
+ for eventId in eventIds:
+ dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId))
+ imgDbCon.close()
+ return not flag
+def convertImage(imgPath: str, outPath: str):
+ print(f'Converting {imgPath} to {outPath}')
+ if os.path.exists(outPath):
+ print('ERROR: Output image already exists')
+ return False
+ # Get image dims
+ width: int
+ height: int
+ try:
+ with Image.open(imgPath) as image:
+ width, height = image.size
+ except UnidentifiedImageError as e:
+ print(f'ERROR: Unable to open/identify {imgPath}: {e}')
+ return False
+ # Limit output dims
+ if width > height:
+ if height > MAX_MINOR_DIM:
+ width = math.ceil(width * height / MAX_MINOR_DIM)
+ height = MAX_MINOR_DIM
+ if width / height > MAX_DIM_RATIO:
+ width = math.ceil(height * MAX_DIM_RATIO)
+ else:
+ if width > MAX_MINOR_DIM:
+ height = math.ceil(height * width / MAX_MINOR_DIM)
+ width = MAX_MINOR_DIM
+ if height / width > MAX_DIM_RATIO:
+ height = math.ceil(width * MAX_DIM_RATIO)
+ # Convert image
+ try:
+ completedProcess = subprocess.run(
+ ['npx', 'smartcrop-cli', '--width', str(width), '--height', str(height), imgPath, outPath],
+ stdout=subprocess.DEVNULL
+ )
+ except Exception as e:
+ print(f'ERROR: Exception while attempting to run smartcrop: {e}')
+ return False
+ if completedProcess.returncode != 0:
+ print(f'ERROR: smartcrop had exit status {completedProcess.returncode}')
+ return False
+ return True
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ genImgs(IMG_DIR, IMG_DB, OUT_DIR, DB_FILE)