From 5c0afa3e9a80fdcfa77bfc01b6373121f6ff3c6d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Mon, 3 Oct 2022 23:41:18 +1100 Subject: Tweak gen_imgs.py output --- backend/hist_data/README.md | 7 ++++++- backend/hist_data/gen_imgs.py | 8 ++++---- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'backend/hist_data') diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index 32836e2..7653f09 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -39,6 +39,7 @@ Some of the scripts use third-party packages: - `indexed_bzip2`: For parallelised bzip2 processing - `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps - `requests`: For downloading data +- `Pillow`: For image processing ## Generate Event Data 1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README. @@ -56,7 +57,11 @@ Some of the scripts use third-party packages: images, and adds them to the image database. 1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. 1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/. - Adds the `imgs` and `event_imgs` tables. + Adds the `imgs` and `event_imgs` tables.
+ The outputs will likely need additional manual changes: + - An input image might have no output produced, possibly due to + data incompatibilities, memory limits, etc. + - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. ## Generate Description Data 1. Obtain an enwiki dump in enwiki/, as specified in the README. diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py index 0b2f480..817de03 100755 --- a/backend/hist_data/gen_imgs.py +++ b/backend/hist_data/gen_imgs.py @@ -13,7 +13,7 @@ to skip. import os, math, subprocess import sqlite3, urllib.parse import signal -from PIL import Image, UnidentifiedImageError +from PIL import Image IMG_DIR = os.path.join('enwiki', 'imgs') IMG_DB = os.path.join('enwiki', 'img_data.db') @@ -21,7 +21,7 @@ OUT_DIR = 'img' DB_FILE = 'data.db' # MAX_MINOR_DIM = 200 -MAX_DIM_RATIO = 2 +MAX_DIM_RATIO = 3/2 def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): """ Converts images and updates db, checking for entries to skip """ @@ -116,8 +116,8 @@ def convertImage(imgPath: str, outPath: str): try: with Image.open(imgPath) as image: width, height = image.size - except UnidentifiedImageError as e: - print(f'ERROR: Unable to open/identify {imgPath}: {e}') + except Exception as e: # Being more specific runs the risk of ending the program without committing to db + print(f'ERROR: Unable to open {imgPath}: {e}') return False # Limit output dims if width > height: -- cgit v1.2.3