aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-10-03 23:41:18 +1100
committerTerry Truong <terry06890@gmail.com>2022-10-03 23:41:31 +1100
commit5c0afa3e9a80fdcfa77bfc01b6373121f6ff3c6d (patch)
tree95bada97a649b7459d1ff8e523aac7f979d674eb /backend
parent0c6c79084bd0ba331c469b4203627f18eb3b8275 (diff)
Tweak gen_imgs.py output
Diffstat (limited to 'backend')
-rw-r--r--backend/hist_data/README.md7
-rwxr-xr-xbackend/hist_data/gen_imgs.py8
2 files changed, 10 insertions, 5 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index 32836e2..7653f09 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -39,6 +39,7 @@ Some of the scripts use third-party packages:
- `indexed_bzip2`: For parallelised bzip2 processing
- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps
- `requests`: For downloading data
+- `Pillow`: For image processing
## Generate Event Data
1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
@@ -56,7 +57,11 @@ Some of the scripts use third-party packages:
images, and adds them to the image database.
1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/.
- Adds the `imgs` and `event_imgs` tables.
+ Adds the `imgs` and `event_imgs` tables. <br>
+ The outputs will likely need additional manual changes:
+ - An input image might have no output produced, possibly due to
+ data incompatibilities, memory limits, etc.
+ - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
## Generate Description Data
1. Obtain an enwiki dump in enwiki/, as specified in the README.
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
index 0b2f480..817de03 100755
--- a/backend/hist_data/gen_imgs.py
+++ b/backend/hist_data/gen_imgs.py
@@ -13,7 +13,7 @@ to skip.
import os, math, subprocess
import sqlite3, urllib.parse
import signal
-from PIL import Image, UnidentifiedImageError
+from PIL import Image
IMG_DIR = os.path.join('enwiki', 'imgs')
IMG_DB = os.path.join('enwiki', 'img_data.db')
@@ -21,7 +21,7 @@ OUT_DIR = 'img'
DB_FILE = 'data.db'
#
MAX_MINOR_DIM = 200
-MAX_DIM_RATIO = 2
+MAX_DIM_RATIO = 3/2
def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
""" Converts images and updates db, checking for entries to skip """
@@ -116,8 +116,8 @@ def convertImage(imgPath: str, outPath: str):
try:
with Image.open(imgPath) as image:
width, height = image.size
- except UnidentifiedImageError as e:
- print(f'ERROR: Unable to open/identify {imgPath}: {e}')
+ except Exception as e: # Being more specific runs the risk of ending the program without committing to db
+ print(f'ERROR: Unable to open {imgPath}: {e}')
return False
# Limit output dims
if width > height: