aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-01-13 18:56:04 +1100
committerTerry Truong <terry06890@gmail.com>2023-01-13 18:56:04 +1100
commit3029a2f866b240856518cfa944b9e00ef37455db (patch)
tree0c9dcfaa065d346060de69779523aef66d62d6d4 /backend/hist_data
parentd335df85ea815d10df62a126067482e178e3670a (diff)
Re-enable images
Diffstat (limited to 'backend/hist_data')
-rw-r--r--backend/hist_data/README.md5
-rwxr-xr-xbackend/hist_data/enwiki/download_imgs.py2
-rwxr-xr-xbackend/hist_data/gen_imgs.py8
3 files changed, 11 insertions, 4 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index 4cc7103..2a9475a 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -68,10 +68,13 @@ Some of the scripts use third-party packages:
USER_AGENT variable applies here as well.
1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/.
Adds the `imgs` and `event_imgs` tables. <br>
- The output images may need additional manual changes:
+ The output images might need additional manual changes:
- An input image might have no output produced, possibly due to
data incompatibilities, memory limits, etc.
- An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
+ - An input image might produce output with unexpected dimensions.
+ This seems to happen when the image is very large, and triggers a
+ decompression bomb warning.
## Generate Description Data
1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index 7dd0771..378de7f 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,6 +9,8 @@ The program can be re-run to continue downloading, and looks
in the output directory do decide what to skip.
"""
+# Took about a week to downloaded about 60k images
+
import argparse
import re, os, time, signal
import sqlite3
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
index 6d57180..46cf6ee 100755
--- a/backend/hist_data/gen_imgs.py
+++ b/backend/hist_data/gen_imgs.py
@@ -10,6 +10,8 @@ processing. It uses already-existing database entries to decide what
to skip.
"""
+# Took about 10 hours to process about 60k images
+
import argparse
import os, subprocess, signal
import sqlite3, urllib.parse
@@ -44,7 +46,7 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
imgsDone.add(imgId)
print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip')
#
- print('Processing images from eol and enwiki')
+ print('Processing images')
processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone)
#
dbCon.commit()
@@ -89,8 +91,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
if not success:
flag = True
break
- # Add entry to db
- if imgId not in imgsDone:
+ # Add image to db
row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone()
if row is None:
print(f'ERROR: No image record for ID {imgId}')
@@ -99,6 +100,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
name, license, artist, credit = row
url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit))
+ # Add event association to db
for eventId in eventIds:
dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId))
imgDbCon.close()