Re-enable images

author: Terry Truong <terry06890@gmail.com> 2023-01-13 18:56:04 +1100
committer: Terry Truong <terry06890@gmail.com> 2023-01-13 18:56:04 +1100
commit: 3029a2f866b240856518cfa944b9e00ef37455db (patch)
tree: 0c9dcfaa065d346060de69779523aef66d62d6d4 /backend/hist_data
parent: d335df85ea815d10df62a126067482e178e3670a (diff)
3 files changed, 11 insertions, 4 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index 4cc7103..2a9475a 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -68,10 +68,13 @@ Some of the scripts use third-party packages:
     USER_AGENT variable applies here as well.
 1.  Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/.
     Adds the `imgs` and `event_imgs` tables. <br>
-    The output images may need additional manual changes:
+    The output images might need additional manual changes:
     -   An input image might have no output produced, possibly due to
         data incompatibilities, memory limits, etc.
     -   An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
+    -   An input image might produce output with unexpected dimensions.
+        This seems to happen when the image is very large, and triggers a
+        decompression bomb warning.
 
 ## Generate Description Data
 1.  In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index 7dd0771..378de7f 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,6 +9,8 @@ The program can be re-run to continue downloading, and looks
 in the output directory do decide what to skip.
 """
 
+# Took about a week to downloaded about 60k images
+
 import argparse
 import re, os, time, signal
 import sqlite3
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
index 6d57180..46cf6ee 100755
--- a/backend/hist_data/gen_imgs.py
+++ b/backend/hist_data/gen_imgs.py
@@ -10,6 +10,8 @@ processing. It uses already-existing database entries to decide what
 to skip.
 """
 
+# Took about 10 hours to process about 60k images
+
 import argparse
 import os, subprocess, signal
 import sqlite3, urllib.parse
@@ -44,7 +46,7 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
 			imgsDone.add(imgId)
 		print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip')
 	#
-	print('Processing images from eol and enwiki')
+	print('Processing images')
 	processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone)
 	#
 	dbCon.commit()
@@ -89,8 +91,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
 			if not success:
 				flag = True
 				break
-		# Add entry to db
-		if imgId not in imgsDone:
+			# Add image to db
 			row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone()
 			if row is None:
 				print(f'ERROR: No image record for ID {imgId}')
@@ -99,6 +100,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
 			name, license, artist, credit = row
 			url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
 			dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit))
+		# Add event association to db
 		for eventId in eventIds:
 			dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId))
 	imgDbCon.close()
author	Terry Truong <terry06890@gmail.com>	2023-01-13 18:56:04 +1100
committer	Terry Truong <terry06890@gmail.com>	2023-01-13 18:56:04 +1100
commit	3029a2f866b240856518cfa944b9e00ef37455db (patch)
tree	0c9dcfaa065d346060de69779523aef66d62d6d4 /backend/hist_data
parent	d335df85ea815d10df62a126067482e178e3670a (diff)