diff options
Diffstat (limited to 'backend/hist_data')
| -rw-r--r-- | backend/hist_data/README.md | 5 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_imgs.py | 2 | ||||
| -rwxr-xr-x | backend/hist_data/gen_imgs.py | 8 |
3 files changed, 11 insertions, 4 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index 4cc7103..2a9475a 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -68,10 +68,13 @@ Some of the scripts use third-party packages: USER_AGENT variable applies here as well. 1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/. Adds the `imgs` and `event_imgs` tables. <br> - The output images may need additional manual changes: + The output images might need additional manual changes: - An input image might have no output produced, possibly due to data incompatibilities, memory limits, etc. - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. + - An input image might produce output with unexpected dimensions. + This seems to happen when the image is very large, and triggers a + decompression bomb warning. ## Generate Description Data 1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database. diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index 7dd0771..378de7f 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -9,6 +9,8 @@ The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. """ +# Took about a week to downloaded about 60k images + import argparse import re, os, time, signal import sqlite3 diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py index 6d57180..46cf6ee 100755 --- a/backend/hist_data/gen_imgs.py +++ b/backend/hist_data/gen_imgs.py @@ -10,6 +10,8 @@ processing. It uses already-existing database entries to decide what to skip. """ +# Took about 10 hours to process about 60k images + import argparse import os, subprocess, signal import sqlite3, urllib.parse @@ -44,7 +46,7 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): imgsDone.add(imgId) print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip') # - print('Processing images from eol and enwiki') + print('Processing images') processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone) # dbCon.commit() @@ -89,8 +91,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, if not success: flag = True break - # Add entry to db - if imgId not in imgsDone: + # Add image to db row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone() if row is None: print(f'ERROR: No image record for ID {imgId}') @@ -99,6 +100,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, name, license, artist, credit = row url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name) dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit)) + # Add event association to db for eventId in eventIds: dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId)) imgDbCon.close() |
