aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/gen_imgs.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data/gen_imgs.py')
-rwxr-xr-xbackend/hist_data/gen_imgs.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
index 6d57180..46cf6ee 100755
--- a/backend/hist_data/gen_imgs.py
+++ b/backend/hist_data/gen_imgs.py
@@ -10,6 +10,8 @@ processing. It uses already-existing database entries to decide what
to skip.
"""
+# Took about 10 hours to process about 60k images
+
import argparse
import os, subprocess, signal
import sqlite3, urllib.parse
@@ -44,7 +46,7 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
imgsDone.add(imgId)
print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip')
#
- print('Processing images from eol and enwiki')
+ print('Processing images')
processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone)
#
dbCon.commit()
@@ -89,8 +91,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
if not success:
flag = True
break
- # Add entry to db
- if imgId not in imgsDone:
+ # Add image to db
row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone()
if row is None:
print(f'ERROR: No image record for ID {imgId}')
@@ -99,6 +100,7 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
name, license, artist, credit = row
url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit))
+ # Add event association to db
for eventId in eventIds:
dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId))
imgDbCon.close()