aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki/download_imgs.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data/enwiki/download_imgs.py')
-rwxr-xr-xbackend/hist_data/enwiki/download_imgs.py27
1 files changed, 18 insertions, 9 deletions
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index df40bae..e484b33 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,33 +9,38 @@ The program can be re-run to continue downloading, and looks
in the output directory do decide what to skip.
"""
-# Took about a week to downloaded about 60k images
+# Note: Took about a week to downloaded about 60k images
import argparse
-import re, os, time, signal
+import re
+import os
+import time
+import signal
import sqlite3
-import urllib.parse, requests
+import urllib.parse
+import requests
IMG_DB = 'img_data.db' # About 130k image names
OUT_DIR = 'imgs'
-#
+
LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
TIMEOUT = 1
- # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
- # It's unclear how to properly check for cache misses, so we just aim for 1 per sec
+ # Note: https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'.
+ # It's unclear how to properly check for cache misses, so we just aim for 1 per sec.
EXP_BACKOFF = True # If True, double the timeout each time a download error occurs (otherwise just exit)
def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if not os.path.exists(outDir):
os.mkdir(outDir)
+
print('Checking for already-downloaded images')
fileList = os.listdir(outDir)
imgIdsDone: set[int] = set()
for filename in fileList:
imgIdsDone.add(int(os.path.splitext(filename)[0]))
print(f'Found {len(imgIdsDone)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -44,10 +49,11 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
+
print('Starting downloads')
iterNum = 0
query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs'
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if interrupted:
print('Exiting loop')
break
+
# Check for problematic attributes
if license is None or LICENSE_REGEX.fullmatch(license) is None:
continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
continue
if restrictions is not None and restrictions != '':
continue
+
# Download image
iterNum += 1
print(f'Iteration {iterNum}: Downloading for image ID {imgId}')
@@ -92,11 +100,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
timeout *= 2
print(f'New timeout: {timeout}')
continue
+
print('Closing database')
dbCon.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)