From 0a9b2c2e5eca8a04e37fbdd423379882863237c2 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sat, 21 Jan 2023 12:21:03 +1100 Subject: Adjust backend coding style Increase line spacing, add section comments, etc --- backend/hist_data/enwiki/download_imgs.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'backend/hist_data/enwiki/download_imgs.py') diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index df40bae..e484b33 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -9,33 +9,38 @@ The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. """ -# Took about a week to downloaded about 60k images +# Note: Took about a week to downloaded about 60k images import argparse -import re, os, time, signal +import re +import os +import time +import signal import sqlite3 -import urllib.parse, requests +import urllib.parse +import requests IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' -# + LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' - # It's unclear how to properly check for cache misses, so we just aim for 1 per sec + # Note: https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'. + # It's unclear how to properly check for cache misses, so we just aim for 1 per sec. EXP_BACKOFF = True # If True, double the timeout each time a download error occurs (otherwise just exit) def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if not os.path.exists(outDir): os.mkdir(outDir) + print('Checking for already-downloaded images') fileList = os.listdir(outDir) imgIdsDone: set[int] = set() for filename in fileList: imgIdsDone.add(int(os.path.splitext(filename)[0])) print(f'Found {len(imgIdsDone)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -44,10 +49,11 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() + print('Starting downloads') iterNum = 0 query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs' @@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if interrupted: print('Exiting loop') break + # Check for problematic attributes if license is None or LICENSE_REGEX.fullmatch(license) is None: continue @@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: continue if restrictions is not None and restrictions != '': continue + # Download image iterNum += 1 print(f'Iteration {iterNum}: Downloading for image ID {imgId}') @@ -92,11 +100,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: timeout *= 2 print(f'New timeout: {timeout}') continue + print('Closing database') dbCon.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) -- cgit v1.2.3