From 0c6c79084bd0ba331c469b4203627f18eb3b8275 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Mon, 3 Oct 2022 19:54:06 +1100 Subject: Add exponential-backoff download option --- backend/hist_data/enwiki/download_imgs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'backend/hist_data/enwiki') diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index 664dd28..bbd2cda 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -22,6 +22,7 @@ USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' # It's unclear how to properly check for cache misses, so we just aim for 1 per sec +BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit) def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if not os.path.exists(outDir): @@ -83,7 +84,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: time.sleep(timeout) except Exception as e: print(f'Error while downloading to {outFile}: {e}') - return + if not BACKOFF: + return + else: + timeout *= 2 + print(f'New timeout: {timeout}') + continue print('Closing database') dbCon.close() -- cgit v1.2.3