From 0a9b2c2e5eca8a04e37fbdd423379882863237c2 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Sat, 21 Jan 2023 12:21:03 +1100
Subject: Adjust backend coding style

Increase line spacing, add section comments, etc
---
 backend/hist_data/enwiki/download_imgs.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'backend/hist_data/enwiki/download_imgs.py')

diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index df40bae..e484b33 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,33 +9,38 @@ The program can be re-run to continue downloading, and looks
 in the output directory do decide what to skip.
 """
 
-# Took about a week to downloaded about 60k images
+# Note: Took about a week to downloaded about 60k images
 
 import argparse
-import re, os, time, signal
+import re
+import os
+import time
+import signal
 import sqlite3
-import urllib.parse, requests
+import urllib.parse
+import requests
 
 IMG_DB = 'img_data.db' # About 130k image names
 OUT_DIR = 'imgs'
-#
+
 LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
 USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 TIMEOUT = 1
-	# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
-	# It's unclear how to properly check for cache misses, so we just aim for 1 per sec
+	# Note: https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'.
+	# It's unclear how to properly check for cache misses, so we just aim for 1 per sec.
 EXP_BACKOFF = True # If True, double the timeout each time a download error occurs (otherwise just exit)
 
 def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	if not os.path.exists(outDir):
 		os.mkdir(outDir)
+
 	print('Checking for already-downloaded images')
 	fileList = os.listdir(outDir)
 	imgIdsDone: set[int] = set()
 	for filename in fileList:
 		imgIdsDone.add(int(os.path.splitext(filename)[0]))
 	print(f'Found {len(imgIdsDone)}')
-	#
+
 	# Set SIGINT handler
 	interrupted = False
 	oldHandler = None
@@ -44,10 +49,11 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		interrupted = True
 		signal.signal(signal.SIGINT, oldHandler)
 	oldHandler = signal.signal(signal.SIGINT, onSigint)
-	#
+
 	print('Opening database')
 	dbCon = sqlite3.connect(imgDb)
 	dbCur = dbCon.cursor()
+
 	print('Starting downloads')
 	iterNum = 0
 	query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs'
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		if interrupted:
 			print('Exiting loop')
 			break
+
 		# Check for problematic attributes
 		if license is None or LICENSE_REGEX.fullmatch(license) is None:
 			continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 			continue
 		if restrictions is not None and restrictions != '':
 			continue
+
 		# Download image
 		iterNum += 1
 		print(f'Iteration {iterNum}: Downloading for image ID {imgId}')
@@ -92,11 +100,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 				timeout *= 2
 				print(f'New timeout: {timeout}')
 				continue
+
 	print('Closing database')
 	dbCon.close()
 
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
-- 
cgit v1.2.3