From daccbbd9c73a5292ea9d6746560d7009e5aa666d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 7 Sep 2022 11:37:37 +1000 Subject: Add python type annotations Also use consistent quote symbols Also use 'is None' instead of '== None' Also use 'if list1' instead of 'if len(list1) > 0' --- backend/tolData/enwiki/downloadImgs.py | 50 +++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'backend/tolData/enwiki/downloadImgs.py') diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py index 520677f..def4714 100755 --- a/backend/tolData/enwiki/downloadImgs.py +++ b/backend/tolData/enwiki/downloadImgs.py @@ -16,20 +16,20 @@ in the output directory do decide what to skip. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -imgDb = "imgData.db" # About 130k image names -outDir = "imgs" -licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) +imgDb = 'imgData.db' # About 130k image names +outDir = 'imgs' +licenseRegex = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) # In testing, this downloaded about 100k images, over several days if not os.path.exists(outDir): os.mkdir(outDir) -print("Checking for already-downloaded images") +print('Checking for already-downloaded images') fileList = os.listdir(outDir) -pageIdsDone = set() +pageIdsDone: set[int] = set() for filename in fileList: - (basename, extension) = os.path.splitext(filename) + basename, extension = os.path.splitext(filename) pageIdsDone.add(int(basename)) -print(f"Found {len(pageIdsDone)}") +print(f'Found {len(pageIdsDone)}') # Set SIGINT handler interrupted = False @@ -40,49 +40,49 @@ def onSigint(sig, frame): signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) -print("Opening database") +print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() -print("Starting downloads") +print('Starting downloads') iterNum = 0 -query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \ - " imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name" -for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query): +query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \ + ' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name' +for pageId, license, artist, credit, restrictions, url in dbCur.execute(query): if pageId in pageIdsDone: continue if interrupted: - print(f"Exiting loop") + print('Exiting loop') break # Check for problematic attributes - if license == None or licenseRegex.fullmatch(license) == None: + if license is None or licenseRegex.fullmatch(license) is None: continue - if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None: + if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None: continue - if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None: + if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None: continue - if restrictions != None and restrictions != "": + if restrictions is not None and restrictions != '': continue # Download image iterNum += 1 - print(f"Iteration {iterNum}: Downloading for page-id {pageId}") + print(f'Iteration {iterNum}: Downloading for page-id {pageId}') urlParts = urllib.parse.urlparse(url) extension = os.path.splitext(urlParts.path)[1] if len(extension) <= 1: - print(f"WARNING: No filename extension found in URL {url}") + print(f'WARNING: No filename extension found in URL {url}') sys.exit(1) - outFile = f"{outDir}/{pageId}{extension}" + outFile = f'{outDir}/{pageId}{extension}' headers = { - "user-agent": "terryt.dev (terry06890@gmail.com)", - "accept-encoding": "gzip", + 'user-agent': 'terryt.dev (terry06890@gmail.com)', + 'accept-encoding': 'gzip', } try: response = requests.get(url, headers=headers) with open(outFile, 'wb') as file: file.write(response.content) time.sleep(1) - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec" + # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle self to 1 cache miss per sec' # It's unclear how to properly check for cache misses, so this just aims for 1 per sec except Exception as e: - print(f"Error while downloading to {outFile}: {e}") -print("Closing database") + print(f'Error while downloading to {outFile}: {e}') +print('Closing database') dbCon.close() -- cgit v1.2.3