From 0a9b2c2e5eca8a04e37fbdd423379882863237c2 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sat, 21 Jan 2023 12:21:03 +1100 Subject: Adjust backend coding style Increase line spacing, add section comments, etc --- .../hist_data/enwiki/download_img_license_info.py | 29 +++++++++++++++------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'backend/hist_data/enwiki/download_img_license_info.py') diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 43f2c43..6fd710c 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -10,12 +10,16 @@ at already-processed names to decide what to skip. """ import argparse -import re, time, signal -import sqlite3, urllib.parse, html +import re +import time +import signal +import sqlite3 +import urllib.parse +import html import requests IMG_DB = 'img_data.db' -# + API_URL = 'https://en.wikipedia.org/w/api.php' USER_AGENT = 'terryt.dev (terry06890@gmail.com)' BATCH_SZ = 50 # Max 50 @@ -26,17 +30,18 @@ def downloadInfo(imgDb: str) -> None: print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() + print('Checking for table') if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: dbCur.execute('CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \ 'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') - # + print('Reading image names') imgNames: set[str] = set() for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): imgNames.add(imgName) print(f'Found {len(imgNames)}') - # + print('Checking for already-processed images') nextImgId = 1 oldSz = len(imgNames) @@ -45,7 +50,7 @@ def downloadInfo(imgDb: str) -> None: if imgId >= nextImgId: nextImgId = imgId + 1 print(f'Found {oldSz - len(imgNames)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -54,7 +59,7 @@ def downloadInfo(imgDb: str) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Iterating through image names') imgNameList = list(imgNames) iterNum = 0 @@ -65,9 +70,11 @@ def downloadInfo(imgDb: str) -> None: if interrupted: print(f'Exiting loop at iteration {iterNum}') break + # Get batch imgBatch = imgNameList[i:i+BATCH_SZ] imgBatch = ['File:' + x for x in imgBatch] + # Make request headers = { 'user-agent': USER_AGENT, @@ -90,6 +97,7 @@ def downloadInfo(imgDb: str) -> None: print(f'ERROR: Exception while downloading info: {e}') print('\tImage batch: ' + '|'.join(imgBatch)) continue + # Parse response-object if 'query' not in responseObj or 'pages' not in responseObj['query']: print('WARNING: Response object doesn\'t have page data') @@ -120,6 +128,7 @@ def downloadInfo(imgDb: str) -> None: if title not in imgNames: print(f'WARNING: Got title "{title}" not in image-name list') continue + if 'imageinfo' not in page: print(f'WARNING: No imageinfo section for page "{title}"') continue @@ -129,6 +138,7 @@ def downloadInfo(imgDb: str) -> None: artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup if artist is not None: artist = TAG_REGEX.sub(' ', artist).strip() @@ -140,11 +150,12 @@ def downloadInfo(imgDb: str) -> None: credit = WHITESPACE_REGEX.sub(' ', credit) credit = html.unescape(credit) credit = urllib.parse.unquote(credit) + # Add to db dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)', (nextImgId, title, license, artist, credit, restrictions, url)) nextImgId += 1 - # + print('Closing database') dbCon.commit() dbCon.close() @@ -152,5 +163,5 @@ def downloadInfo(imgDb: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadInfo(IMG_DB) -- cgit v1.2.3