From daccbbd9c73a5292ea9d6746560d7009e5aa666d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 7 Sep 2022 11:37:37 +1000 Subject: Add python type annotations Also use consistent quote symbols Also use 'is None' instead of '== None' Also use 'if list1' instead of 'if len(list1) > 0' --- backend/tolData/enwiki/downloadImgLicenseInfo.py | 136 +++++++++++------------ 1 file changed, 68 insertions(+), 68 deletions(-) (limited to 'backend/tolData/enwiki/downloadImgLicenseInfo.py') diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py index dd39d54..ba6317e 100755 --- a/backend/tolData/enwiki/downloadImgLicenseInfo.py +++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, re +import re import sqlite3, urllib.parse, html import requests import time, signal @@ -16,33 +16,33 @@ at already-processed names to decide what to skip. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -imgDb = "imgData.db" -apiUrl = "https://en.wikipedia.org/w/api.php" -userAgent = "terryt.dev (terry06890@gmail.com)" +imgDb = 'imgData.db' +apiUrl = 'https://en.wikipedia.org/w/api.php' +userAgent = 'terryt.dev (terry06890@gmail.com)' batchSz = 50 # Max 50 -tagRegex = re.compile(r"<[^<]+>") -whitespaceRegex = re.compile(r"\s+") +tagRegex = re.compile(r'<[^<]+>') +whitespaceRegex = re.compile(r'\s+') -print("Opening database") +print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() dbCur2 = dbCon.cursor() -print("Checking for table") -if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None: - dbCur.execute("CREATE TABLE imgs(" \ - "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)") +print('Checking for table') +if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: + dbCur.execute('CREATE TABLE imgs(' \ + 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') -print("Reading image names") -imgNames = set() -for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"): +print('Reading image names') +imgNames: set[str] = set() +for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): imgNames.add(imgName) -print(f"Found {len(imgNames)}") +print(f'Found {len(imgNames)}') -print("Checking for already-processed images") +print('Checking for already-processed images') oldSz = len(imgNames) -for (imgName,) in dbCur.execute("SELECT name FROM imgs"): +for (imgName,) in dbCur.execute('SELECT name FROM imgs'): imgNames.discard(imgName) -print(f"Found {oldSz - len(imgNames)}") +print(f'Found {oldSz - len(imgNames)}') # Set SIGINT handler interrupted = False @@ -53,95 +53,95 @@ def onSigint(sig, frame): signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) -print("Iterating through image names") -imgNames = list(imgNames) +print('Iterating through image names') +imgNameList = list(imgNames) iterNum = 0 -for i in range(0, len(imgNames), batchSz): +for i in range(0, len(imgNameList), batchSz): iterNum += 1 if iterNum % 1 == 0: - print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)") + print(f'At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)') if interrupted: - print(f"Exiting loop at iteration {iterNum}") + print(f'Exiting loop at iteration {iterNum}') break # Get batch - imgBatch = imgNames[i:i+batchSz] - imgBatch = ["File:" + x for x in imgBatch] + imgBatch = imgNameList[i:i+batchSz] + imgBatch = ['File:' + x for x in imgBatch] # Make request headers = { - "user-agent": userAgent, - "accept-encoding": "gzip", + 'user-agent': userAgent, + 'accept-encoding': 'gzip', } params = { - "action": "query", - "format": "json", - "prop": "imageinfo", - "iiprop": "extmetadata|url", - "maxlag": "5", - "titles": "|".join(imgBatch), - "iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions", + 'action': 'query', + 'format': 'json', + 'prop': 'imageinfo', + 'iiprop': 'extmetadata|url', + 'maxlag': '5', + 'titles': '|'.join(imgBatch), + 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions', } responseObj = None try: response = requests.get(apiUrl, params=params, headers=headers) responseObj = response.json() except Exception as e: - print(f"ERROR: Exception while downloading info: {e}") - print(f"\tImage batch: " + "|".join(imgBatch)) + print(f'ERROR: Exception while downloading info: {e}') + print('\tImage batch: ' + '|'.join(imgBatch)) continue # Parse response-object - if "query" not in responseObj or "pages" not in responseObj["query"]: - print("WARNING: Response object for doesn't have page data") - print("\tImage batch: " + "|".join(imgBatch)) - if "error" in responseObj: - errorCode = responseObj["error"]["code"] - print(f"\tError code: {errorCode}") - if errorCode == "maxlag": + if 'query' not in responseObj or 'pages' not in responseObj['query']: + print('WARNING: Response object for doesn\'t have page data') + print('\tImage batch: ' + '|'.join(imgBatch)) + if 'error' in responseObj: + errorCode = responseObj['error']['code'] + print(f'\tError code: {errorCode}') + if errorCode == 'maxlag': time.sleep(5) continue - pages = responseObj["query"]["pages"] - normalisedToInput = {} - if "normalized" in responseObj["query"]: - for entry in responseObj["query"]["normalized"]: - normalisedToInput[entry["to"]] = entry["from"] - for (_, page) in pages.items(): + pages = responseObj['query']['pages'] + normalisedToInput: dict[str, str] = {} + if 'normalized' in responseObj['query']: + for entry in responseObj['query']['normalized']: + normalisedToInput[entry['to']] = entry['from'] + for _, page in pages.items(): # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data # LicenseShortName: short human-readable license name, apparently more reliable than 'License', # Artist: author name (might contain complex html, multiple authors, etc) # Credit: 'source' # For image-map-like images, can be quite large/complex html, creditng each sub-image - # May be text2, where the text2 might be non-indicative + # May be text2, where the text2 might be non-indicative # Restrictions: specifies non-copyright legal restrictions - title = page["title"] + title: str = page['title'] if title in normalisedToInput: title = normalisedToInput[title] title = title[5:] # Remove 'File:' if title not in imgNames: - print(f"WARNING: Got title \"{title}\" not in image-name list") + print(f'WARNING: Got title "{title}" not in image-name list') continue - if "imageinfo" not in page: - print(f"WARNING: No imageinfo section for page \"{title}\"") + if 'imageinfo' not in page: + print(f'WARNING: No imageinfo section for page "{title}"') continue - metadata = page["imageinfo"][0]["extmetadata"] - url = page["imageinfo"][0]["url"] - license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None - artist = metadata['Artist']['value'] if 'Artist' in metadata else None - credit = metadata['Credit']['value'] if 'Credit' in metadata else None - restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + metadata = page['imageinfo'][0]['extmetadata'] + url: str = page['imageinfo'][0]['url'] + license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None + artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None + credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None + restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None # Remove markup - if artist != None: - artist = tagRegex.sub(" ", artist) - artist = whitespaceRegex.sub(" ", artist) + if artist is not None: + artist = tagRegex.sub(' ', artist) + artist = whitespaceRegex.sub(' ', artist) artist = html.unescape(artist) artist = urllib.parse.unquote(artist) - if credit != None: - credit = tagRegex.sub(" ", credit) - credit = whitespaceRegex.sub(" ", credit) + if credit is not None: + credit = tagRegex.sub(' ', credit) + credit = whitespaceRegex.sub(' ', credit) credit = html.unescape(credit) credit = urllib.parse.unquote(credit) # Add to db - dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", + dbCur2.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', (title, license, artist, credit, restrictions, url)) -print("Closing database") +print('Closing database') dbCon.commit() dbCon.close() -- cgit v1.2.3