From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tolData/enwiki/downloadImgLicenseInfo.py | 147 ----------------------- 1 file changed, 147 deletions(-) delete mode 100755 backend/tolData/enwiki/downloadImgLicenseInfo.py (limited to 'backend/tolData/enwiki/downloadImgLicenseInfo.py') diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py deleted file mode 100755 index ba6317e..0000000 --- a/backend/tolData/enwiki/downloadImgLicenseInfo.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/python3 - -import re -import sqlite3, urllib.parse, html -import requests -import time, signal - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads image names from a database, and uses enwiki's online API to obtain -licensing information for them, adding the info to the database. - -SIGINT causes the program to finish an ongoing download and exit. -The program can be re-run to continue downloading, and looks -at already-processed names to decide what to skip. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imgDb = 'imgData.db' -apiUrl = 'https://en.wikipedia.org/w/api.php' -userAgent = 'terryt.dev (terry06890@gmail.com)' -batchSz = 50 # Max 50 -tagRegex = re.compile(r'<[^<]+>') -whitespaceRegex = re.compile(r'\s+') - -print('Opening database') -dbCon = sqlite3.connect(imgDb) -dbCur = dbCon.cursor() -dbCur2 = dbCon.cursor() -print('Checking for table') -if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: - dbCur.execute('CREATE TABLE imgs(' \ - 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') - -print('Reading image names') -imgNames: set[str] = set() -for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): - imgNames.add(imgName) -print(f'Found {len(imgNames)}') - -print('Checking for already-processed images') -oldSz = len(imgNames) -for (imgName,) in dbCur.execute('SELECT name FROM imgs'): - imgNames.discard(imgName) -print(f'Found {oldSz - len(imgNames)}') - -# Set SIGINT handler -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) - -print('Iterating through image names') -imgNameList = list(imgNames) -iterNum = 0 -for i in range(0, len(imgNameList), batchSz): - iterNum += 1 - if iterNum % 1 == 0: - print(f'At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)') - if interrupted: - print(f'Exiting loop at iteration {iterNum}') - break - # Get batch - imgBatch = imgNameList[i:i+batchSz] - imgBatch = ['File:' + x for x in imgBatch] - # Make request - headers = { - 'user-agent': userAgent, - 'accept-encoding': 'gzip', - } - params = { - 'action': 'query', - 'format': 'json', - 'prop': 'imageinfo', - 'iiprop': 'extmetadata|url', - 'maxlag': '5', - 'titles': '|'.join(imgBatch), - 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions', - } - responseObj = None - try: - response = requests.get(apiUrl, params=params, headers=headers) - responseObj = response.json() - except Exception as e: - print(f'ERROR: Exception while downloading info: {e}') - print('\tImage batch: ' + '|'.join(imgBatch)) - continue - # Parse response-object - if 'query' not in responseObj or 'pages' not in responseObj['query']: - print('WARNING: Response object for doesn\'t have page data') - print('\tImage batch: ' + '|'.join(imgBatch)) - if 'error' in responseObj: - errorCode = responseObj['error']['code'] - print(f'\tError code: {errorCode}') - if errorCode == 'maxlag': - time.sleep(5) - continue - pages = responseObj['query']['pages'] - normalisedToInput: dict[str, str] = {} - if 'normalized' in responseObj['query']: - for entry in responseObj['query']['normalized']: - normalisedToInput[entry['to']] = entry['from'] - for _, page in pages.items(): - # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data - # LicenseShortName: short human-readable license name, apparently more reliable than 'License', - # Artist: author name (might contain complex html, multiple authors, etc) - # Credit: 'source' - # For image-map-like images, can be quite large/complex html, creditng each sub-image - # May be text2, where the text2 might be non-indicative - # Restrictions: specifies non-copyright legal restrictions - title: str = page['title'] - if title in normalisedToInput: - title = normalisedToInput[title] - title = title[5:] # Remove 'File:' - if title not in imgNames: - print(f'WARNING: Got title "{title}" not in image-name list') - continue - if 'imageinfo' not in page: - print(f'WARNING: No imageinfo section for page "{title}"') - continue - metadata = page['imageinfo'][0]['extmetadata'] - url: str = page['imageinfo'][0]['url'] - license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None - artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None - credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None - restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None - # Remove markup - if artist is not None: - artist = tagRegex.sub(' ', artist) - artist = whitespaceRegex.sub(' ', artist) - artist = html.unescape(artist) - artist = urllib.parse.unquote(artist) - if credit is not None: - credit = tagRegex.sub(' ', credit) - credit = whitespaceRegex.sub(' ', credit) - credit = html.unescape(credit) - credit = urllib.parse.unquote(credit) - # Add to db - dbCur2.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', - (title, license, artist, credit, restrictions, url)) - -print('Closing database') -dbCon.commit() -dbCon.close() -- cgit v1.2.3