aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki/downloadImgLicenseInfo.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-11 14:55:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-11 15:04:14 +1000
commit5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/enwiki/downloadImgLicenseInfo.py
parentdaccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
Add backend unit tests
- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tolData/enwiki/downloadImgLicenseInfo.py')
-rwxr-xr-xbackend/tolData/enwiki/downloadImgLicenseInfo.py147
1 files changed, 0 insertions, 147 deletions
diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py
deleted file mode 100755
index ba6317e..0000000
--- a/backend/tolData/enwiki/downloadImgLicenseInfo.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/python3
-
-import re
-import sqlite3, urllib.parse, html
-import requests
-import time, signal
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads image names from a database, and uses enwiki's online API to obtain
-licensing information for them, adding the info to the database.
-
-SIGINT causes the program to finish an ongoing download and exit.
-The program can be re-run to continue downloading, and looks
-at already-processed names to decide what to skip.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imgDb = 'imgData.db'
-apiUrl = 'https://en.wikipedia.org/w/api.php'
-userAgent = 'terryt.dev (terry06890@gmail.com)'
-batchSz = 50 # Max 50
-tagRegex = re.compile(r'<[^<]+>')
-whitespaceRegex = re.compile(r'\s+')
-
-print('Opening database')
-dbCon = sqlite3.connect(imgDb)
-dbCur = dbCon.cursor()
-dbCur2 = dbCon.cursor()
-print('Checking for table')
-if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
- dbCur.execute('CREATE TABLE imgs(' \
- 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
-
-print('Reading image names')
-imgNames: set[str] = set()
-for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
- imgNames.add(imgName)
-print(f'Found {len(imgNames)}')
-
-print('Checking for already-processed images')
-oldSz = len(imgNames)
-for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
- imgNames.discard(imgName)
-print(f'Found {oldSz - len(imgNames)}')
-
-# Set SIGINT handler
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
- global interrupted
- interrupted = True
- signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-
-print('Iterating through image names')
-imgNameList = list(imgNames)
-iterNum = 0
-for i in range(0, len(imgNameList), batchSz):
- iterNum += 1
- if iterNum % 1 == 0:
- print(f'At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)')
- if interrupted:
- print(f'Exiting loop at iteration {iterNum}')
- break
- # Get batch
- imgBatch = imgNameList[i:i+batchSz]
- imgBatch = ['File:' + x for x in imgBatch]
- # Make request
- headers = {
- 'user-agent': userAgent,
- 'accept-encoding': 'gzip',
- }
- params = {
- 'action': 'query',
- 'format': 'json',
- 'prop': 'imageinfo',
- 'iiprop': 'extmetadata|url',
- 'maxlag': '5',
- 'titles': '|'.join(imgBatch),
- 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions',
- }
- responseObj = None
- try:
- response = requests.get(apiUrl, params=params, headers=headers)
- responseObj = response.json()
- except Exception as e:
- print(f'ERROR: Exception while downloading info: {e}')
- print('\tImage batch: ' + '|'.join(imgBatch))
- continue
- # Parse response-object
- if 'query' not in responseObj or 'pages' not in responseObj['query']:
- print('WARNING: Response object for doesn\'t have page data')
- print('\tImage batch: ' + '|'.join(imgBatch))
- if 'error' in responseObj:
- errorCode = responseObj['error']['code']
- print(f'\tError code: {errorCode}')
- if errorCode == 'maxlag':
- time.sleep(5)
- continue
- pages = responseObj['query']['pages']
- normalisedToInput: dict[str, str] = {}
- if 'normalized' in responseObj['query']:
- for entry in responseObj['query']['normalized']:
- normalisedToInput[entry['to']] = entry['from']
- for _, page in pages.items():
- # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
- # LicenseShortName: short human-readable license name, apparently more reliable than 'License',
- # Artist: author name (might contain complex html, multiple authors, etc)
- # Credit: 'source'
- # For image-map-like images, can be quite large/complex html, creditng each sub-image
- # May be <a href='text1'>text2</a>, where the text2 might be non-indicative
- # Restrictions: specifies non-copyright legal restrictions
- title: str = page['title']
- if title in normalisedToInput:
- title = normalisedToInput[title]
- title = title[5:] # Remove 'File:'
- if title not in imgNames:
- print(f'WARNING: Got title "{title}" not in image-name list')
- continue
- if 'imageinfo' not in page:
- print(f'WARNING: No imageinfo section for page "{title}"')
- continue
- metadata = page['imageinfo'][0]['extmetadata']
- url: str = page['imageinfo'][0]['url']
- license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
- artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
- credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
- restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
- # Remove markup
- if artist is not None:
- artist = tagRegex.sub(' ', artist)
- artist = whitespaceRegex.sub(' ', artist)
- artist = html.unescape(artist)
- artist = urllib.parse.unquote(artist)
- if credit is not None:
- credit = tagRegex.sub(' ', credit)
- credit = whitespaceRegex.sub(' ', credit)
- credit = html.unescape(credit)
- credit = urllib.parse.unquote(credit)
- # Add to db
- dbCur2.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
- (title, license, artist, credit, restrictions, url))
-
-print('Closing database')
-dbCon.commit()
-dbCon.close()