From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Sun, 11 Sep 2022 14:55:42 +1000
Subject: Add backend unit tests

- Add unit testing code in backend/tests/
- Change to snake-case for script/file/directory names
- Use os.path.join() instead of '/'
- Refactor script code into function defs and a main-guard
- Make global vars all-caps

Some fixes:
- For getting descriptions, some wiki redirects weren't properly resolved
- Linked images were sub-optimally propagated
- Generation of reduced trees assumed a wiki-id association implied a description
- Tilo.py had potential null dereferences by not always using a reduced node set
- EOL image downloading didn't properly wait for all threads to end when finishing
---
 backend/tolData/enwiki/downloadImgLicenseInfo.py | 147 -----------------------
 1 file changed, 147 deletions(-)
 delete mode 100755 backend/tolData/enwiki/downloadImgLicenseInfo.py

(limited to 'backend/tolData/enwiki/downloadImgLicenseInfo.py')

diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py
deleted file mode 100755
index ba6317e..0000000
--- a/backend/tolData/enwiki/downloadImgLicenseInfo.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/python3
-
-import re
-import sqlite3, urllib.parse, html
-import requests
-import time, signal
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads image names from a database, and uses enwiki's online API to obtain
-licensing information for them, adding the info to the database.
-
-SIGINT causes the program to finish an ongoing download and exit.
-The program can be re-run to continue downloading, and looks
-at already-processed names to decide what to skip.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imgDb = 'imgData.db'
-apiUrl = 'https://en.wikipedia.org/w/api.php'
-userAgent = 'terryt.dev (terry06890@gmail.com)'
-batchSz = 50 # Max 50
-tagRegex = re.compile(r'<[^<]+>')
-whitespaceRegex = re.compile(r'\s+')
-
-print('Opening database')
-dbCon = sqlite3.connect(imgDb)
-dbCur = dbCon.cursor()
-dbCur2 = dbCon.cursor()
-print('Checking for table')
-if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
-	dbCur.execute('CREATE TABLE imgs(' \
-		'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
-
-print('Reading image names')
-imgNames: set[str] = set()
-for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
-	imgNames.add(imgName)
-print(f'Found {len(imgNames)}')
-
-print('Checking for already-processed images')
-oldSz = len(imgNames)
-for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
-	imgNames.discard(imgName)
-print(f'Found {oldSz - len(imgNames)}')
-
-# Set SIGINT handler
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-	signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-
-print('Iterating through image names')
-imgNameList = list(imgNames)
-iterNum = 0
-for i in range(0, len(imgNameList), batchSz):
-	iterNum += 1
-	if iterNum % 1 == 0:
-		print(f'At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)')
-	if interrupted:
-		print(f'Exiting loop at iteration {iterNum}')
-		break
-	# Get batch
-	imgBatch = imgNameList[i:i+batchSz]
-	imgBatch = ['File:' + x for x in imgBatch]
-	# Make request
-	headers = {
-		'user-agent': userAgent,
-		'accept-encoding': 'gzip',
-	}
-	params = {
-		'action': 'query',
-		'format': 'json',
-		'prop': 'imageinfo',
-		'iiprop': 'extmetadata|url',
-		'maxlag': '5',
-		'titles': '|'.join(imgBatch),
-		'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions',
-	}
-	responseObj = None
-	try:
-		response = requests.get(apiUrl, params=params, headers=headers)
-		responseObj = response.json()
-	except Exception as e:
-		print(f'ERROR: Exception while downloading info: {e}')
-		print('\tImage batch: ' + '|'.join(imgBatch))
-		continue
-	# Parse response-object
-	if 'query' not in responseObj or 'pages' not in responseObj['query']:
-		print('WARNING: Response object for doesn\'t have page data')
-		print('\tImage batch: ' + '|'.join(imgBatch))
-		if 'error' in responseObj:
-			errorCode = responseObj['error']['code']
-			print(f'\tError code: {errorCode}')
-			if errorCode == 'maxlag':
-				time.sleep(5)
-		continue
-	pages = responseObj['query']['pages']
-	normalisedToInput: dict[str, str] = {}
-	if 'normalized' in responseObj['query']:
-		for entry in responseObj['query']['normalized']:
-			normalisedToInput[entry['to']] = entry['from']
-	for _, page in pages.items():
-		# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
-			# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
-			# Artist: author name (might contain complex html, multiple authors, etc)
-			# Credit: 'source'
-				# For image-map-like images, can be quite large/complex html, creditng each sub-image
-				# May be <a href='text1'>text2</a>, where the text2 might be non-indicative
-			# Restrictions: specifies non-copyright legal restrictions
-		title: str = page['title']
-		if title in normalisedToInput:
-			title = normalisedToInput[title]
-		title = title[5:] # Remove 'File:'
-		if title not in imgNames:
-			print(f'WARNING: Got title "{title}" not in image-name list')
-			continue
-		if 'imageinfo' not in page:
-			print(f'WARNING: No imageinfo section for page "{title}"')
-			continue
-		metadata = page['imageinfo'][0]['extmetadata']
-		url: str = page['imageinfo'][0]['url']
-		license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
-		artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
-		credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
-		restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
-		# Remove markup
-		if artist is not None:
-			artist = tagRegex.sub(' ', artist)
-			artist = whitespaceRegex.sub(' ', artist)
-			artist = html.unescape(artist)
-			artist = urllib.parse.unquote(artist)
-		if credit is not None:
-			credit = tagRegex.sub(' ', credit)
-			credit = whitespaceRegex.sub(' ', credit)
-			credit = html.unescape(credit)
-			credit = urllib.parse.unquote(credit)
-		# Add to db
-		dbCur2.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
-			(title, license, artist, credit, restrictions, url))
-
-print('Closing database')
-dbCon.commit()
-dbCon.close()
-- 
cgit v1.2.3