diff options
Diffstat (limited to 'backend/data/enwiki/downloadImgLicenseInfo.py')
| -rwxr-xr-x | backend/data/enwiki/downloadImgLicenseInfo.py | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py new file mode 100755 index 0000000..5d99573 --- /dev/null +++ b/backend/data/enwiki/downloadImgLicenseInfo.py @@ -0,0 +1,140 @@ +#!/usr/bin/python3 + +import sys, re +import sqlite3, urllib.parse, html +import requests +import time, signal + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n" +usageInfo += "licensing information for them, adding the info to a sqlite db.\n" +usageInfo += "\n" +usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" +usageInfo += "The program can be re-run to continue downloading, and looks\n" +usageInfo += "at names added to the db to decide what to skip.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +imgDb = "enwikiImgs.db" # About 130k image names +apiUrl = "https://en.wikipedia.org/w/api.php" +batchSz = 50 # Max 50 +tagRegex = re.compile(r"<[^<]+>") +whitespaceRegex = re.compile(r"\s+") + +# Open db +dbCon = sqlite3.connect(imgDb) +dbCur = dbCon.cursor() +dbCur2 = dbCon.cursor() +# Create table if it doesn't exist +if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None: + dbCur.execute("CREATE TABLE imgs(" \ + "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)") +# Get image names +print("Reading image names") +imgNames = set() +for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs"): + imgNames.add(imgName) +print(f"Found {len(imgNames)} images") +oldSz = len(imgNames) +for (imgName,) in dbCur.execute("SELECT name FROM imgs"): + imgNames.discard(imgName) +print(f"Skipping {oldSz - len(imgNames)} already-done images") +# Set SIGINT handler +interrupted = False +oldHandler = None +def onSigint(sig, frame): + global interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) +oldHandler = signal.signal(signal.SIGINT, onSigint) +# Iterate through image names, making API requests +imgNames = list(imgNames) +iterNum = 0 +for i in range(0, len(imgNames), batchSz): + iterNum += 1 + if iterNum % 1 == 0: + print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)") + if interrupted: + print(f"Exiting loop at iteration {iterNum}") + break + # Get batch + imgBatch = imgNames[i:i+batchSz] + imgBatch = ["File:" + x for x in imgBatch] + # Make request + headers = { + "user-agent": "terryt.dev (terry06890@gmail.com)", + "accept-encoding": "gzip", + } + params = { + "action": "query", + "format": "json", + "prop": "imageinfo", + "iiprop": "extmetadata|url", + "maxlag": "5", + "titles": "|".join(imgBatch), + "iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions", + } + responseObj = None + try: + response = requests.get(apiUrl, params=params, headers=headers) + responseObj = response.json() + except Exception as e: + print(f"Error while downloading info: {e}", file=sys.stderr) + print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + continue + # Parse response-object + if "query" not in responseObj or "pages" not in responseObj["query"]: + print("WARNING: Response object for doesn't have page data", file=sys.stderr) + print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + if "error" in responseObj: + errorCode = responseObj["error"]["code"] + print(f"\tError code: {errorCode}", file=sys.stderr) + if errorCode == "maxlag": + time.sleep(5) + continue + pages = responseObj["query"]["pages"] + normalisedToInput = {} + if "normalized" in responseObj["query"]: + for entry in responseObj["query"]["normalized"]: + normalisedToInput[entry["to"]] = entry["from"] + for (_, page) in pages.items(): + # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data + # LicenseShortName: short human-readable license name, apparently more reliable than 'License', + # Artist: author name (might contain complex html, multiple authors, etc) + # Credit: 'source' + # For image-map-like images, can be quite large/complex html, creditng each sub-image + # May be <a href="text1">text2</a>, where the text2 might be non-indicative + # Restrictions: specifies non-copyright legal restrictions + title = page["title"] + if title in normalisedToInput: + title = normalisedToInput[title] + title = title[5:] # Remove 'File:' + if title not in imgNames: + print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr) + continue + if "imageinfo" not in page: + print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr) + continue + metadata = page["imageinfo"][0]["extmetadata"] + url = page["imageinfo"][0]["url"] + license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None + artist = metadata['Artist']['value'] if 'Artist' in metadata else None + credit = metadata['Credit']['value'] if 'Credit' in metadata else None + restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove newlines + if artist != None: + artist = tagRegex.sub(" ", artist) + artist = whitespaceRegex.sub(" ", artist) + artist = html.unescape(artist) + artist = urllib.parse.unquote(artist) + if credit != None: + credit = tagRegex.sub(" ", credit) + credit = whitespaceRegex.sub(" ", credit) + credit = html.unescape(credit) + credit = urllib.parse.unquote(credit) + # Add to db + dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url)) +# Close db +dbCon.commit() +dbCon.close() |
