#!/usr/bin/python3 import sys, re import sqlite3, urllib.parse, html import requests import time, signal usageInfo = f"usage: {sys.argv[0]}\n" usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n" usageInfo += "licensing information for them, adding the info to a sqlite db.\n" usageInfo += "\n" usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" usageInfo += "The program can be re-run to continue downloading, and looks\n" usageInfo += "at names added to the db to decide what to skip.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) imgDb = "enwikiImgs.db" # About 130k image names apiUrl = "https://en.wikipedia.org/w/api.php" batchSz = 50 # Max 50 tagRegex = re.compile(r"<[^<]+>") whitespaceRegex = re.compile(r"\s+") # Open db dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() dbCur2 = dbCon.cursor() # Create table if it doesn't exist if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None: dbCur.execute("CREATE TABLE imgs(" \ "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)") # Get image names print("Reading image names") imgNames = set() for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs"): imgNames.add(imgName) print(f"Found {len(imgNames)} images") oldSz = len(imgNames) for (imgName,) in dbCur.execute("SELECT name FROM imgs"): imgNames.discard(imgName) print(f"Skipping {oldSz - len(imgNames)} already-done images") # Set SIGINT handler interrupted = False oldHandler = None def onSigint(sig, frame): global interrupted interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) # Iterate through image names, making API requests imgNames = list(imgNames) iterNum = 0 for i in range(0, len(imgNames), batchSz): iterNum += 1 if iterNum % 1 == 0: print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)") if interrupted: print(f"Exiting loop at iteration {iterNum}") break # Get batch imgBatch = imgNames[i:i+batchSz] imgBatch = ["File:" + x for x in imgBatch] # Make request headers = { "user-agent": "terryt.dev (terry06890@gmail.com)", "accept-encoding": "gzip", } params = { "action": "query", "format": "json", "prop": "imageinfo", "iiprop": "extmetadata|url", "maxlag": "5", "titles": "|".join(imgBatch), "iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions", } responseObj = None try: response = requests.get(apiUrl, params=params, headers=headers) responseObj = response.json() except Exception as e: print(f"Error while downloading info: {e}", file=sys.stderr) print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr) continue # Parse response-object if "query" not in responseObj or "pages" not in responseObj["query"]: print("WARNING: Response object for doesn't have page data", file=sys.stderr) print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr) if "error" in responseObj: errorCode = responseObj["error"]["code"] print(f"\tError code: {errorCode}", file=sys.stderr) if errorCode == "maxlag": time.sleep(5) continue pages = responseObj["query"]["pages"] normalisedToInput = {} if "normalized" in responseObj["query"]: for entry in responseObj["query"]["normalized"]: normalisedToInput[entry["to"]] = entry["from"] for (_, page) in pages.items(): # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data # LicenseShortName: short human-readable license name, apparently more reliable than 'License', # Artist: author name (might contain complex html, multiple authors, etc) # Credit: 'source' # For image-map-like images, can be quite large/complex html, creditng each sub-image # May be text2, where the text2 might be non-indicative # Restrictions: specifies non-copyright legal restrictions title = page["title"] if title in normalisedToInput: title = normalisedToInput[title] title = title[5:] # Remove 'File:' if title not in imgNames: print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr) continue if "imageinfo" not in page: print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr) continue metadata = page["imageinfo"][0]["extmetadata"] url = page["imageinfo"][0]["url"] license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None artist = metadata['Artist']['value'] if 'Artist' in metadata else None credit = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None # Remove newlines if artist != None: artist = tagRegex.sub(" ", artist) artist = whitespaceRegex.sub(" ", artist) artist = html.unescape(artist) artist = urllib.parse.unquote(artist) if credit != None: credit = tagRegex.sub(" ", credit) credit = whitespaceRegex.sub(" ", credit) credit = html.unescape(credit) credit = urllib.parse.unquote(credit) # Add to db dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url)) # Close db dbCon.commit() dbCon.close()