From abb936f5d76f7fe5cec1e8948d287da86643d504 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 22 Jun 2022 23:16:42 +1000 Subject: Refactor backend scripts --- backend/data/enwiki/downloadImgLicenseInfo.py | 60 ++++++++++++++++----------- 1 file changed, 35 insertions(+), 25 deletions(-) (limited to 'backend/data/enwiki/downloadImgLicenseInfo.py') diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py index 097304b..399922e 100755 --- a/backend/data/enwiki/downloadImgLicenseInfo.py +++ b/backend/data/enwiki/downloadImgLicenseInfo.py @@ -5,41 +5,48 @@ import sqlite3, urllib.parse, html import requests import time, signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n" -usageInfo += "licensing information for them, adding the info to a sqlite db.\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" -usageInfo += "The program can be re-run to continue downloading, and looks\n" -usageInfo += "at names added to the db to decide what to skip.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads image names from a database, and uses enwiki's online API to obtain +licensing information for them, adding the info to the database. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +at already-processed names to decide what to skip. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDb = "imgData.db" # About 130k image names +imgDb = "imgData.db" apiUrl = "https://en.wikipedia.org/w/api.php" +userAgent = "terryt.dev (terry06890@gmail.com)" batchSz = 50 # Max 50 tagRegex = re.compile(r"<[^<]+>") whitespaceRegex = re.compile(r"\s+") -# Open db +print("Opening database") dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() dbCur2 = dbCon.cursor() -# Create table if it doesn't exist +print("Checking for table") if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None: dbCur.execute("CREATE TABLE imgs(" \ "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)") -# Get image names + print("Reading image names") imgNames = set() for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"): imgNames.add(imgName) -print(f"Found {len(imgNames)} images") +print(f"Found {len(imgNames)}") + +print("Checking for already-processed images") oldSz = len(imgNames) for (imgName,) in dbCur.execute("SELECT name FROM imgs"): imgNames.discard(imgName) -print(f"Skipping {oldSz - len(imgNames)} already-done images") +print(f"Found {oldSz - len(imgNames)}") + # Set SIGINT handler interrupted = False oldHandler = None @@ -48,7 +55,8 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) -# Iterate through image names, making API requests + +print("Iterating through image names") imgNames = list(imgNames) iterNum = 0 for i in range(0, len(imgNames), batchSz): @@ -63,7 +71,7 @@ for i in range(0, len(imgNames), batchSz): imgBatch = ["File:" + x for x in imgBatch] # Make request headers = { - "user-agent": "terryt.dev (terry06890@gmail.com)", + "user-agent": userAgent, "accept-encoding": "gzip", } params = { @@ -80,16 +88,16 @@ for i in range(0, len(imgNames), batchSz): response = requests.get(apiUrl, params=params, headers=headers) responseObj = response.json() except Exception as e: - print(f"Error while downloading info: {e}", file=sys.stderr) - print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + print(f"ERROR: Exception while downloading info: {e}") + print(f"\tImage batch: " + "|".join(imgBatch)) continue # Parse response-object if "query" not in responseObj or "pages" not in responseObj["query"]: - print("WARNING: Response object for doesn't have page data", file=sys.stderr) - print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + print("WARNING: Response object for doesn't have page data") + print("\tImage batch: " + "|".join(imgBatch)) if "error" in responseObj: errorCode = responseObj["error"]["code"] - print(f"\tError code: {errorCode}", file=sys.stderr) + print(f"\tError code: {errorCode}") if errorCode == "maxlag": time.sleep(5) continue @@ -111,10 +119,10 @@ for i in range(0, len(imgNames), batchSz): title = normalisedToInput[title] title = title[5:] # Remove 'File:' if title not in imgNames: - print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr) + print(f"WARNING: Got title \"{title}\" not in image-name list") continue if "imageinfo" not in page: - print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr) + print(f"WARNING: No imageinfo section for page \"{title}\"") continue metadata = page["imageinfo"][0]["extmetadata"] url = page["imageinfo"][0]["url"] @@ -122,7 +130,7 @@ for i in range(0, len(imgNames), batchSz): artist = metadata['Artist']['value'] if 'Artist' in metadata else None credit = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None - # Remove newlines + # Remove markup if artist != None: artist = tagRegex.sub(" ", artist) artist = whitespaceRegex.sub(" ", artist) @@ -134,7 +142,9 @@ for i in range(0, len(imgNames), batchSz): credit = html.unescape(credit) credit = urllib.parse.unquote(credit) # Add to db - dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url)) -# Close db + dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", + (title, license, artist, credit, restrictions, url)) + +print("Closing database") dbCon.commit() dbCon.close() -- cgit v1.2.3