From 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Mon, 11 Jul 2022 01:54:08 +1000
Subject: Make backend dev server script serve the image files

Previously, image files in backend/data/img were moved to, or
symlinked from, public/. This needed to be changed before each
build, otherwise vite would end up copying gigabytes of images.
---
 backend/data/enwiki/downloadImgLicenseInfo.py | 150 --------------------------
 1 file changed, 150 deletions(-)
 delete mode 100755 backend/data/enwiki/downloadImgLicenseInfo.py

(limited to 'backend/data/enwiki/downloadImgLicenseInfo.py')

diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
deleted file mode 100755
index 399922e..0000000
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import sqlite3, urllib.parse, html
-import requests
-import time, signal
-
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-Reads image names from a database, and uses enwiki's online API to obtain
-licensing information for them, adding the info to the database.
-
-SIGINT causes the program to finish an ongoing download and exit.
-The program can be re-run to continue downloading, and looks
-at already-processed names to decide what to skip.
-"""
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-imgDb = "imgData.db"
-apiUrl = "https://en.wikipedia.org/w/api.php"
-userAgent = "terryt.dev (terry06890@gmail.com)"
-batchSz = 50 # Max 50
-tagRegex = re.compile(r"<[^<]+>")
-whitespaceRegex = re.compile(r"\s+")
-
-print("Opening database")
-dbCon = sqlite3.connect(imgDb)
-dbCur = dbCon.cursor()
-dbCur2 = dbCon.cursor()
-print("Checking for table")
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
-	dbCur.execute("CREATE TABLE imgs(" \
-		"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
-
-print("Reading image names")
-imgNames = set()
-for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
-	imgNames.add(imgName)
-print(f"Found {len(imgNames)}")
-
-print("Checking for already-processed images")
-oldSz = len(imgNames)
-for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
-	imgNames.discard(imgName)
-print(f"Found {oldSz - len(imgNames)}")
-
-# Set SIGINT handler
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-	signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-
-print("Iterating through image names")
-imgNames = list(imgNames)
-iterNum = 0
-for i in range(0, len(imgNames), batchSz):
-	iterNum += 1
-	if iterNum % 1 == 0:
-		print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
-	if interrupted:
-		print(f"Exiting loop at iteration {iterNum}")
-		break
-	# Get batch
-	imgBatch = imgNames[i:i+batchSz]
-	imgBatch = ["File:" + x for x in imgBatch]
-	# Make request
-	headers = {
-		"user-agent": userAgent,
-		"accept-encoding": "gzip",
-	}
-	params = {
-		"action": "query",
-		"format": "json",
-		"prop": "imageinfo",
-		"iiprop": "extmetadata|url",
-		"maxlag": "5",
-		"titles": "|".join(imgBatch),
-		"iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
-	}
-	responseObj = None
-	try:
-		response = requests.get(apiUrl, params=params, headers=headers)
-		responseObj = response.json()
-	except Exception as e:
-		print(f"ERROR: Exception while downloading info: {e}")
-		print(f"\tImage batch: " + "|".join(imgBatch))
-		continue
-	# Parse response-object
-	if "query" not in responseObj or "pages" not in responseObj["query"]:
-		print("WARNING: Response object for doesn't have page data")
-		print("\tImage batch: " + "|".join(imgBatch))
-		if "error" in responseObj:
-			errorCode = responseObj["error"]["code"]
-			print(f"\tError code: {errorCode}")
-			if errorCode == "maxlag":
-				time.sleep(5)
-		continue
-	pages = responseObj["query"]["pages"]
-	normalisedToInput = {}
-	if "normalized" in responseObj["query"]:
-		for entry in responseObj["query"]["normalized"]:
-			normalisedToInput[entry["to"]] = entry["from"]
-	for (_, page) in pages.items():
-		# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
-			# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
-			# Artist: author name (might contain complex html, multiple authors, etc)
-			# Credit: 'source'
-				# For image-map-like images, can be quite large/complex html, creditng each sub-image
-				# May be <a href="text1">text2</a>, where the text2 might be non-indicative
-			# Restrictions: specifies non-copyright legal restrictions
-		title = page["title"]
-		if title in normalisedToInput:
-			title = normalisedToInput[title]
-		title = title[5:] # Remove 'File:'
-		if title not in imgNames:
-			print(f"WARNING: Got title \"{title}\" not in image-name list")
-			continue
-		if "imageinfo" not in page:
-			print(f"WARNING: No imageinfo section for page \"{title}\"")
-			continue
-		metadata = page["imageinfo"][0]["extmetadata"]
-		url = page["imageinfo"][0]["url"]
-		license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
-		artist = metadata['Artist']['value'] if 'Artist' in metadata else None
-		credit = metadata['Credit']['value'] if 'Credit' in metadata else None
-		restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
-		# Remove markup
-		if artist != None:
-			artist = tagRegex.sub(" ", artist)
-			artist = whitespaceRegex.sub(" ", artist)
-			artist = html.unescape(artist)
-			artist = urllib.parse.unquote(artist)
-		if credit != None:
-			credit = tagRegex.sub(" ", credit)
-			credit = whitespaceRegex.sub(" ", credit)
-			credit = html.unescape(credit)
-			credit = urllib.parse.unquote(credit)
-		# Add to db
-		dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
-			(title, license, artist, credit, restrictions, url))
-
-print("Closing database")
-dbCon.commit()
-dbCon.close()
-- 
cgit v1.2.3