backend/data/enwiki/downloadImgLicenseInfo.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

#!/usr/bin/python3

import sys, re
import sqlite3, urllib.parse, html
import requests
import time, signal

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n"
usageInfo += "licensing information for them, adding the info to a sqlite db.\n"
usageInfo += "\n"
usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
usageInfo += "The program can be re-run to continue downloading, and looks\n"
usageInfo += "at names added to the db to decide what to skip.\n"
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

imgDb = "enwikiImgs.db" # About 130k image names
apiUrl = "https://en.wikipedia.org/w/api.php"
batchSz = 50 # Max 50
tagRegex = re.compile(r"<[^<]+>")
whitespaceRegex = re.compile(r"\s+")

# Open db
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
dbCur2 = dbCon.cursor()
# Create table if it doesn't exist
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
	dbCur.execute("CREATE TABLE imgs(" \
		"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
# Get image names
print("Reading image names")
imgNames = set()
for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
	imgNames.add(imgName)
print(f"Found {len(imgNames)} images")
oldSz = len(imgNames)
for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
	imgNames.discard(imgName)
print(f"Skipping {oldSz - len(imgNames)} already-done images")
# Set SIGINT handler
interrupted = False
oldHandler = None
def onSigint(sig, frame):
	global interrupted
	interrupted = True
	signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
# Iterate through image names, making API requests
imgNames = list(imgNames)
iterNum = 0
for i in range(0, len(imgNames), batchSz):
	iterNum += 1
	if iterNum % 1 == 0:
		print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
	if interrupted:
		print(f"Exiting loop at iteration {iterNum}")
		break
	# Get batch
	imgBatch = imgNames[i:i+batchSz]
	imgBatch = ["File:" + x for x in imgBatch]
	# Make request
	headers = {
		"user-agent": "terryt.dev (terry06890@gmail.com)",
		"accept-encoding": "gzip",
	}
	params = {
		"action": "query",
		"format": "json",
		"prop": "imageinfo",
		"iiprop": "extmetadata|url",
		"maxlag": "5",
		"titles": "|".join(imgBatch),
		"iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
	}
	responseObj = None
	try:
		response = requests.get(apiUrl, params=params, headers=headers)
		responseObj = response.json()
	except Exception as e:
		print(f"Error while downloading info: {e}", file=sys.stderr)
		print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
		continue
	# Parse response-object
	if "query" not in responseObj or "pages" not in responseObj["query"]:
		print("WARNING: Response object for doesn't have page data", file=sys.stderr)
		print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
		if "error" in responseObj:
			errorCode = responseObj["error"]["code"]
			print(f"\tError code: {errorCode}", file=sys.stderr)
			if errorCode == "maxlag":
				time.sleep(5)
		continue
	pages = responseObj["query"]["pages"]
	normalisedToInput = {}
	if "normalized" in responseObj["query"]:
		for entry in responseObj["query"]["normalized"]:
			normalisedToInput[entry["to"]] = entry["from"]
	for (_, page) in pages.items():
		# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
			# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
			# Artist: author name (might contain complex html, multiple authors, etc)
			# Credit: 'source'
				# For image-map-like images, can be quite large/complex html, creditng each sub-image
				# May be <a href="text1">text2</a>, where the text2 might be non-indicative
			# Restrictions: specifies non-copyright legal restrictions
		title = page["title"]
		if title in normalisedToInput:
			title = normalisedToInput[title]
		title = title[5:] # Remove 'File:'
		if title not in imgNames:
			print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr)
			continue
		if "imageinfo" not in page:
			print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr)
			continue
		metadata = page["imageinfo"][0]["extmetadata"]
		url = page["imageinfo"][0]["url"]
		license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
		artist = metadata['Artist']['value'] if 'Artist' in metadata else None
		credit = metadata['Credit']['value'] if 'Credit' in metadata else None
		restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
		# Remove newlines
		if artist != None:
			artist = tagRegex.sub(" ", artist)
			artist = whitespaceRegex.sub(" ", artist)
			artist = html.unescape(artist)
			artist = urllib.parse.unquote(artist)
		if credit != None:
			credit = tagRegex.sub(" ", credit)
			credit = whitespaceRegex.sub(" ", credit)
			credit = html.unescape(credit)
			credit = urllib.parse.unquote(credit)
		# Add to db
		dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url))
# Close db
dbCon.commit()
dbCon.close()