backend/data/enwiki/downloadImgLicenseInfo.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

#!/usr/bin/python3

import sys, re
import sqlite3, urllib.parse, html
import requests
import time, signal

usageInfo = f"""
Usage: {sys.argv[0]}

Reads image names from a database, and uses enwiki's online API to obtain
licensing information for them, adding the info to the database.

SIGINT causes the program to finish an ongoing download and exit.
The program can be re-run to continue downloading, and looks
at already-processed names to decide what to skip.
"""
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

imgDb = "imgData.db"
apiUrl = "https://en.wikipedia.org/w/api.php"
userAgent = "terryt.dev (terry06890@gmail.com)"
batchSz = 50 # Max 50
tagRegex = re.compile(r"<[^<]+>")
whitespaceRegex = re.compile(r"\s+")

print("Opening database")
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
dbCur2 = dbCon.cursor()
print("Checking for table")
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
	dbCur.execute("CREATE TABLE imgs(" \
		"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")

print("Reading image names")
imgNames = set()
for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
	imgNames.add(imgName)
print(f"Found {len(imgNames)}")

print("Checking for already-processed images")
oldSz = len(imgNames)
for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
	imgNames.discard(imgName)
print(f"Found {oldSz - len(imgNames)}")

# Set SIGINT handler
interrupted = False
oldHandler = None
def onSigint(sig, frame):
	global interrupted
	interrupted = True
	signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)

print("Iterating through image names")
imgNames = list(imgNames)
iterNum = 0
for i in range(0, len(imgNames), batchSz):
	iterNum += 1
	if iterNum % 1 == 0:
		print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
	if interrupted:
		print(f"Exiting loop at iteration {iterNum}")
		break
	# Get batch
	imgBatch = imgNames[i:i+batchSz]
	imgBatch = ["File:" + x for x in imgBatch]
	# Make request
	headers = {
		"user-agent": userAgent,
		"accept-encoding": "gzip",
	}
	params = {
		"action": "query",
		"format": "json",
		"prop": "imageinfo",
		"iiprop": "extmetadata|url",
		"maxlag": "5",
		"titles": "|".join(imgBatch),
		"iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
	}
	responseObj = None
	try:
		response = requests.get(apiUrl, params=params, headers=headers)
		responseObj = response.json()
	except Exception as e:
		print(f"ERROR: Exception while downloading info: {e}")
		print(f"\tImage batch: " + "|".join(imgBatch))
		continue
	# Parse response-object
	if "query" not in responseObj or "pages" not in responseObj["query"]:
		print("WARNING: Response object for doesn't have page data")
		print("\tImage batch: " + "|".join(imgBatch))
		if "error" in responseObj:
			errorCode = responseObj["error"]["code"]
			print(f"\tError code: {errorCode}")
			if errorCode == "maxlag":
				time.sleep(5)
		continue
	pages = responseObj["query"]["pages"]
	normalisedToInput = {}
	if "normalized" in responseObj["query"]:
		for entry in responseObj["query"]["normalized"]:
			normalisedToInput[entry["to"]] = entry["from"]
	for (_, page) in pages.items():
		# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
			# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
			# Artist: author name (might contain complex html, multiple authors, etc)
			# Credit: 'source'
				# For image-map-like images, can be quite large/complex html, creditng each sub-image
				# May be <a href="text1">text2</a>, where the text2 might be non-indicative
			# Restrictions: specifies non-copyright legal restrictions
		title = page["title"]
		if title in normalisedToInput:
			title = normalisedToInput[title]
		title = title[5:] # Remove 'File:'
		if title not in imgNames:
			print(f"WARNING: Got title \"{title}\" not in image-name list")
			continue
		if "imageinfo" not in page:
			print(f"WARNING: No imageinfo section for page \"{title}\"")
			continue
		metadata = page["imageinfo"][0]["extmetadata"]
		url = page["imageinfo"][0]["url"]
		license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
		artist = metadata['Artist']['value'] if 'Artist' in metadata else None
		credit = metadata['Credit']['value'] if 'Credit' in metadata else None
		restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
		# Remove markup
		if artist != None:
			artist = tagRegex.sub(" ", artist)
			artist = whitespaceRegex.sub(" ", artist)
			artist = html.unescape(artist)
			artist = urllib.parse.unquote(artist)
		if credit != None:
			credit = tagRegex.sub(" ", credit)
			credit = whitespaceRegex.sub(" ", credit)
			credit = html.unescape(credit)
			credit = urllib.parse.unquote(credit)
		# Add to db
		dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
			(title, license, artist, credit, restrictions, url))

print("Closing database")
dbCon.commit()
dbCon.close()