1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
|
#!/usr/bin/python3
import sys, re
import sqlite3, urllib.parse, html
import requests
import time, signal
usageInfo = f"""
Usage: {sys.argv[0]}
Reads image names from a database, and uses enwiki's online API to obtain
licensing information for them, adding the info to the database.
SIGINT causes the program to finish an ongoing download and exit.
The program can be re-run to continue downloading, and looks
at already-processed names to decide what to skip.
"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
imgDb = "imgData.db"
apiUrl = "https://en.wikipedia.org/w/api.php"
userAgent = "terryt.dev (terry06890@gmail.com)"
batchSz = 50 # Max 50
tagRegex = re.compile(r"<[^<]+>")
whitespaceRegex = re.compile(r"\s+")
print("Opening database")
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
dbCur2 = dbCon.cursor()
print("Checking for table")
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
dbCur.execute("CREATE TABLE imgs(" \
"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
print("Reading image names")
imgNames = set()
for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
imgNames.add(imgName)
print(f"Found {len(imgNames)}")
print("Checking for already-processed images")
oldSz = len(imgNames)
for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
imgNames.discard(imgName)
print(f"Found {oldSz - len(imgNames)}")
# Set SIGINT handler
interrupted = False
oldHandler = None
def onSigint(sig, frame):
global interrupted
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
print("Iterating through image names")
imgNames = list(imgNames)
iterNum = 0
for i in range(0, len(imgNames), batchSz):
iterNum += 1
if iterNum % 1 == 0:
print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
if interrupted:
print(f"Exiting loop at iteration {iterNum}")
break
# Get batch
imgBatch = imgNames[i:i+batchSz]
imgBatch = ["File:" + x for x in imgBatch]
# Make request
headers = {
"user-agent": userAgent,
"accept-encoding": "gzip",
}
params = {
"action": "query",
"format": "json",
"prop": "imageinfo",
"iiprop": "extmetadata|url",
"maxlag": "5",
"titles": "|".join(imgBatch),
"iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
}
responseObj = None
try:
response = requests.get(apiUrl, params=params, headers=headers)
responseObj = response.json()
except Exception as e:
print(f"ERROR: Exception while downloading info: {e}")
print(f"\tImage batch: " + "|".join(imgBatch))
continue
# Parse response-object
if "query" not in responseObj or "pages" not in responseObj["query"]:
print("WARNING: Response object for doesn't have page data")
print("\tImage batch: " + "|".join(imgBatch))
if "error" in responseObj:
errorCode = responseObj["error"]["code"]
print(f"\tError code: {errorCode}")
if errorCode == "maxlag":
time.sleep(5)
continue
pages = responseObj["query"]["pages"]
normalisedToInput = {}
if "normalized" in responseObj["query"]:
for entry in responseObj["query"]["normalized"]:
normalisedToInput[entry["to"]] = entry["from"]
for (_, page) in pages.items():
# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
# Artist: author name (might contain complex html, multiple authors, etc)
# Credit: 'source'
# For image-map-like images, can be quite large/complex html, creditng each sub-image
# May be <a href="text1">text2</a>, where the text2 might be non-indicative
# Restrictions: specifies non-copyright legal restrictions
title = page["title"]
if title in normalisedToInput:
title = normalisedToInput[title]
title = title[5:] # Remove 'File:'
if title not in imgNames:
print(f"WARNING: Got title \"{title}\" not in image-name list")
continue
if "imageinfo" not in page:
print(f"WARNING: No imageinfo section for page \"{title}\"")
continue
metadata = page["imageinfo"][0]["extmetadata"]
url = page["imageinfo"][0]["url"]
license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
artist = metadata['Artist']['value'] if 'Artist' in metadata else None
credit = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
# Remove markup
if artist != None:
artist = tagRegex.sub(" ", artist)
artist = whitespaceRegex.sub(" ", artist)
artist = html.unescape(artist)
artist = urllib.parse.unquote(artist)
if credit != None:
credit = tagRegex.sub(" ", credit)
credit = whitespaceRegex.sub(" ", credit)
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
# Add to db
dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
(title, license, artist, credit, restrictions, url))
print("Closing database")
dbCon.commit()
dbCon.close()
|