aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki/downloadImgLicenseInfo.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/enwiki/downloadImgLicenseInfo.py')
-rwxr-xr-xbackend/tolData/enwiki/downloadImgLicenseInfo.py136
1 files changed, 68 insertions, 68 deletions
diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py
index dd39d54..ba6317e 100755
--- a/backend/tolData/enwiki/downloadImgLicenseInfo.py
+++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, re
+import re
import sqlite3, urllib.parse, html
import requests
import time, signal
@@ -16,33 +16,33 @@ at already-processed names to decide what to skip.
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
-imgDb = "imgData.db"
-apiUrl = "https://en.wikipedia.org/w/api.php"
-userAgent = "terryt.dev (terry06890@gmail.com)"
+imgDb = 'imgData.db'
+apiUrl = 'https://en.wikipedia.org/w/api.php'
+userAgent = 'terryt.dev (terry06890@gmail.com)'
batchSz = 50 # Max 50
-tagRegex = re.compile(r"<[^<]+>")
-whitespaceRegex = re.compile(r"\s+")
+tagRegex = re.compile(r'<[^<]+>')
+whitespaceRegex = re.compile(r'\s+')
-print("Opening database")
+print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
dbCur2 = dbCon.cursor()
-print("Checking for table")
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
- dbCur.execute("CREATE TABLE imgs(" \
- "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
+print('Checking for table')
+if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
+ dbCur.execute('CREATE TABLE imgs(' \
+ 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
-print("Reading image names")
-imgNames = set()
-for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
+print('Reading image names')
+imgNames: set[str] = set()
+for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
imgNames.add(imgName)
-print(f"Found {len(imgNames)}")
+print(f'Found {len(imgNames)}')
-print("Checking for already-processed images")
+print('Checking for already-processed images')
oldSz = len(imgNames)
-for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
+for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
imgNames.discard(imgName)
-print(f"Found {oldSz - len(imgNames)}")
+print(f'Found {oldSz - len(imgNames)}')
# Set SIGINT handler
interrupted = False
@@ -53,95 +53,95 @@ def onSigint(sig, frame):
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
-print("Iterating through image names")
-imgNames = list(imgNames)
+print('Iterating through image names')
+imgNameList = list(imgNames)
iterNum = 0
-for i in range(0, len(imgNames), batchSz):
+for i in range(0, len(imgNameList), batchSz):
iterNum += 1
if iterNum % 1 == 0:
- print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
+ print(f'At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)')
if interrupted:
- print(f"Exiting loop at iteration {iterNum}")
+ print(f'Exiting loop at iteration {iterNum}')
break
# Get batch
- imgBatch = imgNames[i:i+batchSz]
- imgBatch = ["File:" + x for x in imgBatch]
+ imgBatch = imgNameList[i:i+batchSz]
+ imgBatch = ['File:' + x for x in imgBatch]
# Make request
headers = {
- "user-agent": userAgent,
- "accept-encoding": "gzip",
+ 'user-agent': userAgent,
+ 'accept-encoding': 'gzip',
}
params = {
- "action": "query",
- "format": "json",
- "prop": "imageinfo",
- "iiprop": "extmetadata|url",
- "maxlag": "5",
- "titles": "|".join(imgBatch),
- "iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
+ 'action': 'query',
+ 'format': 'json',
+ 'prop': 'imageinfo',
+ 'iiprop': 'extmetadata|url',
+ 'maxlag': '5',
+ 'titles': '|'.join(imgBatch),
+ 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions',
}
responseObj = None
try:
response = requests.get(apiUrl, params=params, headers=headers)
responseObj = response.json()
except Exception as e:
- print(f"ERROR: Exception while downloading info: {e}")
- print(f"\tImage batch: " + "|".join(imgBatch))
+ print(f'ERROR: Exception while downloading info: {e}')
+ print('\tImage batch: ' + '|'.join(imgBatch))
continue
# Parse response-object
- if "query" not in responseObj or "pages" not in responseObj["query"]:
- print("WARNING: Response object for doesn't have page data")
- print("\tImage batch: " + "|".join(imgBatch))
- if "error" in responseObj:
- errorCode = responseObj["error"]["code"]
- print(f"\tError code: {errorCode}")
- if errorCode == "maxlag":
+ if 'query' not in responseObj or 'pages' not in responseObj['query']:
+ print('WARNING: Response object for doesn\'t have page data')
+ print('\tImage batch: ' + '|'.join(imgBatch))
+ if 'error' in responseObj:
+ errorCode = responseObj['error']['code']
+ print(f'\tError code: {errorCode}')
+ if errorCode == 'maxlag':
time.sleep(5)
continue
- pages = responseObj["query"]["pages"]
- normalisedToInput = {}
- if "normalized" in responseObj["query"]:
- for entry in responseObj["query"]["normalized"]:
- normalisedToInput[entry["to"]] = entry["from"]
- for (_, page) in pages.items():
+ pages = responseObj['query']['pages']
+ normalisedToInput: dict[str, str] = {}
+ if 'normalized' in responseObj['query']:
+ for entry in responseObj['query']['normalized']:
+ normalisedToInput[entry['to']] = entry['from']
+ for _, page in pages.items():
# Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
# LicenseShortName: short human-readable license name, apparently more reliable than 'License',
# Artist: author name (might contain complex html, multiple authors, etc)
# Credit: 'source'
# For image-map-like images, can be quite large/complex html, creditng each sub-image
- # May be <a href="text1">text2</a>, where the text2 might be non-indicative
+ # May be <a href='text1'>text2</a>, where the text2 might be non-indicative
# Restrictions: specifies non-copyright legal restrictions
- title = page["title"]
+ title: str = page['title']
if title in normalisedToInput:
title = normalisedToInput[title]
title = title[5:] # Remove 'File:'
if title not in imgNames:
- print(f"WARNING: Got title \"{title}\" not in image-name list")
+ print(f'WARNING: Got title "{title}" not in image-name list')
continue
- if "imageinfo" not in page:
- print(f"WARNING: No imageinfo section for page \"{title}\"")
+ if 'imageinfo' not in page:
+ print(f'WARNING: No imageinfo section for page "{title}"')
continue
- metadata = page["imageinfo"][0]["extmetadata"]
- url = page["imageinfo"][0]["url"]
- license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
- artist = metadata['Artist']['value'] if 'Artist' in metadata else None
- credit = metadata['Credit']['value'] if 'Credit' in metadata else None
- restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+ metadata = page['imageinfo'][0]['extmetadata']
+ url: str = page['imageinfo'][0]['url']
+ license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
+ artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
+ credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
+ restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
# Remove markup
- if artist != None:
- artist = tagRegex.sub(" ", artist)
- artist = whitespaceRegex.sub(" ", artist)
+ if artist is not None:
+ artist = tagRegex.sub(' ', artist)
+ artist = whitespaceRegex.sub(' ', artist)
artist = html.unescape(artist)
artist = urllib.parse.unquote(artist)
- if credit != None:
- credit = tagRegex.sub(" ", credit)
- credit = whitespaceRegex.sub(" ", credit)
+ if credit is not None:
+ credit = tagRegex.sub(' ', credit)
+ credit = whitespaceRegex.sub(' ', credit)
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
# Add to db
- dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
+ dbCur2.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
(title, license, artist, credit, restrictions, url))
-print("Closing database")
+print('Closing database')
dbCon.commit()
dbCon.close()