aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki/downloadImgLicenseInfo.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/enwiki/downloadImgLicenseInfo.py')
-rwxr-xr-xbackend/data/enwiki/downloadImgLicenseInfo.py150
1 files changed, 0 insertions, 150 deletions
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
deleted file mode 100755
index 399922e..0000000
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import sqlite3, urllib.parse, html
-import requests
-import time, signal
-
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-Reads image names from a database, and uses enwiki's online API to obtain
-licensing information for them, adding the info to the database.
-
-SIGINT causes the program to finish an ongoing download and exit.
-The program can be re-run to continue downloading, and looks
-at already-processed names to decide what to skip.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-imgDb = "imgData.db"
-apiUrl = "https://en.wikipedia.org/w/api.php"
-userAgent = "terryt.dev (terry06890@gmail.com)"
-batchSz = 50 # Max 50
-tagRegex = re.compile(r"<[^<]+>")
-whitespaceRegex = re.compile(r"\s+")
-
-print("Opening database")
-dbCon = sqlite3.connect(imgDb)
-dbCur = dbCon.cursor()
-dbCur2 = dbCon.cursor()
-print("Checking for table")
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
- dbCur.execute("CREATE TABLE imgs(" \
- "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
-
-print("Reading image names")
-imgNames = set()
-for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
- imgNames.add(imgName)
-print(f"Found {len(imgNames)}")
-
-print("Checking for already-processed images")
-oldSz = len(imgNames)
-for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
- imgNames.discard(imgName)
-print(f"Found {oldSz - len(imgNames)}")
-
-# Set SIGINT handler
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
- global interrupted
- interrupted = True
- signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-
-print("Iterating through image names")
-imgNames = list(imgNames)
-iterNum = 0
-for i in range(0, len(imgNames), batchSz):
- iterNum += 1
- if iterNum % 1 == 0:
- print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
- if interrupted:
- print(f"Exiting loop at iteration {iterNum}")
- break
- # Get batch
- imgBatch = imgNames[i:i+batchSz]
- imgBatch = ["File:" + x for x in imgBatch]
- # Make request
- headers = {
- "user-agent": userAgent,
- "accept-encoding": "gzip",
- }
- params = {
- "action": "query",
- "format": "json",
- "prop": "imageinfo",
- "iiprop": "extmetadata|url",
- "maxlag": "5",
- "titles": "|".join(imgBatch),
- "iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
- }
- responseObj = None
- try:
- response = requests.get(apiUrl, params=params, headers=headers)
- responseObj = response.json()
- except Exception as e:
- print(f"ERROR: Exception while downloading info: {e}")
- print(f"\tImage batch: " + "|".join(imgBatch))
- continue
- # Parse response-object
- if "query" not in responseObj or "pages" not in responseObj["query"]:
- print("WARNING: Response object for doesn't have page data")
- print("\tImage batch: " + "|".join(imgBatch))
- if "error" in responseObj:
- errorCode = responseObj["error"]["code"]
- print(f"\tError code: {errorCode}")
- if errorCode == "maxlag":
- time.sleep(5)
- continue
- pages = responseObj["query"]["pages"]
- normalisedToInput = {}
- if "normalized" in responseObj["query"]:
- for entry in responseObj["query"]["normalized"]:
- normalisedToInput[entry["to"]] = entry["from"]
- for (_, page) in pages.items():
- # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
- # LicenseShortName: short human-readable license name, apparently more reliable than 'License',
- # Artist: author name (might contain complex html, multiple authors, etc)
- # Credit: 'source'
- # For image-map-like images, can be quite large/complex html, creditng each sub-image
- # May be <a href="text1">text2</a>, where the text2 might be non-indicative
- # Restrictions: specifies non-copyright legal restrictions
- title = page["title"]
- if title in normalisedToInput:
- title = normalisedToInput[title]
- title = title[5:] # Remove 'File:'
- if title not in imgNames:
- print(f"WARNING: Got title \"{title}\" not in image-name list")
- continue
- if "imageinfo" not in page:
- print(f"WARNING: No imageinfo section for page \"{title}\"")
- continue
- metadata = page["imageinfo"][0]["extmetadata"]
- url = page["imageinfo"][0]["url"]
- license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
- artist = metadata['Artist']['value'] if 'Artist' in metadata else None
- credit = metadata['Credit']['value'] if 'Credit' in metadata else None
- restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
- # Remove markup
- if artist != None:
- artist = tagRegex.sub(" ", artist)
- artist = whitespaceRegex.sub(" ", artist)
- artist = html.unescape(artist)
- artist = urllib.parse.unquote(artist)
- if credit != None:
- credit = tagRegex.sub(" ", credit)
- credit = whitespaceRegex.sub(" ", credit)
- credit = html.unescape(credit)
- credit = urllib.parse.unquote(credit)
- # Add to db
- dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
- (title, license, artist, credit, restrictions, url))
-
-print("Closing database")
-dbCon.commit()
-dbCon.close()