aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki/download_img_license_info.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-01-21 12:21:03 +1100
committerTerry Truong <terry06890@gmail.com>2023-01-21 12:32:01 +1100
commit0a9b2c2e5eca8a04e37fbdd423379882863237c2 (patch)
tree1812bdb6bb13e4f76fdd7ef04075b291f775c213 /backend/hist_data/enwiki/download_img_license_info.py
parent8321e2f92dbc073b8f1de87895d6620a2021b22e (diff)
Adjust backend coding style
Increase line spacing, add section comments, etc
Diffstat (limited to 'backend/hist_data/enwiki/download_img_license_info.py')
-rwxr-xr-xbackend/hist_data/enwiki/download_img_license_info.py29
1 files changed, 20 insertions, 9 deletions
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 43f2c43..6fd710c 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -10,12 +10,16 @@ at already-processed names to decide what to skip.
"""
import argparse
-import re, time, signal
-import sqlite3, urllib.parse, html
+import re
+import time
+import signal
+import sqlite3
+import urllib.parse
+import html
import requests
IMG_DB = 'img_data.db'
-#
+
API_URL = 'https://en.wikipedia.org/w/api.php'
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
BATCH_SZ = 50 # Max 50
@@ -26,17 +30,18 @@ def downloadInfo(imgDb: str) -> None:
print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
+
print('Checking for table')
if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
dbCur.execute('CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \
'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
- #
+
print('Reading image names')
imgNames: set[str] = set()
for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
imgNames.add(imgName)
print(f'Found {len(imgNames)}')
- #
+
print('Checking for already-processed images')
nextImgId = 1
oldSz = len(imgNames)
@@ -45,7 +50,7 @@ def downloadInfo(imgDb: str) -> None:
if imgId >= nextImgId:
nextImgId = imgId + 1
print(f'Found {oldSz - len(imgNames)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -54,7 +59,7 @@ def downloadInfo(imgDb: str) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Iterating through image names')
imgNameList = list(imgNames)
iterNum = 0
@@ -65,9 +70,11 @@ def downloadInfo(imgDb: str) -> None:
if interrupted:
print(f'Exiting loop at iteration {iterNum}')
break
+
# Get batch
imgBatch = imgNameList[i:i+BATCH_SZ]
imgBatch = ['File:' + x for x in imgBatch]
+
# Make request
headers = {
'user-agent': USER_AGENT,
@@ -90,6 +97,7 @@ def downloadInfo(imgDb: str) -> None:
print(f'ERROR: Exception while downloading info: {e}')
print('\tImage batch: ' + '|'.join(imgBatch))
continue
+
# Parse response-object
if 'query' not in responseObj or 'pages' not in responseObj['query']:
print('WARNING: Response object doesn\'t have page data')
@@ -120,6 +128,7 @@ def downloadInfo(imgDb: str) -> None:
if title not in imgNames:
print(f'WARNING: Got title "{title}" not in image-name list')
continue
+
if 'imageinfo' not in page:
print(f'WARNING: No imageinfo section for page "{title}"')
continue
@@ -129,6 +138,7 @@ def downloadInfo(imgDb: str) -> None:
artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
# Remove markup
if artist is not None:
artist = TAG_REGEX.sub(' ', artist).strip()
@@ -140,11 +150,12 @@ def downloadInfo(imgDb: str) -> None:
credit = WHITESPACE_REGEX.sub(' ', credit)
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
+
# Add to db
dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)',
(nextImgId, title, license, artist, credit, restrictions, url))
nextImgId += 1
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
@@ -152,5 +163,5 @@ def downloadInfo(imgDb: str) -> None:
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadInfo(IMG_DB)