aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/enwiki/download_imgs.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-11 14:55:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-11 15:04:14 +1000
commit5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree2567c25c902dbb40d44419805cebb38171df47fa /backend/tol_data/enwiki/download_imgs.py
parentdaccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
Add backend unit tests
- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tol_data/enwiki/download_imgs.py')
-rwxr-xr-xbackend/tol_data/enwiki/download_imgs.py99
1 files changed, 99 insertions, 0 deletions
diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py
new file mode 100755
index 0000000..ba874e1
--- /dev/null
+++ b/backend/tol_data/enwiki/download_imgs.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python3
+
+"""
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'pageId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
+
+# In testing, this downloaded about 100k images, over several days
+
+import re, os
+import sqlite3
+import urllib.parse, requests
+import time, signal
+
+IMG_DB = 'img_data.db' # About 130k image names
+OUT_DIR = 'imgs'
+#
+LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
+USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
+TIMEOUT = 1
+ # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
+ # It's unclear how to properly check for cache misses, so we just aim for 1 per sec
+
+def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
+ if not os.path.exists(outDir):
+ os.mkdir(outDir)
+ print('Checking for already-downloaded images')
+ fileList = os.listdir(outDir)
+ pageIdsDone: set[int] = set()
+ for filename in fileList:
+ pageIdsDone.add(int(os.path.splitext(filename)[0]))
+ print(f'Found {len(pageIdsDone)}')
+ #
+ # Set SIGINT handler
+ interrupted = False
+ oldHandler = None
+ def onSigint(sig, frame):
+ nonlocal interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+ oldHandler = signal.signal(signal.SIGINT, onSigint)
+ #
+ print('Opening database')
+ dbCon = sqlite3.connect(imgDb)
+ dbCur = dbCon.cursor()
+ print('Starting downloads')
+ iterNum = 0
+ query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \
+ ' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name'
+ for pageId, license, artist, credit, restrictions, url in dbCur.execute(query):
+ if pageId in pageIdsDone:
+ continue
+ if interrupted:
+ print('Exiting loop')
+ break
+ # Check for problematic attributes
+ if license is None or LICENSE_REGEX.fullmatch(license) is None:
+ continue
+ if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None:
+ continue
+ if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None:
+ continue
+ if restrictions is not None and restrictions != '':
+ continue
+ # Download image
+ iterNum += 1
+ print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
+ urlParts = urllib.parse.urlparse(url)
+ extension = os.path.splitext(urlParts.path)[1]
+ if len(extension) <= 1:
+ print(f'WARNING: No filename extension found in URL {url}')
+ continue
+ outFile = os.path.join(outDir, f'{pageId}{extension}')
+ print(outFile)
+ headers = {
+ 'user-agent': USER_AGENT,
+ 'accept-encoding': 'gzip',
+ }
+ try:
+ response = requests.get(url, headers=headers)
+ with open(outFile, 'wb') as file:
+ file.write(response.content)
+ time.sleep(timeout)
+ except Exception as e:
+ print(f'Error while downloading to {outFile}: {e}')
+ return
+ print('Closing database')
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)