From abb936f5d76f7fe5cec1e8948d287da86643d504 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 22 Jun 2022 23:16:42 +1000 Subject: Refactor backend scripts --- backend/data/enwiki/downloadEnwikiImgs.py | 88 ------------------------------- 1 file changed, 88 deletions(-) delete mode 100755 backend/data/enwiki/downloadEnwikiImgs.py (limited to 'backend/data/enwiki/downloadEnwikiImgs.py') diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py deleted file mode 100755 index 2929a0d..0000000 --- a/backend/data/enwiki/downloadEnwikiImgs.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import sqlite3 -import urllib.parse, requests -import time, signal - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Downloads images from URLs specified in an sqlite db,\n" -usageInfo += "into a specified directory.'\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" -usageInfo += "The program can be re-run to continue downloading, and looks\n" -usageInfo += "in the output directory do decide what to skip.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -imgDb = "imgData.db" # About 130k image names -outDir = "imgs" -licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) - -# Create output directory if not present -if not os.path.exists(outDir): - os.mkdir(outDir) -# Get existing image names -print("Gettings already-downloaded images") -fileList = os.listdir(outDir) -pageIdsDone = set() -for filename in fileList: - (basename, extension) = os.path.splitext(filename) - pageIdsDone.add(int(basename)) -print(f"Found {len(pageIdsDone)} already-downloaded images") -# Set SIGINT handler -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) -# Open db -dbCon = sqlite3.connect(imgDb) -dbCur = dbCon.cursor() -# Start downloads -print("Starting downloads") -iterNum = 0 -query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \ - " imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name" -for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query): - if pageId in pageIdsDone: - continue - if interrupted: - print(f"Exiting loop") - break - # Check for problematic attributes - if license == None or licenseRegex.fullmatch(license) == None: - continue - if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None: - continue - if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None: - continue - if restrictions != None and restrictions != "": - continue - # Download image - iterNum += 1 - print(f"Iteration {iterNum}: Downloading for page-id {pageId}") - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) - sys.exit(1) - outFile = f"{outDir}/{pageId}{extension}" - headers = { - "user-agent": "terryt.dev (terry06890@gmail.com)", - "accept-encoding": "gzip", - } - try: - response = requests.get(url, headers=headers) - with open(outFile, 'wb') as file: - file.write(response.content) - time.sleep(1) - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec" - # It's unclear how to properly check for cache misses, so just do about <=1 per sec - except Exception as e: - print(f"Error while downloading to {outFile}: {e}", file=sys.stderr) -# Close db -dbCon.close() -- cgit v1.2.3