diff options
Diffstat (limited to 'backend')
| -rwxr-xr-x | backend/data/enwiki/downloadEnwikiImgs.py | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py new file mode 100755 index 0000000..de9b862 --- /dev/null +++ b/backend/data/enwiki/downloadEnwikiImgs.py @@ -0,0 +1,88 @@ +#!/usr/bin/python3 + +import sys, re, os +import sqlite3 +import urllib.parse, requests +import time, signal + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Downloads images from URLs specified in an sqlite db,\n" +usageInfo += "into a specified directory.'\n" +usageInfo += "\n" +usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" +usageInfo += "The program can be re-run to continue downloading, and looks\n" +usageInfo += "in the output directory do decide what to skip.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +imgDb = "enwikiImgs.db" # About 130k image names +outDir = "imgs" +licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) + +# Create output directory if not present +if not os.path.exists(outDir): + os.mkdir(outDir) +# Get existing image names +print("Gettings already-downloaded images") +fileList = os.listdir(outDir) +pageIdsDone = set() +for filename in fileList: + (basename, extension) = os.path.splitext(filename) + pageIdsDone.add(int(basename)) +print(f"Found {len(pageIdsDone)} already-downloaded images") +# Set SIGINT handler +interrupted = False +oldHandler = None +def onSigint(sig, frame): + global interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) +oldHandler = signal.signal(signal.SIGINT, onSigint) +# Open db +dbCon = sqlite3.connect(imgDb) +dbCur = dbCon.cursor() +# Start downloads +print("Starting downloads") +iterNum = 0 +query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \ + " imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name" +for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query): + if pageId in pageIdsDone: + continue + if interrupted: + print(f"Exiting loop") + break + # Check for problematic attributes + if license == None or licenseRegex.fullmatch(license) == None: + continue + if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None: + continue + if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None: + continue + if restrictions != None and restrictions != "": + continue + # Download image + iterNum += 1 + print(f"Iteration {iterNum}: Downloading for page-id {pageId}") + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) + sys.exit(1) + outFile = f"{outDir}/{pageId}{extension}" + headers = { + "user-agent": "terryt.dev (terry06890@gmail.com)", + "accept-encoding": "gzip", + } + try: + response = requests.get(url, headers=headers) + with open(outFile, 'wb') as file: + file.write(response.content) + time.sleep(1) + # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec" + # It's unclear how to properly check for cache misses, so just do about <=1 per sec + except Exception as e: + print(f"Error while downloading to {outFile}: {e}", file=sys.stderr) +# Close db +dbCon.close() |
