aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rwxr-xr-xbackend/data/enwiki/downloadEnwikiImgs.py88
1 files changed, 88 insertions, 0 deletions
diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py
new file mode 100755
index 0000000..de9b862
--- /dev/null
+++ b/backend/data/enwiki/downloadEnwikiImgs.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+
+import sys, re, os
+import sqlite3
+import urllib.parse, requests
+import time, signal
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Downloads images from URLs specified in an sqlite db,\n"
+usageInfo += "into a specified directory.'\n"
+usageInfo += "\n"
+usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
+usageInfo += "The program can be re-run to continue downloading, and looks\n"
+usageInfo += "in the output directory do decide what to skip.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+imgDb = "enwikiImgs.db" # About 130k image names
+outDir = "imgs"
+licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
+
+# Create output directory if not present
+if not os.path.exists(outDir):
+ os.mkdir(outDir)
+# Get existing image names
+print("Gettings already-downloaded images")
+fileList = os.listdir(outDir)
+pageIdsDone = set()
+for filename in fileList:
+ (basename, extension) = os.path.splitext(filename)
+ pageIdsDone.add(int(basename))
+print(f"Found {len(pageIdsDone)} already-downloaded images")
+# Set SIGINT handler
+interrupted = False
+oldHandler = None
+def onSigint(sig, frame):
+ global interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+oldHandler = signal.signal(signal.SIGINT, onSigint)
+# Open db
+dbCon = sqlite3.connect(imgDb)
+dbCur = dbCon.cursor()
+# Start downloads
+print("Starting downloads")
+iterNum = 0
+query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
+ " imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name"
+for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query):
+ if pageId in pageIdsDone:
+ continue
+ if interrupted:
+ print(f"Exiting loop")
+ break
+ # Check for problematic attributes
+ if license == None or licenseRegex.fullmatch(license) == None:
+ continue
+ if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None:
+ continue
+ if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None:
+ continue
+ if restrictions != None and restrictions != "":
+ continue
+ # Download image
+ iterNum += 1
+ print(f"Iteration {iterNum}: Downloading for page-id {pageId}")
+ urlParts = urllib.parse.urlparse(url)
+ extension = os.path.splitext(urlParts.path)[1]
+ if len(extension) <= 1:
+ print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
+ sys.exit(1)
+ outFile = f"{outDir}/{pageId}{extension}"
+ headers = {
+ "user-agent": "terryt.dev (terry06890@gmail.com)",
+ "accept-encoding": "gzip",
+ }
+ try:
+ response = requests.get(url, headers=headers)
+ with open(outFile, 'wb') as file:
+ file.write(response.content)
+ time.sleep(1)
+ # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
+ # It's unclear how to properly check for cache misses, so just do about <=1 per sec
+ except Exception as e:
+ print(f"Error while downloading to {outFile}: {e}", file=sys.stderr)
+# Close db
+dbCon.close()