From abb936f5d76f7fe5cec1e8948d287da86643d504 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Wed, 22 Jun 2022 23:16:42 +1000
Subject: Refactor backend scripts

---
 backend/data/enwiki/downloadEnwikiImgs.py | 88 -------------------------------
 1 file changed, 88 deletions(-)
 delete mode 100755 backend/data/enwiki/downloadEnwikiImgs.py

(limited to 'backend/data/enwiki/downloadEnwikiImgs.py')

diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py
deleted file mode 100755
index 2929a0d..0000000
--- a/backend/data/enwiki/downloadEnwikiImgs.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os
-import sqlite3
-import urllib.parse, requests
-import time, signal
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an sqlite db,\n"
-usageInfo += "into a specified directory.'\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and looks\n"
-usageInfo += "in the output directory do decide what to skip.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-imgDb = "imgData.db" # About 130k image names
-outDir = "imgs"
-licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
-
-# Create output directory if not present
-if not os.path.exists(outDir):
-	os.mkdir(outDir)
-# Get existing image names
-print("Gettings already-downloaded images")
-fileList = os.listdir(outDir)
-pageIdsDone = set()
-for filename in fileList:
-	(basename, extension) = os.path.splitext(filename)
-	pageIdsDone.add(int(basename))
-print(f"Found {len(pageIdsDone)} already-downloaded images")
-# Set SIGINT handler
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-	signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Open db
-dbCon = sqlite3.connect(imgDb)
-dbCur = dbCon.cursor()
-# Start downloads
-print("Starting downloads")
-iterNum = 0
-query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
-	" imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name"
-for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query):
-	if pageId in pageIdsDone:
-		continue
-	if interrupted:
-		print(f"Exiting loop")
-		break
-	# Check for problematic attributes
-	if license == None or licenseRegex.fullmatch(license) == None:
-		continue
-	if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None:
-		continue
-	if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None:
-		continue
-	if restrictions != None and restrictions != "":
-		continue
-	# Download image
-	iterNum += 1
-	print(f"Iteration {iterNum}: Downloading for page-id {pageId}")
-	urlParts = urllib.parse.urlparse(url)
-	extension = os.path.splitext(urlParts.path)[1]
-	if len(extension) <= 1:
-		print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
-		sys.exit(1)
-	outFile = f"{outDir}/{pageId}{extension}"
-	headers = {
-		"user-agent": "terryt.dev (terry06890@gmail.com)",
-		"accept-encoding": "gzip",
-	}
-	try:
-		response = requests.get(url, headers=headers)
-		with open(outFile, 'wb') as file:
-			file.write(response.content)
-		time.sleep(1)
-			# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
-			# It's unclear how to properly check for cache misses, so just do about <=1 per sec
-	except Exception as e:
-		print(f"Error while downloading to {outFile}: {e}", file=sys.stderr)
-# Close db
-dbCon.close()
-- 
cgit v1.2.3