#!/usr/bin/python3

import sys, re, os, random
import sqlite3
import urllib.parse, requests
import time
from threading import Thread
import signal

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Downloads images from URLs specified in an image-list database,\n"
usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n"
usageInfo += "the form 'eolId1 contentId1.ext1'.\n"
usageInfo += "\n"
usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n"
usageInfo += "The program can be re-run to continue downloading. It looks for\n"
usageInfo += "existing downloaded files, and continues after the one with\n"
usageInfo += "highest EOL ID.\n"
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

imagesListDb = "imagesList.db"
def getInputEolIds():
	eolIds = set()
	dbCon = sqlite3.connect("../data.db")
	dbCur = dbCon.cursor()
	for (id,) in dbCur.execute("SELECT id FROM eol_ids"):
		eolIds.add(id)
	dbCon.close()
	return eolIds
outDir = "imgsForReview/"
LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
POST_DL_DELAY_MAX = 3

# Get eol-ids from data db
print("Getting input EOL IDs")
eolIds = getInputEolIds()
# Get eol-ids from images db
print("Getting images-list-db EOL IDs")
imgDbCon = sqlite3.connect(imagesListDb)
imgCur = imgDbCon.cursor()
imgListIds = set()
for row in imgCur.execute("SELECT DISTINCT page_id FROM images"):
	imgListIds.add(row[0])
# Get eol-id intersection, and sort into list
eolIds = eolIds.intersection(imgListIds)
eolIds = sorted(eolIds)
print(f"Resulted in {len(eolIds)} EOL IDs")

MAX_IMGS_PER_ID = 3
MAX_THREADS = 5
numThreads = 0
threadException = None # Used for ending main thread after a non-main thread exception
def downloadImg(url, outFile):
	global numThreads, threadException
	try:
		data = requests.get(url)
		with open(outFile, 'wb') as file:
			file.write(data.content)
		time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
	except Exception as e:
		print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
		threadException = e
	numThreads -= 1
# Create output directory if not present
if not os.path.exists(outDir):
	os.mkdir(outDir)
# Find next eol ID to download for
print("Finding next ID to download for")
nextIdx = 0
fileList = os.listdir(outDir)
ids = [int(filename.split(" ")[0]) for filename in fileList]
if len(ids) > 0:
	ids.sort()
	nextIdx = eolIds.index(ids[-1]) + 1
if nextIdx == len(eolIds):
	print("No IDs left. Exiting...")
	sys.exit(0)
# Detect SIGINT signals
interrupted = False
oldHandler = None
def onSigint(sig, frame):
	global interrupted
	interrupted = True
	signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
# Manage downloading
for idx in range(nextIdx, len(eolIds)):
	eolId = eolIds[idx]
	# Get image urls
	imgDataList = []
	ownerSet = set() # Used to get images from different owners, for variety
	for row in imgCur.execute(
		"SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)):
		license = row[3]
		copyrightOwner = row[4]
		if re.fullmatch(LICENSE_REGEX, license) == None:
			continue
		if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic
			continue
		if copyrightOwner not in ownerSet:
			ownerSet.add(copyrightOwner)
			imgDataList.append(row)
			if len(ownerSet) == MAX_IMGS_PER_ID:
				break
	if len(imgDataList) == 0:
		continue
	# Determine output filenames
	outFiles = []
	urls = []
	for row in imgDataList:
		contentId = row[0]
		url = row[2]
		if url.startswith("data/"):
			url = "https://content.eol.org/" + url
		urlParts = urllib.parse.urlparse(url)
		extension = os.path.splitext(urlParts.path)[1]
		if len(extension) <= 1:
			print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
			continue
		outFiles.append(str(eolId) + " " + str(contentId) + extension)
		urls.append(url)
	# Start downloads
	exitLoop = False
	for i in range(len(outFiles)):
		outPath = outDir + outFiles[i]
		if not os.path.exists(outPath):
			# Enforce thread limit
			while numThreads == MAX_THREADS:
				time.sleep(1)
			# Wait for threads after an interrupt or thread-exception
			if interrupted or threadException != None:
				print("Waiting for existing threads to end")
				while numThreads > 0:
					time.sleep(1)
				exitLoop = True
				break
			print(f"Downloading image to {outPath}")
			# Perform download
			numThreads += 1
			thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True)
			thread.start()
	if exitLoop:
		break
# Close images-list db
print("Finished downloading")
imgDbCon.close()