aboutsummaryrefslogtreecommitdiff
path: root/backend/data/eol/downloadImgs.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/eol/downloadImgs.py')
-rwxr-xr-xbackend/data/eol/downloadImgs.py147
1 files changed, 0 insertions, 147 deletions
diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py
deleted file mode 100755
index 96bc085..0000000
--- a/backend/data/eol/downloadImgs.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os, random
-import sqlite3
-import urllib.parse, requests
-import time
-from threading import Thread
-import signal
-
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-For some set of EOL IDs, downloads associated images from URLs in
-an image-list database. Uses multiple downloading threads.
-
-May obtain multiple images per ID. The images will get names
-with the form 'eolId1 contentId1.ext1'.
-
-SIGINT causes the program to finish ongoing downloads and exit.
-The program can be re-run to continue downloading. It looks for
-already-downloaded files, and continues after the one with
-highest EOL ID.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-# In testing, this downloaded about 70k images, over a few days
-
-imagesListDb = "imagesList.db"
-def getInputEolIds():
- eolIds = set()
- dbCon = sqlite3.connect("../data.db")
- dbCur = dbCon.cursor()
- for (id,) in dbCur.execute("SELECT id FROM eol_ids"):
- eolIds.add(id)
- dbCon.close()
- return eolIds
-outDir = "imgsForReview/"
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
-POST_DL_DELAY_MAX = 3
-LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
-
-print("Getting input EOL IDs")
-eolIds = getInputEolIds()
-print("Getting EOL IDs to download for")
-# Get IDs from images-list db
-imgDbCon = sqlite3.connect(imagesListDb)
-imgCur = imgDbCon.cursor()
-imgListIds = set()
-for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"):
- imgListIds.add(pageId)
-# Get set intersection, and sort into list
-eolIds = eolIds.intersection(imgListIds)
-eolIds = sorted(eolIds)
-print(f"Result: {len(eolIds)} EOL IDs")
-
-print("Checking output directory")
-if not os.path.exists(outDir):
- os.mkdir(outDir)
-print("Finding next ID to download for")
-nextIdx = 0
-fileList = os.listdir(outDir)
-ids = [int(filename.split(" ")[0]) for filename in fileList]
-if len(ids) > 0:
- ids.sort()
- nextIdx = eolIds.index(ids[-1]) + 1
-if nextIdx == len(eolIds):
- print("No IDs left. Exiting...")
- sys.exit(0)
-
-print("Starting download threads")
-numThreads = 0
-threadException = None # Used for ending main thread after a non-main thread exception
-# Handle SIGINT signals
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
- global interrupted
- interrupted = True
- signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Function for threads to execute
-def downloadImg(url, outFile):
- global numThreads, threadException
- try:
- data = requests.get(url)
- with open(outFile, 'wb') as file:
- file.write(data.content)
- time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
- except Exception as e:
- print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
- threadException = e
- numThreads -= 1
-# Manage downloading
-for idx in range(nextIdx, len(eolIds)):
- eolId = eolIds[idx]
- # Get image urls
- imgDataList = []
- ownerSet = set() # Used to get images from different owners, for variety
- exitLoop = False
- query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
- for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)):
- if url.startswith("data/"):
- url = "https://content.eol.org/" + url
- urlParts = urllib.parse.urlparse(url)
- extension = os.path.splitext(urlParts.path)[1]
- if len(extension) <= 1:
- print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
- continue
- # Check image-quantity limit
- if len(ownerSet) == MAX_IMGS_PER_ID:
- break
- # Check for skip conditions
- if re.fullmatch(LICENSE_REGEX, license) == None:
- continue
- if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
- continue
- if copyrightOwner in ownerSet:
- continue
- ownerSet.add(copyrightOwner)
- # Determine output filename
- outPath = f"{outDir}{eolId} {contentId}{extension}"
- if os.path.exists(outPath):
- print(f"WARNING: {outPath} already exists. Skipping download.")
- continue
- # Check thread limit
- while numThreads == MAX_THREADS:
- time.sleep(1)
- # Wait for threads after an interrupt or thread-exception
- if interrupted or threadException != None:
- print("Waiting for existing threads to end")
- while numThreads > 0:
- time.sleep(1)
- exitLoop = True
- break
- # Perform download
- print(f"Downloading image to {outPath}")
- numThreads += 1
- thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
- thread.start()
- if exitLoop:
- break
-# Close images-list db
-print("Finished downloading")
-imgDbCon.close()