aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/eol/downloadImgs.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/eol/downloadImgs.py')
-rwxr-xr-xbackend/tolData/eol/downloadImgs.py142
1 files changed, 0 insertions, 142 deletions
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py
deleted file mode 100755
index 5213aaf..0000000
--- a/backend/tolData/eol/downloadImgs.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os, random
-import sqlite3
-import urllib.parse, requests
-import time
-from threading import Thread
-import signal
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-For some set of EOL IDs, downloads associated images from URLs in
-an image-list database. Uses multiple downloading threads.
-
-May obtain multiple images per ID. The images will get names
-with the form 'eolId1 contentId1.ext1'.
-
-SIGINT causes the program to finish ongoing downloads and exit.
-The program can be re-run to continue downloading. It looks for
-already-downloaded files, and continues after the one with
-highest EOL ID.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imagesListDb = 'imagesList.db'
-def getInputEolIds() -> set[int]:
- eolIds: set[int] = set()
- dbCon = sqlite3.connect('../data.db')
- dbCur = dbCon.cursor()
- for (id,) in dbCur.execute('SELECT id FROM eol_ids'):
- eolIds.add(id)
- dbCon.close()
- return eolIds
-outDir = 'imgsForReview/'
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
-POST_DL_DELAY_MAX = 3
-LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain'
-
-print('Getting input EOL IDs')
-eolIds = getInputEolIds()
-print('Getting EOL IDs to download for')
-# Get IDs from images-list db
-imgDbCon = sqlite3.connect(imagesListDb)
-imgCur = imgDbCon.cursor()
-imgListIds: set[int] = set()
-for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'):
- imgListIds.add(pageId)
-# Get set intersection, and sort into list
-eolIds = eolIds.intersection(imgListIds)
-eolIdList = sorted(eolIds)
-print(f'Result: {len(eolIdList)} EOL IDs')
-
-print('Checking output directory')
-if not os.path.exists(outDir):
- os.mkdir(outDir)
-print('Finding next ID to download for')
-nextIdx = 0
-fileList = os.listdir(outDir)
-ids = [int(filename.split(' ')[0]) for filename in fileList]
-if ids:
- ids.sort()
- nextIdx = eolIdList.index(ids[-1]) + 1
-if nextIdx == len(eolIdList):
- print('No IDs left. Exiting...')
- sys.exit(0)
-
-print('Starting download threads')
-numThreads = 0
-threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
-# Handle SIGINT signals
-interrupted = False
-oldHandler = None
-def onSigint(sig, frame):
- global interrupted
- interrupted = True
- signal.signal(signal.SIGINT, oldHandler)
-oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Function for threads to execute
-def downloadImg(url, outFile):
- global numThreads, threadException
- try:
- data = requests.get(url)
- with open(outFile, 'wb') as file:
- file.write(data.content)
- time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
- except Exception as e:
- print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
- threadException = e
- numThreads -= 1
-# Manage downloading
-for idx in range(nextIdx, len(eolIdList)):
- eolId = eolIdList[idx]
- # Get image urls
- ownerSet: set[str] = set() # Used to get images from different owners, for variety
- exitLoop = False
- query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?'
- for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)):
- if url.startswith('data/'):
- url = 'https://content.eol.org/' + url
- urlParts = urllib.parse.urlparse(url)
- extension = os.path.splitext(urlParts.path)[1]
- if len(extension) <= 1:
- print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
- continue
- # Check image-quantity limit
- if len(ownerSet) == MAX_IMGS_PER_ID:
- break
- # Check for skip conditions
- if re.fullmatch(LICENSE_REGEX, license) is None:
- continue
- if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
- continue
- if copyrightOwner in ownerSet:
- continue
- ownerSet.add(copyrightOwner)
- # Determine output filename
- outPath = f'{outDir}{eolId} {contentId}{extension}'
- if os.path.exists(outPath):
- print(f'WARNING: {outPath} already exists. Skipping download.')
- continue
- # Check thread limit
- while numThreads == MAX_THREADS:
- time.sleep(1)
- # Wait for threads after an interrupt or thread-exception
- if interrupted or threadException is not None:
- print('Waiting for existing threads to end')
- while numThreads > 0:
- time.sleep(1)
- exitLoop = True
- break
- # Perform download
- print(f'Downloading image to {outPath}')
- numThreads += 1
- thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
- thread.start()
- if exitLoop:
- break
-# Close images-list db
-print('Finished downloading')
-imgDbCon.close()