aboutsummaryrefslogtreecommitdiff
path: root/backend/data/eol
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/eol')
-rw-r--r--backend/data/eol/README.md5
-rwxr-xr-xbackend/data/eol/downloadImgs.py152
-rwxr-xr-xbackend/data/eol/genImagesListDb.sh2
-rwxr-xr-xbackend/data/eol/reviewImgs.py48
4 files changed, 107 insertions, 100 deletions
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md
index fbb008d..8c527a8 100644
--- a/backend/data/eol/README.md
+++ b/backend/data/eol/README.md
@@ -11,9 +11,10 @@ This directory holds files obtained from/using the [Encyclopedia of Life](https:
Contains metadata for images from EOL.
- imagesList/ <br>
Extracted from imagesList.tgz.
+- genImagesListDb.sh <br>
+ Creates a database, and imports imagesList/*.csv files into it.
- imagesList.db <br>
- Contains data from imagesList/.
- Created by running genImagesListDb.sh, which simply imports csv files into a database. <br>
+ Created by running genImagesListDb.sh <br>
Tables: <br>
- `images`:
`content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT`
diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py
index ac72ea1..96bc085 100755
--- a/backend/data/eol/downloadImgs.py
+++ b/backend/data/eol/downloadImgs.py
@@ -7,18 +7,24 @@ import time
from threading import Thread
import signal
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an image-list database,\n"
-usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n"
-usageInfo += "the form 'eolId1 contentId1.ext1'.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n"
-usageInfo += "The program can be re-run to continue downloading. It looks for\n"
-usageInfo += "existing downloaded files, and continues after the one with\n"
-usageInfo += "highest EOL ID.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of EOL IDs, downloads associated images from URLs in
+an image-list database. Uses multiple downloading threads.
+
+May obtain multiple images per ID. The images will get names
+with the form 'eolId1 contentId1.ext1'.
+
+SIGINT causes the program to finish ongoing downloads and exit.
+The program can be re-run to continue downloading. It looks for
+already-downloaded files, and continues after the one with
+highest EOL ID.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
+# In testing, this downloaded about 70k images, over a few days
imagesListDb = "imagesList.db"
def getInputEolIds():
@@ -30,44 +36,29 @@ def getInputEolIds():
dbCon.close()
return eolIds
outDir = "imgsForReview/"
-LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
+MAX_IMGS_PER_ID = 3
+MAX_THREADS = 5
POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
POST_DL_DELAY_MAX = 3
+LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
-# Get eol-ids from data db
print("Getting input EOL IDs")
eolIds = getInputEolIds()
-# Get eol-ids from images db
-print("Getting images-list-db EOL IDs")
+print("Getting EOL IDs to download for")
+# Get IDs from images-list db
imgDbCon = sqlite3.connect(imagesListDb)
imgCur = imgDbCon.cursor()
imgListIds = set()
-for row in imgCur.execute("SELECT DISTINCT page_id FROM images"):
- imgListIds.add(row[0])
-# Get eol-id intersection, and sort into list
+for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"):
+ imgListIds.add(pageId)
+# Get set intersection, and sort into list
eolIds = eolIds.intersection(imgListIds)
eolIds = sorted(eolIds)
-print(f"Resulted in {len(eolIds)} EOL IDs")
+print(f"Result: {len(eolIds)} EOL IDs")
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-numThreads = 0
-threadException = None # Used for ending main thread after a non-main thread exception
-def downloadImg(url, outFile):
- global numThreads, threadException
- try:
- data = requests.get(url)
- with open(outFile, 'wb') as file:
- file.write(data.content)
- time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
- except Exception as e:
- print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
- threadException = e
- numThreads -= 1
-# Create output directory if not present
+print("Checking output directory")
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Find next eol ID to download for
print("Finding next ID to download for")
nextIdx = 0
fileList = os.listdir(outDir)
@@ -78,7 +69,11 @@ if len(ids) > 0:
if nextIdx == len(eolIds):
print("No IDs left. Exiting...")
sys.exit(0)
-# Detect SIGINT signals
+
+print("Starting download threads")
+numThreads = 0
+threadException = None # Used for ending main thread after a non-main thread exception
+# Handle SIGINT signals
interrupted = False
oldHandler = None
def onSigint(sig, frame):
@@ -86,33 +81,27 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
+# Function for threads to execute
+def downloadImg(url, outFile):
+ global numThreads, threadException
+ try:
+ data = requests.get(url)
+ with open(outFile, 'wb') as file:
+ file.write(data.content)
+ time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
+ except Exception as e:
+ print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
+ threadException = e
+ numThreads -= 1
# Manage downloading
for idx in range(nextIdx, len(eolIds)):
eolId = eolIds[idx]
# Get image urls
imgDataList = []
ownerSet = set() # Used to get images from different owners, for variety
- for row in imgCur.execute(
- "SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)):
- license = row[3]
- copyrightOwner = row[4]
- if re.fullmatch(LICENSE_REGEX, license) == None:
- continue
- if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic
- continue
- if copyrightOwner not in ownerSet:
- ownerSet.add(copyrightOwner)
- imgDataList.append(row)
- if len(ownerSet) == MAX_IMGS_PER_ID:
- break
- if len(imgDataList) == 0:
- continue
- # Determine output filenames
- outFiles = []
- urls = []
- for row in imgDataList:
- contentId = row[0]
- url = row[2]
+ exitLoop = False
+ query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
+ for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)):
if url.startswith("data/"):
url = "https://content.eol.org/" + url
urlParts = urllib.parse.urlparse(url)
@@ -120,28 +109,37 @@ for idx in range(nextIdx, len(eolIds)):
if len(extension) <= 1:
print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
continue
- outFiles.append(str(eolId) + " " + str(contentId) + extension)
- urls.append(url)
- # Start downloads
- exitLoop = False
- for i in range(len(outFiles)):
- outPath = outDir + outFiles[i]
- if not os.path.exists(outPath):
- # Enforce thread limit
- while numThreads == MAX_THREADS:
+ # Check image-quantity limit
+ if len(ownerSet) == MAX_IMGS_PER_ID:
+ break
+ # Check for skip conditions
+ if re.fullmatch(LICENSE_REGEX, license) == None:
+ continue
+ if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
+ continue
+ if copyrightOwner in ownerSet:
+ continue
+ ownerSet.add(copyrightOwner)
+ # Determine output filename
+ outPath = f"{outDir}{eolId} {contentId}{extension}"
+ if os.path.exists(outPath):
+ print(f"WARNING: {outPath} already exists. Skipping download.")
+ continue
+ # Check thread limit
+ while numThreads == MAX_THREADS:
+ time.sleep(1)
+ # Wait for threads after an interrupt or thread-exception
+ if interrupted or threadException != None:
+ print("Waiting for existing threads to end")
+ while numThreads > 0:
time.sleep(1)
- # Wait for threads after an interrupt or thread-exception
- if interrupted or threadException != None:
- print("Waiting for existing threads to end")
- while numThreads > 0:
- time.sleep(1)
- exitLoop = True
- break
- print(f"Downloading image to {outPath}")
- # Perform download
- numThreads += 1
- thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True)
- thread.start()
+ exitLoop = True
+ break
+ # Perform download
+ print(f"Downloading image to {outPath}")
+ numThreads += 1
+ thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
+ thread.start()
if exitLoop:
break
# Close images-list db
diff --git a/backend/data/eol/genImagesListDb.sh b/backend/data/eol/genImagesListDb.sh
index 3a8ced7..87dd840 100755
--- a/backend/data/eol/genImagesListDb.sh
+++ b/backend/data/eol/genImagesListDb.sh
@@ -1,7 +1,9 @@
#!/bin/bash
set -e
+# Combine CSV files into one, skipping header lines
cat imagesList/media_*_{1..58}.csv | tail -n +2 > imagesList.csv
+# Create database, and import the CSV file
sqlite3 imagesList.db <<END
CREATE TABLE images (
content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT);
diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py
index 5290f9e..ecdf7ab 100755
--- a/backend/data/eol/reviewImgs.py
+++ b/backend/data/eol/reviewImgs.py
@@ -7,11 +7,14 @@ from tkinter import ttk
import PIL
from PIL import ImageTk, Image, ImageOps
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Provides a GUI for reviewing images. Looks in a for-review directory for\n"
-usageInfo += "images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to\n"
-usageInfo += "choose an image to keep, or reject all. Also provides image rotation.\n"
-usageInfo += "Chosen images are placed in another directory, and rejected ones are deleted.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Provides a GUI for reviewing images. Looks in a for-review directory for
+images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to
+choose an image to keep, or reject all. Also provides image rotation.
+Chosen images are placed in another directory, and rejected ones are deleted.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -21,6 +24,7 @@ outDir = "imgs/"
extraInfoDbCon = sqlite3.connect("../data.db")
extraInfoDbCur = extraInfoDbCon.cursor()
def getExtraInfo(eolId):
+ global extraInfoDbCur
query = "SELECT names.alt_name FROM" \
" names INNER JOIN eol_ids ON eol_ids.name = names.name" \
" WHERE id = ? and pref_alt = 1"
@@ -31,21 +35,21 @@ def getExtraInfo(eolId):
return f"Reviewing EOL ID {eolId}"
IMG_DISPLAY_SZ = 400
MAX_IMGS_PER_ID = 3
-PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
+IMG_BG_COLOR = (88, 28, 135)
+PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR)
-# Create output directory if not present
+print("Checking output directory")
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Get images for review
-print("Reading input image list")
+print("Getting input image list")
imgList = os.listdir(imgDir)
imgList.sort(key=lambda s: int(s.split(" ")[0]))
if len(imgList) == 0:
- print("No input images found", file=sys.stderr)
- sys.exit(1)
+ print("No input images found")
+ sys.exit(0)
class EolImgReviewer:
- """ Provides the GUI for reviewing images """
+ " Provides the GUI for reviewing images "
def __init__(self, root, imgList):
self.root = root
root.title("EOL Image Reviewer")
@@ -68,7 +72,7 @@ class EolImgReviewer:
# Add padding
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
- # Add bindings
+ # Add keyboard bindings
root.bind("<q>", self.quit)
root.bind("<Key-j>", lambda evt: self.accept(0))
root.bind("<Key-k>", lambda evt: self.accept(1))
@@ -87,11 +91,11 @@ class EolImgReviewer:
self.nextImgNames = []
self.rotations = []
self.getNextImgs()
- # For more info
+ # For displaying extra info
self.numReviewed = 0
self.startTime = time.time()
def getNextImgs(self):
- """ Updates display with new images to review, or ends program """
+ " Updates display with new images to review, or ends program "
# Gather names of next images to review
for i in range(MAX_IMGS_PER_ID):
if self.imgListIdx == len(self.imgList):
@@ -123,7 +127,7 @@ class EolImgReviewer:
del self.nextImgNames[idx]
del self.rotations[idx]
continue
- self.imgs[idx] = self.resizeForDisplay(img)
+ self.imgs[idx] = self.resizeImgForDisplay(img)
else:
self.imgs[idx] = PLACEHOLDER_IMG
self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
@@ -140,7 +144,7 @@ class EolImgReviewer:
title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})"
self.root.title(title)
def accept(self, imgIdx):
- """ React to a user selecting an image """
+ " React to a user selecting an image "
if imgIdx >= len(self.nextImgNames):
print("Invalid selection")
return
@@ -159,19 +163,20 @@ class EolImgReviewer:
self.numReviewed += 1
self.getNextImgs()
def reject(self):
- """ React to a user rejecting all images of a set """
+ " React to a user rejecting all images of a set "
for i in range(len(self.nextImgNames)):
os.remove(imgDir + self.nextImgNames[i])
self.numReviewed += 1
self.getNextImgs()
def rotate(self, imgIdx, anticlockwise = False):
- """ Respond to a user rotating an image """
+ " Respond to a user rotating an image "
deg = -90 if not anticlockwise else 90
self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg)
self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
def quit(self, e = None):
+ global extraInfoDbCon
print(f"Number reviewed: {self.numReviewed}")
timeElapsed = time.time() - self.startTime
print(f"Time elapsed: {timeElapsed:.2f} seconds")
@@ -179,8 +184,8 @@ class EolImgReviewer:
print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
extraInfoDbCon.close()
self.root.destroy()
- def resizeForDisplay(self, img):
- """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
+ def resizeImgForDisplay(self, img):
+ " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background "
if max(img.width, img.height) > IMG_DISPLAY_SZ:
if (img.width > img.height):
newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
@@ -194,6 +199,7 @@ class EolImgReviewer:
int((IMG_DISPLAY_SZ - img.height) / 2)))
return bgImg
# Create GUI and defer control
+print("Starting GUI")
root = tki.Tk()
EolImgReviewer(root, imgList)
root.mainloop()