aboutsummaryrefslogtreecommitdiff
path: root/backend/data/eol
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/eol')
-rwxr-xr-xbackend/data/eol/downloadImgs.py149
-rwxr-xr-xbackend/data/eol/reviewImgs.py199
2 files changed, 348 insertions, 0 deletions
diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py
new file mode 100755
index 0000000..ac72ea1
--- /dev/null
+++ b/backend/data/eol/downloadImgs.py
@@ -0,0 +1,149 @@
+#!/usr/bin/python3
+
+import sys, re, os, random
+import sqlite3
+import urllib.parse, requests
+import time
+from threading import Thread
+import signal
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Downloads images from URLs specified in an image-list database,\n"
+usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n"
+usageInfo += "the form 'eolId1 contentId1.ext1'.\n"
+usageInfo += "\n"
+usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n"
+usageInfo += "The program can be re-run to continue downloading. It looks for\n"
+usageInfo += "existing downloaded files, and continues after the one with\n"
+usageInfo += "highest EOL ID.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+imagesListDb = "imagesList.db"
+def getInputEolIds():
+ eolIds = set()
+ dbCon = sqlite3.connect("../data.db")
+ dbCur = dbCon.cursor()
+ for (id,) in dbCur.execute("SELECT id FROM eol_ids"):
+ eolIds.add(id)
+ dbCon.close()
+ return eolIds
+outDir = "imgsForReview/"
+LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
+POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
+POST_DL_DELAY_MAX = 3
+
+# Get eol-ids from data db
+print("Getting input EOL IDs")
+eolIds = getInputEolIds()
+# Get eol-ids from images db
+print("Getting images-list-db EOL IDs")
+imgDbCon = sqlite3.connect(imagesListDb)
+imgCur = imgDbCon.cursor()
+imgListIds = set()
+for row in imgCur.execute("SELECT DISTINCT page_id FROM images"):
+ imgListIds.add(row[0])
+# Get eol-id intersection, and sort into list
+eolIds = eolIds.intersection(imgListIds)
+eolIds = sorted(eolIds)
+print(f"Resulted in {len(eolIds)} EOL IDs")
+
+MAX_IMGS_PER_ID = 3
+MAX_THREADS = 5
+numThreads = 0
+threadException = None # Used for ending main thread after a non-main thread exception
+def downloadImg(url, outFile):
+ global numThreads, threadException
+ try:
+ data = requests.get(url)
+ with open(outFile, 'wb') as file:
+ file.write(data.content)
+ time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
+ except Exception as e:
+ print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
+ threadException = e
+ numThreads -= 1
+# Create output directory if not present
+if not os.path.exists(outDir):
+ os.mkdir(outDir)
+# Find next eol ID to download for
+print("Finding next ID to download for")
+nextIdx = 0
+fileList = os.listdir(outDir)
+ids = [int(filename.split(" ")[0]) for filename in fileList]
+if len(ids) > 0:
+ ids.sort()
+ nextIdx = eolIds.index(ids[-1]) + 1
+if nextIdx == len(eolIds):
+ print("No IDs left. Exiting...")
+ sys.exit(0)
+# Detect SIGINT signals
+interrupted = False
+oldHandler = None
+def onSigint(sig, frame):
+ global interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+oldHandler = signal.signal(signal.SIGINT, onSigint)
+# Manage downloading
+for idx in range(nextIdx, len(eolIds)):
+ eolId = eolIds[idx]
+ # Get image urls
+ imgDataList = []
+ ownerSet = set() # Used to get images from different owners, for variety
+ for row in imgCur.execute(
+ "SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)):
+ license = row[3]
+ copyrightOwner = row[4]
+ if re.fullmatch(LICENSE_REGEX, license) == None:
+ continue
+ if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic
+ continue
+ if copyrightOwner not in ownerSet:
+ ownerSet.add(copyrightOwner)
+ imgDataList.append(row)
+ if len(ownerSet) == MAX_IMGS_PER_ID:
+ break
+ if len(imgDataList) == 0:
+ continue
+ # Determine output filenames
+ outFiles = []
+ urls = []
+ for row in imgDataList:
+ contentId = row[0]
+ url = row[2]
+ if url.startswith("data/"):
+ url = "https://content.eol.org/" + url
+ urlParts = urllib.parse.urlparse(url)
+ extension = os.path.splitext(urlParts.path)[1]
+ if len(extension) <= 1:
+ print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
+ continue
+ outFiles.append(str(eolId) + " " + str(contentId) + extension)
+ urls.append(url)
+ # Start downloads
+ exitLoop = False
+ for i in range(len(outFiles)):
+ outPath = outDir + outFiles[i]
+ if not os.path.exists(outPath):
+ # Enforce thread limit
+ while numThreads == MAX_THREADS:
+ time.sleep(1)
+ # Wait for threads after an interrupt or thread-exception
+ if interrupted or threadException != None:
+ print("Waiting for existing threads to end")
+ while numThreads > 0:
+ time.sleep(1)
+ exitLoop = True
+ break
+ print(f"Downloading image to {outPath}")
+ # Perform download
+ numThreads += 1
+ thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True)
+ thread.start()
+ if exitLoop:
+ break
+# Close images-list db
+print("Finished downloading")
+imgDbCon.close()
diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py
new file mode 100755
index 0000000..ba313d9
--- /dev/null
+++ b/backend/data/eol/reviewImgs.py
@@ -0,0 +1,199 @@
+#!/usr/bin/python3
+
+import sys, re, os, time
+import sqlite3
+import tkinter as tki
+from tkinter import ttk
+import PIL
+from PIL import ImageTk, Image, ImageOps
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Provides a GUI for reviewing images. Looks in a for-review directory for\n"
+usageInfo += "images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to\n"
+usageInfo += "choose an image to keep, or reject all. Also provides image rotation.\n"
+usageInfo += "Chosen images are placed in another directory, and rejected ones are deleted.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+imgDir = "imgsForReview/"
+outDir = "imgsReviewed/"
+extraInfoDbCon = sqlite3.connect("../data.db")
+extraInfoDbCur = extraInfoDbCon.cursor()
+def getExtraInfo(eolId):
+ query = "SELECT names.alt_name FROM" \
+ " names INNER JOIN eol_ids ON eol_ids.name = names.name" \
+ " WHERE id = ? and pref_alt = 1"
+ row = extraInfoDbCur.execute(query, (eolId,)).fetchone()
+ if row != None:
+ return f"Reviewing EOL ID {eolId}, aka \"row[0]\""
+ else:
+ return f"Reviewing EOL ID {eolId}"
+IMG_DISPLAY_SZ = 400
+MAX_IMGS_PER_ID = 3
+PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
+
+# Create output directory if not present
+if not os.path.exists(outDir):
+ os.mkdir(outDir)
+# Get images for review
+print("Reading input image list")
+imgList = os.listdir(imgDir)
+imgList.sort(key=lambda s: int(s.split(" ")[0]))
+if len(imgList) == 0:
+ print("No input images found", file=sys.stderr)
+ sys.exit(1)
+
+class EolImgReviewer:
+ """ Provides the GUI for reviewing images """
+ def __init__(self, root, imgList):
+ self.root = root
+ root.title("EOL Image Reviewer")
+ # Setup main frame
+ mainFrame = ttk.Frame(root, padding="5 5 5 5")
+ mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
+ root.columnconfigure(0, weight=1)
+ root.rowconfigure(0, weight=1)
+ # Set up images-to-be-reviewed frames
+ self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation
+ self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter
+ # These need a persistent reference for some reason (doesn't display otherwise)
+ self.labels = []
+ for i in range(MAX_IMGS_PER_ID):
+ frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ)
+ frame.grid(column=i, row=0)
+ label = ttk.Label(frame, image=self.photoImgs[i])
+ label.grid(column=0, row=0)
+ self.labels.append(label)
+ # Add padding
+ for child in mainFrame.winfo_children():
+ child.grid_configure(padx=5, pady=5)
+ # Add bindings
+ root.bind("<q>", self.quit)
+ root.bind("<Key-j>", lambda evt: self.accept(0))
+ root.bind("<Key-k>", lambda evt: self.accept(1))
+ root.bind("<Key-l>", lambda evt: self.accept(2))
+ root.bind("<Key-i>", lambda evt: self.reject())
+ root.bind("<Key-a>", lambda evt: self.rotate(0))
+ root.bind("<Key-s>", lambda evt: self.rotate(1))
+ root.bind("<Key-d>", lambda evt: self.rotate(2))
+ root.bind("<Key-A>", lambda evt: self.rotate(0, True))
+ root.bind("<Key-S>", lambda evt: self.rotate(1, True))
+ root.bind("<Key-D>", lambda evt: self.rotate(2, True))
+ # Initialise images to review
+ self.imgList = imgList
+ self.imgListIdx = 0
+ self.nextEolId = 0
+ self.nextImgNames = []
+ self.rotations = []
+ self.getNextImgs()
+ # For more info
+ self.numReviewed = 0
+ self.startTime = time.time()
+ def getNextImgs(self):
+ """ Updates display with new images to review, or ends program """
+ # Gather names of next images to review
+ for i in range(MAX_IMGS_PER_ID):
+ if self.imgListIdx == len(self.imgList):
+ if i == 0:
+ self.quit()
+ return
+ break
+ imgName = self.imgList[self.imgListIdx]
+ eolId = int(re.match(r"(\d+) (\d+)", imgName).group(1))
+ if i == 0:
+ self.nextEolId = eolId
+ self.nextImgNames = [imgName]
+ self.rotations = [0]
+ else:
+ if self.nextEolId != eolId:
+ break
+ self.nextImgNames.append(imgName)
+ self.rotations.append(0)
+ self.imgListIdx += 1
+ # Update displayed images
+ idx = 0
+ while idx < MAX_IMGS_PER_ID:
+ if idx < len(self.nextImgNames):
+ try:
+ img = Image.open(imgDir + self.nextImgNames[idx])
+ img = ImageOps.exif_transpose(img)
+ except PIL.UnidentifiedImageError:
+ os.remove(imgDir + self.nextImgNames[idx])
+ del self.nextImgNames[idx]
+ del self.rotations[idx]
+ continue
+ self.imgs[idx] = self.resizeForDisplay(img)
+ else:
+ self.imgs[idx] = PLACEHOLDER_IMG
+ self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
+ self.labels[idx].config(image=self.photoImgs[idx])
+ idx += 1
+ # Restart if all image files non-recognisable
+ if len(self.nextImgNames) == 0:
+ self.getNextImgs()
+ return
+ # Update title
+ firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
+ lastImgIdx = self.imgListIdx
+ title = getExtraInfo(self.nextEolId)
+ title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})"
+ self.root.title(title)
+ def accept(self, imgIdx):
+ """ React to a user selecting an image """
+ if imgIdx >= len(self.nextImgNames):
+ print("Invalid selection")
+ return
+ for i in range(len(self.nextImgNames)):
+ inFile = imgDir + self.nextImgNames[i]
+ if i == imgIdx: # Move accepted image, rotating if needed
+ outFile = outDir + self.nextImgNames[i]
+ img = Image.open(inFile)
+ img = ImageOps.exif_transpose(img)
+ if self.rotations[i] != 0:
+ img = img.rotate(self.rotations[i], expand=True)
+ img.save(outFile)
+ os.remove(inFile)
+ else: # Delete non-accepted image
+ os.remove(inFile)
+ self.numReviewed += 1
+ self.getNextImgs()
+ def reject(self):
+ """ React to a user rejecting all images of a set """
+ for i in range(len(self.nextImgNames)):
+ os.remove(imgDir + self.nextImgNames[i])
+ self.numReviewed += 1
+ self.getNextImgs()
+ def rotate(self, imgIdx, anticlockwise = False):
+ """ Respond to a user rotating an image """
+ deg = -90 if not anticlockwise else 90
+ self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg)
+ self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
+ self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
+ self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
+ def quit(self, e = None):
+ print(f"Number reviewed: {self.numReviewed}")
+ timeElapsed = time.time() - self.startTime
+ print(f"Time elapsed: {timeElapsed:.2f} seconds")
+ if self.numReviewed > 0:
+ print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
+ extraInfoDbCon.close()
+ self.root.destroy()
+ def resizeForDisplay(self, img):
+ """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
+ if max(img.width, img.height) > IMG_DISPLAY_SZ:
+ if (img.width > img.height):
+ newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
+ img = img.resize((IMG_DISPLAY_SZ, newHeight))
+ else:
+ newWidth = int(img.width * IMG_DISPLAY_SZ / img.height)
+ img = img.resize((newWidth, IMG_DISPLAY_SZ))
+ bgImg = PLACEHOLDER_IMG.copy()
+ bgImg.paste(img, box=(
+ int((IMG_DISPLAY_SZ - img.width) / 2),
+ int((IMG_DISPLAY_SZ - img.height) / 2)))
+ return bgImg
+# Create GUI and defer control
+root = tki.Tk()
+EolImgReviewer(root, imgList)
+root.mainloop()