aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/eol
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-07 11:37:37 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-07 11:37:37 +1000
commitdaccbbd9c73a5292ea9d6746560d7009e5aa666d (patch)
tree9156bf011ab6302eb3c0d219d40587d594f51841 /backend/tolData/eol
parent1a7fe33edafa68a6f759d124bdeee673ff9cf9ff (diff)
Add python type annotations
Also use consistent quote symbols Also use 'is None' instead of '== None' Also use 'if list1' instead of 'if len(list1) > 0'
Diffstat (limited to 'backend/tolData/eol')
-rwxr-xr-xbackend/tolData/eol/downloadImgs.py76
-rwxr-xr-xbackend/tolData/eol/genImagesListDb.py28
-rwxr-xr-xbackend/tolData/eol/reviewImgs.py92
3 files changed, 98 insertions, 98 deletions
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py
index 4d658e7..5213aaf 100755
--- a/backend/tolData/eol/downloadImgs.py
+++ b/backend/tolData/eol/downloadImgs.py
@@ -22,53 +22,53 @@ highest EOL ID.
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
-imagesListDb = "imagesList.db"
-def getInputEolIds():
- eolIds = set()
- dbCon = sqlite3.connect("../data.db")
+imagesListDb = 'imagesList.db'
+def getInputEolIds() -> set[int]:
+ eolIds: set[int] = set()
+ dbCon = sqlite3.connect('../data.db')
dbCur = dbCon.cursor()
- for (id,) in dbCur.execute("SELECT id FROM eol_ids"):
+ for (id,) in dbCur.execute('SELECT id FROM eol_ids'):
eolIds.add(id)
dbCon.close()
return eolIds
-outDir = "imgsForReview/"
+outDir = 'imgsForReview/'
MAX_IMGS_PER_ID = 3
MAX_THREADS = 5
POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
POST_DL_DELAY_MAX = 3
-LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
+LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain'
-print("Getting input EOL IDs")
+print('Getting input EOL IDs')
eolIds = getInputEolIds()
-print("Getting EOL IDs to download for")
+print('Getting EOL IDs to download for')
# Get IDs from images-list db
imgDbCon = sqlite3.connect(imagesListDb)
imgCur = imgDbCon.cursor()
-imgListIds = set()
-for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"):
+imgListIds: set[int] = set()
+for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'):
imgListIds.add(pageId)
# Get set intersection, and sort into list
eolIds = eolIds.intersection(imgListIds)
-eolIds = sorted(eolIds)
-print(f"Result: {len(eolIds)} EOL IDs")
+eolIdList = sorted(eolIds)
+print(f'Result: {len(eolIdList)} EOL IDs')
-print("Checking output directory")
+print('Checking output directory')
if not os.path.exists(outDir):
os.mkdir(outDir)
-print("Finding next ID to download for")
+print('Finding next ID to download for')
nextIdx = 0
fileList = os.listdir(outDir)
-ids = [int(filename.split(" ")[0]) for filename in fileList]
-if len(ids) > 0:
+ids = [int(filename.split(' ')[0]) for filename in fileList]
+if ids:
ids.sort()
- nextIdx = eolIds.index(ids[-1]) + 1
-if nextIdx == len(eolIds):
- print("No IDs left. Exiting...")
+ nextIdx = eolIdList.index(ids[-1]) + 1
+if nextIdx == len(eolIdList):
+ print('No IDs left. Exiting...')
sys.exit(0)
-print("Starting download threads")
+print('Starting download threads')
numThreads = 0
-threadException = None # Used for ending main thread after a non-main thread exception
+threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
# Handle SIGINT signals
interrupted = False
oldHandler = None
@@ -86,29 +86,29 @@ def downloadImg(url, outFile):
file.write(data.content)
time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
except Exception as e:
- print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
+ print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
threadException = e
numThreads -= 1
# Manage downloading
-for idx in range(nextIdx, len(eolIds)):
- eolId = eolIds[idx]
+for idx in range(nextIdx, len(eolIdList)):
+ eolId = eolIdList[idx]
# Get image urls
- ownerSet = set() # Used to get images from different owners, for variety
+ ownerSet: set[str] = set() # Used to get images from different owners, for variety
exitLoop = False
- query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
- for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)):
- if url.startswith("data/"):
- url = "https://content.eol.org/" + url
+ query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?'
+ for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)):
+ if url.startswith('data/'):
+ url = 'https://content.eol.org/' + url
urlParts = urllib.parse.urlparse(url)
extension = os.path.splitext(urlParts.path)[1]
if len(extension) <= 1:
- print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
+ print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
continue
# Check image-quantity limit
if len(ownerSet) == MAX_IMGS_PER_ID:
break
# Check for skip conditions
- if re.fullmatch(LICENSE_REGEX, license) == None:
+ if re.fullmatch(LICENSE_REGEX, license) is None:
continue
if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
continue
@@ -116,27 +116,27 @@ for idx in range(nextIdx, len(eolIds)):
continue
ownerSet.add(copyrightOwner)
# Determine output filename
- outPath = f"{outDir}{eolId} {contentId}{extension}"
+ outPath = f'{outDir}{eolId} {contentId}{extension}'
if os.path.exists(outPath):
- print(f"WARNING: {outPath} already exists. Skipping download.")
+ print(f'WARNING: {outPath} already exists. Skipping download.')
continue
# Check thread limit
while numThreads == MAX_THREADS:
time.sleep(1)
# Wait for threads after an interrupt or thread-exception
- if interrupted or threadException != None:
- print("Waiting for existing threads to end")
+ if interrupted or threadException is not None:
+ print('Waiting for existing threads to end')
while numThreads > 0:
time.sleep(1)
exitLoop = True
break
# Perform download
- print(f"Downloading image to {outPath}")
+ print(f'Downloading image to {outPath}')
numThreads += 1
thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
thread.start()
if exitLoop:
break
# Close images-list db
-print("Finished downloading")
+print('Finished downloading')
imgDbCon.close()
diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py
index 4dcb6d9..808292d 100755
--- a/backend/tolData/eol/genImagesListDb.py
+++ b/backend/tolData/eol/genImagesListDb.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, os, re
+import os, re
import csv
import sqlite3
@@ -10,25 +10,25 @@ Generates a sqlite db from a directory of CSV files holding EOL image data
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
-imagesListDir = "imagesList/"
-dbFile = "imagesList.db"
+imagesListDir = 'imagesList/'
+dbFile = 'imagesList.db'
-print("Creating database")
+print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE images" \
- " (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)")
-dbCur.execute("CREATE INDEX images_pid_idx ON images(page_id)")
-print("Reading CSV files")
+dbCur.execute('CREATE TABLE images' \
+ ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)')
+dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)')
+print('Reading CSV files')
csvFilenames = os.listdir(imagesListDir)
for filename in csvFilenames:
- print(f"Processing {imagesListDir}{filename}")
- with open(imagesListDir + filename, newline="") as file:
- for (contentId, pageId, sourceUrl, copyUrl, license, owner) in csv.reader(file):
- if re.match(r"^[a-zA-Z]", contentId): # Skip header line
+ print(f'Processing {imagesListDir}{filename}')
+ with open(imagesListDir + filename, newline='') as file:
+ for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file):
+ if re.match(r'^[a-zA-Z]', contentId): # Skip header line
continue
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+ dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
(int(contentId), int(pageId), sourceUrl, copyUrl, license, owner))
-print("Closing database")
+print('Closing database')
dbCon.commit()
dbCon.close()
diff --git a/backend/tolData/eol/reviewImgs.py b/backend/tolData/eol/reviewImgs.py
index 979ed0e..e44fb3d 100755
--- a/backend/tolData/eol/reviewImgs.py
+++ b/backend/tolData/eol/reviewImgs.py
@@ -16,42 +16,42 @@ Chosen images are placed in another directory, and rejected ones are deleted.
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
-imgDir = "imgsForReview/"
-outDir = "imgs/"
-extraInfoDbCon = sqlite3.connect("../data.db")
+imgDir = 'imgsForReview/'
+outDir = 'imgs/'
+extraInfoDbCon = sqlite3.connect('../data.db')
extraInfoDbCur = extraInfoDbCon.cursor()
-def getExtraInfo(eolId):
+def getExtraInfo(eolId: int) -> str:
global extraInfoDbCur
- query = "SELECT names.alt_name FROM" \
- " names INNER JOIN eol_ids ON eol_ids.name = names.name" \
- " WHERE id = ? and pref_alt = 1"
+ query = 'SELECT names.alt_name FROM' \
+ ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \
+ ' WHERE id = ? and pref_alt = 1'
row = extraInfoDbCur.execute(query, (eolId,)).fetchone()
- if row != None:
- return f"Reviewing EOL ID {eolId}, aka \"{row[0]}\""
+ if row is not None:
+ return f'Reviewing EOL ID {eolId}, aka "{row[0]}"'
else:
- return f"Reviewing EOL ID {eolId}"
+ return f'Reviewing EOL ID {eolId}'
IMG_DISPLAY_SZ = 400
MAX_IMGS_PER_ID = 3
IMG_BG_COLOR = (88, 28, 135)
-PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR)
+PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR)
-print("Checking output directory")
+print('Checking output directory')
if not os.path.exists(outDir):
os.mkdir(outDir)
-print("Getting input image list")
+print('Getting input image list')
imgList = os.listdir(imgDir)
-imgList.sort(key=lambda s: int(s.split(" ")[0]))
-if len(imgList) == 0:
- print("No input images found")
+imgList.sort(key=lambda s: int(s.split(' ')[0]))
+if not imgList:
+ print('No input images found')
sys.exit(0)
class EolImgReviewer:
- " Provides the GUI for reviewing images "
+ """ Provides the GUI for reviewing images """
def __init__(self, root, imgList):
self.root = root
- root.title("EOL Image Reviewer")
+ root.title('EOL Image Reviewer')
# Setup main frame
- mainFrame = ttk.Frame(root, padding="5 5 5 5")
+ mainFrame = ttk.Frame(root, padding='5 5 5 5')
mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
@@ -59,7 +59,7 @@ class EolImgReviewer:
self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation
self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter
# These need a persistent reference for some reason (doesn't display otherwise)
- self.labels = []
+ self.labels: list[ttk.Label] = []
for i in range(MAX_IMGS_PER_ID):
frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ)
frame.grid(column=i, row=0)
@@ -70,29 +70,29 @@ class EolImgReviewer:
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
# Add keyboard bindings
- root.bind("<q>", self.quit)
- root.bind("<Key-j>", lambda evt: self.accept(0))
- root.bind("<Key-k>", lambda evt: self.accept(1))
- root.bind("<Key-l>", lambda evt: self.accept(2))
- root.bind("<Key-i>", lambda evt: self.reject())
- root.bind("<Key-a>", lambda evt: self.rotate(0))
- root.bind("<Key-s>", lambda evt: self.rotate(1))
- root.bind("<Key-d>", lambda evt: self.rotate(2))
- root.bind("<Key-A>", lambda evt: self.rotate(0, True))
- root.bind("<Key-S>", lambda evt: self.rotate(1, True))
- root.bind("<Key-D>", lambda evt: self.rotate(2, True))
+ root.bind('<q>', self.quit)
+ root.bind('<Key-j>', lambda evt: self.accept(0))
+ root.bind('<Key-k>', lambda evt: self.accept(1))
+ root.bind('<Key-l>', lambda evt: self.accept(2))
+ root.bind('<Key-i>', lambda evt: self.reject())
+ root.bind('<Key-a>', lambda evt: self.rotate(0))
+ root.bind('<Key-s>', lambda evt: self.rotate(1))
+ root.bind('<Key-d>', lambda evt: self.rotate(2))
+ root.bind('<Key-A>', lambda evt: self.rotate(0, True))
+ root.bind('<Key-S>', lambda evt: self.rotate(1, True))
+ root.bind('<Key-D>', lambda evt: self.rotate(2, True))
# Initialise images to review
self.imgList = imgList
self.imgListIdx = 0
self.nextEolId = 0
- self.nextImgNames = []
- self.rotations = []
+ self.nextImgNames: list[str] = []
+ self.rotations: list[int] = []
self.getNextImgs()
# For displaying extra info
self.numReviewed = 0
self.startTime = time.time()
def getNextImgs(self):
- " Updates display with new images to review, or ends program "
+ """ Updates display with new images to review, or ends program """
# Gather names of next images to review
for i in range(MAX_IMGS_PER_ID):
if self.imgListIdx == len(self.imgList):
@@ -101,7 +101,7 @@ class EolImgReviewer:
return
break
imgName = self.imgList[self.imgListIdx]
- eolId = int(re.match(r"(\d+) (\d+)", imgName).group(1))
+ eolId = int(re.match(r'(\d+) (\d+)', imgName).group(1))
if i == 0:
self.nextEolId = eolId
self.nextImgNames = [imgName]
@@ -131,19 +131,19 @@ class EolImgReviewer:
self.labels[idx].config(image=self.photoImgs[idx])
idx += 1
# Restart if all image files non-recognisable
- if len(self.nextImgNames) == 0:
+ if not self.nextImgNames:
self.getNextImgs()
return
# Update title
firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
lastImgIdx = self.imgListIdx
title = getExtraInfo(self.nextEolId)
- title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})"
+ title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})'
self.root.title(title)
def accept(self, imgIdx):
- " React to a user selecting an image "
+ """ React to a user selecting an image """
if imgIdx >= len(self.nextImgNames):
- print("Invalid selection")
+ print('Invalid selection')
return
for i in range(len(self.nextImgNames)):
inFile = imgDir + self.nextImgNames[i]
@@ -160,13 +160,13 @@ class EolImgReviewer:
self.numReviewed += 1
self.getNextImgs()
def reject(self):
- " React to a user rejecting all images of a set "
+ """ React to a user rejecting all images of a set """
for i in range(len(self.nextImgNames)):
os.remove(imgDir + self.nextImgNames[i])
self.numReviewed += 1
self.getNextImgs()
def rotate(self, imgIdx, anticlockwise = False):
- " Respond to a user rotating an image "
+ """ Respond to a user rotating an image """
deg = -90 if not anticlockwise else 90
self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg)
self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
@@ -174,15 +174,15 @@ class EolImgReviewer:
self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
def quit(self, e = None):
global extraInfoDbCon
- print(f"Number reviewed: {self.numReviewed}")
+ print(f'Number reviewed: {self.numReviewed}')
timeElapsed = time.time() - self.startTime
- print(f"Time elapsed: {timeElapsed:.2f} seconds")
+ print(f'Time elapsed: {timeElapsed:.2f} seconds')
if self.numReviewed > 0:
- print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
+ print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
extraInfoDbCon.close()
self.root.destroy()
def resizeImgForDisplay(self, img):
- " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background "
+ """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """
if max(img.width, img.height) > IMG_DISPLAY_SZ:
if (img.width > img.height):
newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
@@ -196,7 +196,7 @@ class EolImgReviewer:
int((IMG_DISPLAY_SZ - img.height) / 2)))
return bgImg
# Create GUI and defer control
-print("Starting GUI")
+print('Starting GUI')
root = tki.Tk()
EolImgReviewer(root, imgList)
root.mainloop()