diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-09-07 11:37:37 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-09-07 11:37:37 +1000 |
| commit | daccbbd9c73a5292ea9d6746560d7009e5aa666d (patch) | |
| tree | 9156bf011ab6302eb3c0d219d40587d594f51841 /backend/tolData/eol | |
| parent | 1a7fe33edafa68a6f759d124bdeee673ff9cf9ff (diff) | |
Add python type annotations
Also use consistent quote symbols
Also use 'is None' instead of '== None'
Also use 'if list1' instead of 'if len(list1) > 0'
Diffstat (limited to 'backend/tolData/eol')
| -rwxr-xr-x | backend/tolData/eol/downloadImgs.py | 76 | ||||
| -rwxr-xr-x | backend/tolData/eol/genImagesListDb.py | 28 | ||||
| -rwxr-xr-x | backend/tolData/eol/reviewImgs.py | 92 |
3 files changed, 98 insertions, 98 deletions
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py index 4d658e7..5213aaf 100755 --- a/backend/tolData/eol/downloadImgs.py +++ b/backend/tolData/eol/downloadImgs.py @@ -22,53 +22,53 @@ highest EOL ID. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -imagesListDb = "imagesList.db" -def getInputEolIds(): - eolIds = set() - dbCon = sqlite3.connect("../data.db") +imagesListDb = 'imagesList.db' +def getInputEolIds() -> set[int]: + eolIds: set[int] = set() + dbCon = sqlite3.connect('../data.db') dbCur = dbCon.cursor() - for (id,) in dbCur.execute("SELECT id FROM eol_ids"): + for (id,) in dbCur.execute('SELECT id FROM eol_ids'): eolIds.add(id) dbCon.close() return eolIds -outDir = "imgsForReview/" +outDir = 'imgsForReview/' MAX_IMGS_PER_ID = 3 MAX_THREADS = 5 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) POST_DL_DELAY_MAX = 3 -LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" +LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' -print("Getting input EOL IDs") +print('Getting input EOL IDs') eolIds = getInputEolIds() -print("Getting EOL IDs to download for") +print('Getting EOL IDs to download for') # Get IDs from images-list db imgDbCon = sqlite3.connect(imagesListDb) imgCur = imgDbCon.cursor() -imgListIds = set() -for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): +imgListIds: set[int] = set() +for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): imgListIds.add(pageId) # Get set intersection, and sort into list eolIds = eolIds.intersection(imgListIds) -eolIds = sorted(eolIds) -print(f"Result: {len(eolIds)} EOL IDs") +eolIdList = sorted(eolIds) +print(f'Result: {len(eolIdList)} EOL IDs') -print("Checking output directory") +print('Checking output directory') if not os.path.exists(outDir): os.mkdir(outDir) -print("Finding next ID to download for") +print('Finding next ID to download for') nextIdx = 0 fileList = os.listdir(outDir) -ids = [int(filename.split(" ")[0]) for filename in fileList] -if len(ids) > 0: +ids = [int(filename.split(' ')[0]) for filename in fileList] +if ids: ids.sort() - nextIdx = eolIds.index(ids[-1]) + 1 -if nextIdx == len(eolIds): - print("No IDs left. Exiting...") + nextIdx = eolIdList.index(ids[-1]) + 1 +if nextIdx == len(eolIdList): + print('No IDs left. Exiting...') sys.exit(0) -print("Starting download threads") +print('Starting download threads') numThreads = 0 -threadException = None # Used for ending main thread after a non-main thread exception +threadException: Exception | None = None # Used for ending main thread after a non-main thread exception # Handle SIGINT signals interrupted = False oldHandler = None @@ -86,29 +86,29 @@ def downloadImg(url, outFile): file.write(data.content) time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) except Exception as e: - print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) + print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) threadException = e numThreads -= 1 # Manage downloading -for idx in range(nextIdx, len(eolIds)): - eolId = eolIds[idx] +for idx in range(nextIdx, len(eolIdList)): + eolId = eolIdList[idx] # Get image urls - ownerSet = set() # Used to get images from different owners, for variety + ownerSet: set[str] = set() # Used to get images from different owners, for variety exitLoop = False - query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" - for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): - if url.startswith("data/"): - url = "https://content.eol.org/" + url + query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' + for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): + if url.startswith('data/'): + url = 'https://content.eol.org/' + url urlParts = urllib.parse.urlparse(url) extension = os.path.splitext(urlParts.path)[1] if len(extension) <= 1: - print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) + print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) continue # Check image-quantity limit if len(ownerSet) == MAX_IMGS_PER_ID: break # Check for skip conditions - if re.fullmatch(LICENSE_REGEX, license) == None: + if re.fullmatch(LICENSE_REGEX, license) is None: continue if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic continue @@ -116,27 +116,27 @@ for idx in range(nextIdx, len(eolIds)): continue ownerSet.add(copyrightOwner) # Determine output filename - outPath = f"{outDir}{eolId} {contentId}{extension}" + outPath = f'{outDir}{eolId} {contentId}{extension}' if os.path.exists(outPath): - print(f"WARNING: {outPath} already exists. Skipping download.") + print(f'WARNING: {outPath} already exists. Skipping download.') continue # Check thread limit while numThreads == MAX_THREADS: time.sleep(1) # Wait for threads after an interrupt or thread-exception - if interrupted or threadException != None: - print("Waiting for existing threads to end") + if interrupted or threadException is not None: + print('Waiting for existing threads to end') while numThreads > 0: time.sleep(1) exitLoop = True break # Perform download - print(f"Downloading image to {outPath}") + print(f'Downloading image to {outPath}') numThreads += 1 thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) thread.start() if exitLoop: break # Close images-list db -print("Finished downloading") +print('Finished downloading') imgDbCon.close() diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py index 4dcb6d9..808292d 100755 --- a/backend/tolData/eol/genImagesListDb.py +++ b/backend/tolData/eol/genImagesListDb.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, os, re +import os, re import csv import sqlite3 @@ -10,25 +10,25 @@ Generates a sqlite db from a directory of CSV files holding EOL image data """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -imagesListDir = "imagesList/" -dbFile = "imagesList.db" +imagesListDir = 'imagesList/' +dbFile = 'imagesList.db' -print("Creating database") +print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE images" \ - " (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)") -dbCur.execute("CREATE INDEX images_pid_idx ON images(page_id)") -print("Reading CSV files") +dbCur.execute('CREATE TABLE images' \ + ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)') +dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)') +print('Reading CSV files') csvFilenames = os.listdir(imagesListDir) for filename in csvFilenames: - print(f"Processing {imagesListDir}{filename}") - with open(imagesListDir + filename, newline="") as file: - for (contentId, pageId, sourceUrl, copyUrl, license, owner) in csv.reader(file): - if re.match(r"^[a-zA-Z]", contentId): # Skip header line + print(f'Processing {imagesListDir}{filename}') + with open(imagesListDir + filename, newline='') as file: + for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file): + if re.match(r'^[a-zA-Z]', contentId): # Skip header line continue - dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner)) -print("Closing database") +print('Closing database') dbCon.commit() dbCon.close() diff --git a/backend/tolData/eol/reviewImgs.py b/backend/tolData/eol/reviewImgs.py index 979ed0e..e44fb3d 100755 --- a/backend/tolData/eol/reviewImgs.py +++ b/backend/tolData/eol/reviewImgs.py @@ -16,42 +16,42 @@ Chosen images are placed in another directory, and rejected ones are deleted. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -imgDir = "imgsForReview/" -outDir = "imgs/" -extraInfoDbCon = sqlite3.connect("../data.db") +imgDir = 'imgsForReview/' +outDir = 'imgs/' +extraInfoDbCon = sqlite3.connect('../data.db') extraInfoDbCur = extraInfoDbCon.cursor() -def getExtraInfo(eolId): +def getExtraInfo(eolId: int) -> str: global extraInfoDbCur - query = "SELECT names.alt_name FROM" \ - " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ - " WHERE id = ? and pref_alt = 1" + query = 'SELECT names.alt_name FROM' \ + ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \ + ' WHERE id = ? and pref_alt = 1' row = extraInfoDbCur.execute(query, (eolId,)).fetchone() - if row != None: - return f"Reviewing EOL ID {eolId}, aka \"{row[0]}\"" + if row is not None: + return f'Reviewing EOL ID {eolId}, aka "{row[0]}"' else: - return f"Reviewing EOL ID {eolId}" + return f'Reviewing EOL ID {eolId}' IMG_DISPLAY_SZ = 400 MAX_IMGS_PER_ID = 3 IMG_BG_COLOR = (88, 28, 135) -PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) +PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) -print("Checking output directory") +print('Checking output directory') if not os.path.exists(outDir): os.mkdir(outDir) -print("Getting input image list") +print('Getting input image list') imgList = os.listdir(imgDir) -imgList.sort(key=lambda s: int(s.split(" ")[0])) -if len(imgList) == 0: - print("No input images found") +imgList.sort(key=lambda s: int(s.split(' ')[0])) +if not imgList: + print('No input images found') sys.exit(0) class EolImgReviewer: - " Provides the GUI for reviewing images " + """ Provides the GUI for reviewing images """ def __init__(self, root, imgList): self.root = root - root.title("EOL Image Reviewer") + root.title('EOL Image Reviewer') # Setup main frame - mainFrame = ttk.Frame(root, padding="5 5 5 5") + mainFrame = ttk.Frame(root, padding='5 5 5 5') mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S)) root.columnconfigure(0, weight=1) root.rowconfigure(0, weight=1) @@ -59,7 +59,7 @@ class EolImgReviewer: self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter # These need a persistent reference for some reason (doesn't display otherwise) - self.labels = [] + self.labels: list[ttk.Label] = [] for i in range(MAX_IMGS_PER_ID): frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ) frame.grid(column=i, row=0) @@ -70,29 +70,29 @@ class EolImgReviewer: for child in mainFrame.winfo_children(): child.grid_configure(padx=5, pady=5) # Add keyboard bindings - root.bind("<q>", self.quit) - root.bind("<Key-j>", lambda evt: self.accept(0)) - root.bind("<Key-k>", lambda evt: self.accept(1)) - root.bind("<Key-l>", lambda evt: self.accept(2)) - root.bind("<Key-i>", lambda evt: self.reject()) - root.bind("<Key-a>", lambda evt: self.rotate(0)) - root.bind("<Key-s>", lambda evt: self.rotate(1)) - root.bind("<Key-d>", lambda evt: self.rotate(2)) - root.bind("<Key-A>", lambda evt: self.rotate(0, True)) - root.bind("<Key-S>", lambda evt: self.rotate(1, True)) - root.bind("<Key-D>", lambda evt: self.rotate(2, True)) + root.bind('<q>', self.quit) + root.bind('<Key-j>', lambda evt: self.accept(0)) + root.bind('<Key-k>', lambda evt: self.accept(1)) + root.bind('<Key-l>', lambda evt: self.accept(2)) + root.bind('<Key-i>', lambda evt: self.reject()) + root.bind('<Key-a>', lambda evt: self.rotate(0)) + root.bind('<Key-s>', lambda evt: self.rotate(1)) + root.bind('<Key-d>', lambda evt: self.rotate(2)) + root.bind('<Key-A>', lambda evt: self.rotate(0, True)) + root.bind('<Key-S>', lambda evt: self.rotate(1, True)) + root.bind('<Key-D>', lambda evt: self.rotate(2, True)) # Initialise images to review self.imgList = imgList self.imgListIdx = 0 self.nextEolId = 0 - self.nextImgNames = [] - self.rotations = [] + self.nextImgNames: list[str] = [] + self.rotations: list[int] = [] self.getNextImgs() # For displaying extra info self.numReviewed = 0 self.startTime = time.time() def getNextImgs(self): - " Updates display with new images to review, or ends program " + """ Updates display with new images to review, or ends program """ # Gather names of next images to review for i in range(MAX_IMGS_PER_ID): if self.imgListIdx == len(self.imgList): @@ -101,7 +101,7 @@ class EolImgReviewer: return break imgName = self.imgList[self.imgListIdx] - eolId = int(re.match(r"(\d+) (\d+)", imgName).group(1)) + eolId = int(re.match(r'(\d+) (\d+)', imgName).group(1)) if i == 0: self.nextEolId = eolId self.nextImgNames = [imgName] @@ -131,19 +131,19 @@ class EolImgReviewer: self.labels[idx].config(image=self.photoImgs[idx]) idx += 1 # Restart if all image files non-recognisable - if len(self.nextImgNames) == 0: + if not self.nextImgNames: self.getNextImgs() return # Update title firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 lastImgIdx = self.imgListIdx title = getExtraInfo(self.nextEolId) - title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})" + title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})' self.root.title(title) def accept(self, imgIdx): - " React to a user selecting an image " + """ React to a user selecting an image """ if imgIdx >= len(self.nextImgNames): - print("Invalid selection") + print('Invalid selection') return for i in range(len(self.nextImgNames)): inFile = imgDir + self.nextImgNames[i] @@ -160,13 +160,13 @@ class EolImgReviewer: self.numReviewed += 1 self.getNextImgs() def reject(self): - " React to a user rejecting all images of a set " + """ React to a user rejecting all images of a set """ for i in range(len(self.nextImgNames)): os.remove(imgDir + self.nextImgNames[i]) self.numReviewed += 1 self.getNextImgs() def rotate(self, imgIdx, anticlockwise = False): - " Respond to a user rotating an image " + """ Respond to a user rotating an image """ deg = -90 if not anticlockwise else 90 self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg) self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) @@ -174,15 +174,15 @@ class EolImgReviewer: self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 def quit(self, e = None): global extraInfoDbCon - print(f"Number reviewed: {self.numReviewed}") + print(f'Number reviewed: {self.numReviewed}') timeElapsed = time.time() - self.startTime - print(f"Time elapsed: {timeElapsed:.2f} seconds") + print(f'Time elapsed: {timeElapsed:.2f} seconds') if self.numReviewed > 0: - print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds") + print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds') extraInfoDbCon.close() self.root.destroy() def resizeImgForDisplay(self, img): - " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background " + """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """ if max(img.width, img.height) > IMG_DISPLAY_SZ: if (img.width > img.height): newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) @@ -196,7 +196,7 @@ class EolImgReviewer: int((IMG_DISPLAY_SZ - img.height) / 2))) return bgImg # Create GUI and defer control -print("Starting GUI") +print('Starting GUI') root = tki.Tk() EolImgReviewer(root, imgList) root.mainloop() |
