diff options
Diffstat (limited to 'backend')
| -rw-r--r-- | backend/data/README.md | 10 | ||||
| -rwxr-xr-x | backend/data/eol/downloadImgs.py (renamed from backend/data/downloadEolImgs.py) | 37 | ||||
| -rwxr-xr-x | backend/data/eol/reviewImgs.py (renamed from backend/data/reviewEolImgs.py) | 39 | ||||
| -rwxr-xr-x | backend/data/genImgsForWeb.py | 4 |
4 files changed, 46 insertions, 44 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index ece8efb..17484f4 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -10,14 +10,14 @@ File Generation Process 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, using data in eol/vernacularNames.csv and the 'nodes' table. 3 Image Data - 1 Run downloadEolImgs.py to download EOL images into eolImgsForReview/. + 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/. It uses data in eol/imagesList.db, and the 'eol\_ids' table. - 2 Run reviewEolImgs.py to filter images in eolImgsForReview/ into EOL-id-unique - images in eolImgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info). + 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique + images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info). 3 Run genImgsForWeb.py to create cropped/resized images in img/, using - images in eolImgsReviewed/, and also to add an 'images' table to data.db. + images in eol/imgsReviewed/, and also to add an 'images' table to data.db. 4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, - which uses 'nodes', 'edges', 'eol_ids', and 'images', to associate + which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate nodes without images to child images. 4 Node Description Data 1 Obtain data in dbpedia/, as specified in it's README. diff --git a/backend/data/downloadEolImgs.py b/backend/data/eol/downloadImgs.py index 8cf2ba2..ac72ea1 100755 --- a/backend/data/downloadEolImgs.py +++ b/backend/data/eol/downloadImgs.py @@ -8,33 +8,37 @@ from threading import Thread import signal usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Downloads images from URLs specified in an image-list database, using\n" -usageInfo += "EOL IDs obtained from another database. Downloaded images get names of\n" -usageInfo += "the form 'eolId1 contentId1.ext1'\n" +usageInfo += "Downloads images from URLs specified in an image-list database,\n" +usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n" +usageInfo += "the form 'eolId1 contentId1.ext1'.\n" usageInfo += "\n" usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n" -usageInfo += "The program can be re-run to continue downloading, and uses\n" -usageInfo += "existing downloaded files to decide where to continue from.\n" +usageInfo += "The program can be re-run to continue downloading. It looks for\n" +usageInfo += "existing downloaded files, and continues after the one with\n" +usageInfo += "highest EOL ID.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imagesListDb = "eol/imagesList.db" -dbFile = "data.db" -outDir = "eolImgsForReview/" +imagesListDb = "imagesList.db" +def getInputEolIds(): + eolIds = set() + dbCon = sqlite3.connect("../data.db") + dbCur = dbCon.cursor() + for (id,) in dbCur.execute("SELECT id FROM eol_ids"): + eolIds.add(id) + dbCon.close() + return eolIds +outDir = "imgsForReview/" LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) POST_DL_DELAY_MAX = 3 # Get eol-ids from data db -eolIds = set() -print("Reading in EOL IDs") -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -for row in dbCur.execute("SELECT id FROM eol_ids"): - eolIds.add(row[0]) -dbCon.close() +print("Getting input EOL IDs") +eolIds = getInputEolIds() # Get eol-ids from images db +print("Getting images-list-db EOL IDs") imgDbCon = sqlite3.connect(imagesListDb) imgCur = imgDbCon.cursor() imgListIds = set() @@ -43,6 +47,7 @@ for row in imgCur.execute("SELECT DISTINCT page_id FROM images"): # Get eol-id intersection, and sort into list eolIds = eolIds.intersection(imgListIds) eolIds = sorted(eolIds) +print(f"Resulted in {len(eolIds)} EOL IDs") MAX_IMGS_PER_ID = 3 MAX_THREADS = 5 @@ -132,7 +137,7 @@ for idx in range(nextIdx, len(eolIds)): time.sleep(1) exitLoop = True break - print("Downloading image to {outPath}") + print(f"Downloading image to {outPath}") # Perform download numThreads += 1 thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True) diff --git a/backend/data/reviewEolImgs.py b/backend/data/eol/reviewImgs.py index 08b8478..ba313d9 100755 --- a/backend/data/reviewEolImgs.py +++ b/backend/data/eol/reviewImgs.py @@ -16,9 +16,19 @@ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDir = "eolImgsForReview/" -outDir = "eolImgsReviewed/" -dbFile = "data.db" +imgDir = "imgsForReview/" +outDir = "imgsReviewed/" +extraInfoDbCon = sqlite3.connect("../data.db") +extraInfoDbCur = extraInfoDbCon.cursor() +def getExtraInfo(eolId): + query = "SELECT names.alt_name FROM" \ + " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ + " WHERE id = ? and pref_alt = 1" + row = extraInfoDbCur.execute(query, (eolId,)).fetchone() + if row != None: + return f"Reviewing EOL ID {eolId}, aka \"row[0]\"" + else: + return f"Reviewing EOL ID {eolId}" IMG_DISPLAY_SZ = 400 MAX_IMGS_PER_ID = 3 PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135)) @@ -33,9 +43,6 @@ imgList.sort(key=lambda s: int(s.split(" ")[0])) if len(imgList) == 0: print("No input images found", file=sys.stderr) sys.exit(1) -# Open db -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() class EolImgReviewer: """ Provides the GUI for reviewing images """ @@ -59,7 +66,7 @@ class EolImgReviewer: label.grid(column=0, row=0) self.labels.append(label) # Add padding - for child in mainFrame.winfo_children(): + for child in mainFrame.winfo_children(): child.grid_configure(padx=5, pady=5) # Add bindings root.bind("<q>", self.quit) @@ -129,18 +136,9 @@ class EolImgReviewer: # Update title firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 lastImgIdx = self.imgListIdx - query = "SELECT eol_ids.id, names.alt_name, names.pref_alt FROM" \ - " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ - " WHERE id = ? and pref_alt = 1" - row = dbCur.execute(query, (self.nextEolId,)).fetchone() - if row != None: - commonName = row[1] - self.root.title( - f"Reviewing EOL ID {self.nextEolId}, aka \"{commonName}\"" \ - f"(imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})") - else: - self.root.title( - f"Reviewing EOL ID {self.nextEolId} (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})") + title = getExtraInfo(self.nextEolId) + title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})" + self.root.title(title) def accept(self, imgIdx): """ React to a user selecting an image """ if imgIdx >= len(self.nextImgNames): @@ -179,7 +177,7 @@ class EolImgReviewer: print(f"Time elapsed: {timeElapsed:.2f} seconds") if self.numReviewed > 0: print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds") - dbCon.close() + extraInfoDbCon.close() self.root.destroy() def resizeForDisplay(self, img): """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """ @@ -199,4 +197,3 @@ class EolImgReviewer: root = tki.Tk() EolImgReviewer(root, imgList) root.mainloop() - diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py index d1eef1b..68089b7 100755 --- a/backend/data/genImgsForWeb.py +++ b/backend/data/genImgsForWeb.py @@ -16,7 +16,7 @@ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDir = "eolImgsReviewed/" +imgDir = "eol/imgsReviewed/" outDir = "img/" imagesListDb = "eol/imagesList.db" dbFile = "data.db" @@ -78,7 +78,7 @@ for i in range(inputImgIdx, len(inputImgList)): contentId = int(otherStr.split(".")[0]) print(f"Converting {imgName}") subprocess.run( - ['npx', 'smartcrop-cli', + ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgDir + imgName, |
