From 88376b3bb7f9072d5fcc83d63eaf425e9b21b77e Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Tue, 14 Jun 2022 16:48:12 +1000 Subject: Fix bugs in web-img generation Also adjust image-reviewing script to allow skiping some user-choices --- backend/data/README.md | 18 ++++++++++-- backend/data/enwiki/README.md | 1 + backend/data/genImgsForWeb.py | 58 +++++++++++++++++++++++---------------- backend/data/reviewImgsToMerge.py | 48 ++++++++++++++++++-------------- 4 files changed, 79 insertions(+), 46 deletions(-) (limited to 'backend') diff --git a/backend/data/README.md b/backend/data/README.md index 7c03d9e..19005e5 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -36,26 +36,38 @@ File Generation Process and outputs choice information into mergedImgList.txt. 7 Run genImgsForWeb.py, which creates cropped/resized images in img/, using mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db. + Smartcrop's outputs might need to be manually created/adjusted:
+ - An input image might have no output produced, possibly due to + data incompatibilities, memory limits, etc. A few input image files + might actually be html files, containing a 'file not found' page. + - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. + - An input image might produce output with unexpected dimensions. + This seems to happen when the image is very large, and triggers a + decompression bomb warning. + The result might have as many as 150k images, with about 2/3 of them + being from wikipedia. 8 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, which uses 'nodes', 'edges', 'eol\_ids', and 'node_imgs', to associate nodes without images to child images. + 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. 6 Other 1 Can run genEnwikiNameData.py, which adds more entries to the 'names' table, using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables. + 2 //node-trimming data.db Tables ============== - nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT - edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) -- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) - eol\_ids: id INT PRIMARY KEY, name TEXT +- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) +- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT - images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src) - node\_imgs: id TEXT PRIMARY KEY, img\_id INT, src TEXT -- linked\_imgs: name TEXT PRIMARY KEY, eol\_id INT, eol\_id2 INT -- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT +- linked\_imgs: name TEXT PRIMARY KEY, otol\_id INT, otol\_id2 INT - r\_nodes: name TEXT PRIMARY KEY, tips INT - r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index ea97c9a..22af5ba 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -35,3 +35,4 @@ Generated Files Tables:
- page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT - imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT + (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info) diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py index aa485bb..2c4f58d 100755 --- a/backend/data/genImgsForWeb.py +++ b/backend/data/genImgsForWeb.py @@ -34,15 +34,19 @@ enwikiCon = sqlite3.connect(enwikiImgDb) enwikiCur = enwikiCon.cursor() # Create image tables if not present nodesDone = set() -if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='images'").fetchone() == None: +imgsDone = set() +if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None: + dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)") dbCur.execute("CREATE TABLE images" \ " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))") - dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)") else: # Get existing node-associations - for (otolId,) in dbCur.execute("SELECT DISTINCT id from node_imgs"): + for (otolId,) in dbCur.execute("SELECT id from node_imgs"): nodesDone.add(otolId) - print(f"Found {len(nodesDone)} nodes already processed") + # And images + for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"): + imgsDone.add((imgId, imgSrc)) + print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing") # Detect SIGINT signals interrupted = False def onSigint(sig, frame): @@ -65,23 +69,21 @@ with open(imgListFile) as file: if otolId in nodesDone: continue outPath = outDir + otolId + ".jpg" - # Convert image if needed - convertedImage = False - if not os.path.exists(outPath): - print(f"{otolId}: converting {imgPath}") + # Convert image + print(f"{otolId}: converting {imgPath}") + if os.path.exists(outPath): + print(f"ERROR: Output image already exists") + break + try: completedProcess = subprocess.run( ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], - stdout=subprocess.DEVNULL - ) - # Prevent adding a db entry after an interrupted conversion - # Needed because the subprocess above exits on a SIGINT (not prevented by onSigint() above) - if completedProcess.returncode < 0: - print("Exiting due to interrupted subprocess") - break - elif completedProcess.returncode > 0: - print(f"Exiting due to subprocess exit status {completedProcess.returncode}") - break - convertedImage = True + stdout=subprocess.DEVNULL) + except Exception as e: + print(f"ERROR: Exception while attempting to run smartcrop: {e}") + break + if completedProcess.returncode != 0: + print(f"ERROR: smartcrop had exit status {completedProcess.returncode}") + break # Add entry to db fromEol = imgPath.startswith("eol/") imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component @@ -89,22 +91,32 @@ with open(imgListFile) as file: if fromEol: (eolId, _, contentId) = imgName.partition(" ") (eolId, contentId) = (int(eolId), int(contentId)) - if convertedImage: + if (eolId, "eol") not in imgsDone: query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?" - (url, license, owner) = eolCur.execute(query, (contentId,)).fetchone() + row = eolCur.execute(query, (contentId,)).fetchone() + if row == None: + print("ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr) + break + (url, license, owner) = row dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", (eolId, "eol", url, license, owner, "")) + imgsDone.add((eolId, "eol")) dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, eolId, "eol")) else: enwikiId = int(imgName) - if convertedImage: + if (enwikiId, "enwiki") not in imgsDone: query = "SELECT name, license, artist, credit FROM" \ " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \ " WHERE page_imgs.page_id = ?" - (name, license, artist, credit) = enwikiCur.execute(query, (enwikiId,)).fetchone() + row = enwikiCur.execute(query, (enwikiId,)).fetchone() + if row == None: + print("ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr) + break + (name, license, artist, credit) = row url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name) dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", (enwikiId, "enwiki", url, license, artist, credit)) + imgsDone.add((enwikiId, "enwiki")) dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, enwikiId, "enwiki")) # Close dbs dbCon.commit() diff --git a/backend/data/reviewImgsToMerge.py b/backend/data/reviewImgsToMerge.py index 15490f7..4120b14 100755 --- a/backend/data/reviewImgsToMerge.py +++ b/backend/data/reviewImgsToMerge.py @@ -26,6 +26,7 @@ dbFile = "data.db" outFile = "mergedImgList.txt" IMG_DISPLAY_SZ = 400 PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135)) +onlyReviewPairs = False # Open db dbCon = sqlite3.connect(dbFile) @@ -37,28 +38,28 @@ if os.path.exists(eolImgDir): for filename in os.listdir(eolImgDir): (eolId, _, _) = filename.partition(" ") query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?" - row = dbCur.execute(query, (int(eolId),)).fetchone() - if row == None: + found = False + for (otolId,) in dbCur.execute(query, (int(eolId),)): + if otolId not in nodeToImgs: + nodeToImgs[otolId] = [] + nodeToImgs[otolId].append(eolImgDir + filename) + found = True + if not found: print(f"No node found for {eolImgDir}{filename}", file=sys.stderr) - continue - otolId = row[0] - if otolId not in nodeToImgs: - nodeToImgs[otolId] = [] - nodeToImgs[otolId].append(eolImgDir + filename) print(f"Result has {len(nodeToImgs)} node entries") print("Looking through enwiki images") if os.path.exists(enwikiImgDir): for filename in os.listdir(enwikiImgDir): (wikiId, _, _) = filename.partition(".") query = "SELECT nodes.id FROM nodes INNER JOIN descs ON nodes.name = descs.name WHERE descs.wiki_id = ?" - row = dbCur.execute(query, (int(wikiId),)).fetchone() - if row == None: + found = False + for (otolId,) in dbCur.execute(query, (int(wikiId),)): + if otolId not in nodeToImgs: + nodeToImgs[otolId] = [] + nodeToImgs[otolId].append(enwikiImgDir + filename) + found = True + if not found: print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr) - continue - otolId = row[0] - if otolId not in nodeToImgs: - nodeToImgs[otolId] = [] - nodeToImgs[otolId].append(enwikiImgDir + filename) print(f"Result has {len(nodeToImgs)} node entries") # Check for already-made choices print("Filtering out already-chosen IDs") @@ -113,12 +114,19 @@ class ImgReviewer: def getNextImgs(self): """ Updates display with new images to review, or ends program """ # Get next image paths - self.listIdx += 1 - if self.listIdx == len(self.nodeImgsList): - print("No more images to review. Exiting program.") - self.quit() - return - (self.otolId, imgPaths) = self.nodeImgsList[self.listIdx] + while True: + self.listIdx += 1 + if self.listIdx == len(self.nodeImgsList): + print("No more images to review. Exiting program.") + self.quit() + return + (self.otolId, imgPaths) = self.nodeImgsList[self.listIdx] + # Potentially skip user choice + if onlyReviewPairs and len(imgPaths) == 1: + with open(outFile, 'a') as file: + file.write(f"{self.otolId} {imgPaths[0]}\n") + continue + break # Update displayed images self.eolImgPath = self.enwikiImgPath = None imageOpenError = False -- cgit v1.2.3