Fix bugs in web-img generation

Also adjust image-reviewing script to allow skiping some user-choices
author: Terry Truong <terry06890@gmail.com> 2022-06-14 16:48:12 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-06-14 16:48:12 +1000
commit: 88376b3bb7f9072d5fcc83d63eaf425e9b21b77e (patch)
tree: c279b91b127fca164b881a4415c703b988382f2c /backend/data
parent: d94953e72fad8e3259e69761744f60a325e63984 (diff)
4 files changed, 79 insertions, 46 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 7c03d9e..19005e5 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -36,26 +36,38 @@ File Generation Process
         and outputs choice information into mergedImgList.txt.
     7   Run genImgsForWeb.py, which creates cropped/resized images in img/, using
         mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db.
+        Smartcrop's outputs might need to be manually created/adjusted: <br>
+        -   An input image might have no output produced, possibly due to
+            data incompatibilities, memory limits, etc. A few input image files
+            might actually be html files, containing a 'file not found' page.
+        -   An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
+        -   An input image might produce output with unexpected dimensions.
+            This seems to happen when the image is very large, and triggers a
+            decompression bomb warning.
+        The result might have as many as 150k images, with about 2/3 of them
+        being from wikipedia.
     8   Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
         which uses 'nodes', 'edges', 'eol\_ids', and 'node_imgs', to associate
         nodes without images to child images.
+
 5   Reduced Tree Structure Data
     1   Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
         data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
 6   Other
     1   Can run genEnwikiNameData.py, which adds more entries to the 'names' table,
         using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+    2   //node-trimming
 
 data.db Tables
 ==============
 -   nodes:        name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT
 -   edges:        node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
--   names:        name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
 -   eol\_ids:     id INT PRIMARY KEY, name TEXT
+-   names:        name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
+-   descs:        name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
 -   images:       id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)
 -   node\_imgs:   id TEXT PRIMARY KEY, img\_id INT, src TEXT
--   linked\_imgs: name TEXT PRIMARY KEY, eol\_id INT, eol\_id2 INT
--   descs:        name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
+-   linked\_imgs: name TEXT PRIMARY KEY, otol\_id INT, otol\_id2 INT
 -   r\_nodes:     name TEXT PRIMARY KEY, tips INT
 -   r\_edges:     node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
 
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index ea97c9a..22af5ba 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -35,3 +35,4 @@ Generated Files
     Tables: <br>
     -   page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT
     -   imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT
+        (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info)
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
index aa485bb..2c4f58d 100755
--- a/backend/data/genImgsForWeb.py
+++ b/backend/data/genImgsForWeb.py
@@ -34,15 +34,19 @@ enwikiCon = sqlite3.connect(enwikiImgDb)
 enwikiCur = enwikiCon.cursor()
 # Create image tables if not present
 nodesDone = set()
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='images'").fetchone() == None:
+imgsDone = set()
+if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
+	dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)")
 	dbCur.execute("CREATE TABLE images" \
 		" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
-	dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)")
 else:
 	# Get existing node-associations
-	for (otolId,) in dbCur.execute("SELECT DISTINCT id from node_imgs"):
+	for (otolId,) in dbCur.execute("SELECT id from node_imgs"):
 		nodesDone.add(otolId)
-	print(f"Found {len(nodesDone)} nodes already processed")
+	# And images
+	for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
+		imgsDone.add((imgId, imgSrc))
+	print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
 # Detect SIGINT signals
 interrupted = False
 def onSigint(sig, frame):
@@ -65,23 +69,21 @@ with open(imgListFile) as file:
 		if otolId in nodesDone:
 			continue
 		outPath = outDir + otolId + ".jpg"
-		# Convert image if needed
-		convertedImage = False
-		if not os.path.exists(outPath):
-			print(f"{otolId}: converting {imgPath}")
+		# Convert image
+		print(f"{otolId}: converting {imgPath}")
+		if os.path.exists(outPath):
+			print(f"ERROR: Output image already exists")
+			break
+		try:
 			completedProcess = subprocess.run(
 				['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
-				stdout=subprocess.DEVNULL
-			)
-			# Prevent adding a db entry after an interrupted conversion
-				# Needed because the subprocess above exits on a SIGINT (not prevented by onSigint() above)
-			if completedProcess.returncode < 0:
-				print("Exiting due to interrupted subprocess")
-				break
-			elif completedProcess.returncode > 0:
-				print(f"Exiting due to subprocess exit status {completedProcess.returncode}")
-				break
-			convertedImage = True
+				stdout=subprocess.DEVNULL)
+		except Exception as e:
+			print(f"ERROR: Exception while attempting to run smartcrop: {e}")
+			break
+		if completedProcess.returncode != 0:
+			print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
+			break
 		# Add entry to db
 		fromEol = imgPath.startswith("eol/")
 		imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
@@ -89,22 +91,32 @@ with open(imgListFile) as file:
 		if fromEol:
 			(eolId, _, contentId) = imgName.partition(" ")
 			(eolId, contentId) = (int(eolId), int(contentId))
-			if convertedImage:
+			if (eolId, "eol") not in imgsDone:
 				query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
-				(url, license, owner) = eolCur.execute(query, (contentId,)).fetchone()
+				row = eolCur.execute(query, (contentId,)).fetchone()
+				if row == None:
+					print("ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
+					break
+				(url, license, owner) = row
 				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
 					(eolId, "eol", url, license, owner, ""))
+				imgsDone.add((eolId, "eol"))
 			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, eolId, "eol"))
 		else:
 			enwikiId = int(imgName)
-			if convertedImage:
+			if (enwikiId, "enwiki") not in imgsDone:
 				query = "SELECT name, license, artist, credit FROM" \
 					" page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
 					" WHERE page_imgs.page_id = ?"
-				(name, license, artist, credit) = enwikiCur.execute(query, (enwikiId,)).fetchone()
+				row = enwikiCur.execute(query, (enwikiId,)).fetchone()
+				if row == None:
+					print("ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
+					break
+				(name, license, artist, credit) = row
 				url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
 				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
 					(enwikiId, "enwiki", url, license, artist, credit))
+				imgsDone.add((enwikiId, "enwiki"))
 			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, enwikiId, "enwiki"))
 # Close dbs
 dbCon.commit()
diff --git a/backend/data/reviewImgsToMerge.py b/backend/data/reviewImgsToMerge.py
index 15490f7..4120b14 100755
--- a/backend/data/reviewImgsToMerge.py
+++ b/backend/data/reviewImgsToMerge.py
@@ -26,6 +26,7 @@ dbFile = "data.db"
 outFile = "mergedImgList.txt"
 IMG_DISPLAY_SZ = 400
 PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
+onlyReviewPairs = False
 
 # Open db
 dbCon = sqlite3.connect(dbFile)
@@ -37,28 +38,28 @@ if os.path.exists(eolImgDir):
 	for filename in os.listdir(eolImgDir):
 		(eolId, _, _) = filename.partition(" ")
 		query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?"
-		row = dbCur.execute(query, (int(eolId),)).fetchone()
-		if row == None:
+		found = False
+		for (otolId,) in dbCur.execute(query, (int(eolId),)):
+			if otolId not in nodeToImgs:
+				nodeToImgs[otolId] = []
+			nodeToImgs[otolId].append(eolImgDir + filename)
+			found = True
+		if not found:
 			print(f"No node found for {eolImgDir}{filename}", file=sys.stderr)
-			continue
-		otolId = row[0]
-		if otolId not in nodeToImgs:
-			nodeToImgs[otolId] = []
-		nodeToImgs[otolId].append(eolImgDir + filename)
 print(f"Result has {len(nodeToImgs)} node entries")
 print("Looking through enwiki images")
 if os.path.exists(enwikiImgDir):
 	for filename in os.listdir(enwikiImgDir):
 		(wikiId, _, _) = filename.partition(".")
 		query = "SELECT nodes.id FROM nodes INNER JOIN descs ON nodes.name = descs.name WHERE descs.wiki_id = ?"
-		row = dbCur.execute(query, (int(wikiId),)).fetchone()
-		if row == None:
+		found = False
+		for (otolId,) in dbCur.execute(query, (int(wikiId),)):
+			if otolId not in nodeToImgs:
+				nodeToImgs[otolId] = []
+			nodeToImgs[otolId].append(enwikiImgDir + filename)
+			found = True
+		if not found:
 			print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr)
-			continue
-		otolId = row[0]
-		if otolId not in nodeToImgs:
-			nodeToImgs[otolId] = []
-		nodeToImgs[otolId].append(enwikiImgDir + filename)
 print(f"Result has {len(nodeToImgs)} node entries")
 # Check for already-made choices
 print("Filtering out already-chosen IDs")
@@ -113,12 +114,19 @@ class ImgReviewer:
 	def getNextImgs(self):
 		""" Updates display with new images to review, or ends program """
 		# Get next image paths
-		self.listIdx += 1
-		if self.listIdx == len(self.nodeImgsList):
-			print("No more images to review. Exiting program.")
-			self.quit()
-			return
-		(self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
+		while True:
+			self.listIdx += 1
+			if self.listIdx == len(self.nodeImgsList):
+				print("No more images to review. Exiting program.")
+				self.quit()
+				return
+			(self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
+			# Potentially skip user choice
+			if onlyReviewPairs and len(imgPaths) == 1:
+				with open(outFile, 'a') as file:
+					file.write(f"{self.otolId} {imgPaths[0]}\n")
+				continue
+			break
 		# Update displayed images
 		self.eolImgPath = self.enwikiImgPath = None
 		imageOpenError = False
author	Terry Truong <terry06890@gmail.com>	2022-06-14 16:48:12 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-06-14 16:48:12 +1000
commit	88376b3bb7f9072d5fcc83d63eaf425e9b21b77e (patch)
tree	c279b91b127fca164b881a4415c703b988382f2c /backend/data
parent	d94953e72fad8e3259e69761744f60a325e63984 (diff)