aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-14 16:48:12 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-14 16:48:12 +1000
commit88376b3bb7f9072d5fcc83d63eaf425e9b21b77e (patch)
treec279b91b127fca164b881a4415c703b988382f2c /backend/data
parentd94953e72fad8e3259e69761744f60a325e63984 (diff)
Fix bugs in web-img generation
Also adjust image-reviewing script to allow skiping some user-choices
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md18
-rw-r--r--backend/data/enwiki/README.md1
-rwxr-xr-xbackend/data/genImgsForWeb.py58
-rwxr-xr-xbackend/data/reviewImgsToMerge.py48
4 files changed, 79 insertions, 46 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 7c03d9e..19005e5 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -36,26 +36,38 @@ File Generation Process
and outputs choice information into mergedImgList.txt.
7 Run genImgsForWeb.py, which creates cropped/resized images in img/, using
mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db.
+ Smartcrop's outputs might need to be manually created/adjusted: <br>
+ - An input image might have no output produced, possibly due to
+ data incompatibilities, memory limits, etc. A few input image files
+ might actually be html files, containing a 'file not found' page.
+ - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
+ - An input image might produce output with unexpected dimensions.
+ This seems to happen when the image is very large, and triggers a
+ decompression bomb warning.
+ The result might have as many as 150k images, with about 2/3 of them
+ being from wikipedia.
8 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
which uses 'nodes', 'edges', 'eol\_ids', and 'node_imgs', to associate
nodes without images to child images.
+
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
6 Other
1 Can run genEnwikiNameData.py, which adds more entries to the 'names' table,
using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+ 2 //node-trimming
data.db Tables
==============
- nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT
- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
-- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
- eol\_ids: id INT PRIMARY KEY, name TEXT
+- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
+- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
- images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)
- node\_imgs: id TEXT PRIMARY KEY, img\_id INT, src TEXT
-- linked\_imgs: name TEXT PRIMARY KEY, eol\_id INT, eol\_id2 INT
-- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
+- linked\_imgs: name TEXT PRIMARY KEY, otol\_id INT, otol\_id2 INT
- r\_nodes: name TEXT PRIMARY KEY, tips INT
- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index ea97c9a..22af5ba 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -35,3 +35,4 @@ Generated Files
Tables: <br>
- page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT
- imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT
+ (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info)
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
index aa485bb..2c4f58d 100755
--- a/backend/data/genImgsForWeb.py
+++ b/backend/data/genImgsForWeb.py
@@ -34,15 +34,19 @@ enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
# Create image tables if not present
nodesDone = set()
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='images'").fetchone() == None:
+imgsDone = set()
+if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
+ dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)")
dbCur.execute("CREATE TABLE images" \
" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
- dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)")
else:
# Get existing node-associations
- for (otolId,) in dbCur.execute("SELECT DISTINCT id from node_imgs"):
+ for (otolId,) in dbCur.execute("SELECT id from node_imgs"):
nodesDone.add(otolId)
- print(f"Found {len(nodesDone)} nodes already processed")
+ # And images
+ for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
+ imgsDone.add((imgId, imgSrc))
+ print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
# Detect SIGINT signals
interrupted = False
def onSigint(sig, frame):
@@ -65,23 +69,21 @@ with open(imgListFile) as file:
if otolId in nodesDone:
continue
outPath = outDir + otolId + ".jpg"
- # Convert image if needed
- convertedImage = False
- if not os.path.exists(outPath):
- print(f"{otolId}: converting {imgPath}")
+ # Convert image
+ print(f"{otolId}: converting {imgPath}")
+ if os.path.exists(outPath):
+ print(f"ERROR: Output image already exists")
+ break
+ try:
completedProcess = subprocess.run(
['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
- stdout=subprocess.DEVNULL
- )
- # Prevent adding a db entry after an interrupted conversion
- # Needed because the subprocess above exits on a SIGINT (not prevented by onSigint() above)
- if completedProcess.returncode < 0:
- print("Exiting due to interrupted subprocess")
- break
- elif completedProcess.returncode > 0:
- print(f"Exiting due to subprocess exit status {completedProcess.returncode}")
- break
- convertedImage = True
+ stdout=subprocess.DEVNULL)
+ except Exception as e:
+ print(f"ERROR: Exception while attempting to run smartcrop: {e}")
+ break
+ if completedProcess.returncode != 0:
+ print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
+ break
# Add entry to db
fromEol = imgPath.startswith("eol/")
imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
@@ -89,22 +91,32 @@ with open(imgListFile) as file:
if fromEol:
(eolId, _, contentId) = imgName.partition(" ")
(eolId, contentId) = (int(eolId), int(contentId))
- if convertedImage:
+ if (eolId, "eol") not in imgsDone:
query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
- (url, license, owner) = eolCur.execute(query, (contentId,)).fetchone()
+ row = eolCur.execute(query, (contentId,)).fetchone()
+ if row == None:
+ print("ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
+ break
+ (url, license, owner) = row
dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
(eolId, "eol", url, license, owner, ""))
+ imgsDone.add((eolId, "eol"))
dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, eolId, "eol"))
else:
enwikiId = int(imgName)
- if convertedImage:
+ if (enwikiId, "enwiki") not in imgsDone:
query = "SELECT name, license, artist, credit FROM" \
" page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
" WHERE page_imgs.page_id = ?"
- (name, license, artist, credit) = enwikiCur.execute(query, (enwikiId,)).fetchone()
+ row = enwikiCur.execute(query, (enwikiId,)).fetchone()
+ if row == None:
+ print("ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
+ break
+ (name, license, artist, credit) = row
url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
(enwikiId, "enwiki", url, license, artist, credit))
+ imgsDone.add((enwikiId, "enwiki"))
dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, enwikiId, "enwiki"))
# Close dbs
dbCon.commit()
diff --git a/backend/data/reviewImgsToMerge.py b/backend/data/reviewImgsToMerge.py
index 15490f7..4120b14 100755
--- a/backend/data/reviewImgsToMerge.py
+++ b/backend/data/reviewImgsToMerge.py
@@ -26,6 +26,7 @@ dbFile = "data.db"
outFile = "mergedImgList.txt"
IMG_DISPLAY_SZ = 400
PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
+onlyReviewPairs = False
# Open db
dbCon = sqlite3.connect(dbFile)
@@ -37,28 +38,28 @@ if os.path.exists(eolImgDir):
for filename in os.listdir(eolImgDir):
(eolId, _, _) = filename.partition(" ")
query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?"
- row = dbCur.execute(query, (int(eolId),)).fetchone()
- if row == None:
+ found = False
+ for (otolId,) in dbCur.execute(query, (int(eolId),)):
+ if otolId not in nodeToImgs:
+ nodeToImgs[otolId] = []
+ nodeToImgs[otolId].append(eolImgDir + filename)
+ found = True
+ if not found:
print(f"No node found for {eolImgDir}{filename}", file=sys.stderr)
- continue
- otolId = row[0]
- if otolId not in nodeToImgs:
- nodeToImgs[otolId] = []
- nodeToImgs[otolId].append(eolImgDir + filename)
print(f"Result has {len(nodeToImgs)} node entries")
print("Looking through enwiki images")
if os.path.exists(enwikiImgDir):
for filename in os.listdir(enwikiImgDir):
(wikiId, _, _) = filename.partition(".")
query = "SELECT nodes.id FROM nodes INNER JOIN descs ON nodes.name = descs.name WHERE descs.wiki_id = ?"
- row = dbCur.execute(query, (int(wikiId),)).fetchone()
- if row == None:
+ found = False
+ for (otolId,) in dbCur.execute(query, (int(wikiId),)):
+ if otolId not in nodeToImgs:
+ nodeToImgs[otolId] = []
+ nodeToImgs[otolId].append(enwikiImgDir + filename)
+ found = True
+ if not found:
print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr)
- continue
- otolId = row[0]
- if otolId not in nodeToImgs:
- nodeToImgs[otolId] = []
- nodeToImgs[otolId].append(enwikiImgDir + filename)
print(f"Result has {len(nodeToImgs)} node entries")
# Check for already-made choices
print("Filtering out already-chosen IDs")
@@ -113,12 +114,19 @@ class ImgReviewer:
def getNextImgs(self):
""" Updates display with new images to review, or ends program """
# Get next image paths
- self.listIdx += 1
- if self.listIdx == len(self.nodeImgsList):
- print("No more images to review. Exiting program.")
- self.quit()
- return
- (self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
+ while True:
+ self.listIdx += 1
+ if self.listIdx == len(self.nodeImgsList):
+ print("No more images to review. Exiting program.")
+ self.quit()
+ return
+ (self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
+ # Potentially skip user choice
+ if onlyReviewPairs and len(imgPaths) == 1:
+ with open(outFile, 'a') as file:
+ file.write(f"{self.otolId} {imgPaths[0]}\n")
+ continue
+ break
# Update displayed images
self.eolImgPath = self.enwikiImgPath = None
imageOpenError = False