aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md7
-rwxr-xr-xbackend/data/genImgsForWeb.py86
-rw-r--r--backend/data/pickedImgs/README.md12
3 files changed, 82 insertions, 23 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 007a090..18e5da3 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -19,7 +19,7 @@ File Generation Process
using data in enwiki/enwikiData.db, and the 'nodes' table.
Also uses genDescNamesToSkip.txt and genEnwikiDescTitlesToUse.txt for
skipping/resolving some name-page associations.
-3 Image Data
+4 Image Data
1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/.
It uses data in eol/imagesList.db, and the 'eol_ids' table.
2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique
@@ -36,8 +36,9 @@ File Generation Process
and enables choosing, for each tol-node, which image should be used, if any,
and outputs choice information into mergedImgList.txt. Uses the 'nodes',
'eol_ids', and 'wiki_ids' tables (as well as 'names' for info-display).
- 7 Run genImgsForWeb.py, which creates cropped/resized images in img/, using
- mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db.
+ 7 Run genImgsForWeb.py, which creates cropped/resized images in img/,
+ using mergedImgList.txt, and possibly pickedImgs/, and adds 'images' and
+ 'node_imgs' tables to data.db. <br>
Smartcrop's outputs might need to be manually created/adjusted: <br>
- An input image might have no output produced, possibly due to
data incompatibilities, memory limits, etc. A few input image files
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
index 1db543f..3c299bb 100755
--- a/backend/data/genImgsForWeb.py
+++ b/backend/data/genImgsForWeb.py
@@ -19,6 +19,8 @@ imgListFile = "mergedImgList.txt"
outDir = "img/"
eolImgDb = "eol/imagesList.db"
enwikiImgDb = "enwiki/enwikiImgs.db"
+pickedImgsDir = "pickedImgs/"
+pickedImgsFile = "metadata.txt"
dbFile = "data.db"
IMG_OUT_SZ = 200
genImgFiles = True
@@ -33,6 +35,20 @@ eolCon = sqlite3.connect(eolImgDb)
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
+# Get 'picked images' info
+nodeToPickedImg = {}
+if os.path.exists(pickedImgsDir + pickedImgsFile):
+ lineNum = 0
+ with open(pickedImgsDir + pickedImgsFile) as file:
+ for line in file:
+ lineNum += 1
+ (filename, url, license, artist, credit) = line.rstrip().split("|")
+ nodeName = os.path.splitext(filename)[0] # Remove extension
+ (otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone()
+ nodeToPickedImg[otolId] = {
+ "nodeName": nodeName, "id": lineNum,
+ "filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
+ }
# Create image tables if not present
nodesDone = set()
imgsDone = set()
@@ -55,6 +71,52 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, onSigint)
# Iterate though images to process
+def quit():
+ dbCon.commit()
+ dbCon.close()
+ eolCon.close()
+ enwikiCon.close()
+ sys.exit(0)
+def convertImage(imgPath, outPath):
+ print(f"Converting {imgPath} to {outPath}")
+ if os.path.exists(outPath):
+ print(f"ERROR: Output image already exists")
+ return False
+ try:
+ completedProcess = subprocess.run(
+ ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
+ stdout=subprocess.DEVNULL
+ )
+ except Exception as e:
+ print(f"ERROR: Exception while attempting to run smartcrop: {e}")
+ return False
+ if completedProcess.returncode != 0:
+ print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
+ return False
+ return True
+print("Processing picked images")
+for (otolId, imgData) in nodeToPickedImg.items():
+ # Check for SIGINT event
+ if interrupted:
+ print("Exiting")
+ quit()
+ # Skip if already processed
+ if otolId in nodesDone:
+ continue
+ # Convert image
+ if genImgFiles:
+ if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"):
+ quit()
+ else:
+ print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
+ # Add entry to db
+ if (imgData["id"], "picked") not in imgsDone:
+ dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+ (imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"]))
+ imgsDone.add((imgData["id"], "picked"))
+ dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked"))
+ nodesDone.add(otolId)
+print("Processing images from eol and enwiki")
iterNum = 0
with open(imgListFile) as file:
for line in file:
@@ -73,20 +135,7 @@ with open(imgListFile) as file:
continue
# Convert image
if genImgFiles:
- print(f"Processing {otolId}: converting {imgPath}")
- outPath = outDir + otolId + ".jpg"
- if os.path.exists(outPath):
- print(f"ERROR: Output image already exists")
- break
- try:
- completedProcess = subprocess.run(
- ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
- stdout=subprocess.DEVNULL)
- except Exception as e:
- print(f"ERROR: Exception while attempting to run smartcrop: {e}")
- break
- if completedProcess.returncode != 0:
- print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
+ if not convertImage(imgPath, outDir + otolId + ".jpg"):
break
else:
if iterNum % 1e4 == 0:
@@ -103,7 +152,7 @@ with open(imgListFile) as file:
query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
row = eolCur.execute(query, (contentId,)).fetchone()
if row == None:
- print("ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
+ print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
break
(url, license, owner) = row
dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
@@ -118,7 +167,7 @@ with open(imgListFile) as file:
" WHERE page_imgs.page_id = ?"
row = enwikiCur.execute(query, (enwikiId,)).fetchone()
if row == None:
- print("ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
+ print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
break
(name, license, artist, credit) = row
url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
@@ -127,7 +176,4 @@ with open(imgListFile) as file:
imgsDone.add((enwikiId, "enwiki"))
dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
# Close dbs
-dbCon.commit()
-dbCon.close()
-eolCon.close()
-enwikiCon.close()
+quit()
diff --git a/backend/data/pickedImgs/README.md b/backend/data/pickedImgs/README.md
new file mode 100644
index 0000000..52fc608
--- /dev/null
+++ b/backend/data/pickedImgs/README.md
@@ -0,0 +1,12 @@
+This directory is used for adding additional, manually-picked images,
+to the server's dataset, overriding any from eol and enwiki. If used,
+it is expected to contain image files, and a metadata.txt file that
+holds metadata.
+
+Possible Files
+==============
+- Image files
+- metadata.txt <br>
+ Contains lines with the format filename|url|license|artist|credit.
+ The filename should be a tree-of-life node name, with an image
+ extension. Other fields correspond to those in the 'images' table.