aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/data/README.md10
-rwxr-xr-xbackend/data/eol/downloadImgs.py (renamed from backend/data/downloadEolImgs.py)37
-rwxr-xr-xbackend/data/eol/reviewImgs.py (renamed from backend/data/reviewEolImgs.py)39
-rwxr-xr-xbackend/data/genImgsForWeb.py4
4 files changed, 46 insertions, 44 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index ece8efb..17484f4 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -10,14 +10,14 @@ File Generation Process
2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
using data in eol/vernacularNames.csv and the 'nodes' table.
3 Image Data
- 1 Run downloadEolImgs.py to download EOL images into eolImgsForReview/.
+ 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/.
It uses data in eol/imagesList.db, and the 'eol\_ids' table.
- 2 Run reviewEolImgs.py to filter images in eolImgsForReview/ into EOL-id-unique
- images in eolImgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
+ 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique
+ images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
3 Run genImgsForWeb.py to create cropped/resized images in img/, using
- images in eolImgsReviewed/, and also to add an 'images' table to data.db.
+ images in eol/imgsReviewed/, and also to add an 'images' table to data.db.
4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
- which uses 'nodes', 'edges', 'eol_ids', and 'images', to associate
+ which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate
nodes without images to child images.
4 Node Description Data
1 Obtain data in dbpedia/, as specified in it's README.
diff --git a/backend/data/downloadEolImgs.py b/backend/data/eol/downloadImgs.py
index 8cf2ba2..ac72ea1 100755
--- a/backend/data/downloadEolImgs.py
+++ b/backend/data/eol/downloadImgs.py
@@ -8,33 +8,37 @@ from threading import Thread
import signal
usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an image-list database, using\n"
-usageInfo += "EOL IDs obtained from another database. Downloaded images get names of\n"
-usageInfo += "the form 'eolId1 contentId1.ext1'\n"
+usageInfo += "Downloads images from URLs specified in an image-list database,\n"
+usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n"
+usageInfo += "the form 'eolId1 contentId1.ext1'.\n"
usageInfo += "\n"
usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and uses\n"
-usageInfo += "existing downloaded files to decide where to continue from.\n"
+usageInfo += "The program can be re-run to continue downloading. It looks for\n"
+usageInfo += "existing downloaded files, and continues after the one with\n"
+usageInfo += "highest EOL ID.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-imagesListDb = "eol/imagesList.db"
-dbFile = "data.db"
-outDir = "eolImgsForReview/"
+imagesListDb = "imagesList.db"
+def getInputEolIds():
+ eolIds = set()
+ dbCon = sqlite3.connect("../data.db")
+ dbCur = dbCon.cursor()
+ for (id,) in dbCur.execute("SELECT id FROM eol_ids"):
+ eolIds.add(id)
+ dbCon.close()
+ return eolIds
+outDir = "imgsForReview/"
LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
POST_DL_DELAY_MAX = 3
# Get eol-ids from data db
-eolIds = set()
-print("Reading in EOL IDs")
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-for row in dbCur.execute("SELECT id FROM eol_ids"):
- eolIds.add(row[0])
-dbCon.close()
+print("Getting input EOL IDs")
+eolIds = getInputEolIds()
# Get eol-ids from images db
+print("Getting images-list-db EOL IDs")
imgDbCon = sqlite3.connect(imagesListDb)
imgCur = imgDbCon.cursor()
imgListIds = set()
@@ -43,6 +47,7 @@ for row in imgCur.execute("SELECT DISTINCT page_id FROM images"):
# Get eol-id intersection, and sort into list
eolIds = eolIds.intersection(imgListIds)
eolIds = sorted(eolIds)
+print(f"Resulted in {len(eolIds)} EOL IDs")
MAX_IMGS_PER_ID = 3
MAX_THREADS = 5
@@ -132,7 +137,7 @@ for idx in range(nextIdx, len(eolIds)):
time.sleep(1)
exitLoop = True
break
- print("Downloading image to {outPath}")
+ print(f"Downloading image to {outPath}")
# Perform download
numThreads += 1
thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True)
diff --git a/backend/data/reviewEolImgs.py b/backend/data/eol/reviewImgs.py
index 08b8478..ba313d9 100755
--- a/backend/data/reviewEolImgs.py
+++ b/backend/data/eol/reviewImgs.py
@@ -16,9 +16,19 @@ if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-imgDir = "eolImgsForReview/"
-outDir = "eolImgsReviewed/"
-dbFile = "data.db"
+imgDir = "imgsForReview/"
+outDir = "imgsReviewed/"
+extraInfoDbCon = sqlite3.connect("../data.db")
+extraInfoDbCur = extraInfoDbCon.cursor()
+def getExtraInfo(eolId):
+ query = "SELECT names.alt_name FROM" \
+ " names INNER JOIN eol_ids ON eol_ids.name = names.name" \
+ " WHERE id = ? and pref_alt = 1"
+ row = extraInfoDbCur.execute(query, (eolId,)).fetchone()
+ if row != None:
+ return f"Reviewing EOL ID {eolId}, aka \"row[0]\""
+ else:
+ return f"Reviewing EOL ID {eolId}"
IMG_DISPLAY_SZ = 400
MAX_IMGS_PER_ID = 3
PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
@@ -33,9 +43,6 @@ imgList.sort(key=lambda s: int(s.split(" ")[0]))
if len(imgList) == 0:
print("No input images found", file=sys.stderr)
sys.exit(1)
-# Open db
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
class EolImgReviewer:
""" Provides the GUI for reviewing images """
@@ -59,7 +66,7 @@ class EolImgReviewer:
label.grid(column=0, row=0)
self.labels.append(label)
# Add padding
- for child in mainFrame.winfo_children():
+ for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
# Add bindings
root.bind("<q>", self.quit)
@@ -129,18 +136,9 @@ class EolImgReviewer:
# Update title
firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
lastImgIdx = self.imgListIdx
- query = "SELECT eol_ids.id, names.alt_name, names.pref_alt FROM" \
- " names INNER JOIN eol_ids ON eol_ids.name = names.name" \
- " WHERE id = ? and pref_alt = 1"
- row = dbCur.execute(query, (self.nextEolId,)).fetchone()
- if row != None:
- commonName = row[1]
- self.root.title(
- f"Reviewing EOL ID {self.nextEolId}, aka \"{commonName}\"" \
- f"(imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})")
- else:
- self.root.title(
- f"Reviewing EOL ID {self.nextEolId} (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})")
+ title = getExtraInfo(self.nextEolId)
+ title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})"
+ self.root.title(title)
def accept(self, imgIdx):
""" React to a user selecting an image """
if imgIdx >= len(self.nextImgNames):
@@ -179,7 +177,7 @@ class EolImgReviewer:
print(f"Time elapsed: {timeElapsed:.2f} seconds")
if self.numReviewed > 0:
print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
- dbCon.close()
+ extraInfoDbCon.close()
self.root.destroy()
def resizeForDisplay(self, img):
""" Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
@@ -199,4 +197,3 @@ class EolImgReviewer:
root = tki.Tk()
EolImgReviewer(root, imgList)
root.mainloop()
-
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
index d1eef1b..68089b7 100755
--- a/backend/data/genImgsForWeb.py
+++ b/backend/data/genImgsForWeb.py
@@ -16,7 +16,7 @@ if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-imgDir = "eolImgsReviewed/"
+imgDir = "eol/imgsReviewed/"
outDir = "img/"
imagesListDb = "eol/imagesList.db"
dbFile = "data.db"
@@ -78,7 +78,7 @@ for i in range(inputImgIdx, len(inputImgList)):
contentId = int(otherStr.split(".")[0])
print(f"Converting {imgName}")
subprocess.run(
- ['npx', 'smartcrop-cli',
+ ['npx', 'smartcrop-cli',
'--width', str(IMG_OUT_SZ),
'--height', str(IMG_OUT_SZ),
imgDir + imgName,