aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--backend/data/README.md5
-rwxr-xr-xbackend/data/dbpedia/genData.py3
-rwxr-xr-xbackend/data/reviewImgsToMerge.py209
4 files changed, 216 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore
index 2493f5e..acc3905 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@
/backend/data/dbpPickedLabels.txt
/backend/data/genEnwikiDescNamesToSkip.txt
/backend/data/genEnwikiDescTitlesToUse.txt
+/backend/data/mergedImgList.txt
diff --git a/backend/data/README.md b/backend/data/README.md
index 18daa99..2a6344c 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -40,7 +40,10 @@ File Generation Process
it in that db.
6 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/.
- 7 // ADD
+ 7 // UPDATE
+ Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/,
+ enables choosing, for each tol-node, which image should be used, if any,
+ and outputs choice information into mergedImgList.txt.
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
diff --git a/backend/data/dbpedia/genData.py b/backend/data/dbpedia/genData.py
index 7b48ac4..41c48a8 100755
--- a/backend/data/dbpedia/genData.py
+++ b/backend/data/dbpedia/genData.py
@@ -24,7 +24,8 @@ dbCur = dbCon.cursor()
# Read/store labels
print("Reading/storing label data")
dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)")
-dbCur.execute("CREATE INDEX labels_idx ON labels(label COLLATE NOCASE)")
+dbCur.execute("CREATE INDEX labels_idx ON labels(label)")
+dbCur.execute("CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)")
labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n')
lineNum = 0
with bz2.open(labelsFile, mode='rt') as file:
diff --git a/backend/data/reviewImgsToMerge.py b/backend/data/reviewImgsToMerge.py
new file mode 100755
index 0000000..15490f7
--- /dev/null
+++ b/backend/data/reviewImgsToMerge.py
@@ -0,0 +1,209 @@
+#!/usr/bin/python3
+
+import sys, re, os, time
+import sqlite3
+import tkinter as tki
+from tkinter import ttk
+import PIL
+from PIL import ImageTk, Image, ImageOps
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Provides a GUI that displays, for each tol-node, an associated image from\n"
+usageInfo += "eol/* and enwiki/*, and enables the user to choose which to use. Writes\n"
+usageInfo += "choice data to a text file with lines of the form 'otolId1 imgPath1', or\n"
+usageInfo += "'otolId1', where no path indicates a choice of no image.\n"
+usageInfo += "\n"
+usageInfo += "The program can be closed, and run again to continue from the last choice.\n"
+usageInfo += "The program looks for an existing output file to determine what choices\n"
+usageInfo += "have already been made.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+eolImgDir = "eol/imgsReviewed/"
+enwikiImgDir = "enwiki/imgs/"
+dbFile = "data.db"
+outFile = "mergedImgList.txt"
+IMG_DISPLAY_SZ = 400
+PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
+
+# Open db
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Associate nodes with images
+nodeToImgs = {} # Maps otol-ids to img-path arrays
+print("Looking through EOL images")
+if os.path.exists(eolImgDir):
+ for filename in os.listdir(eolImgDir):
+ (eolId, _, _) = filename.partition(" ")
+ query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?"
+ row = dbCur.execute(query, (int(eolId),)).fetchone()
+ if row == None:
+ print(f"No node found for {eolImgDir}{filename}", file=sys.stderr)
+ continue
+ otolId = row[0]
+ if otolId not in nodeToImgs:
+ nodeToImgs[otolId] = []
+ nodeToImgs[otolId].append(eolImgDir + filename)
+print(f"Result has {len(nodeToImgs)} node entries")
+print("Looking through enwiki images")
+if os.path.exists(enwikiImgDir):
+ for filename in os.listdir(enwikiImgDir):
+ (wikiId, _, _) = filename.partition(".")
+ query = "SELECT nodes.id FROM nodes INNER JOIN descs ON nodes.name = descs.name WHERE descs.wiki_id = ?"
+ row = dbCur.execute(query, (int(wikiId),)).fetchone()
+ if row == None:
+ print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr)
+ continue
+ otolId = row[0]
+ if otolId not in nodeToImgs:
+ nodeToImgs[otolId] = []
+ nodeToImgs[otolId].append(enwikiImgDir + filename)
+print(f"Result has {len(nodeToImgs)} node entries")
+# Check for already-made choices
+print("Filtering out already-chosen IDs")
+oldSz = len(nodeToImgs)
+if os.path.exists(outFile):
+ with open(outFile) as file:
+ for line in file:
+ line = line.rstrip()
+ if " " in line:
+ line = line[:line.find(" ")]
+ del nodeToImgs[line]
+print(f"Filtered out {oldSz - len(nodeToImgs)} entries")
+
+class ImgReviewer:
+ """ Provides the GUI for reviewing images """
+ def __init__(self, root, nodeToImgs):
+ self.root = root
+ root.title("Image Reviewer")
+ # Setup main frame
+ mainFrame = ttk.Frame(root, padding="5 5 5 5")
+ mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
+ root.columnconfigure(0, weight=1)
+ root.rowconfigure(0, weight=1)
+ # Set up images-to-be-reviewed frames
+ self.eolImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
+ self.enwikiImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
+ self.labels = []
+ for i in (0, 1):
+ frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ)
+ frame.grid(column=i, row=0)
+ label = ttk.Label(frame, image=self.eolImg if i == 0 else self.enwikiImg)
+ label.grid(column=0, row=0)
+ self.labels.append(label)
+ # Add padding
+ for child in mainFrame.winfo_children():
+ child.grid_configure(padx=5, pady=5)
+ # Add bindings
+ root.bind("<q>", self.quit)
+ root.bind("<Key-j>", lambda evt: self.accept(0))
+ root.bind("<Key-k>", lambda evt: self.accept(1))
+ root.bind("<Key-l>", lambda evt: self.reject())
+ # Set fields
+ self.nodeImgsList = list(nodeToImgs.items())
+ self.listIdx = -1
+ self.otolId = None
+ self.eolImgPath = None
+ self.enwikiImgPath = None
+ self.numReviewed = 0
+ self.startTime = time.time()
+ # Initialise images to review
+ self.getNextImgs()
+ def getNextImgs(self):
+ """ Updates display with new images to review, or ends program """
+ # Get next image paths
+ self.listIdx += 1
+ if self.listIdx == len(self.nodeImgsList):
+ print("No more images to review. Exiting program.")
+ self.quit()
+ return
+ (self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
+ # Update displayed images
+ self.eolImgPath = self.enwikiImgPath = None
+ imageOpenError = False
+ for imgPath in imgPaths:
+ img = None
+ try:
+ img = Image.open(imgPath)
+ img = ImageOps.exif_transpose(img)
+ except PIL.UnidentifiedImageError:
+ print(f"UnidentifiedImageError for {imgPath}")
+ imageOpenError = True
+ continue
+ if imgPath.startswith("eol/"):
+ self.eolImgPath = imgPath
+ self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
+ elif imgPath.startswith("enwiki/"):
+ self.enwikiImgPath = imgPath
+ self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
+ else:
+ print(f"Unexpected image path {imgPath}", file=sys.stderr)
+ self.quit()
+ return
+ # Re-iterate if all image paths invalid
+ if self.eolImgPath == None and self.enwikiImgPath == None:
+ if imageOpenError:
+ self.reject()
+ self.getNextImgs()
+ return
+ # Add placeholder images
+ if self.eolImgPath == None:
+ self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
+ elif self.enwikiImgPath == None:
+ self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
+ # Update image-frames
+ self.labels[0].config(image=self.eolImg)
+ self.labels[1].config(image=self.enwikiImg)
+ # Update title
+ title = f"Imgs for otol ID {self.otolId}"
+ query = "SELECT names.alt_name FROM" \
+ " nodes INNER JOIN names ON nodes.name = names.name" \
+ " WHERE nodes.id = ? and pref_alt = 1"
+ row = dbCur.execute(query, (self.otolId,)).fetchone()
+ if row != None:
+ title += f", aka {row[0]}"
+ title += f" ({self.listIdx + 1} out of {len(self.nodeImgsList)})"
+ self.root.title(title)
+ def accept(self, imgIdx):
+ """ React to a user selecting an image """
+ imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath
+ if imgPath == None:
+ print("Invalid selection")
+ return
+ with open(outFile, 'a') as file:
+ file.write(f"{self.otolId} {imgPath}\n")
+ self.numReviewed += 1
+ self.getNextImgs()
+ def reject(self):
+ """ React to a user rejecting all images of a set """
+ with open(outFile, 'a') as file:
+ file.write(f"{self.otolId}\n")
+ self.numReviewed += 1
+ self.getNextImgs()
+ def quit(self, e = None):
+ print(f"Number reviewed: {self.numReviewed}")
+ timeElapsed = time.time() - self.startTime
+ print(f"Time elapsed: {timeElapsed:.2f} seconds")
+ if self.numReviewed > 0:
+ print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
+ dbCon.close()
+ self.root.destroy()
+ def resizeForDisplay(self, img):
+ """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
+ if max(img.width, img.height) > IMG_DISPLAY_SZ:
+ if (img.width > img.height):
+ newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
+ img = img.resize((IMG_DISPLAY_SZ, newHeight))
+ else:
+ newWidth = int(img.width * IMG_DISPLAY_SZ / img.height)
+ img = img.resize((newWidth, IMG_DISPLAY_SZ))
+ bgImg = PLACEHOLDER_IMG.copy()
+ bgImg.paste(img, box=(
+ int((IMG_DISPLAY_SZ - img.width) / 2),
+ int((IMG_DISPLAY_SZ - img.height) / 2)))
+ return bgImg
+# Create GUI and defer control
+root = tki.Tk()
+ImgReviewer(root, nodeToImgs)
+root.mainloop()