aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genImgs.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/genImgs.py')
-rwxr-xr-xbackend/tolData/genImgs.py154
1 files changed, 81 insertions, 73 deletions
diff --git a/backend/tolData/genImgs.py b/backend/tolData/genImgs.py
index 930990b..6f72b49 100755
--- a/backend/tolData/genImgs.py
+++ b/backend/tolData/genImgs.py
@@ -17,57 +17,65 @@ to skip.
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
-imgListFile = "imgList.txt"
-outDir = "img/"
-eolImgDb = "eol/imagesList.db"
-enwikiImgDb = "enwiki/imgData.db"
-pickedImgsDir = "pickedImgs/"
-pickedImgsFilename = "imgData.txt"
-dbFile = "data.db"
+imgListFile = 'imgList.txt'
+outDir = 'img/'
+eolImgDb = 'eol/imagesList.db'
+enwikiImgDb = 'enwiki/imgData.db'
+pickedImgsDir = 'pickedImgs/'
+pickedImgsFilename = 'imgData.txt'
+dbFile = 'data.db'
IMG_OUT_SZ = 200
genImgFiles = True # Usable for debugging
+class PickedImg:
+ """ Represents a picked-image from pickedImgsDir """
+ def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str):
+ self.nodeName = nodeName
+ self.id = id
+ self.filename = filename
+ self.url = url
+ self.license = license
+ self.artist = artist
+ self.credit = credit
+
if not os.path.exists(outDir):
os.mkdir(outDir)
-print("Opening databases")
+print('Opening databases')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
eolCon = sqlite3.connect(eolImgDb)
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
-print("Checking for picked-images")
-nodeToPickedImg = {}
+print('Checking for picked-images')
+nodeToPickedImg: dict[str, PickedImg] = {}
if os.path.exists(pickedImgsDir + pickedImgsFilename):
lineNum = 0
with open(pickedImgsDir + pickedImgsFilename) as file:
for line in file:
lineNum += 1
- (filename, url, license, artist, credit) = line.rstrip().split("|")
+ filename, url, license, artist, credit = line.rstrip().split('|')
nodeName = os.path.splitext(filename)[0] # Remove extension
- (otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone()
- nodeToPickedImg[otolId] = {
- "nodeName": nodeName, "id": lineNum,
- "filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
- }
+ (otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone()
+ nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit)
-print("Checking for image tables")
-nodesDone = set()
-imgsDone = set()
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
+print('Checking for image tables')
+nodesDone: set[str] = set()
+imgsDone: set[tuple[int, str]] = set()
+if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="node_imgs"').fetchone() is None:
# Add image tables if not present
- dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
- dbCur.execute("CREATE TABLE images" \
- " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
+ dbCur.execute('CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)')
+ dbCur.execute('CREATE TABLE images' \
+ ' (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))')
else:
# Get existing image-associated nodes
- for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
+ for (otolId,) in dbCur.execute('SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name'):
nodesDone.add(otolId)
# Get existing node-associated images
- for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
+ for imgId, imgSrc in dbCur.execute('SELECT id, src from images'):
imgsDone.add((imgId, imgSrc))
- print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip")
+ print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip')
# Set SIGINT handler
interrupted = False
@@ -76,18 +84,18 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, onSigint)
-print("Iterating through input images")
+print('Iterating through input images')
def quit():
- print("Closing databases")
+ print('Closing databases')
dbCon.commit()
dbCon.close()
eolCon.close()
enwikiCon.close()
sys.exit(0)
def convertImage(imgPath, outPath):
- print(f"Converting {imgPath} to {outPath}")
+ print(f'Converting {imgPath} to {outPath}')
if os.path.exists(outPath):
- print(f"ERROR: Output image already exists")
+ print('ERROR: Output image already exists')
return False
try:
completedProcess = subprocess.run(
@@ -95,94 +103,94 @@ def convertImage(imgPath, outPath):
stdout=subprocess.DEVNULL
)
except Exception as e:
- print(f"ERROR: Exception while attempting to run smartcrop: {e}")
+ print(f'ERROR: Exception while attempting to run smartcrop: {e}')
return False
if completedProcess.returncode != 0:
- print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
+ print(f'ERROR: smartcrop had exit status {completedProcess.returncode}')
return False
return True
-print("Processing picked-images")
-for (otolId, imgData) in nodeToPickedImg.items():
+print('Processing picked-images')
+for otolId, imgData in nodeToPickedImg.items():
# Check for SIGINT event
if interrupted:
- print("Exiting")
+ print('Exiting')
quit()
# Skip if already processed
if otolId in nodesDone:
continue
# Convert image
if genImgFiles:
- success = convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg")
+ success = convertImage(pickedImgsDir + imgData.filename, outDir + otolId + '.jpg')
if not success:
quit()
else:
- print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
+ print(f'Processing {imgData.nodeName}: {otolId}.jpg')
# Add entry to db
- if (imgData["id"], "picked") not in imgsDone:
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
- (imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"]))
- imgsDone.add((imgData["id"], "picked"))
- dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked"))
+ if (imgData.id, 'picked') not in imgsDone:
+ dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
+ (imgData.id, 'picked', imgData.url, imgData.license, imgData.artist, imgData.credit))
+ imgsDone.add((imgData.id, 'picked'))
+ dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked'))
nodesDone.add(otolId)
-print("Processing images from eol and enwiki")
+print('Processing images from eol and enwiki')
iterNum = 0
with open(imgListFile) as file:
for line in file:
iterNum += 1
# Check for SIGINT event
if interrupted:
- print("Exiting")
+ print('Exiting')
break
# Skip lines without an image path
- if line.find(" ") == -1:
+ if line.find(' ') == -1:
continue
# Get filenames
- (otolId, _, imgPath) = line.rstrip().partition(" ")
+ otolId, _, imgPath = line.rstrip().partition(' ')
# Skip if already processed
if otolId in nodesDone:
continue
# Convert image
if genImgFiles:
- success = convertImage(imgPath, outDir + otolId + ".jpg")
+ success = convertImage(imgPath, outDir + otolId + '.jpg')
if not success:
break
else:
if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
+ print(f'At iteration {iterNum}')
# Add entry to db
- (nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone()
- fromEol = imgPath.startswith("eol/")
+ (nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone()
+ fromEol = imgPath.startswith('eol/')
imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
imgName = os.path.splitext(imgName)[0] # Remove extension
if fromEol:
- eolId, _, contentId = imgName.partition(" ")
- eolId, contentId = (int(eolId), int(contentId))
- if (eolId, "eol") not in imgsDone:
- query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
+ eolIdStr, _, contentIdStr = imgName.partition(' ')
+ eolId, contentId = (int(eolIdStr), int(contentIdStr))
+ if (eolId, 'eol') not in imgsDone:
+ query = 'SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?'
row = eolCur.execute(query, (contentId,)).fetchone()
- if row == None:
- print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}")
+ if row is None:
+ print(f'ERROR: No image record for EOL ID {eolId}, content ID {contentId}')
break
- (url, license, owner) = row
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
- (eolId, "eol", url, license, owner, ""))
- imgsDone.add((eolId, "eol"))
- dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
+ url, license, owner = row
+ dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
+ (eolId, 'eol', url, license, owner, ''))
+ imgsDone.add((eolId, 'eol'))
+ dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, eolId, 'eol'))
else:
enwikiId = int(imgName)
- if (enwikiId, "enwiki") not in imgsDone:
- query = "SELECT name, license, artist, credit FROM" \
- " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
- " WHERE page_imgs.page_id = ?"
+ if (enwikiId, 'enwiki') not in imgsDone:
+ query = 'SELECT name, license, artist, credit FROM' \
+ ' page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name' \
+ ' WHERE page_imgs.page_id = ?'
row = enwikiCur.execute(query, (enwikiId,)).fetchone()
- if row == None:
- print(f"ERROR: No image record for enwiki ID {enwikiId}")
+ if row is None:
+ print(f'ERROR: No image record for enwiki ID {enwikiId}')
break
- (name, license, artist, credit) = row
- url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
- (enwikiId, "enwiki", url, license, artist, credit))
- imgsDone.add((enwikiId, "enwiki"))
- dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
+ name, license, artist, credit = row
+ url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
+ dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
+ (enwikiId, 'enwiki', url, license, artist, credit))
+ imgsDone.add((enwikiId, 'enwiki'))
+ dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki'))
# Close dbs
quit()