diff options
Diffstat (limited to 'backend/tolData/genImgs.py')
| -rwxr-xr-x | backend/tolData/genImgs.py | 154 |
1 files changed, 81 insertions, 73 deletions
diff --git a/backend/tolData/genImgs.py b/backend/tolData/genImgs.py index 930990b..6f72b49 100755 --- a/backend/tolData/genImgs.py +++ b/backend/tolData/genImgs.py @@ -17,57 +17,65 @@ to skip. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -imgListFile = "imgList.txt" -outDir = "img/" -eolImgDb = "eol/imagesList.db" -enwikiImgDb = "enwiki/imgData.db" -pickedImgsDir = "pickedImgs/" -pickedImgsFilename = "imgData.txt" -dbFile = "data.db" +imgListFile = 'imgList.txt' +outDir = 'img/' +eolImgDb = 'eol/imagesList.db' +enwikiImgDb = 'enwiki/imgData.db' +pickedImgsDir = 'pickedImgs/' +pickedImgsFilename = 'imgData.txt' +dbFile = 'data.db' IMG_OUT_SZ = 200 genImgFiles = True # Usable for debugging +class PickedImg: + """ Represents a picked-image from pickedImgsDir """ + def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str): + self.nodeName = nodeName + self.id = id + self.filename = filename + self.url = url + self.license = license + self.artist = artist + self.credit = credit + if not os.path.exists(outDir): os.mkdir(outDir) -print("Opening databases") +print('Opening databases') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() eolCon = sqlite3.connect(eolImgDb) eolCur = eolCon.cursor() enwikiCon = sqlite3.connect(enwikiImgDb) enwikiCur = enwikiCon.cursor() -print("Checking for picked-images") -nodeToPickedImg = {} +print('Checking for picked-images') +nodeToPickedImg: dict[str, PickedImg] = {} if os.path.exists(pickedImgsDir + pickedImgsFilename): lineNum = 0 with open(pickedImgsDir + pickedImgsFilename) as file: for line in file: lineNum += 1 - (filename, url, license, artist, credit) = line.rstrip().split("|") + filename, url, license, artist, credit = line.rstrip().split('|') nodeName = os.path.splitext(filename)[0] # Remove extension - (otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone() - nodeToPickedImg[otolId] = { - "nodeName": nodeName, "id": lineNum, - "filename": filename, "url": url, "license": license, "artist": artist, "credit": credit, - } + (otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone() + nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit) -print("Checking for image tables") -nodesDone = set() -imgsDone = set() -if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None: +print('Checking for image tables') +nodesDone: set[str] = set() +imgsDone: set[tuple[int, str]] = set() +if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="node_imgs"').fetchone() is None: # Add image tables if not present - dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)") - dbCur.execute("CREATE TABLE images" \ - " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))") + dbCur.execute('CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)') + dbCur.execute('CREATE TABLE images' \ + ' (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))') else: # Get existing image-associated nodes - for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"): + for (otolId,) in dbCur.execute('SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name'): nodesDone.add(otolId) # Get existing node-associated images - for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"): + for imgId, imgSrc in dbCur.execute('SELECT id, src from images'): imgsDone.add((imgId, imgSrc)) - print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip") + print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip') # Set SIGINT handler interrupted = False @@ -76,18 +84,18 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, onSigint) -print("Iterating through input images") +print('Iterating through input images') def quit(): - print("Closing databases") + print('Closing databases') dbCon.commit() dbCon.close() eolCon.close() enwikiCon.close() sys.exit(0) def convertImage(imgPath, outPath): - print(f"Converting {imgPath} to {outPath}") + print(f'Converting {imgPath} to {outPath}') if os.path.exists(outPath): - print(f"ERROR: Output image already exists") + print('ERROR: Output image already exists') return False try: completedProcess = subprocess.run( @@ -95,94 +103,94 @@ def convertImage(imgPath, outPath): stdout=subprocess.DEVNULL ) except Exception as e: - print(f"ERROR: Exception while attempting to run smartcrop: {e}") + print(f'ERROR: Exception while attempting to run smartcrop: {e}') return False if completedProcess.returncode != 0: - print(f"ERROR: smartcrop had exit status {completedProcess.returncode}") + print(f'ERROR: smartcrop had exit status {completedProcess.returncode}') return False return True -print("Processing picked-images") -for (otolId, imgData) in nodeToPickedImg.items(): +print('Processing picked-images') +for otolId, imgData in nodeToPickedImg.items(): # Check for SIGINT event if interrupted: - print("Exiting") + print('Exiting') quit() # Skip if already processed if otolId in nodesDone: continue # Convert image if genImgFiles: - success = convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg") + success = convertImage(pickedImgsDir + imgData.filename, outDir + otolId + '.jpg') if not success: quit() else: - print(f"Processing {imgData['nodeName']}: {otolId}.jpg") + print(f'Processing {imgData.nodeName}: {otolId}.jpg') # Add entry to db - if (imgData["id"], "picked") not in imgsDone: - dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", - (imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"])) - imgsDone.add((imgData["id"], "picked")) - dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked")) + if (imgData.id, 'picked') not in imgsDone: + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (imgData.id, 'picked', imgData.url, imgData.license, imgData.artist, imgData.credit)) + imgsDone.add((imgData.id, 'picked')) + dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked')) nodesDone.add(otolId) -print("Processing images from eol and enwiki") +print('Processing images from eol and enwiki') iterNum = 0 with open(imgListFile) as file: for line in file: iterNum += 1 # Check for SIGINT event if interrupted: - print("Exiting") + print('Exiting') break # Skip lines without an image path - if line.find(" ") == -1: + if line.find(' ') == -1: continue # Get filenames - (otolId, _, imgPath) = line.rstrip().partition(" ") + otolId, _, imgPath = line.rstrip().partition(' ') # Skip if already processed if otolId in nodesDone: continue # Convert image if genImgFiles: - success = convertImage(imgPath, outDir + otolId + ".jpg") + success = convertImage(imgPath, outDir + otolId + '.jpg') if not success: break else: if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") + print(f'At iteration {iterNum}') # Add entry to db - (nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone() - fromEol = imgPath.startswith("eol/") + (nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone() + fromEol = imgPath.startswith('eol/') imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component imgName = os.path.splitext(imgName)[0] # Remove extension if fromEol: - eolId, _, contentId = imgName.partition(" ") - eolId, contentId = (int(eolId), int(contentId)) - if (eolId, "eol") not in imgsDone: - query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?" + eolIdStr, _, contentIdStr = imgName.partition(' ') + eolId, contentId = (int(eolIdStr), int(contentIdStr)) + if (eolId, 'eol') not in imgsDone: + query = 'SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?' row = eolCur.execute(query, (contentId,)).fetchone() - if row == None: - print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}") + if row is None: + print(f'ERROR: No image record for EOL ID {eolId}, content ID {contentId}') break - (url, license, owner) = row - dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", - (eolId, "eol", url, license, owner, "")) - imgsDone.add((eolId, "eol")) - dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol")) + url, license, owner = row + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (eolId, 'eol', url, license, owner, '')) + imgsDone.add((eolId, 'eol')) + dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, eolId, 'eol')) else: enwikiId = int(imgName) - if (enwikiId, "enwiki") not in imgsDone: - query = "SELECT name, license, artist, credit FROM" \ - " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \ - " WHERE page_imgs.page_id = ?" + if (enwikiId, 'enwiki') not in imgsDone: + query = 'SELECT name, license, artist, credit FROM' \ + ' page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name' \ + ' WHERE page_imgs.page_id = ?' row = enwikiCur.execute(query, (enwikiId,)).fetchone() - if row == None: - print(f"ERROR: No image record for enwiki ID {enwikiId}") + if row is None: + print(f'ERROR: No image record for enwiki ID {enwikiId}') break - (name, license, artist, credit) = row - url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name) - dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", - (enwikiId, "enwiki", url, license, artist, credit)) - imgsDone.add((enwikiId, "enwiki")) - dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki")) + name, license, artist, credit = row + url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name) + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (enwikiId, 'enwiki', url, license, artist, credit)) + imgsDone.add((enwikiId, 'enwiki')) + dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki')) # Close dbs quit() |
