aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genImgs.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-11 14:55:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-11 15:04:14 +1000
commit5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/genImgs.py
parentdaccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
Add backend unit tests
- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tolData/genImgs.py')
-rwxr-xr-xbackend/tolData/genImgs.py196
1 files changed, 0 insertions, 196 deletions
diff --git a/backend/tolData/genImgs.py b/backend/tolData/genImgs.py
deleted file mode 100755
index 6f72b49..0000000
--- a/backend/tolData/genImgs.py
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os, subprocess
-import sqlite3, urllib.parse
-import signal
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads node IDs and image paths from a file, and possibly from a directory,
-and generates cropped/resized versions of those images into a directory,
-with names of the form 'nodeId1.jpg'. Also adds image metadata to the
-database.
-
-SIGINT can be used to stop, and the program can be re-run to continue
-processing. It uses already-existing database entries to decide what
-to skip.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-imgListFile = 'imgList.txt'
-outDir = 'img/'
-eolImgDb = 'eol/imagesList.db'
-enwikiImgDb = 'enwiki/imgData.db'
-pickedImgsDir = 'pickedImgs/'
-pickedImgsFilename = 'imgData.txt'
-dbFile = 'data.db'
-IMG_OUT_SZ = 200
-genImgFiles = True # Usable for debugging
-
-class PickedImg:
- """ Represents a picked-image from pickedImgsDir """
- def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str):
- self.nodeName = nodeName
- self.id = id
- self.filename = filename
- self.url = url
- self.license = license
- self.artist = artist
- self.credit = credit
-
-if not os.path.exists(outDir):
- os.mkdir(outDir)
-
-print('Opening databases')
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-eolCon = sqlite3.connect(eolImgDb)
-eolCur = eolCon.cursor()
-enwikiCon = sqlite3.connect(enwikiImgDb)
-enwikiCur = enwikiCon.cursor()
-print('Checking for picked-images')
-nodeToPickedImg: dict[str, PickedImg] = {}
-if os.path.exists(pickedImgsDir + pickedImgsFilename):
- lineNum = 0
- with open(pickedImgsDir + pickedImgsFilename) as file:
- for line in file:
- lineNum += 1
- filename, url, license, artist, credit = line.rstrip().split('|')
- nodeName = os.path.splitext(filename)[0] # Remove extension
- (otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone()
- nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit)
-
-print('Checking for image tables')
-nodesDone: set[str] = set()
-imgsDone: set[tuple[int, str]] = set()
-if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="node_imgs"').fetchone() is None:
- # Add image tables if not present
- dbCur.execute('CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)')
- dbCur.execute('CREATE TABLE images' \
- ' (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))')
-else:
- # Get existing image-associated nodes
- for (otolId,) in dbCur.execute('SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name'):
- nodesDone.add(otolId)
- # Get existing node-associated images
- for imgId, imgSrc in dbCur.execute('SELECT id, src from images'):
- imgsDone.add((imgId, imgSrc))
- print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip')
-
-# Set SIGINT handler
-interrupted = False
-def onSigint(sig, frame):
- global interrupted
- interrupted = True
-signal.signal(signal.SIGINT, onSigint)
-
-print('Iterating through input images')
-def quit():
- print('Closing databases')
- dbCon.commit()
- dbCon.close()
- eolCon.close()
- enwikiCon.close()
- sys.exit(0)
-def convertImage(imgPath, outPath):
- print(f'Converting {imgPath} to {outPath}')
- if os.path.exists(outPath):
- print('ERROR: Output image already exists')
- return False
- try:
- completedProcess = subprocess.run(
- ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
- stdout=subprocess.DEVNULL
- )
- except Exception as e:
- print(f'ERROR: Exception while attempting to run smartcrop: {e}')
- return False
- if completedProcess.returncode != 0:
- print(f'ERROR: smartcrop had exit status {completedProcess.returncode}')
- return False
- return True
-print('Processing picked-images')
-for otolId, imgData in nodeToPickedImg.items():
- # Check for SIGINT event
- if interrupted:
- print('Exiting')
- quit()
- # Skip if already processed
- if otolId in nodesDone:
- continue
- # Convert image
- if genImgFiles:
- success = convertImage(pickedImgsDir + imgData.filename, outDir + otolId + '.jpg')
- if not success:
- quit()
- else:
- print(f'Processing {imgData.nodeName}: {otolId}.jpg')
- # Add entry to db
- if (imgData.id, 'picked') not in imgsDone:
- dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
- (imgData.id, 'picked', imgData.url, imgData.license, imgData.artist, imgData.credit))
- imgsDone.add((imgData.id, 'picked'))
- dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked'))
- nodesDone.add(otolId)
-print('Processing images from eol and enwiki')
-iterNum = 0
-with open(imgListFile) as file:
- for line in file:
- iterNum += 1
- # Check for SIGINT event
- if interrupted:
- print('Exiting')
- break
- # Skip lines without an image path
- if line.find(' ') == -1:
- continue
- # Get filenames
- otolId, _, imgPath = line.rstrip().partition(' ')
- # Skip if already processed
- if otolId in nodesDone:
- continue
- # Convert image
- if genImgFiles:
- success = convertImage(imgPath, outDir + otolId + '.jpg')
- if not success:
- break
- else:
- if iterNum % 1e4 == 0:
- print(f'At iteration {iterNum}')
- # Add entry to db
- (nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone()
- fromEol = imgPath.startswith('eol/')
- imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
- imgName = os.path.splitext(imgName)[0] # Remove extension
- if fromEol:
- eolIdStr, _, contentIdStr = imgName.partition(' ')
- eolId, contentId = (int(eolIdStr), int(contentIdStr))
- if (eolId, 'eol') not in imgsDone:
- query = 'SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?'
- row = eolCur.execute(query, (contentId,)).fetchone()
- if row is None:
- print(f'ERROR: No image record for EOL ID {eolId}, content ID {contentId}')
- break
- url, license, owner = row
- dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
- (eolId, 'eol', url, license, owner, ''))
- imgsDone.add((eolId, 'eol'))
- dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, eolId, 'eol'))
- else:
- enwikiId = int(imgName)
- if (enwikiId, 'enwiki') not in imgsDone:
- query = 'SELECT name, license, artist, credit FROM' \
- ' page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name' \
- ' WHERE page_imgs.page_id = ?'
- row = enwikiCur.execute(query, (enwikiId,)).fetchone()
- if row is None:
- print(f'ERROR: No image record for enwiki ID {enwikiId}')
- break
- name, license, artist, credit = row
- url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
- dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
- (enwikiId, 'enwiki', url, license, artist, credit))
- imgsDone.add((enwikiId, 'enwiki'))
- dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki'))
-# Close dbs
-quit()