diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 47 | ||||
| -rw-r--r-- | backend/data/eol/README.md | 5 | ||||
| -rwxr-xr-x | backend/data/genImgsForWeb.py | 148 |
3 files changed, 109 insertions, 91 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 0845450..174c262 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -10,44 +10,42 @@ File Generation Process 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, using data in eol/vernacularNames.csv and the 'nodes' table, and possibly genEolNameDataPickedIds.txt. -3 Image Data - 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/. - It uses data in eol/imagesList.db, and the 'eol\_ids' table. - 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique - images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info). - 3 // UPDATE - Run genImgsForWeb.py to create cropped/resized images in img/, using - images in eol/imgsReviewed/, and also to add an 'images' table to data.db. - 4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, - which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate - nodes without images to child images. -4 Node Description Data - 1 Obtain data in dbpedia/, as specified in it's README. +3 Node Description Data + 1 Obtain data in dbpedia/ and enwiki/, as specified in their README files. 2 Run genDbpData.py, which adds a 'descs' table to data.db, using data in dbpedia/dbpData.db, the 'nodes' table, and possibly genDescNamesToSkip.txt and dbpPickedLabels.txt. -5 Supplementary Name/Description/Image Data - 1 Obtain data in enwiki/, as specified in it's README. - 2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in + 3 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in enwiki/enwikiData.db, and the 'nodes' table. Also uses genDescNamesToSkip.txt and genEnwikiDescTitlesToUse.txt for skipping/resolving some name-page associations. - 3 Optionally run genEnwikiNameData.py, which adds to the 'names' table, - using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables. - 4 In enwiki/, run getEnwikiImgData.py, which generates a list of +3 Image Data + 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/. + It uses data in eol/imagesList.db, and the 'eol\_ids' table. + 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique + images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info). + 3 In enwiki/, run getEnwikiImgData.py, which generates a list of tol-node images, and creates enwiki/enwikiImgs.db to store it. Uses the 'descs' table to get tol-node wiki-ids. - 5 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing + 4 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing information for images listed in enwiki/enwikiImgs.db, and stores it in that db. - 6 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed' + 5 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed' images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/. - 7 // UPDATE - Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/, + 6 Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/, enables choosing, for each tol-node, which image should be used, if any, and outputs choice information into mergedImgList.txt. + 7 Run genImgsForWeb.py, which creates cropped/resized images in img/, using + mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db. + + 8 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, + which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate + nodes without images to child images. 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. +6 Other + 1 Can run genEnwikiNameData.py, which adds more entries to the 'names' table, + using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables. data.db Tables ============== @@ -55,7 +53,8 @@ data.db Tables - edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) - names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) - eol\_ids: id INT PRIMARY KEY, name TEXT -- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT +- images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src) +- node\_imgs: id TEXT PRIMARY KEY, img\_id INT, src TEXT - linked\_imgs: name TEXT PRIMARY KEY, eol\_id INT, eol\_id2 INT - descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT - r\_nodes: name TEXT PRIMARY KEY, tips INT diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md index 6f1f6c6..8338be0 100644 --- a/backend/data/eol/README.md +++ b/backend/data/eol/README.md @@ -12,4 +12,7 @@ Generated Files - imagesList/ <br> Obtained by extracting imagesList.tgz. - imagesList.db <br> - Represents data from eol/imagesList/*, and is created by genImagesListDb.sh. + Represents data from eol/imagesList/*, and is created by genImagesListDb.sh. <br> + Tables: <br> + - images: + content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py index 68089b7..de8ce1b 100755 --- a/backend/data/genImgsForWeb.py +++ b/backend/data/genImgsForWeb.py @@ -1,94 +1,110 @@ #!/usr/bin/python3 import sys, os, subprocess -import sqlite3 +import sqlite3, urllib.parse import signal usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Creates web-usable copies of reviewed images.\n" -usageInfo += "Looks in a reviewed-images directory for images named 'eolId1 contentId1.ext1', \n" -usageInfo += "and places copied/resized versions in another directory, with name 'eolId1.jpg'.\n" -usageInfo += "Also adds image metadata to a database, making use of an images-list database.\n" +usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n" +usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n" +usageInfo += "Also adds image metadata to an sqlite database.\n" usageInfo += "\n" usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n" -usageInfo += "continue processing. It uses existing output files to decide where from.\n" +usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDir = "eol/imgsReviewed/" +imgListFile = "mergedImgList.txt" outDir = "img/" -imagesListDb = "eol/imagesList.db" +eolImgDb = "eol/imagesList.db" +enwikiImgDb = "enwiki/enwikiImgs.db" dbFile = "data.db" IMG_OUT_SZ = 200 # Create output directory if not present if not os.path.exists(outDir): os.mkdir(outDir) -# Open images-list db -imagesListDbCon = sqlite3.connect(imagesListDb) -imagesListCur = imagesListDbCon.cursor() -# Open data db +# Open dbs dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() +eolCon = sqlite3.connect(eolImgDb) +eolCur = eolCon.cursor() +enwikiCon = sqlite3.connect(enwikiImgDb) +enwikiCur = enwikiCon.cursor() +# Create image tables if not present +nodesDone = set() if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='images'").fetchone() == None: - dbCur.execute("CREATE TABLE images (eol_id INT PRIMARY KEY, source_url TEXT, license TEXT, copyright_owner TEXT)") -def closeDb(): - dbCon.commit() - dbCon.close() -# Get list of input images -print("Reading input image list") -inputImgList = os.listdir(imgDir) -inputImgList.sort(key=lambda s: int(s.split(" ")[0])) -if len(inputImgList) == 0: - print("No input images found") - closeDb() - sys.exit(0) -# Get next image to convert -inputImgIdx = 0 -print("Checking for existing output files") -outputImgList = os.listdir(outDir) -if len(outputImgList) > 0: - latestOutputId = 0 - for filename in outputImgList: - latestOutputId = max(latestOutputId, int(filename.split(".")[0])) - while int(inputImgList[inputImgIdx].split(" ")[0]) <= latestOutputId: - inputImgIdx += 1 - if inputImgIdx == len(inputImgList): - print("No unprocessed input images found") - closeDb() - sys.exit(0) + dbCur.execute("CREATE TABLE images" \ + " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))") + dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)") +else: + # Get existing node-associations + for (otolId,) in dbCur.execute("SELECT DISTINCT id from node_imgs"): + nodesDone.add(otolId) + print(f"Found {len(nodesDone)} nodes already processed") # Detect SIGINT signals interrupted = False def onSigint(sig, frame): global interrupted interrupted = True signal.signal(signal.SIGINT, onSigint) -# Convert input images - # There are two interrupt checks because the subprocess exits on a SIGINT (not prevented by the handler above). - # The second check prevents adding a db entry for a non-created file. - # The first check prevents starting a new subprocess after a sigint occurs while adding to db -print("Converting images") -for i in range(inputImgIdx, len(inputImgList)): - if interrupted: - print("Exiting") - break - imgName = inputImgList[i] - [eolIdStr, otherStr] = imgName.split(" ") - contentId = int(otherStr.split(".")[0]) - print(f"Converting {imgName}") - subprocess.run( - ['npx', 'smartcrop-cli', - '--width', str(IMG_OUT_SZ), - '--height', str(IMG_OUT_SZ), - imgDir + imgName, - outDir + eolIdStr + ".jpg"], - stdout=subprocess.DEVNULL) - if interrupted: - print("Exiting") - break - # Add entry to db - imagesListQuery = "SELECT content_id, source_url, license, copyright_owner FROM images WHERE content_id = ?" - row = imagesListCur.execute(imagesListQuery, (contentId,)).fetchone() - dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?)", (int(eolIdStr), row[1], row[2], row[3])) -closeDb() +# Iterate though images to process +with open(imgListFile) as file: + for line in file: + # Check for SIGINT event + if interrupted: + print("Exiting") + break + # Skip lines without an image path + if line.find(" ") == -1: + continue + # Get filenames + (otolId, _, imgPath) = line.rstrip().partition(" ") + # Skip if already processed + if otolId in nodesDone: + continue + outPath = outDir + otolId + ".jpg" + # Convert image if needed + convertedImage = False + if not os.path.exists(outPath): + print(f"{otolId}: converting {imgPath}") + completedProcess = subprocess.run( + ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], + stdout=subprocess.DEVNULL + ) + # Prevent adding a db entry after an interrupted conversion + # Needed because the subprocess above exits on a SIGINT (not prevented by onSigint() above) + if completedProcess.returncode < 0: + print("Exiting due to interrupted subprocess") + break + convertedImage = True + # Add entry to db + fromEol = imgPath.startswith("eol/") + imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component + imgName = os.path.splitext(imgName)[0] # Remove extension + if fromEol: + (eolId, _, contentId) = imgName.partition(" ") + (eolId, contentId) = (int(eolId), int(contentId)) + if convertedImage: + query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?" + (url, license, owner) = eolCur.execute(query, (contentId,)).fetchone() + dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", + (eolId, "eol", url, license, owner, "")) + dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, eolId, "eol")) + else: + enwikiId = int(imgName) + if convertedImage: + query = "SELECT name, license, artist, credit FROM" \ + " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \ + " WHERE page_imgs.page_id = ?" + (name, license, artist, credit) = enwikiCur.execute(query, (enwikiId,)).fetchone() + url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name) + dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", + (enwikiId, "enwiki", url, license, artist, credit)) + dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, enwikiId, "enwiki")) +# Close dbs +dbCon.commit() +dbCon.close() +eolCon.close() +enwikiCon.close() |
