aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md47
-rw-r--r--backend/data/eol/README.md5
-rwxr-xr-xbackend/data/genImgsForWeb.py148
3 files changed, 109 insertions, 91 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 0845450..174c262 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -10,44 +10,42 @@ File Generation Process
2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
using data in eol/vernacularNames.csv and the 'nodes' table, and possibly
genEolNameDataPickedIds.txt.
-3 Image Data
- 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/.
- It uses data in eol/imagesList.db, and the 'eol\_ids' table.
- 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique
- images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
- 3 // UPDATE
- Run genImgsForWeb.py to create cropped/resized images in img/, using
- images in eol/imgsReviewed/, and also to add an 'images' table to data.db.
- 4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
- which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate
- nodes without images to child images.
-4 Node Description Data
- 1 Obtain data in dbpedia/, as specified in it's README.
+3 Node Description Data
+ 1 Obtain data in dbpedia/ and enwiki/, as specified in their README files.
2 Run genDbpData.py, which adds a 'descs' table to data.db, using
data in dbpedia/dbpData.db, the 'nodes' table, and possibly
genDescNamesToSkip.txt and dbpPickedLabels.txt.
-5 Supplementary Name/Description/Image Data
- 1 Obtain data in enwiki/, as specified in it's README.
- 2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
+ 3 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
enwiki/enwikiData.db, and the 'nodes' table. Also uses genDescNamesToSkip.txt and
genEnwikiDescTitlesToUse.txt for skipping/resolving some name-page associations.
- 3 Optionally run genEnwikiNameData.py, which adds to the 'names' table,
- using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
- 4 In enwiki/, run getEnwikiImgData.py, which generates a list of
+3 Image Data
+ 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/.
+ It uses data in eol/imagesList.db, and the 'eol\_ids' table.
+ 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique
+ images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
+ 3 In enwiki/, run getEnwikiImgData.py, which generates a list of
tol-node images, and creates enwiki/enwikiImgs.db to store it.
Uses the 'descs' table to get tol-node wiki-ids.
- 5 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing
+ 4 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing
information for images listed in enwiki/enwikiImgs.db, and stores
it in that db.
- 6 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
+ 5 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/.
- 7 // UPDATE
- Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/,
+ 6 Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/,
enables choosing, for each tol-node, which image should be used, if any,
and outputs choice information into mergedImgList.txt.
+ 7 Run genImgsForWeb.py, which creates cropped/resized images in img/, using
+ mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db.
+
+ 8 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
+ which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate
+ nodes without images to child images.
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
+6 Other
+ 1 Can run genEnwikiNameData.py, which adds more entries to the 'names' table,
+ using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
data.db Tables
==============
@@ -55,7 +53,8 @@ data.db Tables
- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
- eol\_ids: id INT PRIMARY KEY, name TEXT
-- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
+- images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)
+- node\_imgs: id TEXT PRIMARY KEY, img\_id INT, src TEXT
- linked\_imgs: name TEXT PRIMARY KEY, eol\_id INT, eol\_id2 INT
- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
- r\_nodes: name TEXT PRIMARY KEY, tips INT
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md
index 6f1f6c6..8338be0 100644
--- a/backend/data/eol/README.md
+++ b/backend/data/eol/README.md
@@ -12,4 +12,7 @@ Generated Files
- imagesList/ <br>
Obtained by extracting imagesList.tgz.
- imagesList.db <br>
- Represents data from eol/imagesList/*, and is created by genImagesListDb.sh.
+ Represents data from eol/imagesList/*, and is created by genImagesListDb.sh. <br>
+ Tables: <br>
+ - images:
+ content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
index 68089b7..de8ce1b 100755
--- a/backend/data/genImgsForWeb.py
+++ b/backend/data/genImgsForWeb.py
@@ -1,94 +1,110 @@
#!/usr/bin/python3
import sys, os, subprocess
-import sqlite3
+import sqlite3, urllib.parse
import signal
usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Creates web-usable copies of reviewed images.\n"
-usageInfo += "Looks in a reviewed-images directory for images named 'eolId1 contentId1.ext1', \n"
-usageInfo += "and places copied/resized versions in another directory, with name 'eolId1.jpg'.\n"
-usageInfo += "Also adds image metadata to a database, making use of an images-list database.\n"
+usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
+usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
+usageInfo += "Also adds image metadata to an sqlite database.\n"
usageInfo += "\n"
usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
-usageInfo += "continue processing. It uses existing output files to decide where from.\n"
+usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-imgDir = "eol/imgsReviewed/"
+imgListFile = "mergedImgList.txt"
outDir = "img/"
-imagesListDb = "eol/imagesList.db"
+eolImgDb = "eol/imagesList.db"
+enwikiImgDb = "enwiki/enwikiImgs.db"
dbFile = "data.db"
IMG_OUT_SZ = 200
# Create output directory if not present
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Open images-list db
-imagesListDbCon = sqlite3.connect(imagesListDb)
-imagesListCur = imagesListDbCon.cursor()
-# Open data db
+# Open dbs
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+eolCon = sqlite3.connect(eolImgDb)
+eolCur = eolCon.cursor()
+enwikiCon = sqlite3.connect(enwikiImgDb)
+enwikiCur = enwikiCon.cursor()
+# Create image tables if not present
+nodesDone = set()
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='images'").fetchone() == None:
- dbCur.execute("CREATE TABLE images (eol_id INT PRIMARY KEY, source_url TEXT, license TEXT, copyright_owner TEXT)")
-def closeDb():
- dbCon.commit()
- dbCon.close()
-# Get list of input images
-print("Reading input image list")
-inputImgList = os.listdir(imgDir)
-inputImgList.sort(key=lambda s: int(s.split(" ")[0]))
-if len(inputImgList) == 0:
- print("No input images found")
- closeDb()
- sys.exit(0)
-# Get next image to convert
-inputImgIdx = 0
-print("Checking for existing output files")
-outputImgList = os.listdir(outDir)
-if len(outputImgList) > 0:
- latestOutputId = 0
- for filename in outputImgList:
- latestOutputId = max(latestOutputId, int(filename.split(".")[0]))
- while int(inputImgList[inputImgIdx].split(" ")[0]) <= latestOutputId:
- inputImgIdx += 1
- if inputImgIdx == len(inputImgList):
- print("No unprocessed input images found")
- closeDb()
- sys.exit(0)
+ dbCur.execute("CREATE TABLE images" \
+ " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
+ dbCur.execute("CREATE TABLE node_imgs (id TEXT PRIMARY KEY, img_id INT, src TEXT)")
+else:
+ # Get existing node-associations
+ for (otolId,) in dbCur.execute("SELECT DISTINCT id from node_imgs"):
+ nodesDone.add(otolId)
+ print(f"Found {len(nodesDone)} nodes already processed")
# Detect SIGINT signals
interrupted = False
def onSigint(sig, frame):
global interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
-# Convert input images
- # There are two interrupt checks because the subprocess exits on a SIGINT (not prevented by the handler above).
- # The second check prevents adding a db entry for a non-created file.
- # The first check prevents starting a new subprocess after a sigint occurs while adding to db
-print("Converting images")
-for i in range(inputImgIdx, len(inputImgList)):
- if interrupted:
- print("Exiting")
- break
- imgName = inputImgList[i]
- [eolIdStr, otherStr] = imgName.split(" ")
- contentId = int(otherStr.split(".")[0])
- print(f"Converting {imgName}")
- subprocess.run(
- ['npx', 'smartcrop-cli',
- '--width', str(IMG_OUT_SZ),
- '--height', str(IMG_OUT_SZ),
- imgDir + imgName,
- outDir + eolIdStr + ".jpg"],
- stdout=subprocess.DEVNULL)
- if interrupted:
- print("Exiting")
- break
- # Add entry to db
- imagesListQuery = "SELECT content_id, source_url, license, copyright_owner FROM images WHERE content_id = ?"
- row = imagesListCur.execute(imagesListQuery, (contentId,)).fetchone()
- dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?)", (int(eolIdStr), row[1], row[2], row[3]))
-closeDb()
+# Iterate though images to process
+with open(imgListFile) as file:
+ for line in file:
+ # Check for SIGINT event
+ if interrupted:
+ print("Exiting")
+ break
+ # Skip lines without an image path
+ if line.find(" ") == -1:
+ continue
+ # Get filenames
+ (otolId, _, imgPath) = line.rstrip().partition(" ")
+ # Skip if already processed
+ if otolId in nodesDone:
+ continue
+ outPath = outDir + otolId + ".jpg"
+ # Convert image if needed
+ convertedImage = False
+ if not os.path.exists(outPath):
+ print(f"{otolId}: converting {imgPath}")
+ completedProcess = subprocess.run(
+ ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
+ stdout=subprocess.DEVNULL
+ )
+ # Prevent adding a db entry after an interrupted conversion
+ # Needed because the subprocess above exits on a SIGINT (not prevented by onSigint() above)
+ if completedProcess.returncode < 0:
+ print("Exiting due to interrupted subprocess")
+ break
+ convertedImage = True
+ # Add entry to db
+ fromEol = imgPath.startswith("eol/")
+ imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
+ imgName = os.path.splitext(imgName)[0] # Remove extension
+ if fromEol:
+ (eolId, _, contentId) = imgName.partition(" ")
+ (eolId, contentId) = (int(eolId), int(contentId))
+ if convertedImage:
+ query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
+ (url, license, owner) = eolCur.execute(query, (contentId,)).fetchone()
+ dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+ (eolId, "eol", url, license, owner, ""))
+ dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, eolId, "eol"))
+ else:
+ enwikiId = int(imgName)
+ if convertedImage:
+ query = "SELECT name, license, artist, credit FROM" \
+ " page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
+ " WHERE page_imgs.page_id = ?"
+ (name, license, artist, credit) = enwikiCur.execute(query, (enwikiId,)).fetchone()
+ url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
+ dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+ (enwikiId, "enwiki", url, license, artist, credit))
+ dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (otolId, enwikiId, "enwiki"))
+# Close dbs
+dbCon.commit()
+dbCon.close()
+eolCon.close()
+enwikiCon.close()