aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-22 20:20:38 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-22 20:20:38 +1000
commit703750ea5de00354c6bb807f95e8bd9685b3dddd (patch)
tree2947ae4040bd1a8339f4fa5c1b6903a1fdc81f34 /backend/data
parent8ce802ef223ac082975da9d04f10e5dc78529410 (diff)
Up-propgate images for usage in parents without images
Also adjust vite config to avoid apparent redundant processing when images are being downloaded.
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md26
-rwxr-xr-xbackend/data/genImgsForWeb.py2
-rwxr-xr-xbackend/data/genLinkedImgs.py78
3 files changed, 94 insertions, 12 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 44219da..438188c 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -10,12 +10,15 @@ File Generation Process
2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
using data in eol/vernacularNames.csv and the 'nodes' table.
3 Image Data
- 1 Use downloadImgsForReview.py to download EOL images into imgsForReview/.
+ 1 Run downloadImgsForReview.py to download EOL images into imgsForReview/.
It uses data in eol/imagesList.db, and the 'eol\_ids' table.
- 2 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique
+ 2 Run reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique
images in imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
- 3 Use genImgsForWeb.py to create cropped/resized images in img/, using
+ 3 Run genImgsForWeb.py to create cropped/resized images in img/, using
images in imgsReviewed, and also to add an 'images' table to data.db.
+ 4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
+ which uses 'nodes', 'edges', 'eol_ids', and 'images', to associate
+ nodes without images to child images.
4 Node Description Data
- Using DBpedia
1 Obtain data in dbpedia/, as specified in it's README.
@@ -31,14 +34,15 @@ File Generation Process
data.db Tables
==============
-- nodes: name TEXT PRIMARY KEY, tips INT
-- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
-- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
-- eol\_ids: id INT PRIMARY KEY, name TEXT
-- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
-- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
-- r\_nodes: name TEXT PRIMARY KEY, tips INT
-- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
+- nodes: name TEXT PRIMARY KEY, tips INT
+- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
+- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
+- eol\_ids: id INT PRIMARY KEY, name TEXT
+- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
+- linked\_imgs: name TEXT PRIMARY KEY, eol\_id INT
+- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
+- r\_nodes: name TEXT PRIMARY KEY, tips INT
+- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
Other Files
===========
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
index 14583d6..91a1cde 100755
--- a/backend/data/genImgsForWeb.py
+++ b/backend/data/genImgsForWeb.py
@@ -28,7 +28,7 @@ if not os.path.exists(outDir):
# Open images-list db
imagesListDbCon = sqlite3.connect(imagesListDb)
imagesListCur = imagesListDbCon.cursor()
-# Create/open data db
+# Open data db
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='images'").fetchone() == None:
diff --git a/backend/data/genLinkedImgs.py b/backend/data/genLinkedImgs.py
new file mode 100755
index 0000000..5f49ffc
--- /dev/null
+++ b/backend/data/genLinkedImgs.py
@@ -0,0 +1,78 @@
+#!/usr/bin/python3
+
+import sys
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Adds a table to data.db, associating nodes without images to\n"
+usageInfo += "usable child images.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dbFile = "data.db"
+
+# Open db
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, eol_id INT)")
+# Get nodes with images
+print("Getting nodes with images")
+resolvedNodes = {} # Will map node names to eol IDs with a usable image
+query = "SELECT nodes.name, eol_ids.id FROM" \
+ " nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name" \
+ " INNER JOIN images ON eol_ids.id = images.eol_id"
+for (name, eolId) in dbCur.execute(query):
+ resolvedNodes[name] = eolId
+print("Got {} nodes".format(len(resolvedNodes)))
+# Iterate through resolved nodes, resolving ancestors where able
+print("Resolving ancestor nodes")
+nodesToResolve = {}
+processedNodes = set()
+iterNum = 0
+while len(resolvedNodes) > 0:
+ iterNum += 1
+ if iterNum % 1e3 == 0:
+ print("At iteration {}".format(iterNum))
+ # Get next node
+ (nodeName, eolId) = resolvedNodes.popitem()
+ processedNodes.add(nodeName)
+ # Traverse upwards, resolving ancestors if able
+ while True:
+ # Get parent
+ row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (nodeName,)).fetchone()
+ if row == None or row[0] in processedNodes or row[0] in resolvedNodes:
+ break
+ parent = row[0]
+ # Get parent data
+ if parent not in nodesToResolve:
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (parent,))]
+ query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(childNames)))
+ childObjs = [{"name": row[0], "tips": row[1], "eolId": None} for row in dbCur.execute(query, childNames)]
+ childObjs.sort(key=lambda x: x["tips"], reverse=True)
+ nodesToResolve[parent] = childObjs
+ else:
+ childObjs = nodesToResolve[parent]
+ # Check if highest-tips child
+ if (childObjs[0]["name"] == nodeName):
+ # Resolve parent, and continue from it
+ dbCur.execute("INSERT INTO linked_imgs VALUES (?, ?)", (parent, eolId))
+ del nodesToResolve[parent]
+ processedNodes.add(parent)
+ nodeName = parent
+ continue
+ else:
+ # Add potential EOL ID to parent
+ childObj = next(c for c in childObjs if c["name"] == nodeName)
+ childObj["eolId"] = eolId
+ break
+ # When out of resolved nodes, resolve any nodesToResolve nodes
+ if len(resolvedNodes) == 0:
+ for (name, childObjs) in nodesToResolve.items():
+ childObj = next(c for c in childObjs if c["eolId"] != None)
+ resolvedNodes[name] = childObj["eolId"]
+ dbCur.execute("INSERT INTO linked_imgs VALUES (?, ?)", (name, childObj["eolId"]))
+ nodesToResolve.clear()
+# Close db
+dbCon.commit()
+dbCon.close()