diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 26 | ||||
| -rwxr-xr-x | backend/data/genImgsForWeb.py | 2 | ||||
| -rwxr-xr-x | backend/data/genLinkedImgs.py | 78 |
3 files changed, 94 insertions, 12 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 44219da..438188c 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -10,12 +10,15 @@ File Generation Process 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, using data in eol/vernacularNames.csv and the 'nodes' table. 3 Image Data - 1 Use downloadImgsForReview.py to download EOL images into imgsForReview/. + 1 Run downloadImgsForReview.py to download EOL images into imgsForReview/. It uses data in eol/imagesList.db, and the 'eol\_ids' table. - 2 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique + 2 Run reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique images in imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info). - 3 Use genImgsForWeb.py to create cropped/resized images in img/, using + 3 Run genImgsForWeb.py to create cropped/resized images in img/, using images in imgsReviewed, and also to add an 'images' table to data.db. + 4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, + which uses 'nodes', 'edges', 'eol_ids', and 'images', to associate + nodes without images to child images. 4 Node Description Data - Using DBpedia 1 Obtain data in dbpedia/, as specified in it's README. @@ -31,14 +34,15 @@ File Generation Process data.db Tables ============== -- nodes: name TEXT PRIMARY KEY, tips INT -- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) -- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) -- eol\_ids: id INT PRIMARY KEY, name TEXT -- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT -- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT -- r\_nodes: name TEXT PRIMARY KEY, tips INT -- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) +- nodes: name TEXT PRIMARY KEY, tips INT +- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) +- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) +- eol\_ids: id INT PRIMARY KEY, name TEXT +- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT +- linked\_imgs: name TEXT PRIMARY KEY, eol\_id INT +- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT +- r\_nodes: name TEXT PRIMARY KEY, tips INT +- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) Other Files =========== diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py index 14583d6..91a1cde 100755 --- a/backend/data/genImgsForWeb.py +++ b/backend/data/genImgsForWeb.py @@ -28,7 +28,7 @@ if not os.path.exists(outDir): # Open images-list db imagesListDbCon = sqlite3.connect(imagesListDb) imagesListCur = imagesListDbCon.cursor() -# Create/open data db +# Open data db dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='images'").fetchone() == None: diff --git a/backend/data/genLinkedImgs.py b/backend/data/genLinkedImgs.py new file mode 100755 index 0000000..5f49ffc --- /dev/null +++ b/backend/data/genLinkedImgs.py @@ -0,0 +1,78 @@ +#!/usr/bin/python3 + +import sys +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Adds a table to data.db, associating nodes without images to\n" +usageInfo += "usable child images.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbFile = "data.db" + +# Open db +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, eol_id INT)") +# Get nodes with images +print("Getting nodes with images") +resolvedNodes = {} # Will map node names to eol IDs with a usable image +query = "SELECT nodes.name, eol_ids.id FROM" \ + " nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name" \ + " INNER JOIN images ON eol_ids.id = images.eol_id" +for (name, eolId) in dbCur.execute(query): + resolvedNodes[name] = eolId +print("Got {} nodes".format(len(resolvedNodes))) +# Iterate through resolved nodes, resolving ancestors where able +print("Resolving ancestor nodes") +nodesToResolve = {} +processedNodes = set() +iterNum = 0 +while len(resolvedNodes) > 0: + iterNum += 1 + if iterNum % 1e3 == 0: + print("At iteration {}".format(iterNum)) + # Get next node + (nodeName, eolId) = resolvedNodes.popitem() + processedNodes.add(nodeName) + # Traverse upwards, resolving ancestors if able + while True: + # Get parent + row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (nodeName,)).fetchone() + if row == None or row[0] in processedNodes or row[0] in resolvedNodes: + break + parent = row[0] + # Get parent data + if parent not in nodesToResolve: + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (parent,))] + query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(childNames))) + childObjs = [{"name": row[0], "tips": row[1], "eolId": None} for row in dbCur.execute(query, childNames)] + childObjs.sort(key=lambda x: x["tips"], reverse=True) + nodesToResolve[parent] = childObjs + else: + childObjs = nodesToResolve[parent] + # Check if highest-tips child + if (childObjs[0]["name"] == nodeName): + # Resolve parent, and continue from it + dbCur.execute("INSERT INTO linked_imgs VALUES (?, ?)", (parent, eolId)) + del nodesToResolve[parent] + processedNodes.add(parent) + nodeName = parent + continue + else: + # Add potential EOL ID to parent + childObj = next(c for c in childObjs if c["name"] == nodeName) + childObj["eolId"] = eolId + break + # When out of resolved nodes, resolve any nodesToResolve nodes + if len(resolvedNodes) == 0: + for (name, childObjs) in nodesToResolve.items(): + childObj = next(c for c in childObjs if c["eolId"] != None) + resolvedNodes[name] = childObj["eolId"] + dbCur.execute("INSERT INTO linked_imgs VALUES (?, ?)", (name, childObj["eolId"])) + nodesToResolve.clear() +# Close db +dbCon.commit() +dbCon.close() |
