diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-15 20:48:31 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-15 20:48:31 +1000 |
| commit | eabec97d80e5f43e4710dd4c8327d0bc8519ab8c (patch) | |
| tree | 0fd6d93f306b6dc7db2b9233ce63b65ebdc994bc /backend | |
| parent | 3402f3d6e906afb500b95448d7d0b136b6b5ee86 (diff) | |
Separate node-wiki_id association from 'descs' table
Diffstat (limited to 'backend')
| -rw-r--r-- | backend/data/README.md | 27 | ||||
| -rwxr-xr-x | backend/data/enwiki/getEnwikiImgData.py | 2 | ||||
| -rwxr-xr-x | backend/data/genDbpData.py | 9 | ||||
| -rwxr-xr-x | backend/data/genEnwikiDescData.py | 6 | ||||
| -rwxr-xr-x | backend/data/genEnwikiNameData.py | 2 | ||||
| -rwxr-xr-x | backend/data/reviewImgsToMerge.py | 2 | ||||
| -rwxr-xr-x | backend/data/trimTree.py | 2 | ||||
| -rwxr-xr-x | backend/server.py | 12 |
8 files changed, 34 insertions, 28 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index fb3ca16..e622832 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -12,12 +12,13 @@ File Generation Process genEolNameDataPickedIds.txt. 3 Node Description Data 1 Obtain data in dbpedia/ and enwiki/, as specified in their README files. - 2 Run genDbpData.py, which adds a 'descs' table to data.db, using - data in dbpedia/dbpData.db, the 'nodes' table, and possibly + 2 Run genDbpData.py, which adds 'wiki\_ids' and 'descs' tables to data.db, + using data in dbpedia/dbpData.db, the 'nodes' table, and possibly genDescNamesToSkip.txt and dbpPickedLabels.txt. - 3 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in - enwiki/enwikiData.db, and the 'nodes' table. Also uses genDescNamesToSkip.txt and - genEnwikiDescTitlesToUse.txt for skipping/resolving some name-page associations. + 3 Run genEnwikiDescData.py, which adds to the 'wiki\_ids' and 'descs' tables, + using data in enwiki/enwikiData.db, and the 'nodes' table. + Also uses genDescNamesToSkip.txt and genEnwikiDescTitlesToUse.txt for + skipping/resolving some name-page associations. 3 Image Data 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/. It uses data in eol/imagesList.db, and the 'eol\_ids' table. @@ -25,15 +26,16 @@ File Generation Process images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info). 3 In enwiki/, run getEnwikiImgData.py, which generates a list of tol-node images, and creates enwiki/enwikiImgs.db to store it. - Uses the 'descs' table to get tol-node wiki-ids. + Uses the 'wiki\_ids' table to get tol-node wiki-ids. 4 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing information for images listed in enwiki/enwikiImgs.db, and stores it in that db. 5 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed' images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/. 6 Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/, - enables choosing, for each tol-node, which image should be used, if any, - and outputs choice information into mergedImgList.txt. + and enables choosing, for each tol-node, which image should be used, if any, + and outputs choice information into mergedImgList.txt. Uses the 'nodes', + 'eol\_ids', and 'wiki\_ids' tables (as well as 'names' for info-display). 7 Run genImgsForWeb.py, which creates cropped/resized images in img/, using mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db. Smartcrop's outputs might need to be manually created/adjusted: <br> @@ -47,14 +49,14 @@ File Generation Process The result might have as many as 150k images, with about 2/3 of them being from wikipedia. 8 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, - which uses 'nodes', 'edges', 'eol\_ids', and 'node_imgs', to associate + which uses 'nodes', 'edges', 'eol\_ids', and 'node\_imgs', to associate nodes without images to child images. 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. 6 Other - Optionally run genEnwikiNameData.py, which adds more entries to the 'names' table, - using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables. + using data in enwiki/enwikiData.db, and the 'names' and 'wiki\_ids' tables. - Optionally run trimTree.py, which tries to remove some 'low-significance' nodes, for the sake of performance and result-relevance. Without this, jumping to certain nodes within the fungi and moths can take over a minute to render. @@ -63,9 +65,10 @@ data.db Tables ============== - nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT - edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) -- names: name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name) - eol\_ids: id INT PRIMARY KEY, name TEXT -- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT +- names: name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name) +- wiki\_ids: name TEXT PRIMARY KEY, id INT, redirected INT +- descs: wiki\_id INT PRIMARY KEY, desc TEXT, from\_dbp INT - node\_imgs: name TEXT PRIMARY KEY, img\_id INT, src TEXT - images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src) - linked\_imgs: name TEXT PRIMARY KEY, otol\_id INT, otol\_id2 INT diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py index 1eca0bd..f9680ff 100755 --- a/backend/data/enwiki/getEnwikiImgData.py +++ b/backend/data/enwiki/getEnwikiImgData.py @@ -15,7 +15,7 @@ def getInputPageIds(): pageIds = set() dbCon = sqlite3.connect("../data.db") dbCur = dbCon.cursor() - for (pageId,) in dbCur.execute("SELECT wiki_id from descs"): + for (pageId,) in dbCur.execute("SELECT id from wiki_ids"): pageIds.add(pageId) dbCon.close() return pageIds diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py index 3755145..e921b6c 100755 --- a/backend/data/genDbpData.py +++ b/backend/data/genDbpData.py @@ -221,8 +221,8 @@ for (name, iri) in nodeToIri.items(): redirectingIriSet.add(name) # Find descriptions, and add to db print("Adding node description data") -dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)") -dbCur.execute("CREATE INDEX descs_id_idx ON descs(wiki_id)") # wiki_id intentionally left non-unique +dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)") +dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)") iterNum = 0 for (name, iri) in nodeToIri.items(): iterNum += 1 @@ -232,8 +232,9 @@ for (name, iri) in nodeToIri.items(): query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?" row = dbpCur.execute(query, (iri,)).fetchone() if row != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)", - (name, row[0], 1 if name in redirectingIriSet else 0, row[1], 1)) + (desc, wikiId) = row + dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0)) + dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1)) # Close dbs dbCon.commit() dbCon.close() diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py index 4445c3f..debd3fd 100755 --- a/backend/data/genEnwikiDescData.py +++ b/backend/data/genEnwikiDescData.py @@ -40,7 +40,7 @@ print(f"Read in {len(nameToPickedTitle)} titles to use for certain names") # Get node names without descriptions print("Getting node names") nodeNames = set() -query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL" +query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL" for row in dbCur.execute(query): nodeNames.add(row[0]) print(f"Found {len(nodeNames)} names") @@ -90,8 +90,8 @@ for (name, pageId) in nodeToPageId.items(): # row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() if row != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)", - (name, row[0], 1 if name in redirectingNames else 0, pageId, 0)) + dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0)) + dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0)) # Close dbs dbCon.commit() dbCon.close() diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py index 1beb522..71960a5 100755 --- a/backend/data/genEnwikiNameData.py +++ b/backend/data/genEnwikiNameData.py @@ -23,7 +23,7 @@ dbCur = dbCon.cursor() # Get nodes with wiki-ids print("Getting nodes with wiki IDs") nodeToWikiId = {} -for row in dbCur.execute("SELECT name, wiki_id from descs"): +for row in dbCur.execute("SELECT name, id from wiki_ids"): nodeToWikiId[row[0]] = row[1] print(f"Found {len(nodeToWikiId)} nodes") # Find wiki-ids that redirect to each node diff --git a/backend/data/reviewImgsToMerge.py b/backend/data/reviewImgsToMerge.py index 4120b14..d177a5e 100755 --- a/backend/data/reviewImgsToMerge.py +++ b/backend/data/reviewImgsToMerge.py @@ -51,7 +51,7 @@ print("Looking through enwiki images") if os.path.exists(enwikiImgDir): for filename in os.listdir(enwikiImgDir): (wikiId, _, _) = filename.partition(".") - query = "SELECT nodes.id FROM nodes INNER JOIN descs ON nodes.name = descs.name WHERE descs.wiki_id = ?" + query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids._id = ?" found = False for (otolId,) in dbCur.execute(query, (int(wikiId),)): if otolId not in nodeToImgs: diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py index 3c98ae8..f580e7e 100755 --- a/backend/data/trimTree.py +++ b/backend/data/trimTree.py @@ -21,7 +21,7 @@ dbCur = dbCon.cursor() print("Finding nodes to keep") nodesToKeep = set() print("\tFinding nodes with descs") -for (name,) in dbCur.execute("SELECT name FROM descs"): +for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc nodesToKeep.add(name) print("\tFinding nodes with images") for (name,) in dbCur.execute("SELECT name FROM node_imgs"): diff --git a/backend/server.py b/backend/server.py index e252d5e..7d11bc4 100755 --- a/backend/server.py +++ b/backend/server.py @@ -117,19 +117,21 @@ def lookupNodeInfo(name, useReducedTree): descData = None match = re.fullmatch(r"\[(.+) \+ (.+)]", name) if match == None: - query = "SELECT desc, redirected, wiki_id, from_dbp from descs WHERE descs.name = ?" + query = "SELECT wiki_id, redirected, desc, from_dbp FROM" \ + " wiki_ids INNER JOIN descs ON wiki_ids.id = descs.wiki_id WHERE wiki_ids.name = ?" row = cur.execute(query, (name,)).fetchone() if row != None: - descData = {"text": row[0], "fromRedirect": row[1] == 1, "wikiId": row[2], "fromDbp": row[3] == 1} + descData = {"wikiId": row[0], "fromRedirect": row[1] == 1, "text": row[2], "fromDbp": row[3] == 1} else: # Get descs for compound-node element descData = [None, None] - query = "SELECT name, desc, redirected, wiki_id, from_dbp from descs WHERE descs.name IN (?, ?)" + query = "SELECT name, wiki_id, redirected, desc, from_dbp FROM" \ + " wiki_ids INNER JOIN descs ON wiki_ids.id = descs.wiki_id WHERE wiki_ids.name IN (?, ?)" for row in cur.execute(query, match.group(1,2)): if row[0] == match.group(1): - descData[0] = {"text": row[1], "fromRedirect": row[2] == 1, "wikiId": row[3], "fromDbp": row[4] == 1} + descData[0] = {"wikiId": row[1], "fromRedirect": row[2] == 1, "text": row[3], "fromDbp": row[4] == 1} else: - descData[1] = {"text": row[1], "fromRedirect": row[2] == 1, "wikiId": row[3], "fromDbp": row[4] == 1} + descData[1] = {"wikiId": row[1], "fromRedirect": row[2] == 1, "text": row[3], "fromDbp": row[4] == 1} # Get img info imgData = None if nodeObj != None: |
