aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--backend/data/README.md27
-rwxr-xr-xbackend/data/enwiki/getEnwikiImgData.py2
-rwxr-xr-xbackend/data/genDbpData.py9
-rwxr-xr-xbackend/data/genEnwikiDescData.py6
-rwxr-xr-xbackend/data/genEnwikiNameData.py2
-rwxr-xr-xbackend/data/reviewImgsToMerge.py2
-rwxr-xr-xbackend/data/trimTree.py2
-rwxr-xr-xbackend/server.py12
8 files changed, 34 insertions, 28 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index fb3ca16..e622832 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -12,12 +12,13 @@ File Generation Process
genEolNameDataPickedIds.txt.
3 Node Description Data
1 Obtain data in dbpedia/ and enwiki/, as specified in their README files.
- 2 Run genDbpData.py, which adds a 'descs' table to data.db, using
- data in dbpedia/dbpData.db, the 'nodes' table, and possibly
+ 2 Run genDbpData.py, which adds 'wiki\_ids' and 'descs' tables to data.db,
+ using data in dbpedia/dbpData.db, the 'nodes' table, and possibly
genDescNamesToSkip.txt and dbpPickedLabels.txt.
- 3 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
- enwiki/enwikiData.db, and the 'nodes' table. Also uses genDescNamesToSkip.txt and
- genEnwikiDescTitlesToUse.txt for skipping/resolving some name-page associations.
+ 3 Run genEnwikiDescData.py, which adds to the 'wiki\_ids' and 'descs' tables,
+ using data in enwiki/enwikiData.db, and the 'nodes' table.
+ Also uses genDescNamesToSkip.txt and genEnwikiDescTitlesToUse.txt for
+ skipping/resolving some name-page associations.
3 Image Data
1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/.
It uses data in eol/imagesList.db, and the 'eol\_ids' table.
@@ -25,15 +26,16 @@ File Generation Process
images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
3 In enwiki/, run getEnwikiImgData.py, which generates a list of
tol-node images, and creates enwiki/enwikiImgs.db to store it.
- Uses the 'descs' table to get tol-node wiki-ids.
+ Uses the 'wiki\_ids' table to get tol-node wiki-ids.
4 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing
information for images listed in enwiki/enwikiImgs.db, and stores
it in that db.
5 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/.
6 Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/,
- enables choosing, for each tol-node, which image should be used, if any,
- and outputs choice information into mergedImgList.txt.
+ and enables choosing, for each tol-node, which image should be used, if any,
+ and outputs choice information into mergedImgList.txt. Uses the 'nodes',
+ 'eol\_ids', and 'wiki\_ids' tables (as well as 'names' for info-display).
7 Run genImgsForWeb.py, which creates cropped/resized images in img/, using
mergedImgList.txt, and adds 'images' and 'node_imgs' tables to data.db.
Smartcrop's outputs might need to be manually created/adjusted: <br>
@@ -47,14 +49,14 @@ File Generation Process
The result might have as many as 150k images, with about 2/3 of them
being from wikipedia.
8 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
- which uses 'nodes', 'edges', 'eol\_ids', and 'node_imgs', to associate
+ which uses 'nodes', 'edges', 'eol\_ids', and 'node\_imgs', to associate
nodes without images to child images.
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
6 Other
- Optionally run genEnwikiNameData.py, which adds more entries to the 'names' table,
- using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+ using data in enwiki/enwikiData.db, and the 'names' and 'wiki\_ids' tables.
- Optionally run trimTree.py, which tries to remove some 'low-significance' nodes,
for the sake of performance and result-relevance. Without this, jumping to certain
nodes within the fungi and moths can take over a minute to render.
@@ -63,9 +65,10 @@ data.db Tables
==============
- nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT
- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
-- names: name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name)
- eol\_ids: id INT PRIMARY KEY, name TEXT
-- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from\_dbp INT
+- names: name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name)
+- wiki\_ids: name TEXT PRIMARY KEY, id INT, redirected INT
+- descs: wiki\_id INT PRIMARY KEY, desc TEXT, from\_dbp INT
- node\_imgs: name TEXT PRIMARY KEY, img\_id INT, src TEXT
- images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)
- linked\_imgs: name TEXT PRIMARY KEY, otol\_id INT, otol\_id2 INT
diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py
index 1eca0bd..f9680ff 100755
--- a/backend/data/enwiki/getEnwikiImgData.py
+++ b/backend/data/enwiki/getEnwikiImgData.py
@@ -15,7 +15,7 @@ def getInputPageIds():
pageIds = set()
dbCon = sqlite3.connect("../data.db")
dbCur = dbCon.cursor()
- for (pageId,) in dbCur.execute("SELECT wiki_id from descs"):
+ for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
pageIds.add(pageId)
dbCon.close()
return pageIds
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
index 3755145..e921b6c 100755
--- a/backend/data/genDbpData.py
+++ b/backend/data/genDbpData.py
@@ -221,8 +221,8 @@ for (name, iri) in nodeToIri.items():
redirectingIriSet.add(name)
# Find descriptions, and add to db
print("Adding node description data")
-dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)")
-dbCur.execute("CREATE INDEX descs_id_idx ON descs(wiki_id)") # wiki_id intentionally left non-unique
+dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)")
+dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)")
iterNum = 0
for (name, iri) in nodeToIri.items():
iterNum += 1
@@ -232,8 +232,9 @@ for (name, iri) in nodeToIri.items():
query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?"
row = dbpCur.execute(query, (iri,)).fetchone()
if row != None:
- dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)",
- (name, row[0], 1 if name in redirectingIriSet else 0, row[1], 1))
+ (desc, wikiId) = row
+ dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0))
+ dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1))
# Close dbs
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index 4445c3f..debd3fd 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -40,7 +40,7 @@ print(f"Read in {len(nameToPickedTitle)} titles to use for certain names")
# Get node names without descriptions
print("Getting node names")
nodeNames = set()
-query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL"
+query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL"
for row in dbCur.execute(query):
nodeNames.add(row[0])
print(f"Found {len(nodeNames)} names")
@@ -90,8 +90,8 @@ for (name, pageId) in nodeToPageId.items():
#
row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
if row != None:
- dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)",
- (name, row[0], 1 if name in redirectingNames else 0, pageId, 0))
+ dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0))
+ dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0))
# Close dbs
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py
index 1beb522..71960a5 100755
--- a/backend/data/genEnwikiNameData.py
+++ b/backend/data/genEnwikiNameData.py
@@ -23,7 +23,7 @@ dbCur = dbCon.cursor()
# Get nodes with wiki-ids
print("Getting nodes with wiki IDs")
nodeToWikiId = {}
-for row in dbCur.execute("SELECT name, wiki_id from descs"):
+for row in dbCur.execute("SELECT name, id from wiki_ids"):
nodeToWikiId[row[0]] = row[1]
print(f"Found {len(nodeToWikiId)} nodes")
# Find wiki-ids that redirect to each node
diff --git a/backend/data/reviewImgsToMerge.py b/backend/data/reviewImgsToMerge.py
index 4120b14..d177a5e 100755
--- a/backend/data/reviewImgsToMerge.py
+++ b/backend/data/reviewImgsToMerge.py
@@ -51,7 +51,7 @@ print("Looking through enwiki images")
if os.path.exists(enwikiImgDir):
for filename in os.listdir(enwikiImgDir):
(wikiId, _, _) = filename.partition(".")
- query = "SELECT nodes.id FROM nodes INNER JOIN descs ON nodes.name = descs.name WHERE descs.wiki_id = ?"
+ query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids._id = ?"
found = False
for (otolId,) in dbCur.execute(query, (int(wikiId),)):
if otolId not in nodeToImgs:
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py
index 3c98ae8..f580e7e 100755
--- a/backend/data/trimTree.py
+++ b/backend/data/trimTree.py
@@ -21,7 +21,7 @@ dbCur = dbCon.cursor()
print("Finding nodes to keep")
nodesToKeep = set()
print("\tFinding nodes with descs")
-for (name,) in dbCur.execute("SELECT name FROM descs"):
+for (name,) in dbCur.execute("SELECT name FROM wiki_ids"): # Can assume the wiki_id has a desc
nodesToKeep.add(name)
print("\tFinding nodes with images")
for (name,) in dbCur.execute("SELECT name FROM node_imgs"):
diff --git a/backend/server.py b/backend/server.py
index e252d5e..7d11bc4 100755
--- a/backend/server.py
+++ b/backend/server.py
@@ -117,19 +117,21 @@ def lookupNodeInfo(name, useReducedTree):
descData = None
match = re.fullmatch(r"\[(.+) \+ (.+)]", name)
if match == None:
- query = "SELECT desc, redirected, wiki_id, from_dbp from descs WHERE descs.name = ?"
+ query = "SELECT wiki_id, redirected, desc, from_dbp FROM" \
+ " wiki_ids INNER JOIN descs ON wiki_ids.id = descs.wiki_id WHERE wiki_ids.name = ?"
row = cur.execute(query, (name,)).fetchone()
if row != None:
- descData = {"text": row[0], "fromRedirect": row[1] == 1, "wikiId": row[2], "fromDbp": row[3] == 1}
+ descData = {"wikiId": row[0], "fromRedirect": row[1] == 1, "text": row[2], "fromDbp": row[3] == 1}
else:
# Get descs for compound-node element
descData = [None, None]
- query = "SELECT name, desc, redirected, wiki_id, from_dbp from descs WHERE descs.name IN (?, ?)"
+ query = "SELECT name, wiki_id, redirected, desc, from_dbp FROM" \
+ " wiki_ids INNER JOIN descs ON wiki_ids.id = descs.wiki_id WHERE wiki_ids.name IN (?, ?)"
for row in cur.execute(query, match.group(1,2)):
if row[0] == match.group(1):
- descData[0] = {"text": row[1], "fromRedirect": row[2] == 1, "wikiId": row[3], "fromDbp": row[4] == 1}
+ descData[0] = {"wikiId": row[1], "fromRedirect": row[2] == 1, "text": row[3], "fromDbp": row[4] == 1}
else:
- descData[1] = {"text": row[1], "fromRedirect": row[2] == 1, "wikiId": row[3], "fromDbp": row[4] == 1}
+ descData[1] = {"wikiId": row[1], "fromRedirect": row[2] == 1, "text": row[3], "fromDbp": row[4] == 1}
# Get img info
imgData = None
if nodeObj != None: