aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md32
-rw-r--r--backend/data/enwiki/README.md7
-rwxr-xr-xbackend/data/enwiki/getEnwikiImgData.py33
3 files changed, 42 insertions, 30 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 17484f4..18daa99 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -14,7 +14,8 @@ File Generation Process
It uses data in eol/imagesList.db, and the 'eol\_ids' table.
2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique
images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
- 3 Run genImgsForWeb.py to create cropped/resized images in img/, using
+ 3 // UPDATE
+ Run genImgsForWeb.py to create cropped/resized images in img/, using
images in eol/imgsReviewed/, and also to add an 'images' table to data.db.
4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate
@@ -22,21 +23,31 @@ File Generation Process
4 Node Description Data
1 Obtain data in dbpedia/, as specified in it's README.
2 Run genDbpData.py, which adds a 'descs' table to data.db, using
- data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table.
-5 Supplementary Name/Description Data
+ data in dbpedia/dbpData.db, the 'nodes' table, and possibly
+ dbpNamesToSkip.txt and dbpPickedLabels.txt.
+5 Supplementary Name/Description/Image Data
1 Obtain data in enwiki/, as specified in it's README.
2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
enwiki/enwikiData.db, and the 'nodes' table. Also uses genEnwikiDesc*.txt
files for skipping/resolving some name-page associations.
- 3 Run genEnwikiNameData.py, which adds to the 'names' table, using data in
- enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+ 3 Optionally run genEnwikiNameData.py, which adds to the 'names' table,
+ using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+ 4 In enwiki/, run getEnwikiImgData.py, which generates a list of
+ tol-node images, and creates enwiki/enwikiImgs.db to store it.
+ Uses the 'descs' table to get tol-node wiki-ids.
+ 5 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing
+ information for images listed in enwiki/enwikiImgs.db, and stores
+ it in that db.
+ 6 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
+ images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/.
+ 7 // ADD
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
data.db Tables
==============
-- nodes: name TEXT PRIMARY KEY, tips INT
+- nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT
- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
- eol\_ids: id INT PRIMARY KEY, name TEXT
@@ -51,14 +62,7 @@ Other Files
- dbpPickedLabels.txt <br>
Contains DBpedia labels, one per line. Used by genDbpData.py to help
resolve conflicts when associating tree-of-life node names with
- DBpedia node labels. Was generated by manually editing the output
- of genDbpConflicts.py.
-- genDbpConflicts.py <br>
- Reads data from dbpedia/dbpData.db, and the 'nodes' table of data.db,
- and looks for potential conflicts that would arise when genDbpData.db
- tries to associate tree-of-life node names wth DBpedia node labels. It
- writes data about them to conflicts.txt, which can be manually edited
- to resolve them.
+ DBpedia node labels.
- genOtolNamesToKeep.txt <br>
Contains names to avoid trimming off the tree data generated by
genOtolData.py. Usage is optional, but, without it, a large amount
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index c9615ef..ea97c9a 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -28,3 +28,10 @@ Generated Files
- pages: id INT PRIMARY KEY, title TEXT UNIQUE
- redirects: id INT PRIMARY KEY, target TEXT
- descs: id INT PRIMARY KEY, desc TEXT
+- enwikiImgs.db <br>
+ Holds infobox-images obtained for some set of wiki page-ids.
+ Generated by running getEnwikiImgData.py, which uses the enwiki dump
+ file and dumpIndex.db. <br>
+ Tables: <br>
+ - page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT
+ - imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT
diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py
index 1992279..1eca0bd 100755
--- a/backend/data/enwiki/getEnwikiImgData.py
+++ b/backend/data/enwiki/getEnwikiImgData.py
@@ -5,16 +5,23 @@ import bz2, html, urllib.parse
import sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Gets nodes with enwiki page-ids, and looks up their content in enwiki/,\n"
-usageInfo += "trying to get infobox image filenames, and prints lines like 'pageId1 filename1'\n"
+usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
+usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-dbFile = "../data.db"
-indexDb = "dumpIndex.db"
+def getInputPageIds():
+ pageIds = set()
+ dbCon = sqlite3.connect("../data.db")
+ dbCur = dbCon.cursor()
+ for (pageId,) in dbCur.execute("SELECT wiki_id from descs"):
+ pageIds.add(pageId)
+ dbCon.close()
+ return pageIds
dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
-imgDb = "enwikiImgs.db"
+indexDb = "dumpIndex.db"
+imgDb = "enwikiImgs.db" # Output db
idLineRegex = re.compile(r"<id>(.*)</id>")
imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
@@ -22,19 +29,16 @@ imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNOREC
cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
# Open dbs
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
# Create image-db table
-imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMAY KEY, img_name TEXT)")
-# Get nodes with enwiki page-ids
-print("Getting nodes with wiki-ids", file=sys.stderr)
-pageIds = set()
-for (pageId,) in dbCur.execute("SELECT wiki_id from descs"):
- pageIds.add(pageId)
+imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)")
+imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+# Get input pageIds
+print("Getting input page-ids", file=sys.stderr)
+pageIds = getInputPageIds()
# Get page-id dump-file offsets
print("Getting dump-file offsets", file=sys.stderr)
offsetToPageids = {}
@@ -106,8 +110,6 @@ with open(dumpFile, mode='rb') as file:
iterNum += 1
if iterNum % 100 == 0:
print(f"At iteration {iterNum}", file=sys.stderr)
- if iterNum == 300:
- break
#
pageIds = offsetToPageids[pageOffset]
# Jump to chunk
@@ -163,7 +165,6 @@ with open(dumpFile, mode='rb') as file:
if not foundText:
print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
# Close dbs
-dbCon.close()
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()