3 files changed, 42 insertions, 30 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 17484f4..18daa99 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -14,7 +14,8 @@ File Generation Process
         It uses data in eol/imagesList.db, and the 'eol\_ids' table.
     2   In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique
         images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info).
-    3   Run genImgsForWeb.py to create cropped/resized images in img/, using
+    3   // UPDATE
+        Run genImgsForWeb.py to create cropped/resized images in img/, using
         images in eol/imgsReviewed/, and also to add an 'images' table to data.db.
     4   Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
         which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate
@@ -22,21 +23,31 @@ File Generation Process
 4   Node Description Data
     1   Obtain data in dbpedia/, as specified in it's README.
     2   Run genDbpData.py, which adds a 'descs' table to data.db, using
-        data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table.
-5   Supplementary Name/Description Data
+        data in dbpedia/dbpData.db, the 'nodes' table, and possibly
+        dbpNamesToSkip.txt and dbpPickedLabels.txt.
+5   Supplementary Name/Description/Image Data
     1   Obtain data in enwiki/, as specified in it's README.
     2   Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
         enwiki/enwikiData.db, and the 'nodes' table. Also uses genEnwikiDesc*.txt
         files for skipping/resolving some name-page associations.
-    3   Run genEnwikiNameData.py, which adds to the 'names' table, using data in
-        enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+    3   Optionally run genEnwikiNameData.py, which adds to the 'names' table,
+        using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables.
+    4   In enwiki/, run getEnwikiImgData.py, which generates a list of
+        tol-node images, and creates enwiki/enwikiImgs.db to store it.
+        Uses the 'descs' table to get tol-node wiki-ids.
+    5   In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing
+        information for images listed in enwiki/enwikiImgs.db, and stores
+        it in that db.
+    6   In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
+        images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/.
+    7   // ADD
 5   Reduced Tree Structure Data
     1   Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
         data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
 
 data.db Tables
 ==============
--   nodes:        name TEXT PRIMARY KEY, tips INT
+-   nodes:        name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT
 -   edges:        node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
 -   names:        name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
 -   eol\_ids:     id INT PRIMARY KEY, name TEXT
@@ -51,14 +62,7 @@ Other Files
 -   dbpPickedLabels.txt <br>
     Contains DBpedia labels, one per line. Used by genDbpData.py to help
     resolve conflicts when associating tree-of-life node names with
-    DBpedia node labels. Was generated by manually editing the output
-    of genDbpConflicts.py.
--   genDbpConflicts.py <br>
-    Reads data from dbpedia/dbpData.db, and the 'nodes' table of data.db,
-    and looks for potential conflicts that would arise when genDbpData.db
-    tries to associate tree-of-life node names wth DBpedia node labels. It
-    writes data about them to conflicts.txt, which can be manually edited
-    to resolve them.
+    DBpedia node labels.
 -   genOtolNamesToKeep.txt <br>
     Contains names to avoid trimming off the tree data generated by
     genOtolData.py.  Usage is optional, but, without it, a large amount
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index c9615ef..ea97c9a 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -28,3 +28,10 @@ Generated Files
     -   pages:     id INT PRIMARY KEY, title TEXT UNIQUE
     -   redirects: id INT PRIMARY KEY, target TEXT
     -   descs:     id INT PRIMARY KEY, desc TEXT
+-   enwikiImgs.db <br>
+    Holds infobox-images obtained for some set of wiki page-ids.
+    Generated by running getEnwikiImgData.py, which uses the enwiki dump
+    file and dumpIndex.db. <br>
+    Tables: <br>
+    -   page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT
+    -   imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT
diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py
index 1992279..1eca0bd 100755
--- a/backend/data/enwiki/getEnwikiImgData.py
+++ b/backend/data/enwiki/getEnwikiImgData.py
@@ -5,16 +5,23 @@ import bz2, html, urllib.parse
 import sqlite3
 
 usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Gets nodes with enwiki page-ids, and looks up their content in enwiki/,\n"
-usageInfo += "trying to get infobox image filenames, and prints lines like 'pageId1 filename1'\n"
+usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
+usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-dbFile = "../data.db"
-indexDb = "dumpIndex.db"
+def getInputPageIds():
+	pageIds = set()
+	dbCon = sqlite3.connect("../data.db")
+	dbCur = dbCon.cursor()
+	for (pageId,) in dbCur.execute("SELECT wiki_id from descs"):
+		pageIds.add(pageId)
+	dbCon.close()
+	return pageIds
 dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
-imgDb = "enwikiImgs.db"
+indexDb = "dumpIndex.db"
+imgDb = "enwikiImgs.db" # Output db
 idLineRegex = re.compile(r"<id>(.*)</id>")
 imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
 bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
@@ -22,19 +29,16 @@ imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNOREC
 cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
 
 # Open dbs
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
 indexDbCon = sqlite3.connect(indexDb)
 indexDbCur = indexDbCon.cursor()
 imgDbCon = sqlite3.connect(imgDb)
 imgDbCur = imgDbCon.cursor()
 # Create image-db table
-imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMAY KEY, img_name TEXT)")
-# Get nodes with enwiki page-ids
-print("Getting nodes with wiki-ids", file=sys.stderr)
-pageIds = set()
-for (pageId,) in dbCur.execute("SELECT wiki_id from descs"):
-	pageIds.add(pageId)
+imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)")
+imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+# Get input pageIds
+print("Getting input page-ids", file=sys.stderr)
+pageIds = getInputPageIds()
 # Get page-id dump-file offsets
 print("Getting dump-file offsets", file=sys.stderr)
 offsetToPageids = {}
@@ -106,8 +110,6 @@ with open(dumpFile, mode='rb') as file:
 		iterNum += 1
 		if iterNum % 100 == 0:
 			print(f"At iteration {iterNum}", file=sys.stderr)
-		if iterNum == 300:
-			break
 		#
 		pageIds = offsetToPageids[pageOffset]
 		# Jump to chunk
@@ -163,7 +165,6 @@ with open(dumpFile, mode='rb') as file:
 			if not foundText:
 				print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
 # Close dbs
-dbCon.close()
 indexDbCon.close()
 imgDbCon.commit()
 imgDbCon.close()