Add scripts for generating eol/enwiki mappings

- New data sources: OTOL taxonomy, EOL provider-ids, Wikidata dump - Add 'node_iucn' table - Remove 'redirected' field from 'wiki_ids' table - Make 'eol_ids' table have 'name' as the primary key - Combine name-generation scripts into genNameData.py - Combine description-generation scripts into genDescData.py
author: Terry Truong <terry06890@gmail.com> 2022-08-30 12:27:42 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-08-30 12:27:42 +1000
commit: e8e58a3bb9dc233dacf573973457c5b48d369503 (patch)
tree: 242500ca304c5afbb7e6506e61da4c4dfff0b175 /backend/tolData/eol
parent: 930c12d33e1093f874a4beb4d6376621e464e8c0 (diff)
3 files changed, 7 insertions, 4 deletions
diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md
index 1a9dbdf..c07b48e 100644
--- a/backend/tolData/eol/README.md
+++ b/backend/tolData/eol/README.md
@@ -1,4 +1,9 @@
-This directory holds files obtained from/using the [Encyclopedia of Life](https://eol.org/).
+This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/).
+
+# Mapping Files
+-   `provider_ids.csv.gz` <br>
+    Obtained from <https://opendata.eol.org/dataset/identifier-map> on 22/08/22 (says last updated 27/07/22).
+    Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium.
 
 # Name Data Files
 -   vernacularNames.csv <br>
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py
index 7ca4e79..4d658e7 100755
--- a/backend/tolData/eol/downloadImgs.py
+++ b/backend/tolData/eol/downloadImgs.py
@@ -22,8 +22,6 @@ highest EOL ID.
 """, formatter_class=argparse.RawDescriptionHelpFormatter)
 parser.parse_args()
 
-# In testing, this script downloaded about 70k images, over a few days
-
 imagesListDb = "imagesList.db"
 def getInputEolIds():
 	eolIds = set()
@@ -95,7 +93,6 @@ def downloadImg(url, outFile):
 for idx in range(nextIdx, len(eolIds)):
 	eolId = eolIds[idx]
 	# Get image urls
-	imgDataList = []
 	ownerSet = set() # Used to get images from different owners, for variety
 	exitLoop = False
 	query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py
index 0c45887..4dcb6d9 100755
--- a/backend/tolData/eol/genImagesListDb.py
+++ b/backend/tolData/eol/genImagesListDb.py
@@ -18,6 +18,7 @@ dbCon = sqlite3.connect(dbFile)
 dbCur = dbCon.cursor()
 dbCur.execute("CREATE TABLE images" \
 	" (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)")
+dbCur.execute("CREATE INDEX images_pid_idx ON images(page_id)")
 print("Reading CSV files")
 csvFilenames = os.listdir(imagesListDir)
 for filename in csvFilenames:
author	Terry Truong <terry06890@gmail.com>	2022-08-30 12:27:42 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-08-30 12:27:42 +1000
commit	e8e58a3bb9dc233dacf573973457c5b48d369503 (patch)
tree	242500ca304c5afbb7e6506e61da4c4dfff0b175 /backend/tolData/eol
parent	930c12d33e1093f874a4beb4d6376621e464e8c0 (diff)