diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
| commit | e8e58a3bb9dc233dacf573973457c5b48d369503 (patch) | |
| tree | 242500ca304c5afbb7e6506e61da4c4dfff0b175 /backend/tolData/eol | |
| parent | 930c12d33e1093f874a4beb4d6376621e464e8c0 (diff) | |
Add scripts for generating eol/enwiki mappings
- New data sources: OTOL taxonomy, EOL provider-ids, Wikidata dump
- Add 'node_iucn' table
- Remove 'redirected' field from 'wiki_ids' table
- Make 'eol_ids' table have 'name' as the primary key
- Combine name-generation scripts into genNameData.py
- Combine description-generation scripts into genDescData.py
Diffstat (limited to 'backend/tolData/eol')
| -rw-r--r-- | backend/tolData/eol/README.md | 7 | ||||
| -rwxr-xr-x | backend/tolData/eol/downloadImgs.py | 3 | ||||
| -rwxr-xr-x | backend/tolData/eol/genImagesListDb.py | 1 |
3 files changed, 7 insertions, 4 deletions
diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md index 1a9dbdf..c07b48e 100644 --- a/backend/tolData/eol/README.md +++ b/backend/tolData/eol/README.md @@ -1,4 +1,9 @@ -This directory holds files obtained from/using the [Encyclopedia of Life](https://eol.org/). +This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/). + +# Mapping Files +- `provider_ids.csv.gz` <br> + Obtained from <https://opendata.eol.org/dataset/identifier-map> on 22/08/22 (says last updated 27/07/22). + Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium. # Name Data Files - vernacularNames.csv <br> diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py index 7ca4e79..4d658e7 100755 --- a/backend/tolData/eol/downloadImgs.py +++ b/backend/tolData/eol/downloadImgs.py @@ -22,8 +22,6 @@ highest EOL ID. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -# In testing, this script downloaded about 70k images, over a few days - imagesListDb = "imagesList.db" def getInputEolIds(): eolIds = set() @@ -95,7 +93,6 @@ def downloadImg(url, outFile): for idx in range(nextIdx, len(eolIds)): eolId = eolIds[idx] # Get image urls - imgDataList = [] ownerSet = set() # Used to get images from different owners, for variety exitLoop = False query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py index 0c45887..4dcb6d9 100755 --- a/backend/tolData/eol/genImagesListDb.py +++ b/backend/tolData/eol/genImagesListDb.py @@ -18,6 +18,7 @@ dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE images" \ " (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)") +dbCur.execute("CREATE INDEX images_pid_idx ON images(page_id)") print("Reading CSV files") csvFilenames = os.listdir(imagesListDir) for filename in csvFilenames: |
