diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
| commit | e8e58a3bb9dc233dacf573973457c5b48d369503 (patch) | |
| tree | 242500ca304c5afbb7e6506e61da4c4dfff0b175 /backend/tolData | |
| parent | 930c12d33e1093f874a4beb4d6376621e464e8c0 (diff) | |
Add scripts for generating eol/enwiki mappings
- New data sources: OTOL taxonomy, EOL provider-ids, Wikidata dump
- Add 'node_iucn' table
- Remove 'redirected' field from 'wiki_ids' table
- Make 'eol_ids' table have 'name' as the primary key
- Combine name-generation scripts into genNameData.py
- Combine description-generation scripts into genDescData.py
Diffstat (limited to 'backend/tolData')
| -rw-r--r-- | backend/tolData/README.md | 110 | ||||
| -rwxr-xr-x | backend/tolData/addPickedNames.py | 54 | ||||
| -rwxr-xr-x | backend/tolData/dbpedia/genDescData.py | 1 | ||||
| -rw-r--r-- | backend/tolData/enwiki/README.md | 2 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/genImgData.py | 1 | ||||
| -rw-r--r-- | backend/tolData/eol/README.md | 7 | ||||
| -rwxr-xr-x | backend/tolData/eol/downloadImgs.py | 3 | ||||
| -rwxr-xr-x | backend/tolData/eol/genImagesListDb.py | 1 | ||||
| -rwxr-xr-x | backend/tolData/genDbpData.py | 245 | ||||
| -rwxr-xr-x | backend/tolData/genDescData.py | 91 | ||||
| -rwxr-xr-x | backend/tolData/genEnwikiDescData.py | 99 | ||||
| -rwxr-xr-x | backend/tolData/genEnwikiNameData.py | 73 | ||||
| -rwxr-xr-x | backend/tolData/genEolNameData.py | 181 | ||||
| -rwxr-xr-x | backend/tolData/genMappingData.py | 229 | ||||
| -rwxr-xr-x | backend/tolData/genNameData.py | 113 | ||||
| -rw-r--r-- | backend/tolData/otol/README.md | 19 | ||||
| -rwxr-xr-x | backend/tolData/reviewImgsToGen.py | 1 | ||||
| -rw-r--r-- | backend/tolData/wikidata/README.md | 18 | ||||
| -rwxr-xr-x | backend/tolData/wikidata/genTaxonSrcData.py | 230 |
19 files changed, 756 insertions, 722 deletions
diff --git a/backend/tolData/README.md b/backend/tolData/README.md index 21c02ab..1248098 100644 --- a/backend/tolData/README.md +++ b/backend/tolData/README.md @@ -4,24 +4,24 @@ This directory holds files used to generate the tree-of-life database data.db. ## Tree Structure - `nodes` <br> Format : `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT` <br> - Represents a tree-of-life node. `tips` holds the number of no-child descendants. + Represents a tree-of-life node. `tips` holds the number of no-child descendants - `edges` <br> Format: `parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child)` <br> `p_support` is 1 if the edge has 'phylogenetic support', and 0 otherwise -## Node Names +## Node Mappings - `eol_ids` <br> - Format: `id INT PRIMARY KEY, name TEXT` <br> - Associates an EOL ID with a node's name. + Format: `name TEXT PRIMARY KEY, id INT` <br> + Associates nodes with EOL IDs +- `wiki_ids` <br> + Format: `name TEXT PRIMARY KEY, id INT` <br> + Associates nodes with wikipedia page IDs +## Node Vernacular Names - `names` <br> Format: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)` <br> Associates a node with alternative names. `pref_alt` is 1 if the alt-name is the most 'preferred' one. `src` indicates the dataset the alt-name was obtained from (can be 'eol', 'enwiki', or 'picked'). ## Node Descriptions -- `wiki_ids` <br> - Format: `name TEXT PRIMARY KEY, id INT, redirected INT` <br> - Associates a node with a wikipedia page ID. - `redirected` is 1 if the node was associated with a different page that redirected to this one. - `descs` <br> Format: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT` <br> Associates a wikipedia page ID with a short-description. @@ -42,61 +42,62 @@ This directory holds files used to generate the tree-of-life database data.db. These are like `nodes`, but describe nodes of reduced trees. - `edges_t`, `edges_i`, `edges_p` <br> Like `edges` but for reduced trees. +## Other +- `node_iucn` <br> + Format: `name TEXT PRIMARY KEY, iucn TEXT` <br> + Associated nodes with IUCN conservation status strings (eg: 'endangered') # Generating the Database -For the most part, these steps should be done in order. - -As a warning, the whole process takes a lot of time and file space. The tree will probably -have about 2.5 billion nodes. Downloading the images takes several days, and occupies over -200 GB. And if you want good data, you'll likely need to make additional corrections, -which can take several weeks. +As a warning, the whole process takes a lot of time and file space. The +tree will probably have about 2.6 million nodes. Downloading the images +takes several days, and occupies over 200 GB. ## Environment Some of the scripts use third-party packages: -- jsonpickle: For encoding class objects as JSON. -- requests: For downloading data. -- PIL: For image processing. -- tkinter: For providing a basic GUI to review images. -- mwxml, mwparserfromhell: For parsing Wikipedia dumps. +- `indexed_bzip2`: For parallelised bzip2 processing. +- `jsonpickle`: For encoding class objects as JSON. +- `requests`: For downloading data. +- `PIL`: For image processing. +- `tkinter`: For providing a basic GUI to review images. +- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps. ## Generate Tree Structure Data -1. Obtain files in otol/, as specified in it's README. +1. Obtain 'tree data files' in otol/, as specified in it's README. 2. Run genOtolData.py, which creates data.db, and adds the `nodes` and `edges` tables, using data in otol/. It also uses these files, if they exist: - - pickedOtolNames.txt: Has lines of the form `name1|otolId1`. When nodes in the - tree have the same name (eg: Pholidota can refer to pangolins or orchids), - they get the names 'name1', 'name1 [2]', 'name1 [3], etc. This file is used to - forcibly specify which node should be named 'name1'. + - pickedOtolNames.txt: Has lines of the form `name1|otolId1`. + Can be used to override numeric suffixes added to same-name nodes. + +## Generate Dataset Mappings +1. Obtain 'taxonomy data files' in otol/, 'mapping files' in eol/, + files in wikidata/, and 'dump-index files' in enwiki/, as specified + in their READMEs. +2. Run genMappingData.py, which adds the `eol_ids` and `wiki_ids` tables, + using the files obtained above, and the `nodes` table. It also uses + 'picked mappings' files, if they exist. + - pickedEolIds.txt contains lines like `3785967|405349`, specifying + an otol ID and an eol ID to map it to. The eol ID can be empty, + in which case the otol ID won't be mapped. + - pickedWikiIds.txt and pickedWikiIdsRough.txt contain lines like + `5341349|Human`, specifying an otol ID and an enwiki title, + which may contain spaces. The title can be empty. -## Generate Node Names Data -1. Obtain 'name data files' in eol/, as specified in it's README. -2. Run genEolNameData.py, which adds the `names` and `eol_ids` tables, using data in - eol/ and the `nodes` table. It also uses these files, if they exist: - - pickedEolIds.txt: Has lines of the form `nodeName1|eolId1` or `nodeName1|`. - Specifies node names that should have a particular EOL ID, or no ID. - Quite a few taxons have ambiguous names, and may need manual correction. - For example, Viola may resolve to a taxon of butterflies or of plants. - - pickedEolAltsToSkip.txt: Has lines of the form `nodeName1|altName1`. - Specifies that a node's alt-name set should exclude altName1. +## Generate Node Name Data +1. Obtain 'name data files' in eol/, and 'description database files' in enwiki/, + as specified in their READMEs. +2. Run genNameData.py, which adds the `names` table, using data in eol/ and enwiki/, + along with the `nodes`, `eol_ids`, and `wiki_ids` tables. <br> + It also uses pickedNames.txt, if it exists. This file can hold lines like + `embryophyta|land plant|1`, specifying a node name, an alt-name to add for it, + and a 1 or 0 indicating whether it is a 'preferred' alt-name. The last field + can be empty, which indicates that the alt-name should be removed, or, if the + alt-name is the same as the node name, that no alt-name should be preferred. ## Generate Node Description Data -### Get Data from DBpedia 1. Obtain files in dbpedia/, as specified in it's README. -2. Run genDbpData.py, which adds the `wiki_ids` and `descs` tables, using data in - dbpedia/ and the `nodes` table. It also uses these files, if they exist: - - pickedEnwikiNamesToSkip.txt: Each line holds the name of a node for which - no description should be obtained. Many node names have a same-name - wikipedia page that describes something different (eg: Osiris). - - pickedDbpLabels.txt: Has lines of the form `nodeName1|label1`. - Specifies node names that should have a particular associated page label. -### Get Data from Wikipedia -1. Obtain 'description database files' in enwiki/, as specified in it's README. -2. Run genEnwikiDescData.py, which adds to the `wiki_ids` and `descs` tables, - using data in enwiki/ and the `nodes` table. - It also uses these files, if they exist: - - pickedEnwikiNamesToSkip.txt: Same as with genDbpData.py. - - pickedEnwikiLabels.txt: Similar to pickedDbpLabels.txt. +2. Run genDescData.py, which adds the `descs` table, using data in dbpedia/ and + enwiki/, and the `nodes` table. ## Generate Node Images Data ### Get images from EOL @@ -129,21 +130,12 @@ Some of the scripts use third-party packages: - An input image might produce output with unexpected dimensions. This seems to happen when the image is very large, and triggers a decompression bomb warning. - In testing, this resulted in about 150k images, with about 2/3 of them - being from Wikipedia. ### Add more Image Associations 1. Run genLinkedImgs.py, which tries to associate nodes without images to images of it's children. Adds the `linked_imgs` table, and uses the `nodes`, `edges`, and `node_imgs` tables. ## Do some Post-Processing -1. Run genEnwikiNameData.py, which adds more entries to the `names` table, - using data in enwiki/, and the `names` and `wiki_ids` tables. -2. Optionally run addPickedNames.py, which allows adding manually-selected name data to - the `names` table, as specified in pickedNames.txt. - - pickedNames.txt: Has lines of the form `nodeName1|altName1|prefAlt1`. - These correspond to entries in the `names` table. `prefAlt` should be 1 or 0. - A line like `name1|name1|1` causes a node to have no preferred alt-name. -3. Run genReducedTrees.py, which generates multiple reduced versions of the tree, +1. Run genReducedTrees.py, which generates multiple reduced versions of the tree, adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from pickedNodes.txt, which lists names of nodes that must be included (1 per line). diff --git a/backend/tolData/addPickedNames.py b/backend/tolData/addPickedNames.py deleted file mode 100755 index 9b56422..0000000 --- a/backend/tolData/addPickedNames.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/python3 - -import sys -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads alt-name data from a file, and adds it to the database's 'names' table -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -dbFile = "data.db" -pickedNamesFile = "pickedNames.txt" - -print("Opening database") -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Iterating through picked-names file") -with open(pickedNamesFile) as file: - for line in file: - # Get record data - nodeName, altName, prefAlt = line.lower().rstrip().split("|") - prefAlt = int(prefAlt) - # Check whether there exists a node with the name - row = dbCur.execute("SELECT name from nodes where name = ?", (nodeName,)).fetchone() - if row == None: - print(f"ERROR: No node with name \"{nodeName}\" exists") - break - # Remove any existing preferred-alt status - if prefAlt == 1: - query = "SELECT name, alt_name FROM names WHERE name = ? AND pref_alt = 1" - row = dbCur.execute(query, (nodeName,)).fetchone() - if row != None and row[1] != altName: - print(f"Removing pref-alt status from alt-name {row[1]} for {nodeName}") - dbCur.execute("UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ?", row) - # Check for an existing record - if nodeName == altName: - continue - query = "SELECT name, alt_name, pref_alt FROM names WHERE name = ? AND alt_name = ?" - row = dbCur.execute(query, (nodeName, altName)).fetchone() - if row == None: - print(f"Adding record for alt-name {altName} for {nodeName}") - dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'picked')", (nodeName, altName, prefAlt)) - else: - # Update existing record - if row[2] != prefAlt: - print(f"Updating record for alt-name {altName} for {nodeName}") - dbCur.execute("UPDATE names SET pref_alt = ?, src = 'picked' WHERE name = ? AND alt_name = ?", - (prefAlt, nodeName, altName)) - -print("Closing database") -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/dbpedia/genDescData.py b/backend/tolData/dbpedia/genDescData.py index a23199d..8756a40 100755 --- a/backend/tolData/dbpedia/genDescData.py +++ b/backend/tolData/dbpedia/genDescData.py @@ -41,6 +41,7 @@ with bz2.open(labelsFile, mode='rt') as file: print("Reading/storing wiki page ids") dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)") +dbCur.execute("CREATE INDEX ids_idx ON ids(id)") idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') lineNum = 0 with bz2.open(idsFile, mode='rt') as file: diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md index dfced94..7df21c9 100644 --- a/backend/tolData/enwiki/README.md +++ b/backend/tolData/enwiki/README.md @@ -11,7 +11,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. providing, for each page, an offset into the dump file of a chunk of 100 pages that includes it. -# Generated Dump-Index Files +# Dump-Index Files - genDumpIndexDb.py <br> Creates an sqlite-database version of the enwiki-dump index file. - dumpIndex.db <br> diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py index 97e696f..b5d546d 100755 --- a/backend/tolData/enwiki/genImgData.py +++ b/backend/tolData/enwiki/genImgData.py @@ -30,7 +30,6 @@ imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE) cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) -# In testing, got about 360k image names print("Getting input page-ids") pageIds = getInputPageIds() diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md index 1a9dbdf..c07b48e 100644 --- a/backend/tolData/eol/README.md +++ b/backend/tolData/eol/README.md @@ -1,4 +1,9 @@ -This directory holds files obtained from/using the [Encyclopedia of Life](https://eol.org/). +This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/). + +# Mapping Files +- `provider_ids.csv.gz` <br> + Obtained from <https://opendata.eol.org/dataset/identifier-map> on 22/08/22 (says last updated 27/07/22). + Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium. # Name Data Files - vernacularNames.csv <br> diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py index 7ca4e79..4d658e7 100755 --- a/backend/tolData/eol/downloadImgs.py +++ b/backend/tolData/eol/downloadImgs.py @@ -22,8 +22,6 @@ highest EOL ID. """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -# In testing, this script downloaded about 70k images, over a few days - imagesListDb = "imagesList.db" def getInputEolIds(): eolIds = set() @@ -95,7 +93,6 @@ def downloadImg(url, outFile): for idx in range(nextIdx, len(eolIds)): eolId = eolIds[idx] # Get image urls - imgDataList = [] ownerSet = set() # Used to get images from different owners, for variety exitLoop = False query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py index 0c45887..4dcb6d9 100755 --- a/backend/tolData/eol/genImagesListDb.py +++ b/backend/tolData/eol/genImagesListDb.py @@ -18,6 +18,7 @@ dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE images" \ " (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)") +dbCur.execute("CREATE INDEX images_pid_idx ON images(page_id)") print("Reading CSV files") csvFilenames = os.listdir(imagesListDir) for filename in csvFilenames: diff --git a/backend/tolData/genDbpData.py b/backend/tolData/genDbpData.py deleted file mode 100755 index 9d52e1d..0000000 --- a/backend/tolData/genDbpData.py +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/python3 - -import sys, os, re -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads a database containing data from DBpedia, and tries to associate -DBpedia IRIs with nodes in the tree-of-life database, adding -short-descriptions for them. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -dbpediaDb = "dbpedia/descData.db" -namesToSkipFile = "pickedEnwikiNamesToSkip.txt" -pickedLabelsFile = "pickedDbpLabels.txt" -dbFile = "data.db" -rootNodeName = "cellular organisms" -rootLabel = "Organism" # Will be associated with root node -# Got about 400k descriptions when testing - -print("Opening databases") -dbpCon = sqlite3.connect(dbpediaDb) -dbpCur = dbpCon.cursor() -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Getting node names") -nodeNames = set() -for (name,) in dbCur.execute("SELECT name from nodes"): - nodeNames.add(name) - -print("Checking for names to skip") -oldSz = len(nodeNames) -if os.path.exists(namesToSkipFile): - with open(namesToSkipFile) as file: - for line in file: - nodeNames.remove(line.rstrip()) -print(f"Skipping {oldSz - len(nodeNames)} nodes") - -print("Reading disambiguation-page labels") -disambigLabels = set() -query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri" -for (label,) in dbpCur.execute(query): - disambigLabels.add(label) - -print("Trying to associate nodes with DBpedia labels") -nodeToLabel = {} -nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)' -nameToVariants = {} # Maps node names to lists of matching labels -iterNum = 0 -for (label,) in dbpCur.execute("SELECT label from labels"): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"At iteration {iterNum}") - # - if label in disambigLabels: - continue - name = label.lower() - if name in nodeNames: - if name not in nameToVariants: - nameToVariants[name] = [label] - elif label not in nameToVariants[name]: - nameToVariants[name].append(label) - else: - match = nameVariantRegex.fullmatch(name) - if match != None: - subName = match.group(1) - if subName in nodeNames and match.group(2) != "disambiguation": - if subName not in nameToVariants: - nameToVariants[subName] = [label] - elif name not in nameToVariants[subName]: - nameToVariants[subName].append(label) -# Associate labels without conflicts -for (name, variants) in nameToVariants.items(): - if len(variants) == 1: - nodeToLabel[name] = variants[0] -for name in nodeToLabel: - del nameToVariants[name] -# Special case for root node -nodeToLabel[rootNodeName] = rootLabel -if rootNodeName in nameToVariants: - del nameToVariants["cellular organisms"] - -print(f"Trying to resolve {len(nameToVariants)} conflicts") -def resolveWithPickedLabels(): - " Attempts to resolve conflicts using a picked-names file " - with open(pickedLabelsFile) as file: - for line in file: - (name, _, label) = line.rstrip().partition("|") - if name not in nameToVariants: - print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr) - continue - if label == "": - del nameToVariants[name] - else: - if label not in nameToVariants[name]: - print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr) - nodeToLabel[name] = label - del nameToVariants[name] -def resolveWithCategoryList(): - """ - Attempts to resolve conflicts by looking for labels like 'name1 (category1)', - and choosing those with a category1 that seems 'biological'. - Does two passes, using more generic categories first. This helps avoid stuff like - Pan being classified as a horse instead of an ape. - """ - generalCategories = { - "species", "genus", - "plant", "fungus", "animal", - "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug", - "fish", "amphibian", "reptile", "bird", "mammal", - } - specificCategories = { - "protist", "alveolate", "dinoflagellates", - "orchid", "poaceae", "fern", "moss", "alga", - "bryozoan", "hydrozoan", - "sponge", "cnidarian", "coral", "polychaete", "echinoderm", - "bivalve", "gastropod", "chiton", - "shrimp", "decapod", "crab", "barnacle", "copepod", - "arachnid", "spider", "harvestman", "mite", - "dragonfly", "mantis", "cicada", "grasshopper", "planthopper", - "beetle", "fly", "butterfly", "moth", "wasp", - "catfish", - "frog", - "lizard", - "horse", "sheep", "cattle", "mouse", - } - namesToRemove = set() - for (name, variants) in nameToVariants.items(): - found = False - for label in variants: - match = nameVariantRegex.match(label) - if match != None and match.group(2).lower() in generalCategories: - nodeToLabel[name] = label - namesToRemove.add(name) - found = True - break - if not found: - for label in variants: - match = nameVariantRegex.match(label) - if match != None and match.group(2).lower() in specificCategories: - nodeToLabel[name] = label - namesToRemove.add(name) - break - for name in namesToRemove: - del nameToVariants[name] -def resolveWithTypeData(): - " Attempts to resolve conflicts using DBpedia's type data " - taxonTypes = { # Obtained from the DBpedia ontology - "http://dbpedia.org/ontology/Species", - "http://dbpedia.org/ontology/Archaea", - "http://dbpedia.org/ontology/Bacteria", - "http://dbpedia.org/ontology/Eukaryote", - "http://dbpedia.org/ontology/Plant", - "http://dbpedia.org/ontology/ClubMoss", - "http://dbpedia.org/ontology/Conifer", - "http://dbpedia.org/ontology/CultivatedVariety", - "http://dbpedia.org/ontology/Cycad", - "http://dbpedia.org/ontology/Fern", - "http://dbpedia.org/ontology/FloweringPlant", - "http://dbpedia.org/ontology/Grape", - "http://dbpedia.org/ontology/Ginkgo", - "http://dbpedia.org/ontology/Gnetophytes", - "http://dbpedia.org/ontology/GreenAlga", - "http://dbpedia.org/ontology/Moss", - "http://dbpedia.org/ontology/Fungus", - "http://dbpedia.org/ontology/Animal", - "http://dbpedia.org/ontology/Fish", - "http://dbpedia.org/ontology/Crustacean", - "http://dbpedia.org/ontology/Mollusca", - "http://dbpedia.org/ontology/Insect", - "http://dbpedia.org/ontology/Arachnid", - "http://dbpedia.org/ontology/Amphibian", - "http://dbpedia.org/ontology/Reptile", - "http://dbpedia.org/ontology/Bird", - "http://dbpedia.org/ontology/Mammal", - "http://dbpedia.org/ontology/Cat", - "http://dbpedia.org/ontology/Dog", - "http://dbpedia.org/ontology/Horse", - } - iterNum = 0 - for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"At iteration {iterNum}") - # - if type in taxonTypes: - name = label.lower() - if name in nameToVariants: - nodeToLabel[name] = label - del nameToVariants[name] - else: - match = nameVariantRegex.fullmatch(name) - if match != None: - name = match.group(1).lower() - if name in nameToVariants: - nodeToLabel[name] = label - del nameToVariants[name] -#resolveWithTypeData() -#resolveWithCategoryList() -resolveWithPickedLabels() -print(f"Remaining number of conflicts: {len(nameToVariants)}") - -print("Getting node IRIs") -nodeToIri = {} -for (name, label) in nodeToLabel.items(): - (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ?", (label,)).fetchone() - nodeToIri[name] = iri - -print("Resolving redirects") -redirectingIriSet = set() -iterNum = 0 -for (name, iri) in nodeToIri.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone() - if row != None: - nodeToIri[name] = row[0] - redirectingIriSet.add(name) - -print("Adding description tables") -dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)") -dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)") -dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)") -iterNum = 0 -for (name, iri) in nodeToIri.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?" - row = dbpCur.execute(query, (iri,)).fetchone() - if row != None: - desc, wikiId = row - dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0)) - dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1)) - -print("Closing databases") -dbCon.commit() -dbCon.close() -dbpCon.commit() -dbpCon.close() diff --git a/backend/tolData/genDescData.py b/backend/tolData/genDescData.py new file mode 100755 index 0000000..28971f4 --- /dev/null +++ b/backend/tolData/genDescData.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 + +import sys, os, re +import sqlite3 + +import argparse +parser = argparse.ArgumentParser(description=''' +Maps nodes to short descriptions, using data from DBpedia and +Wikipedia, and stores results in the database. +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +dbpediaDb = 'dbpedia/descData.db' +enwikiDb = 'enwiki/descData.db' +dbFile = 'data.db' + +print('Creating table') +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)') + +print('Getting node mappings') +nodeToWikiId = {} +for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): + nodeToWikiId[name] = wikiId + +print('Reading data from DBpedia') +dbpCon = sqlite3.connect(dbpediaDb) +dbpCur = dbpCon.cursor() +print('Getting node IRIs') +nodeToIri = {} +iterNum = 0 +for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone() + if row != None: + nodeToIri[name] = row[0] +print('Resolving redirects') +iterNum = 0 +for name, iri in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone() + if row != None: + nodeToIri[name] = row[0] +print('Adding descriptions') +iterNum = 0 +for name, iri in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone() + if row != None: + dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1)) + del nodeToWikiId[name] +dbpCon.close() + +print('Reading data from Wikipedia') +enwikiCon = sqlite3.connect(enwikiDb) +enwikiCur = enwikiCon.cursor() +print('Resolving redirects') +iterNum = 0 +for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?' + row = enwikiCur.execute(query, (wikiId,)).fetchone() + if row != None: + nodeToWikiId[name] = row[0] +print('Adding descriptions') +iterNum = 0 +for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e3 == 0: + print(f'At iteration {iterNum}') + # + row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiId,)).fetchone() + if row != None: + dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0)) + +print('Closing databases') +dbCon.commit() +dbCon.close() diff --git a/backend/tolData/genEnwikiDescData.py b/backend/tolData/genEnwikiDescData.py deleted file mode 100755 index e8a69ba..0000000 --- a/backend/tolData/genEnwikiDescData.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads a database containing data from Wikipedia, and tries to associate -wiki pages with nodes in the tree-of-life database, and add descriptions for -nodes that don't have them. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -enwikiDb = "enwiki/descData.db" -dbFile = "data.db" -namesToSkipFile = "pickedEnwikiNamesToSkip.txt" -pickedLabelsFile = "pickedEnwikiLabels.txt" -# Got about 25k descriptions when testing - -print("Opening databases") -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Checking for names to skip") -namesToSkip = set() -if os.path.exists(namesToSkipFile): - with open(namesToSkipFile) as file: - for line in file: - namesToSkip.add(line.rstrip()) - print(f"Found {len(namesToSkip)}") -print("Checking for picked-titles") -nameToPickedTitle = {} -if os.path.exists(pickedLabelsFile): - with open(pickedLabelsFile) as file: - for line in file: - (name, _, title) = line.rstrip().partition("|") - nameToPickedTitle[name.lower()] = title -print(f"Found {len(nameToPickedTitle)}") - -print("Getting names of nodes without descriptions") -nodeNames = set() -query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL" -for (name,) in dbCur.execute(query): - nodeNames.add(name) -print(f"Found {len(nodeNames)}") -nodeNames.difference_update(namesToSkip) - -print("Associating nodes with page IDs") -nodeToPageId = {} -iterNum = 0 -for name in nodeNames: - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - if name not in nameToPickedTitle: - row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] - else: - title = nameToPickedTitle[name] - row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] - else: - print("WARNING: Picked title {title} not found", file=sys.stderr) - -print("Resolving redirects") -redirectingNames = set() -iterNum = 0 -for (name, pageId) in nodeToPageId.items(): - iterNum += 1 - if iterNum % 1e3 == 0: - print(f"At iteration {iterNum}") - # - query = "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?" - row = enwikiCur.execute(query, (pageId,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] - redirectingNames.add(name) - -print("Adding description data") -iterNum = 0 -for (name, pageId) in nodeToPageId.items(): - iterNum += 1 - if iterNum % 1e3 == 0: - print(f"At iteration {iterNum}") - # - row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() - if row != None: - dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0)) - dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0)) - -print("Closing databases") -dbCon.commit() -dbCon.close() -enwikiCon.close() diff --git a/backend/tolData/genEnwikiNameData.py b/backend/tolData/genEnwikiNameData.py deleted file mode 100755 index ec76cca..0000000 --- a/backend/tolData/genEnwikiNameData.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/python3 - -import sys, re -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads from a database containing data from Wikipdia, along with -node and wiki-id information from the database, and use wikipedia -page-redirect information to add additional alt-name data. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -enwikiDb = "enwiki/descData.db" -dbFile = "data.db" -altNameRegex = re.compile(r"[a-zA-Z]+") - # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)', - -print("Opening databases") -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Getting nodes with wiki IDs") -nodeToWikiId = {} -for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"): - nodeToWikiId[nodeName] = wikiId -print(f"Found {len(nodeToWikiId)}") - -print("Iterating through nodes, finding names that redirect to them") -nodeToAltNames = {} -numAltNames = 0 -iterNum = 0 -for (nodeName, wikiId) in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - nodeToAltNames[nodeName] = set() - query = "SELECT p1.title FROM pages p1" \ - " INNER JOIN redirects r1 ON p1.id = r1.id" \ - " INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?" - for (name,) in enwikiCur.execute(query, (wikiId,)): - if altNameRegex.fullmatch(name) != None and name.lower() != nodeName: - nodeToAltNames[nodeName].add(name.lower()) - numAltNames += 1 -print(f"Found {numAltNames} alt-names") - -print("Excluding existing alt-names from the set") -query = "SELECT alt_name FROM names WHERE alt_name IN ({})" -iterNum = 0 -for (nodeName, altNames) in nodeToAltNames.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - existingNames = set() - for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)): - existingNames.add(name) - numAltNames -= len(existingNames) - altNames.difference_update(existingNames) -print(f"Left with {numAltNames} alt-names") - -print("Adding alt-names to database") -for (nodeName, altNames) in nodeToAltNames.items(): - for altName in altNames: - dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0)) - -print("Closing databases") -dbCon.commit() -dbCon.close() -enwikiCon.close() diff --git a/backend/tolData/genEolNameData.py b/backend/tolData/genEolNameData.py deleted file mode 100755 index 2c5414b..0000000 --- a/backend/tolData/genEolNameData.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import html, csv, sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads files describing name data from the 'Encyclopedia of Life' site, -tries to associate names with nodes in the tree-of-life database, -and adds tables to represent associated names. - -Reads a vernacularNames.csv file: - Starts with a header line containing: - page_id, canonical_form, vernacular_string, language_code, - resource_name, is_preferred_by_resource, is_preferred_by_eol - The canonical_form and vernacular_string fields contain names - associated with the page ID. Names are not always unique to - particular page IDs. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries -dbFile = "data.db" -namesToSkip = {"unknown", "unknown species", "unidentified species"} -pickedIdsFile = "pickedEolIds.txt" -altsToSkipFile = "pickedEolAltsToSkip.txt" - -print("Reading in vernacular-names data") -nameToPids = {} # 'pid' means 'Page ID' -canonicalNameToPids = {} -pidToNames = {} -pidToPreferred = {} # Maps pids to 'preferred' names -def updateMaps(name, pid, canonical, preferredAlt): - global namesToSkip, nameToPids, canonicalNameToPids, pidToNames, pidToPreferred - if name in namesToSkip: - return - if name not in nameToPids: - nameToPids[name] = {pid} - else: - nameToPids[name].add(pid) - if canonical: - if name not in canonicalNameToPids: - canonicalNameToPids[name] = {pid} - else: - canonicalNameToPids[name].add(pid) - if pid not in pidToNames: - pidToNames[pid] = {name} - else: - pidToNames[pid].add(name) - if preferredAlt: - pidToPreferred[pid] = name -with open(vnamesFile, newline="") as csvfile: - reader = csv.reader(csvfile) - lineNum = 0 - for row in reader: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f"At line {lineNum}") - # Skip header line - if lineNum == 1: - continue - # Parse line - pid = int(row[0]) - name1 = re.sub(r"<[^>]+>", "", row[1].lower()) # Remove tags - name2 = html.unescape(row[2]).lower() - lang = row[3] - preferred = row[6] == "preferred" - # Add to maps - updateMaps(name1, pid, True, False) - if lang == "eng" and name2 != "": - updateMaps(name2, pid, False, preferred) - -print("Checking for manually-picked pids") -nameToPickedPid = {} -if os.path.exists(pickedIdsFile): - with open(pickedIdsFile) as file: - for line in file: - (name, _, eolId) = line.rstrip().partition("|") - nameToPickedPid[name] = None if eolId == "" else int(eolId) -print(f"Found {len(nameToPickedPid)}") - -print("Checking for alt-names to skip") -nameToAltsToSkip = {} -numToSkip = 0 -if os.path.exists(altsToSkipFile): - with open(altsToSkipFile) as file: - for line in file: - (name, _, altName) = line.rstrip().partition("|") - if name not in nameToAltsToSkip: - nameToAltsToSkip[name] = [altName] - else: - nameToAltsToSkip[name].append(altName) - numToSkip += 1 -print(f"Found {numToSkip} alt-names to skip") - -print("Creating database tables") -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))") -dbCur.execute("CREATE INDEX names_idx ON names(name)") -dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)") -dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)") -dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)") -dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)") - -print("Associating nodes with names") -usedPids = set() -unresolvedNodeNames = set() -dbCur2 = dbCon.cursor() -def addToDb(nodeName, pidToUse): - " Adds page-ID-associated name data to a node in the database " - global dbCur, pidToPreferred - dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName)) - # Get alt-names - altNames = set() - for n in pidToNames[pidToUse]: - # Avoid alt-names with >3 words - if len(n.split(" ")) > 3: - continue - # Avoid alt-names that already name a node in the database - if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None: - continue - # Check for picked alt-name-to-skip - if nodeName in nameToAltsToSkip and n in nameToAltsToSkip[nodeName]: - print(f"Excluding alt-name {n} for node {nodeName}") - continue - # - altNames.add(n) - # Add alt-names to db - preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None - for n in altNames: - isPreferred = 1 if (n == preferredName) else 0 - dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred)) -print("Adding picked IDs") -for (name, pid) in nameToPickedPid.items(): - if pid != None: - addToDb(name, pid) - usedPids.add(pid) -print("Associating nodes with canonical names") -iterNum = 0 -for (nodeName,) in dbCur2.execute("SELECT name FROM nodes"): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f"At iteration {iterNum}") - if nodeName in nameToPickedPid: - continue - # Check for matching canonical name - if nodeName in canonicalNameToPids: - pidToUse = None - # Pick an associated page ID - for pid in canonicalNameToPids[nodeName]: - hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred - hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred - if hasLowerPrio: - continue - if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio): - pidToUse = pid - if pidToUse != None: - addToDb(nodeName, pidToUse) - usedPids.add(pidToUse) - elif nodeName in nameToPids: - unresolvedNodeNames.add(nodeName) -print("Associating leftover nodes with other names") -iterNum = 0 -for nodeName in unresolvedNodeNames: - iterNum += 1 - if iterNum % 100 == 0: - print(f"At iteration {iterNum}") - # Check for matching name - pidToUse = None - for pid in nameToPids[nodeName]: - # Pick an associated page ID - if pid not in usedPids and (pidToUse == None or pid < pidToUse): - pidToUse = pid - if pidToUse != None: - addToDb(nodeName, pidToUse) - usedPids.add(pidToUse) - -print("Closing database") -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/genMappingData.py b/backend/tolData/genMappingData.py new file mode 100755 index 0000000..d562d7e --- /dev/null +++ b/backend/tolData/genMappingData.py @@ -0,0 +1,229 @@ +#!/usr/bin/python3 + +import sys, re, os +from collections import defaultdict +import gzip, bz2, csv, sqlite3 + +import argparse +parser = argparse.ArgumentParser(description=''' +Maps otol IDs to EOL and enwiki titles, using IDs from various +other sources (like NCBI). + +Reads otol taxonomy data to get source IDs for otol IDs, +then looks up those IDs in an EOL provider_ids file, +and in a wikidata dump, and stores results in the database. + +Based on code from https://github.com/OneZoom/OZtree, located in +OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +taxonomyFile = 'otol/taxonomy.tsv' +eolIdsFile = 'eol/provider_ids.csv.gz' +wikidataDb = 'wikidata/taxonSrcs.db' +enwikiDumpIndexDb = 'enwiki/dumpIndex.db' +pickedMappings = { + 'eol': ['pickedEolIds.txt'], + 'enwiki': ['pickedWikiIds.txt', 'pickedWikiIdsRough.txt'] +} +dbFile = 'data.db' + +print('Reading taxonomy file') +# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence): + # uid (otol-id, eg: 93302), parent_uid, name, rank, + # sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags +OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority +nodeToSrcIds = defaultdict(dict) # Maps otol ID to {src1: id1, src2: id2, ...} +usedSrcIds = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used) +with open(taxonomyFile) as file: # Had about 4.5e6 lines + lineNum = 0 + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + # Skip header line + if lineNum == 1: + continue + # Parse line + fields = line.split('\t|\t') + try: + otolId = int(fields[0]) + except ValueError: + print(f'Skipping non-integral ID {fields[0]} on line {lineNum}') + continue + srcInfo = fields[4] + # Add source IDs + for srcPair in srcInfo.split(','): + src, srcId = srcPair.split(':', 1) + if srcId.isdecimal() and src in OTOL_SRCS and src not in nodeToSrcIds[otolId]: + srcId = int(srcId) + nodeToSrcIds[otolId][src] = srcId + usedSrcIds.add((src, srcId)) +print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6 + +print('Reading EOL provider_ids file') +# The CSV file has a header line, then lines that hold these fields: + # node_id, resource_pk (ID from external source), resource_id (int denoting external-source), + # page_id (eol ID), preferred_canonical_for_page +EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps ints to external-source names +srcToEolId = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...} +with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines + for lineNum, row in enumerate(csv.reader(file), 1): + if lineNum % 1e6 == 0: + print(f'At line {lineNum}') + # Skip header line + if lineNum == 1: + continue + # Parse line + eolId = int(row[3]) + srcVal = int(row[2]) + srcId = row[1] + if srcId.isdecimal() and srcVal in EOL_SRCS: + srcId = int(srcId) + src = EOL_SRCS[srcVal] + if (src, srcId) not in usedSrcIds: + continue + if srcId in srcToEolId[src]: + print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}') + continue + srcToEolId[src][srcId] = eolId +print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries') + # Was about 3.5e6 (4.2e6 without usedSrcIds) + +print('Resolving candidate EOL IDs') +# For each otol ID, find eol IDs with matching sources, and choose the 'best' one +nodeToEolId = {} # Maps otol ID to eol ID +for otolId, srcInfo in nodeToSrcIds.items(): + eolIdToCount = defaultdict(int) + for src, srcId in srcInfo.items(): + if src in srcToEolId and srcId in srcToEolId[src]: + eolId = srcToEolId[src][srcId] + eolIdToCount[eolId] += 1 + if len(eolIdToCount) == 1: + nodeToEolId[otolId] = list(eolIdToCount)[0] + elif len(eolIdToCount) > 1: + # For multiple candidates, prefer those with most sources, and break ties by picking the lowest + maxCount = max(eolIdToCount.values()) + eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount] + nodeToEolId[otolId] = min(eolIds) +print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6 + +print('Reading from Wikidata db') +srcToWikiTitle = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...} +wikiTitles = set() +titleToIucnStatus = {} +dbCon = sqlite3.connect(wikidataDb) +dbCur = dbCon.cursor() +for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'): + if (src, srcId) not in usedSrcIds and src != 'eol': # Keep EOL IDs for later use + continue + srcToWikiTitle[src][srcId] = title + wikiTitles.add(title) +for title, status in dbCur.execute('SELECT title, status from title_iucn'): + if title in wikiTitles: + titleToIucnStatus[title] = status +print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries') + # Was about 1.1e6 (1.2e6 without usedSrcIds) +print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds) +dbCon.close() + +print('Resolving candidate Wikidata items') +# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one +nodeToWikiTitle = {} +for otolId, srcInfo in nodeToSrcIds.items(): + titleToSrcs = defaultdict(list) # Maps candidate titles to {src1: srcId1, ...} + for src, srcId in srcInfo.items(): + if src in srcToWikiTitle and srcId in srcToWikiTitle[src]: + title = srcToWikiTitle[src][srcId] + titleToSrcs[title].append(src) + # Choose title to use + if len(titleToSrcs) == 1: + nodeToWikiTitle[otolId] = list(titleToSrcs)[0] + elif len(titleToSrcs) > 1: # Test example: otol ID 621052 + # Get titles with most sources + maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()]) + titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt} + if len(titleToSrcs) == 1: + nodeToWikiTitle[otolId] = list(titleToSrcs)[0] + else: # Test example: otol ID 4235272 + # Get a title with a source with highest priority + srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]} + for src in OTOL_SRCS: + if src in srcToTitle: + nodeToWikiTitle[otolId] = srcToTitle[src] + break +print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5 + +print('Adding extra EOL mappings from Wikidata') +eolIdToNode = {eolId: node for node, eolId in nodeToEolId.items()} +wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()} +addedEntries = {} +for eolId, title in srcToWikiTitle['eol'].items(): + if title in wikiTitleToNode: + otolId = wikiTitleToNode[title] + if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID + nodeToEolId[otolId] = eolId + addedEntries[otolId] = eolId +print(f'- Added {len(addedEntries):,} entries') # Was about 3e3 + +print('Reading picked mappings') +for src in pickedMappings: + for filename in pickedMappings[src]: + if not os.path.exists(filename): + continue + with open(filename) as file: + for line in file: + otolId, mappedVal = line.rstrip().split('|') + otolId = int(otolId) + if src == 'eol': + if mappedVal: + nodeToEolId[otolId] = int(mappedVal) + else: + if otolId in nodeToEolId: + del nodeToEolId[otolId] + else: # src == 'enwiki' + if mappedVal: + nodeToWikiTitle[otolId] = mappedVal + else: + if otolId in nodeToWikiTitle: + del nodeToWikiTitle[otolId] + +print(f'Getting enwiki page IDs') +titleToPageId = {} +numNotFound = 0 +dbCon = sqlite3.connect(enwikiDumpIndexDb) +dbCur = dbCon.cursor() +for title in nodeToWikiTitle.values(): + row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row != None: + titleToPageId[title] = row[0] + else: + numNotFound += 1 +dbCon.close() +print(f'Unable to find IDs for {numNotFound} titles') # Was 2913 + +print('Writing to db') +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Get otol id-to-name map +otolIdToName = {} +for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'): + if nodeId.startswith('ott'): + otolIdToName[int(nodeId[3:])] = nodeName +# Add eol mappings +dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)') +dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)') +for otolId, eolId in nodeToEolId.items(): + if otolId in otolIdToName: + dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId)) +# Add enwiki mappings +dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)') +dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)') +dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)') +for otolId, title in nodeToWikiTitle.items(): + if otolId in otolIdToName and title in titleToPageId: + dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title])) + if title in titleToIucnStatus: + dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title])) +dbCon.commit() +dbCon.close() diff --git a/backend/tolData/genNameData.py b/backend/tolData/genNameData.py new file mode 100755 index 0000000..7e6c025 --- /dev/null +++ b/backend/tolData/genNameData.py @@ -0,0 +1,113 @@ +#!/usr/bin/python3 + +import sys, re, os +import html, csv, sqlite3 + +import argparse +parser = argparse.ArgumentParser(description=''' +Maps nodes to vernacular names, using data from EOL, enwiki, and a +picked-names file, and stores results in the database. +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +eolNamesFile = 'eol/vernacularNames.csv' +enwikiDb = 'enwiki/descData.db' +pickedNamesFile = 'pickedNames.txt' +dbFile = 'data.db' + +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() + +print('Getting node mappings') +nodeToTips = {} +eolIdToNode = {} # Maps eol ID to node name (if there are multiple, choose one with most tips) +wikiIdToNode = {} +for name, tips in dbCur.execute('SELECT name, tips from nodes'): + nodeToTips[name] = tips +for name, eolId in dbCur.execute('SELECT name, id from eol_ids'): + if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]: + eolIdToNode[eolId] = name +for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): + if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]: + wikiIdToNode[wikiId] = name + +print('Creating table') +dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))') +dbCur.execute('CREATE INDEX names_idx ON names(name)') +dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)') +dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)') + +print('Getting names from EOL') +# The CSV file has a header line, then lines with these fields: + # page_id, canonical_form (canonical name, not always unique to page ID), + # vernacular_string (vernacular name), language_code, + # resource_name, is_preferred_by_resource, is_preferred_by_eol +namesToSkip = {'unknown', 'unknown species', 'unidentified species'} +with open(eolNamesFile, newline='') as file: + for lineNum, fields in enumerate(csv.reader(file), 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') # Reached about 2.8e6 + # Skip header line + if lineNum == 1: + continue + # Parse line + eolId = int(fields[0]) + name = html.unescape(fields[2]).lower() + lang = fields[3] + isPreferred = 1 if fields[6] == 'preferred' else 0 + # Add to db + if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \ + and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words + cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')' # The 'OR IGNORE' accounts for duplicate lines + dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred)) + +print('Getting names from Wikipedia') +altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)', +enwikiCon = sqlite3.connect(enwikiDb) +enwikiCur = enwikiCon.cursor() +iterNum = 0 +for wikiId, nodeName in wikiIdToNode.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') # Reached about 3.6e5 + # + query = 'SELECT p1.title FROM pages p1' \ + ' INNER JOIN redirects r1 ON p1.id = r1.id' \ + ' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?' + for (name,) in enwikiCur.execute(query, (wikiId,)): + name = name.lower() + if altNameRegex.fullmatch(name) != None and name != nodeName and name not in nodeToTips: + dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0)) + +print('Getting picked names') +# File format: + # nodename1|altName1|isPreferred1 -> Add an alt-name + # nodename1|altName1| -> Remove an alt-name + # nodename1|nodeName1| -> Remove any preferred-alt status +altNamesToSkip = {} # Maps node names to alt-names to exclude +if os.path.exists(pickedNamesFile): + with open(pickedNamesFile) as file: + for line in file: + nodeName, altName, isPreferred = line.lower().rstrip().split('|') + if nodeName not in nodeToTips: + print(f"Skipping \"{nodeName}\", as no such node exists") + continue + if isPreferred: + isPreferred = 1 if isPreferred == '1' else 0 + if isPreferred == 1: + # Remove any existing preferred-alt status + cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1' + dbCur.execute(cmd, (nodeName, altName)) + # Remove any existing record + dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName)) + # Add record + dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'picked')", (nodeName, altName, isPreferred)) + elif nodeName != altName: # Remove any matching record + dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName)) + else: # Remove any preferred-alt status + cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1' + dbCur.execute(cmd, (nodeName, altName)) + +print('Closing database') +dbCon.commit() +dbCon.close() diff --git a/backend/tolData/otol/README.md b/backend/tolData/otol/README.md index 4be2fd2..e018369 100644 --- a/backend/tolData/otol/README.md +++ b/backend/tolData/otol/README.md @@ -1,10 +1,19 @@ -Files -===== -- opentree13.4tree.tgz <br> +This directory holds files obtained via the +[Open Tree of Life](https://tree.opentreeoflife.org/about/open-tree-of-life). + +# Tree Data Files +- `opentree13.4tree.tgz` <br> Obtained from <https://tree.opentreeoflife.org/about/synthesis-release/v13.4>. Contains tree data from the [Open Tree of Life](https://tree.opentreeoflife.org/about/open-tree-of-life). -- labelled\_supertree\_ottnames.tre <br> +- `labelled_supertree_ottnames.tre` <br> Extracted from the .tgz file. Describes the structure of the tree. -- annotations.json +- `annotations.json` <br> Extracted from the .tgz file. Contains additional attributes of tree nodes. Used for finding out which nodes have 'phylogenetic support'. + +# Taxonomy Data Files +- `ott3.3.tgz` <br> + Obtained from <https://tree.opentreeoflife.org/about/taxonomy-version/ott3.3>. + Contains taxonomy data from the Open Tree of Life. +- `otol/taxonomy.tsv` <br> + Extracted from the .tgz file. Holds taxon IDs from sources like NCBI, used to map between datasets. diff --git a/backend/tolData/reviewImgsToGen.py b/backend/tolData/reviewImgsToGen.py index 88822c5..dcf18bc 100755 --- a/backend/tolData/reviewImgsToGen.py +++ b/backend/tolData/reviewImgsToGen.py @@ -66,6 +66,7 @@ if os.path.exists(enwikiImgDir): print(f"WARNING: No node found for {enwikiImgDir}{filename}") print(f"Result: {len(nodeToImgs)} nodes with images") print("Filtering out already-made image choices") + oldSz = len(nodeToImgs) if os.path.exists(outFile): with open(outFile) as file: diff --git a/backend/tolData/wikidata/README.md b/backend/tolData/wikidata/README.md new file mode 100644 index 0000000..db45b3c --- /dev/null +++ b/backend/tolData/wikidata/README.md @@ -0,0 +1,18 @@ +This directory holds files obtained via [Wikidata](https://www.wikidata.org/). + +# Downloaded Files +- `latest-all.json.bz2` <br> + Obtained from <https://dumps.wikimedia.org/wikidatawiki/entities/> (on 23/08/22). + Format info can be found at <https://doc.wikimedia.org/Wikibase/master/php/md_docs_topics_json.html>. + +# Other Files +- genTaxonSrcData.py <br> + Used to generate a database holding taxon information from the dump. +- offsets.dat <br> + Holds bzip2 block offsets for the dump. Generated and used by + genTaxonSrcData.py for parallel processing of the dump. +- taxonSrcs.db <br> + Generated by genTaxonSrcData.py. <br> + Tables: <br> + - `src_id_to_title`: `src TEXT, id INT, title TEXT, PRIMARY KEY(src, id)` + - `title_iucn`: `title TEXT PRIMARY KEY, status TEXT` diff --git a/backend/tolData/wikidata/genTaxonSrcData.py b/backend/tolData/wikidata/genTaxonSrcData.py new file mode 100755 index 0000000..bd86172 --- /dev/null +++ b/backend/tolData/wikidata/genTaxonSrcData.py @@ -0,0 +1,230 @@ +#!/usr/bin/python3 + +import sys, os, re, math, io +from collections import defaultdict +import bz2, json, sqlite3 +import multiprocessing, indexed_bzip2, pickle, tempfile + +import argparse +parser = argparse.ArgumentParser(description=''' +Reads a wikidata JSON dump, looking for enwiki taxon items, and associated +IDs from sources like GBIF/etc, and IUCN conservation status. Writes results +into a database. + +The JSON dump contains an array of objects, each of which describes a +Wikidata item item1, and takes up it's own line. +- Getting item1's Wikidata ID: item1['id'] (eg: "Q144") +- Checking if item1 is a taxon: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['numeric-id'] == id1 + 'idx1' indexes an array of statements + 'id1' is a Wikidata ID denoting a taxon item type (eg: 310890 means 'monotypic taxon') +- Checking if item1 is a taxon-alt: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['numeric-id'] == id1 + 'id1' denotes a common-name-alternative item type (eg: 55983715 means 'organisms known by a particular common name') + Getting the ID of the item that item1 is an alternative for: + item1['claims']['P31'][idx1]['qualifiers']['P642'][idx2]['datavalue']['value']['numeric-id'] +- Checking for an EOL/NCBI/etc ID: item['claims'][prop1][idx1]['mainsnak']['datavalue']['value'] (eg: "328672") + 'prop1' denotes a 'has ID from source *' property (eg: 'P830' means 'has EOL ID') +- Checking for an IUCN status: item['claims']['P141'][idx1]['mainsnak']['datavalue']['value']['id'] (eg: "Q219127") + +Based on code from https://github.com/OneZoom/OZtree, located in +OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +# On Linux, running on the full dataset caused the processes to hang after processing. This was resolved by: +# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock. +# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state. +# Related: https://bugs.python.org/issue6721 +# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). +# Possibly related: https://github.com/python/cpython/issues/72882 + +WD_FILE = 'latest-all.json.bz2' +OFFSETS_FILE = 'offsets.dat' +DB_FILE = 'taxonSrcs.db' +N_PROCS = 6 # Took about 3 hours (probably would've taken 6-12 with N_PROCS=1) + +# Wikidata entity IDs +TAXON_IDS = ['Q16521', 'Q310890', 'Q23038290', 'Q713623'] # 'taxon', 'monotypic taxon', 'fossil taxon', 'clade' +TAXON_ALT_IDS = ['Q55983715', 'Q502895'] # 'organisms known by a particular common name', 'common name' +SRC_PROP_IDS = {'P830': 'eol', 'P685': 'ncbi', 'P1391': 'if', 'P850': 'worms', 'P5055': 'irmng', 'P846': 'gbif'} +IUCN_STATUS_IDS = { + 'Q211005': 'least concern', 'Q719675': 'near threatened', 'Q278113': 'vulnerable', + 'Q11394': 'endangered', 'Q219127': 'critically endangered', 'Q239509': 'extinct in the wild', + 'Q237350': 'extinct species', 'Q3245245': 'data deficient' +} +# For filtering lines before parsing JSON +LINE_REGEX = re.compile(('"numeric-id":(?:' + '|'.join([s[1:] for s in TAXON_IDS + TAXON_ALT_IDS]) + ')\D').encode()) + +def main(): + # Maps to populate + srcIdToId = defaultdict(dict) # Maps 'eol'/etc to {srcId1: wikidataId1, ...} (IDs are ints) + idToTitle = {} # Maps wikidata ID to enwiki title + idToAltId = {} # Maps taxon-item wikidata ID to taxon-alt wikidata ID (eg: 'canis lupus familiaris' and 'dog') + idToIucnStatus = {} # Maps wikidata ID to iucn-status string ('least concern', etc) + # Check db + if os.path.exists(DB_FILE): + print('ERROR: Database already exists') + sys.exit(1) + # Read dump + if N_PROCS == 1: + with bz2.open(WD_FILE, mode='rb') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e4 == 0: + print(f'At line {lineNum}') + readDumpLine(line, srcIdToId, idToTitle, idToAltId, idToIucnStatus) + else: + + if not os.path.exists(OFFSETS_FILE): + print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours) + with indexed_bzip2.open(WD_FILE) as file: + with open(OFFSETS_FILE, 'wb') as file2: + pickle.dump(file.block_offsets(), file2) + print('Allocating file into chunks') + fileSz = None # About 1.4 TB + with indexed_bzip2.open(WD_FILE) as file: + with open(OFFSETS_FILE, 'rb') as file2: + file.set_block_offsets(pickle.load(file2)) + fileSz = file.seek(0, io.SEEK_END) + chunkSz = math.floor(fileSz / N_PROCS) + chunkIdxs = [None] + [chunkSz * i for i in range(1, N_PROCS)] + [fileSz-1] + # Each adjacent pair specifies a start+end byte for readDumpChunk() + print(f'- Chunk size: {chunkSz:,}') + print('Starting processes to read dump') + with tempfile.TemporaryDirectory() as tempDirName: + # Using maxtasksperchild=1 to free resources on task completion + with multiprocessing.Pool(processes=N_PROCS, maxtasksperchild=1) as pool: + for outFilename in pool.map(readDumpChunkOneParam, + [(i, chunkIdxs[i], chunkIdxs[i+1], f'{tempDirName}/{i}.pickle') for i in range(N_PROCS)]): + # Get map data from subprocess output file + with open(outFilename, 'rb') as file: + maps = pickle.load(file) + # Add to maps + for src, idMap in maps[0].items(): + srcIdToId[src].update(idMap) + idToTitle.update(maps[1]) + idToAltId.update(maps[2]) + idToIucnStatus.update(maps[3]) + # + print('Writing to db') + dbCon = sqlite3.connect(DB_FILE) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE src_id_to_title (src TEXT, id INT, title TEXT, PRIMARY KEY(src, id))') + for src, submap in srcIdToId.items(): + for srcId, wId in submap.items(): + if wId not in idToTitle: # Check for a title, possibly via an alt-taxon + if wId in idToAltId: + wId = idToAltId[wId] + else: + continue + dbCur.execute('INSERT INTO src_id_to_title VALUES (?, ?, ?)', (src, srcId, idToTitle[wId])) + dbCur.execute('CREATE TABLE title_iucn (title TEXT PRIMARY KEY, status TEXT)') + for wId, status in idToIucnStatus.items(): + if wId not in idToTitle: # Check for a title, possibly via an alt-taxon + if wId in idToAltId and idToAltId[wId] not in idToIucnStatus: + wId = idToAltId[wId] + else: + continue + dbCur.execute('INSERT OR IGNORE INTO title_iucn VALUES (?, ?)', (idToTitle[wId], status)) + # The 'OR IGNORE' allows for multiple taxons using the same alt + #dbCur.execute('CREATE TABLE id_to_alt_title (id TEXT PRIMARY KEY, title TEXT, alt TEXT)') + #for wId, altId in idToAltId.items(): + # dbCur.execute('INSERT INTO id_to_alt_title VALUES (?, ?, ?)', + # (wId, idToTitle[wId] if wId in idToTitle else None, idToTitle[altId])) + dbCon.commit() + dbCon.close() + +def readDumpLine(line, srcIdToId, idToTitle, idToAltId, idToIucnStatus): + # Check if taxon item + if LINE_REGEX.search(line) == None: + return + try: + line = line.decode('utf-8').rstrip().rstrip(',') + jsonItem = json.loads(line) + except json.JSONDecodeError: + print(f'Unable to parse Line {lineNum} as JSON') + return + isTaxon = False + altTaxa = [] # For a taxon-alt item, holds associated taxon-item IDs + claims = None + try: + claims = jsonItem['claims'] + for statement in claims['P31']: # Check for 'instance of' statements + typeId = statement['mainsnak']['datavalue']['value']['id'] + if typeId in TAXON_IDS: + isTaxon = True + break + elif typeId in TAXON_ALT_IDS: + snaks = statement['qualifiers']['P642'] # Check for 'of' qualifiers + altTaxa.extend([int(s['datavalue']['value']['numeric-id']) for s in snaks]) + break + except (KeyError, ValueError): + return + if not isTaxon and not altTaxa: + return + # Get wikidata ID and enwiki title + itemId, itemTitle = None, None + try: + itemId = int(jsonItem['id'][1:]) # Skips initial 'Q' + itemTitle = jsonItem['sitelinks']['enwiki']['title'] + except KeyError: + # Allow taxon-items without titles (they might get one via a taxon-alt) + if itemId != None and isTaxon: + itemTitle = None + else: + return + # Update maps + if itemTitle != None: + idToTitle[itemId] = itemTitle + for altId in altTaxa: + idToAltId[altId] = itemId + # Check for source IDs + for srcPropId, src in SRC_PROP_IDS.items(): + if srcPropId in claims: + try: + srcId = int(claims[srcPropId][0]['mainsnak']['datavalue']['value']) + srcIdToId[src][srcId] = itemId + except (KeyError, ValueError): + continue + # Check for IUCN status + if 'P141' in claims: # Check for 'iucn conservation status' statement + try: + iucnStatusId = claims['P141'][0]['mainsnak']['datavalue']['value']['id'] + idToIucnStatus[itemId] = IUCN_STATUS_IDS[iucnStatusId] + except KeyError: + pass + +def readDumpChunkOneParam(params): # Forwards to readDumpChunk(), for use with pool.map() + return readDumpChunk(*params) + +# Reads lines in the dump that begin after a start-byte, and not after an end byte + # If startByte is None, start at the first line +def readDumpChunk(procId, startByte, endByte, outFilename): + # Maps to populate + maps = [defaultdict(dict), {}, {}, {}] + # Read dump + with indexed_bzip2.open(WD_FILE) as file: + # Load offsets file + with open(OFFSETS_FILE, 'rb') as file2: + offsets = pickle.load(file2) + file.set_block_offsets(offsets) + # Seek to chunk + if startByte != None: + file.seek(startByte) + file.readline() + else: + startByte = 0 # Used for progress calculation + # Read lines + count = 0 + while file.tell() <= endByte: + count += 1 + if count % 1e4 == 0: + perc = (file.tell() - startByte) / (endByte - startByte) * 100 + print(f'Thread {procId}: {perc:.2f}%') + readDumpLine(file.readline(), *maps) + # Output results into file + with open(outFilename, 'wb') as file: + pickle.dump(maps, file) + return outFilename + +if __name__ == '__main__': # Guard needed for multiprocessing + multiprocessing.set_start_method('spawn') + main() |
