aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore16
-rwxr-xr-xbackend/tilo.py9
-rw-r--r--backend/tolData/README.md110
-rwxr-xr-xbackend/tolData/addPickedNames.py54
-rwxr-xr-xbackend/tolData/dbpedia/genDescData.py1
-rw-r--r--backend/tolData/enwiki/README.md2
-rwxr-xr-xbackend/tolData/enwiki/genImgData.py1
-rw-r--r--backend/tolData/eol/README.md7
-rwxr-xr-xbackend/tolData/eol/downloadImgs.py3
-rwxr-xr-xbackend/tolData/eol/genImagesListDb.py1
-rwxr-xr-xbackend/tolData/genDbpData.py245
-rwxr-xr-xbackend/tolData/genDescData.py91
-rwxr-xr-xbackend/tolData/genEnwikiDescData.py99
-rwxr-xr-xbackend/tolData/genEnwikiNameData.py73
-rwxr-xr-xbackend/tolData/genEolNameData.py181
-rwxr-xr-xbackend/tolData/genMappingData.py229
-rwxr-xr-xbackend/tolData/genNameData.py113
-rw-r--r--backend/tolData/otol/README.md19
-rwxr-xr-xbackend/tolData/reviewImgsToGen.py1
-rw-r--r--backend/tolData/wikidata/README.md18
-rwxr-xr-xbackend/tolData/wikidata/genTaxonSrcData.py230
21 files changed, 770 insertions, 733 deletions
diff --git a/.gitignore b/.gitignore
index 7d5fe2d..2d08c56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,31 +2,35 @@
/node_modules/
/dist/
/backend/__pycache__
+.venv
# Database files
/backend/tolData/data.db
/backend/tolData/otol/*.tgz
+/backend/tolData/otol/taxonomy.tsv
/backend/tolData/otol/*.json
/backend/tolData/otol/*.tre
/backend/tolData/eol/*.tgz
/backend/tolData/eol/*.csv
/backend/tolData/eol/imagesList/
+/backend/tolData/eol/*.gz
/backend/tolData/eol/*.db
/backend/tolData/eol/imgsForReview/
/backend/tolData/eol/imgs/
-/backend/tolData/dbpedia/*.bz2
-/backend/tolData/dbpedia/*.db
/backend/tolData/enwiki/*.bz2
+/backend/tolData/enwiki/*.gz
/backend/tolData/enwiki/*.db
/backend/tolData/enwiki/imgs/
+/backend/tolData/dbpedia/*.bz2
+/backend/tolData/dbpedia/*.db
+/backend/tolData/wikidata/*.bz2
+/backend/tolData/wikidata/*.dat
+/backend/tolData/wikidata/*.db
/backend/tolData/imgList.txt
/backend/tolData/pickedImgs/
/backend/tolData/img/
/backend/tolData/pickedOtolNames.txt
/backend/tolData/pickedEolIds.txt
-/backend/tolData/pickedEolAltsToSkip.txt
-/backend/tolData/pickedEnwikiNamesToSkip.txt
-/backend/tolData/pickedDbpLabels.txt
-/backend/tolData/pickedEnwikiLabels.txt
+/backend/tolData/pickedWikiIds*.txt
/backend/tolData/pickedNodes.txt
/backend/tolData/pickedNames.txt
diff --git a/backend/tilo.py b/backend/tilo.py
index d86e94c..bb7af16 100755
--- a/backend/tilo.py
+++ b/backend/tilo.py
@@ -56,10 +56,9 @@ class SearchSuggResponse:
self.hasMore = hasMore # boolean
class DescInfo:
" Represents a node's associated description "
- def __init__(self, text, wikiId, fromRedirect, fromDbp):
+ def __init__(self, text, wikiId, fromDbp):
self.text = text # string
self.wikiId = wikiId # number
- self.fromRedirect = fromRedirect # boolean
self.fromDbp = fromDbp # boolean
class ImgInfo:
" Represents a node's associated image "
@@ -199,11 +198,11 @@ def lookupInfo(name, tree, dbCur):
namesToLookup = [name] if len(subNames) == 0 else [n for n in subNames if n != None]
# Get desc info
nameToDescInfo = {}
- query = "SELECT name, desc, wiki_id, redirected, from_dbp FROM" \
+ query = "SELECT name, desc, wiki_id, from_dbp FROM" \
" wiki_ids INNER JOIN descs ON wiki_ids.id = descs.wiki_id" \
" WHERE wiki_ids.name IN ({})".format(",".join(["?"] * len(namesToLookup)))
- for (nodeName, desc, wikiId, redirected, fromDbp) in dbCur.execute(query, namesToLookup):
- nameToDescInfo[nodeName] = DescInfo(desc, wikiId, redirected == 1, fromDbp == 1)
+ for (nodeName, desc, wikiId, fromDbp) in dbCur.execute(query, namesToLookup):
+ nameToDescInfo[nodeName] = DescInfo(desc, wikiId, fromDbp == 1)
# Get image info
nameToImgInfo = {}
idsToNames = {nameToNodes[n].imgName[:-4]: n for n in namesToLookup if nameToNodes[n].imgName != None}
diff --git a/backend/tolData/README.md b/backend/tolData/README.md
index 21c02ab..1248098 100644
--- a/backend/tolData/README.md
+++ b/backend/tolData/README.md
@@ -4,24 +4,24 @@ This directory holds files used to generate the tree-of-life database data.db.
## Tree Structure
- `nodes` <br>
Format : `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT` <br>
- Represents a tree-of-life node. `tips` holds the number of no-child descendants.
+ Represents a tree-of-life node. `tips` holds the number of no-child descendants
- `edges` <br>
Format: `parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child)` <br>
`p_support` is 1 if the edge has 'phylogenetic support', and 0 otherwise
-## Node Names
+## Node Mappings
- `eol_ids` <br>
- Format: `id INT PRIMARY KEY, name TEXT` <br>
- Associates an EOL ID with a node's name.
+ Format: `name TEXT PRIMARY KEY, id INT` <br>
+ Associates nodes with EOL IDs
+- `wiki_ids` <br>
+ Format: `name TEXT PRIMARY KEY, id INT` <br>
+ Associates nodes with wikipedia page IDs
+## Node Vernacular Names
- `names` <br>
Format: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)` <br>
Associates a node with alternative names.
`pref_alt` is 1 if the alt-name is the most 'preferred' one.
`src` indicates the dataset the alt-name was obtained from (can be 'eol', 'enwiki', or 'picked').
## Node Descriptions
-- `wiki_ids` <br>
- Format: `name TEXT PRIMARY KEY, id INT, redirected INT` <br>
- Associates a node with a wikipedia page ID.
- `redirected` is 1 if the node was associated with a different page that redirected to this one.
- `descs` <br>
Format: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT` <br>
Associates a wikipedia page ID with a short-description.
@@ -42,61 +42,62 @@ This directory holds files used to generate the tree-of-life database data.db.
These are like `nodes`, but describe nodes of reduced trees.
- `edges_t`, `edges_i`, `edges_p` <br>
Like `edges` but for reduced trees.
+## Other
+- `node_iucn` <br>
+ Format: `name TEXT PRIMARY KEY, iucn TEXT` <br>
+ Associated nodes with IUCN conservation status strings (eg: 'endangered')
# Generating the Database
-For the most part, these steps should be done in order.
-
-As a warning, the whole process takes a lot of time and file space. The tree will probably
-have about 2.5 billion nodes. Downloading the images takes several days, and occupies over
-200 GB. And if you want good data, you'll likely need to make additional corrections,
-which can take several weeks.
+As a warning, the whole process takes a lot of time and file space. The
+tree will probably have about 2.6 million nodes. Downloading the images
+takes several days, and occupies over 200 GB.
## Environment
Some of the scripts use third-party packages:
-- jsonpickle: For encoding class objects as JSON.
-- requests: For downloading data.
-- PIL: For image processing.
-- tkinter: For providing a basic GUI to review images.
-- mwxml, mwparserfromhell: For parsing Wikipedia dumps.
+- `indexed_bzip2`: For parallelised bzip2 processing.
+- `jsonpickle`: For encoding class objects as JSON.
+- `requests`: For downloading data.
+- `PIL`: For image processing.
+- `tkinter`: For providing a basic GUI to review images.
+- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps.
## Generate Tree Structure Data
-1. Obtain files in otol/, as specified in it's README.
+1. Obtain 'tree data files' in otol/, as specified in it's README.
2. Run genOtolData.py, which creates data.db, and adds the `nodes` and `edges` tables,
using data in otol/. It also uses these files, if they exist:
- - pickedOtolNames.txt: Has lines of the form `name1|otolId1`. When nodes in the
- tree have the same name (eg: Pholidota can refer to pangolins or orchids),
- they get the names 'name1', 'name1 [2]', 'name1 [3], etc. This file is used to
- forcibly specify which node should be named 'name1'.
+ - pickedOtolNames.txt: Has lines of the form `name1|otolId1`.
+ Can be used to override numeric suffixes added to same-name nodes.
+
+## Generate Dataset Mappings
+1. Obtain 'taxonomy data files' in otol/, 'mapping files' in eol/,
+ files in wikidata/, and 'dump-index files' in enwiki/, as specified
+ in their READMEs.
+2. Run genMappingData.py, which adds the `eol_ids` and `wiki_ids` tables,
+ using the files obtained above, and the `nodes` table. It also uses
+ 'picked mappings' files, if they exist.
+ - pickedEolIds.txt contains lines like `3785967|405349`, specifying
+ an otol ID and an eol ID to map it to. The eol ID can be empty,
+ in which case the otol ID won't be mapped.
+ - pickedWikiIds.txt and pickedWikiIdsRough.txt contain lines like
+ `5341349|Human`, specifying an otol ID and an enwiki title,
+ which may contain spaces. The title can be empty.
-## Generate Node Names Data
-1. Obtain 'name data files' in eol/, as specified in it's README.
-2. Run genEolNameData.py, which adds the `names` and `eol_ids` tables, using data in
- eol/ and the `nodes` table. It also uses these files, if they exist:
- - pickedEolIds.txt: Has lines of the form `nodeName1|eolId1` or `nodeName1|`.
- Specifies node names that should have a particular EOL ID, or no ID.
- Quite a few taxons have ambiguous names, and may need manual correction.
- For example, Viola may resolve to a taxon of butterflies or of plants.
- - pickedEolAltsToSkip.txt: Has lines of the form `nodeName1|altName1`.
- Specifies that a node's alt-name set should exclude altName1.
+## Generate Node Name Data
+1. Obtain 'name data files' in eol/, and 'description database files' in enwiki/,
+ as specified in their READMEs.
+2. Run genNameData.py, which adds the `names` table, using data in eol/ and enwiki/,
+ along with the `nodes`, `eol_ids`, and `wiki_ids` tables. <br>
+ It also uses pickedNames.txt, if it exists. This file can hold lines like
+ `embryophyta|land plant|1`, specifying a node name, an alt-name to add for it,
+ and a 1 or 0 indicating whether it is a 'preferred' alt-name. The last field
+ can be empty, which indicates that the alt-name should be removed, or, if the
+ alt-name is the same as the node name, that no alt-name should be preferred.
## Generate Node Description Data
-### Get Data from DBpedia
1. Obtain files in dbpedia/, as specified in it's README.
-2. Run genDbpData.py, which adds the `wiki_ids` and `descs` tables, using data in
- dbpedia/ and the `nodes` table. It also uses these files, if they exist:
- - pickedEnwikiNamesToSkip.txt: Each line holds the name of a node for which
- no description should be obtained. Many node names have a same-name
- wikipedia page that describes something different (eg: Osiris).
- - pickedDbpLabels.txt: Has lines of the form `nodeName1|label1`.
- Specifies node names that should have a particular associated page label.
-### Get Data from Wikipedia
-1. Obtain 'description database files' in enwiki/, as specified in it's README.
-2. Run genEnwikiDescData.py, which adds to the `wiki_ids` and `descs` tables,
- using data in enwiki/ and the `nodes` table.
- It also uses these files, if they exist:
- - pickedEnwikiNamesToSkip.txt: Same as with genDbpData.py.
- - pickedEnwikiLabels.txt: Similar to pickedDbpLabels.txt.
+2. Run genDescData.py, which adds the `descs` table, using data in dbpedia/ and
+ enwiki/, and the `nodes` table.
## Generate Node Images Data
### Get images from EOL
@@ -129,21 +130,12 @@ Some of the scripts use third-party packages:
- An input image might produce output with unexpected dimensions.
This seems to happen when the image is very large, and triggers a
decompression bomb warning.
- In testing, this resulted in about 150k images, with about 2/3 of them
- being from Wikipedia.
### Add more Image Associations
1. Run genLinkedImgs.py, which tries to associate nodes without images to
images of it's children. Adds the `linked_imgs` table, and uses the
`nodes`, `edges`, and `node_imgs` tables.
## Do some Post-Processing
-1. Run genEnwikiNameData.py, which adds more entries to the `names` table,
- using data in enwiki/, and the `names` and `wiki_ids` tables.
-2. Optionally run addPickedNames.py, which allows adding manually-selected name data to
- the `names` table, as specified in pickedNames.txt.
- - pickedNames.txt: Has lines of the form `nodeName1|altName1|prefAlt1`.
- These correspond to entries in the `names` table. `prefAlt` should be 1 or 0.
- A line like `name1|name1|1` causes a node to have no preferred alt-name.
-3. Run genReducedTrees.py, which generates multiple reduced versions of the tree,
+1. Run genReducedTrees.py, which generates multiple reduced versions of the tree,
adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from
pickedNodes.txt, which lists names of nodes that must be included (1 per line).
diff --git a/backend/tolData/addPickedNames.py b/backend/tolData/addPickedNames.py
deleted file mode 100755
index 9b56422..0000000
--- a/backend/tolData/addPickedNames.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/python3
-
-import sys
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads alt-name data from a file, and adds it to the database's 'names' table
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-dbFile = "data.db"
-pickedNamesFile = "pickedNames.txt"
-
-print("Opening database")
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-
-print("Iterating through picked-names file")
-with open(pickedNamesFile) as file:
- for line in file:
- # Get record data
- nodeName, altName, prefAlt = line.lower().rstrip().split("|")
- prefAlt = int(prefAlt)
- # Check whether there exists a node with the name
- row = dbCur.execute("SELECT name from nodes where name = ?", (nodeName,)).fetchone()
- if row == None:
- print(f"ERROR: No node with name \"{nodeName}\" exists")
- break
- # Remove any existing preferred-alt status
- if prefAlt == 1:
- query = "SELECT name, alt_name FROM names WHERE name = ? AND pref_alt = 1"
- row = dbCur.execute(query, (nodeName,)).fetchone()
- if row != None and row[1] != altName:
- print(f"Removing pref-alt status from alt-name {row[1]} for {nodeName}")
- dbCur.execute("UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ?", row)
- # Check for an existing record
- if nodeName == altName:
- continue
- query = "SELECT name, alt_name, pref_alt FROM names WHERE name = ? AND alt_name = ?"
- row = dbCur.execute(query, (nodeName, altName)).fetchone()
- if row == None:
- print(f"Adding record for alt-name {altName} for {nodeName}")
- dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'picked')", (nodeName, altName, prefAlt))
- else:
- # Update existing record
- if row[2] != prefAlt:
- print(f"Updating record for alt-name {altName} for {nodeName}")
- dbCur.execute("UPDATE names SET pref_alt = ?, src = 'picked' WHERE name = ? AND alt_name = ?",
- (prefAlt, nodeName, altName))
-
-print("Closing database")
-dbCon.commit()
-dbCon.close()
diff --git a/backend/tolData/dbpedia/genDescData.py b/backend/tolData/dbpedia/genDescData.py
index a23199d..8756a40 100755
--- a/backend/tolData/dbpedia/genDescData.py
+++ b/backend/tolData/dbpedia/genDescData.py
@@ -41,6 +41,7 @@ with bz2.open(labelsFile, mode='rt') as file:
print("Reading/storing wiki page ids")
dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)")
+dbCur.execute("CREATE INDEX ids_idx ON ids(id)")
idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n')
lineNum = 0
with bz2.open(idsFile, mode='rt') as file:
diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md
index dfced94..7df21c9 100644
--- a/backend/tolData/enwiki/README.md
+++ b/backend/tolData/enwiki/README.md
@@ -11,7 +11,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
providing, for each page, an offset into the dump file of a chunk of
100 pages that includes it.
-# Generated Dump-Index Files
+# Dump-Index Files
- genDumpIndexDb.py <br>
Creates an sqlite-database version of the enwiki-dump index file.
- dumpIndex.db <br>
diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py
index 97e696f..b5d546d 100755
--- a/backend/tolData/enwiki/genImgData.py
+++ b/backend/tolData/enwiki/genImgData.py
@@ -30,7 +30,6 @@ imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
-# In testing, got about 360k image names
print("Getting input page-ids")
pageIds = getInputPageIds()
diff --git a/backend/tolData/eol/README.md b/backend/tolData/eol/README.md
index 1a9dbdf..c07b48e 100644
--- a/backend/tolData/eol/README.md
+++ b/backend/tolData/eol/README.md
@@ -1,4 +1,9 @@
-This directory holds files obtained from/using the [Encyclopedia of Life](https://eol.org/).
+This directory holds files obtained via the [Encyclopedia of Life](https://eol.org/).
+
+# Mapping Files
+- `provider_ids.csv.gz` <br>
+ Obtained from <https://opendata.eol.org/dataset/identifier-map> on 22/08/22 (says last updated 27/07/22).
+ Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium.
# Name Data Files
- vernacularNames.csv <br>
diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py
index 7ca4e79..4d658e7 100755
--- a/backend/tolData/eol/downloadImgs.py
+++ b/backend/tolData/eol/downloadImgs.py
@@ -22,8 +22,6 @@ highest EOL ID.
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
-# In testing, this script downloaded about 70k images, over a few days
-
imagesListDb = "imagesList.db"
def getInputEolIds():
eolIds = set()
@@ -95,7 +93,6 @@ def downloadImg(url, outFile):
for idx in range(nextIdx, len(eolIds)):
eolId = eolIds[idx]
# Get image urls
- imgDataList = []
ownerSet = set() # Used to get images from different owners, for variety
exitLoop = False
query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py
index 0c45887..4dcb6d9 100755
--- a/backend/tolData/eol/genImagesListDb.py
+++ b/backend/tolData/eol/genImagesListDb.py
@@ -18,6 +18,7 @@ dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE images" \
" (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)")
+dbCur.execute("CREATE INDEX images_pid_idx ON images(page_id)")
print("Reading CSV files")
csvFilenames = os.listdir(imagesListDir)
for filename in csvFilenames:
diff --git a/backend/tolData/genDbpData.py b/backend/tolData/genDbpData.py
deleted file mode 100755
index 9d52e1d..0000000
--- a/backend/tolData/genDbpData.py
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os, re
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads a database containing data from DBpedia, and tries to associate
-DBpedia IRIs with nodes in the tree-of-life database, adding
-short-descriptions for them.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-dbpediaDb = "dbpedia/descData.db"
-namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
-pickedLabelsFile = "pickedDbpLabels.txt"
-dbFile = "data.db"
-rootNodeName = "cellular organisms"
-rootLabel = "Organism" # Will be associated with root node
-# Got about 400k descriptions when testing
-
-print("Opening databases")
-dbpCon = sqlite3.connect(dbpediaDb)
-dbpCur = dbpCon.cursor()
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-
-print("Getting node names")
-nodeNames = set()
-for (name,) in dbCur.execute("SELECT name from nodes"):
- nodeNames.add(name)
-
-print("Checking for names to skip")
-oldSz = len(nodeNames)
-if os.path.exists(namesToSkipFile):
- with open(namesToSkipFile) as file:
- for line in file:
- nodeNames.remove(line.rstrip())
-print(f"Skipping {oldSz - len(nodeNames)} nodes")
-
-print("Reading disambiguation-page labels")
-disambigLabels = set()
-query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
-for (label,) in dbpCur.execute(query):
- disambigLabels.add(label)
-
-print("Trying to associate nodes with DBpedia labels")
-nodeToLabel = {}
-nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)'
-nameToVariants = {} # Maps node names to lists of matching labels
-iterNum = 0
-for (label,) in dbpCur.execute("SELECT label from labels"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"At iteration {iterNum}")
- #
- if label in disambigLabels:
- continue
- name = label.lower()
- if name in nodeNames:
- if name not in nameToVariants:
- nameToVariants[name] = [label]
- elif label not in nameToVariants[name]:
- nameToVariants[name].append(label)
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- subName = match.group(1)
- if subName in nodeNames and match.group(2) != "disambiguation":
- if subName not in nameToVariants:
- nameToVariants[subName] = [label]
- elif name not in nameToVariants[subName]:
- nameToVariants[subName].append(label)
-# Associate labels without conflicts
-for (name, variants) in nameToVariants.items():
- if len(variants) == 1:
- nodeToLabel[name] = variants[0]
-for name in nodeToLabel:
- del nameToVariants[name]
-# Special case for root node
-nodeToLabel[rootNodeName] = rootLabel
-if rootNodeName in nameToVariants:
- del nameToVariants["cellular organisms"]
-
-print(f"Trying to resolve {len(nameToVariants)} conflicts")
-def resolveWithPickedLabels():
- " Attempts to resolve conflicts using a picked-names file "
- with open(pickedLabelsFile) as file:
- for line in file:
- (name, _, label) = line.rstrip().partition("|")
- if name not in nameToVariants:
- print(f"WARNING: No conflict found for name \"{name}\"", file=sys.stderr)
- continue
- if label == "":
- del nameToVariants[name]
- else:
- if label not in nameToVariants[name]:
- print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr)
- nodeToLabel[name] = label
- del nameToVariants[name]
-def resolveWithCategoryList():
- """
- Attempts to resolve conflicts by looking for labels like 'name1 (category1)',
- and choosing those with a category1 that seems 'biological'.
- Does two passes, using more generic categories first. This helps avoid stuff like
- Pan being classified as a horse instead of an ape.
- """
- generalCategories = {
- "species", "genus",
- "plant", "fungus", "animal",
- "annelid", "mollusc", "arthropod", "crustacean", "insect", "bug",
- "fish", "amphibian", "reptile", "bird", "mammal",
- }
- specificCategories = {
- "protist", "alveolate", "dinoflagellates",
- "orchid", "poaceae", "fern", "moss", "alga",
- "bryozoan", "hydrozoan",
- "sponge", "cnidarian", "coral", "polychaete", "echinoderm",
- "bivalve", "gastropod", "chiton",
- "shrimp", "decapod", "crab", "barnacle", "copepod",
- "arachnid", "spider", "harvestman", "mite",
- "dragonfly", "mantis", "cicada", "grasshopper", "planthopper",
- "beetle", "fly", "butterfly", "moth", "wasp",
- "catfish",
- "frog",
- "lizard",
- "horse", "sheep", "cattle", "mouse",
- }
- namesToRemove = set()
- for (name, variants) in nameToVariants.items():
- found = False
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2).lower() in generalCategories:
- nodeToLabel[name] = label
- namesToRemove.add(name)
- found = True
- break
- if not found:
- for label in variants:
- match = nameVariantRegex.match(label)
- if match != None and match.group(2).lower() in specificCategories:
- nodeToLabel[name] = label
- namesToRemove.add(name)
- break
- for name in namesToRemove:
- del nameToVariants[name]
-def resolveWithTypeData():
- " Attempts to resolve conflicts using DBpedia's type data "
- taxonTypes = { # Obtained from the DBpedia ontology
- "http://dbpedia.org/ontology/Species",
- "http://dbpedia.org/ontology/Archaea",
- "http://dbpedia.org/ontology/Bacteria",
- "http://dbpedia.org/ontology/Eukaryote",
- "http://dbpedia.org/ontology/Plant",
- "http://dbpedia.org/ontology/ClubMoss",
- "http://dbpedia.org/ontology/Conifer",
- "http://dbpedia.org/ontology/CultivatedVariety",
- "http://dbpedia.org/ontology/Cycad",
- "http://dbpedia.org/ontology/Fern",
- "http://dbpedia.org/ontology/FloweringPlant",
- "http://dbpedia.org/ontology/Grape",
- "http://dbpedia.org/ontology/Ginkgo",
- "http://dbpedia.org/ontology/Gnetophytes",
- "http://dbpedia.org/ontology/GreenAlga",
- "http://dbpedia.org/ontology/Moss",
- "http://dbpedia.org/ontology/Fungus",
- "http://dbpedia.org/ontology/Animal",
- "http://dbpedia.org/ontology/Fish",
- "http://dbpedia.org/ontology/Crustacean",
- "http://dbpedia.org/ontology/Mollusca",
- "http://dbpedia.org/ontology/Insect",
- "http://dbpedia.org/ontology/Arachnid",
- "http://dbpedia.org/ontology/Amphibian",
- "http://dbpedia.org/ontology/Reptile",
- "http://dbpedia.org/ontology/Bird",
- "http://dbpedia.org/ontology/Mammal",
- "http://dbpedia.org/ontology/Cat",
- "http://dbpedia.org/ontology/Dog",
- "http://dbpedia.org/ontology/Horse",
- }
- iterNum = 0
- for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"At iteration {iterNum}")
- #
- if type in taxonTypes:
- name = label.lower()
- if name in nameToVariants:
- nodeToLabel[name] = label
- del nameToVariants[name]
- else:
- match = nameVariantRegex.fullmatch(name)
- if match != None:
- name = match.group(1).lower()
- if name in nameToVariants:
- nodeToLabel[name] = label
- del nameToVariants[name]
-#resolveWithTypeData()
-#resolveWithCategoryList()
-resolveWithPickedLabels()
-print(f"Remaining number of conflicts: {len(nameToVariants)}")
-
-print("Getting node IRIs")
-nodeToIri = {}
-for (name, label) in nodeToLabel.items():
- (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ?", (label,)).fetchone()
- nodeToIri[name] = iri
-
-print("Resolving redirects")
-redirectingIriSet = set()
-iterNum = 0
-for (name, iri) in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- row = dbpCur.execute("SELECT target FROM redirects where iri = ?", (iri,)).fetchone()
- if row != None:
- nodeToIri[name] = row[0]
- redirectingIriSet.add(name)
-
-print("Adding description tables")
-dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)")
-dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)")
-dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)")
-iterNum = 0
-for (name, iri) in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?"
- row = dbpCur.execute(query, (iri,)).fetchone()
- if row != None:
- desc, wikiId = row
- dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0))
- dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1))
-
-print("Closing databases")
-dbCon.commit()
-dbCon.close()
-dbpCon.commit()
-dbpCon.close()
diff --git a/backend/tolData/genDescData.py b/backend/tolData/genDescData.py
new file mode 100755
index 0000000..28971f4
--- /dev/null
+++ b/backend/tolData/genDescData.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import sqlite3
+
+import argparse
+parser = argparse.ArgumentParser(description='''
+Maps nodes to short descriptions, using data from DBpedia and
+Wikipedia, and stores results in the database.
+''', formatter_class=argparse.RawDescriptionHelpFormatter)
+args = parser.parse_args()
+
+dbpediaDb = 'dbpedia/descData.db'
+enwikiDb = 'enwiki/descData.db'
+dbFile = 'data.db'
+
+print('Creating table')
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)')
+
+print('Getting node mappings')
+nodeToWikiId = {}
+for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
+ nodeToWikiId[name] = wikiId
+
+print('Reading data from DBpedia')
+dbpCon = sqlite3.connect(dbpediaDb)
+dbpCur = dbpCon.cursor()
+print('Getting node IRIs')
+nodeToIri = {}
+iterNum = 0
+for name, wikiId in nodeToWikiId.items():
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone()
+ if row != None:
+ nodeToIri[name] = row[0]
+print('Resolving redirects')
+iterNum = 0
+for name, iri in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone()
+ if row != None:
+ nodeToIri[name] = row[0]
+print('Adding descriptions')
+iterNum = 0
+for name, iri in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone()
+ if row != None:
+ dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1))
+ del nodeToWikiId[name]
+dbpCon.close()
+
+print('Reading data from Wikipedia')
+enwikiCon = sqlite3.connect(enwikiDb)
+enwikiCur = enwikiCon.cursor()
+print('Resolving redirects')
+iterNum = 0
+for name, wikiId in nodeToWikiId.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?'
+ row = enwikiCur.execute(query, (wikiId,)).fetchone()
+ if row != None:
+ nodeToWikiId[name] = row[0]
+print('Adding descriptions')
+iterNum = 0
+for name, wikiId in nodeToWikiId.items():
+ iterNum += 1
+ if iterNum % 1e3 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiId,)).fetchone()
+ if row != None:
+ dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0))
+
+print('Closing databases')
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/genEnwikiDescData.py b/backend/tolData/genEnwikiDescData.py
deleted file mode 100755
index e8a69ba..0000000
--- a/backend/tolData/genEnwikiDescData.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads a database containing data from Wikipedia, and tries to associate
-wiki pages with nodes in the tree-of-life database, and add descriptions for
-nodes that don't have them.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-enwikiDb = "enwiki/descData.db"
-dbFile = "data.db"
-namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
-pickedLabelsFile = "pickedEnwikiLabels.txt"
-# Got about 25k descriptions when testing
-
-print("Opening databases")
-enwikiCon = sqlite3.connect(enwikiDb)
-enwikiCur = enwikiCon.cursor()
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-
-print("Checking for names to skip")
-namesToSkip = set()
-if os.path.exists(namesToSkipFile):
- with open(namesToSkipFile) as file:
- for line in file:
- namesToSkip.add(line.rstrip())
- print(f"Found {len(namesToSkip)}")
-print("Checking for picked-titles")
-nameToPickedTitle = {}
-if os.path.exists(pickedLabelsFile):
- with open(pickedLabelsFile) as file:
- for line in file:
- (name, _, title) = line.rstrip().partition("|")
- nameToPickedTitle[name.lower()] = title
-print(f"Found {len(nameToPickedTitle)}")
-
-print("Getting names of nodes without descriptions")
-nodeNames = set()
-query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL"
-for (name,) in dbCur.execute(query):
- nodeNames.add(name)
-print(f"Found {len(nodeNames)}")
-nodeNames.difference_update(namesToSkip)
-
-print("Associating nodes with page IDs")
-nodeToPageId = {}
-iterNum = 0
-for name in nodeNames:
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- if name not in nameToPickedTitle:
- row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
- else:
- title = nameToPickedTitle[name]
- row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
- else:
- print("WARNING: Picked title {title} not found", file=sys.stderr)
-
-print("Resolving redirects")
-redirectingNames = set()
-iterNum = 0
-for (name, pageId) in nodeToPageId.items():
- iterNum += 1
- if iterNum % 1e3 == 0:
- print(f"At iteration {iterNum}")
- #
- query = "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?"
- row = enwikiCur.execute(query, (pageId,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
- redirectingNames.add(name)
-
-print("Adding description data")
-iterNum = 0
-for (name, pageId) in nodeToPageId.items():
- iterNum += 1
- if iterNum % 1e3 == 0:
- print(f"At iteration {iterNum}")
- #
- row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
- if row != None:
- dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0))
- dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0))
-
-print("Closing databases")
-dbCon.commit()
-dbCon.close()
-enwikiCon.close()
diff --git a/backend/tolData/genEnwikiNameData.py b/backend/tolData/genEnwikiNameData.py
deleted file mode 100755
index ec76cca..0000000
--- a/backend/tolData/genEnwikiNameData.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads from a database containing data from Wikipdia, along with
-node and wiki-id information from the database, and use wikipedia
-page-redirect information to add additional alt-name data.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-enwikiDb = "enwiki/descData.db"
-dbFile = "data.db"
-altNameRegex = re.compile(r"[a-zA-Z]+")
- # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',
-
-print("Opening databases")
-enwikiCon = sqlite3.connect(enwikiDb)
-enwikiCur = enwikiCon.cursor()
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-
-print("Getting nodes with wiki IDs")
-nodeToWikiId = {}
-for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"):
- nodeToWikiId[nodeName] = wikiId
-print(f"Found {len(nodeToWikiId)}")
-
-print("Iterating through nodes, finding names that redirect to them")
-nodeToAltNames = {}
-numAltNames = 0
-iterNum = 0
-for (nodeName, wikiId) in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- nodeToAltNames[nodeName] = set()
- query = "SELECT p1.title FROM pages p1" \
- " INNER JOIN redirects r1 ON p1.id = r1.id" \
- " INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?"
- for (name,) in enwikiCur.execute(query, (wikiId,)):
- if altNameRegex.fullmatch(name) != None and name.lower() != nodeName:
- nodeToAltNames[nodeName].add(name.lower())
- numAltNames += 1
-print(f"Found {numAltNames} alt-names")
-
-print("Excluding existing alt-names from the set")
-query = "SELECT alt_name FROM names WHERE alt_name IN ({})"
-iterNum = 0
-for (nodeName, altNames) in nodeToAltNames.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- existingNames = set()
- for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)):
- existingNames.add(name)
- numAltNames -= len(existingNames)
- altNames.difference_update(existingNames)
-print(f"Left with {numAltNames} alt-names")
-
-print("Adding alt-names to database")
-for (nodeName, altNames) in nodeToAltNames.items():
- for altName in altNames:
- dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0))
-
-print("Closing databases")
-dbCon.commit()
-dbCon.close()
-enwikiCon.close()
diff --git a/backend/tolData/genEolNameData.py b/backend/tolData/genEolNameData.py
deleted file mode 100755
index 2c5414b..0000000
--- a/backend/tolData/genEolNameData.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os
-import html, csv, sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Reads files describing name data from the 'Encyclopedia of Life' site,
-tries to associate names with nodes in the tree-of-life database,
-and adds tables to represent associated names.
-
-Reads a vernacularNames.csv file:
- Starts with a header line containing:
- page_id, canonical_form, vernacular_string, language_code,
- resource_name, is_preferred_by_resource, is_preferred_by_eol
- The canonical_form and vernacular_string fields contain names
- associated with the page ID. Names are not always unique to
- particular page IDs.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.parse_args()
-
-vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries
-dbFile = "data.db"
-namesToSkip = {"unknown", "unknown species", "unidentified species"}
-pickedIdsFile = "pickedEolIds.txt"
-altsToSkipFile = "pickedEolAltsToSkip.txt"
-
-print("Reading in vernacular-names data")
-nameToPids = {} # 'pid' means 'Page ID'
-canonicalNameToPids = {}
-pidToNames = {}
-pidToPreferred = {} # Maps pids to 'preferred' names
-def updateMaps(name, pid, canonical, preferredAlt):
- global namesToSkip, nameToPids, canonicalNameToPids, pidToNames, pidToPreferred
- if name in namesToSkip:
- return
- if name not in nameToPids:
- nameToPids[name] = {pid}
- else:
- nameToPids[name].add(pid)
- if canonical:
- if name not in canonicalNameToPids:
- canonicalNameToPids[name] = {pid}
- else:
- canonicalNameToPids[name].add(pid)
- if pid not in pidToNames:
- pidToNames[pid] = {name}
- else:
- pidToNames[pid].add(name)
- if preferredAlt:
- pidToPreferred[pid] = name
-with open(vnamesFile, newline="") as csvfile:
- reader = csv.reader(csvfile)
- lineNum = 0
- for row in reader:
- lineNum += 1
- if lineNum % 1e5 == 0:
- print(f"At line {lineNum}")
- # Skip header line
- if lineNum == 1:
- continue
- # Parse line
- pid = int(row[0])
- name1 = re.sub(r"<[^>]+>", "", row[1].lower()) # Remove tags
- name2 = html.unescape(row[2]).lower()
- lang = row[3]
- preferred = row[6] == "preferred"
- # Add to maps
- updateMaps(name1, pid, True, False)
- if lang == "eng" and name2 != "":
- updateMaps(name2, pid, False, preferred)
-
-print("Checking for manually-picked pids")
-nameToPickedPid = {}
-if os.path.exists(pickedIdsFile):
- with open(pickedIdsFile) as file:
- for line in file:
- (name, _, eolId) = line.rstrip().partition("|")
- nameToPickedPid[name] = None if eolId == "" else int(eolId)
-print(f"Found {len(nameToPickedPid)}")
-
-print("Checking for alt-names to skip")
-nameToAltsToSkip = {}
-numToSkip = 0
-if os.path.exists(altsToSkipFile):
- with open(altsToSkipFile) as file:
- for line in file:
- (name, _, altName) = line.rstrip().partition("|")
- if name not in nameToAltsToSkip:
- nameToAltsToSkip[name] = [altName]
- else:
- nameToAltsToSkip[name].append(altName)
- numToSkip += 1
-print(f"Found {numToSkip} alt-names to skip")
-
-print("Creating database tables")
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))")
-dbCur.execute("CREATE INDEX names_idx ON names(name)")
-dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
-dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
-dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
-
-print("Associating nodes with names")
-usedPids = set()
-unresolvedNodeNames = set()
-dbCur2 = dbCon.cursor()
-def addToDb(nodeName, pidToUse):
- " Adds page-ID-associated name data to a node in the database "
- global dbCur, pidToPreferred
- dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
- # Get alt-names
- altNames = set()
- for n in pidToNames[pidToUse]:
- # Avoid alt-names with >3 words
- if len(n.split(" ")) > 3:
- continue
- # Avoid alt-names that already name a node in the database
- if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None:
- continue
- # Check for picked alt-name-to-skip
- if nodeName in nameToAltsToSkip and n in nameToAltsToSkip[nodeName]:
- print(f"Excluding alt-name {n} for node {nodeName}")
- continue
- #
- altNames.add(n)
- # Add alt-names to db
- preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
- for n in altNames:
- isPreferred = 1 if (n == preferredName) else 0
- dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred))
-print("Adding picked IDs")
-for (name, pid) in nameToPickedPid.items():
- if pid != None:
- addToDb(name, pid)
- usedPids.add(pid)
-print("Associating nodes with canonical names")
-iterNum = 0
-for (nodeName,) in dbCur2.execute("SELECT name FROM nodes"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"At iteration {iterNum}")
- if nodeName in nameToPickedPid:
- continue
- # Check for matching canonical name
- if nodeName in canonicalNameToPids:
- pidToUse = None
- # Pick an associated page ID
- for pid in canonicalNameToPids[nodeName]:
- hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred
- hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred
- if hasLowerPrio:
- continue
- if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio):
- pidToUse = pid
- if pidToUse != None:
- addToDb(nodeName, pidToUse)
- usedPids.add(pidToUse)
- elif nodeName in nameToPids:
- unresolvedNodeNames.add(nodeName)
-print("Associating leftover nodes with other names")
-iterNum = 0
-for nodeName in unresolvedNodeNames:
- iterNum += 1
- if iterNum % 100 == 0:
- print(f"At iteration {iterNum}")
- # Check for matching name
- pidToUse = None
- for pid in nameToPids[nodeName]:
- # Pick an associated page ID
- if pid not in usedPids and (pidToUse == None or pid < pidToUse):
- pidToUse = pid
- if pidToUse != None:
- addToDb(nodeName, pidToUse)
- usedPids.add(pidToUse)
-
-print("Closing database")
-dbCon.commit()
-dbCon.close()
diff --git a/backend/tolData/genMappingData.py b/backend/tolData/genMappingData.py
new file mode 100755
index 0000000..d562d7e
--- /dev/null
+++ b/backend/tolData/genMappingData.py
@@ -0,0 +1,229 @@
+#!/usr/bin/python3
+
+import sys, re, os
+from collections import defaultdict
+import gzip, bz2, csv, sqlite3
+
+import argparse
+parser = argparse.ArgumentParser(description='''
+Maps otol IDs to EOL and enwiki titles, using IDs from various
+other sources (like NCBI).
+
+Reads otol taxonomy data to get source IDs for otol IDs,
+then looks up those IDs in an EOL provider_ids file,
+and in a wikidata dump, and stores results in the database.
+
+Based on code from https://github.com/OneZoom/OZtree, located in
+OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
+''', formatter_class=argparse.RawDescriptionHelpFormatter)
+args = parser.parse_args()
+
+taxonomyFile = 'otol/taxonomy.tsv'
+eolIdsFile = 'eol/provider_ids.csv.gz'
+wikidataDb = 'wikidata/taxonSrcs.db'
+enwikiDumpIndexDb = 'enwiki/dumpIndex.db'
+pickedMappings = {
+ 'eol': ['pickedEolIds.txt'],
+ 'enwiki': ['pickedWikiIds.txt', 'pickedWikiIdsRough.txt']
+}
+dbFile = 'data.db'
+
+print('Reading taxonomy file')
+# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence):
+ # uid (otol-id, eg: 93302), parent_uid, name, rank,
+ # sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags
+OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority
+nodeToSrcIds = defaultdict(dict) # Maps otol ID to {src1: id1, src2: id2, ...}
+usedSrcIds = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used)
+with open(taxonomyFile) as file: # Had about 4.5e6 lines
+ lineNum = 0
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ # Skip header line
+ if lineNum == 1:
+ continue
+ # Parse line
+ fields = line.split('\t|\t')
+ try:
+ otolId = int(fields[0])
+ except ValueError:
+ print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
+ continue
+ srcInfo = fields[4]
+ # Add source IDs
+ for srcPair in srcInfo.split(','):
+ src, srcId = srcPair.split(':', 1)
+ if srcId.isdecimal() and src in OTOL_SRCS and src not in nodeToSrcIds[otolId]:
+ srcId = int(srcId)
+ nodeToSrcIds[otolId][src] = srcId
+ usedSrcIds.add((src, srcId))
+print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
+
+print('Reading EOL provider_ids file')
+# The CSV file has a header line, then lines that hold these fields:
+ # node_id, resource_pk (ID from external source), resource_id (int denoting external-source),
+ # page_id (eol ID), preferred_canonical_for_page
+EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps ints to external-source names
+srcToEolId = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...}
+with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines
+ for lineNum, row in enumerate(csv.reader(file), 1):
+ if lineNum % 1e6 == 0:
+ print(f'At line {lineNum}')
+ # Skip header line
+ if lineNum == 1:
+ continue
+ # Parse line
+ eolId = int(row[3])
+ srcVal = int(row[2])
+ srcId = row[1]
+ if srcId.isdecimal() and srcVal in EOL_SRCS:
+ srcId = int(srcId)
+ src = EOL_SRCS[srcVal]
+ if (src, srcId) not in usedSrcIds:
+ continue
+ if srcId in srcToEolId[src]:
+ print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}')
+ continue
+ srcToEolId[src][srcId] = eolId
+print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
+ # Was about 3.5e6 (4.2e6 without usedSrcIds)
+
+print('Resolving candidate EOL IDs')
+# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
+nodeToEolId = {} # Maps otol ID to eol ID
+for otolId, srcInfo in nodeToSrcIds.items():
+ eolIdToCount = defaultdict(int)
+ for src, srcId in srcInfo.items():
+ if src in srcToEolId and srcId in srcToEolId[src]:
+ eolId = srcToEolId[src][srcId]
+ eolIdToCount[eolId] += 1
+ if len(eolIdToCount) == 1:
+ nodeToEolId[otolId] = list(eolIdToCount)[0]
+ elif len(eolIdToCount) > 1:
+ # For multiple candidates, prefer those with most sources, and break ties by picking the lowest
+ maxCount = max(eolIdToCount.values())
+ eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
+ nodeToEolId[otolId] = min(eolIds)
+print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
+
+print('Reading from Wikidata db')
+srcToWikiTitle = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...}
+wikiTitles = set()
+titleToIucnStatus = {}
+dbCon = sqlite3.connect(wikidataDb)
+dbCur = dbCon.cursor()
+for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'):
+ if (src, srcId) not in usedSrcIds and src != 'eol': # Keep EOL IDs for later use
+ continue
+ srcToWikiTitle[src][srcId] = title
+ wikiTitles.add(title)
+for title, status in dbCur.execute('SELECT title, status from title_iucn'):
+ if title in wikiTitles:
+ titleToIucnStatus[title] = status
+print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries')
+ # Was about 1.1e6 (1.2e6 without usedSrcIds)
+print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
+dbCon.close()
+
+print('Resolving candidate Wikidata items')
+# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
+nodeToWikiTitle = {}
+for otolId, srcInfo in nodeToSrcIds.items():
+ titleToSrcs = defaultdict(list) # Maps candidate titles to {src1: srcId1, ...}
+ for src, srcId in srcInfo.items():
+ if src in srcToWikiTitle and srcId in srcToWikiTitle[src]:
+ title = srcToWikiTitle[src][srcId]
+ titleToSrcs[title].append(src)
+ # Choose title to use
+ if len(titleToSrcs) == 1:
+ nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
+ elif len(titleToSrcs) > 1: # Test example: otol ID 621052
+ # Get titles with most sources
+ maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()])
+ titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt}
+ if len(titleToSrcs) == 1:
+ nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
+ else: # Test example: otol ID 4235272
+ # Get a title with a source with highest priority
+ srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]}
+ for src in OTOL_SRCS:
+ if src in srcToTitle:
+ nodeToWikiTitle[otolId] = srcToTitle[src]
+ break
+print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
+
+print('Adding extra EOL mappings from Wikidata')
+eolIdToNode = {eolId: node for node, eolId in nodeToEolId.items()}
+wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
+addedEntries = {}
+for eolId, title in srcToWikiTitle['eol'].items():
+ if title in wikiTitleToNode:
+ otolId = wikiTitleToNode[title]
+ if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID
+ nodeToEolId[otolId] = eolId
+ addedEntries[otolId] = eolId
+print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
+
+print('Reading picked mappings')
+for src in pickedMappings:
+ for filename in pickedMappings[src]:
+ if not os.path.exists(filename):
+ continue
+ with open(filename) as file:
+ for line in file:
+ otolId, mappedVal = line.rstrip().split('|')
+ otolId = int(otolId)
+ if src == 'eol':
+ if mappedVal:
+ nodeToEolId[otolId] = int(mappedVal)
+ else:
+ if otolId in nodeToEolId:
+ del nodeToEolId[otolId]
+ else: # src == 'enwiki'
+ if mappedVal:
+ nodeToWikiTitle[otolId] = mappedVal
+ else:
+ if otolId in nodeToWikiTitle:
+ del nodeToWikiTitle[otolId]
+
+print(f'Getting enwiki page IDs')
+titleToPageId = {}
+numNotFound = 0
+dbCon = sqlite3.connect(enwikiDumpIndexDb)
+dbCur = dbCon.cursor()
+for title in nodeToWikiTitle.values():
+ row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+ if row != None:
+ titleToPageId[title] = row[0]
+ else:
+ numNotFound += 1
+dbCon.close()
+print(f'Unable to find IDs for {numNotFound} titles') # Was 2913
+
+print('Writing to db')
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Get otol id-to-name map
+otolIdToName = {}
+for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
+ if nodeId.startswith('ott'):
+ otolIdToName[int(nodeId[3:])] = nodeName
+# Add eol mappings
+dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
+dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
+for otolId, eolId in nodeToEolId.items():
+ if otolId in otolIdToName:
+ dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
+# Add enwiki mappings
+dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
+dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
+dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)')
+for otolId, title in nodeToWikiTitle.items():
+ if otolId in otolIdToName and title in titleToPageId:
+ dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
+ if title in titleToIucnStatus:
+ dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/genNameData.py b/backend/tolData/genNameData.py
new file mode 100755
index 0000000..7e6c025
--- /dev/null
+++ b/backend/tolData/genNameData.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python3
+
+import sys, re, os
+import html, csv, sqlite3
+
+import argparse
+parser = argparse.ArgumentParser(description='''
+Maps nodes to vernacular names, using data from EOL, enwiki, and a
+picked-names file, and stores results in the database.
+''', formatter_class=argparse.RawDescriptionHelpFormatter)
+args = parser.parse_args()
+
+eolNamesFile = 'eol/vernacularNames.csv'
+enwikiDb = 'enwiki/descData.db'
+pickedNamesFile = 'pickedNames.txt'
+dbFile = 'data.db'
+
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+
+print('Getting node mappings')
+nodeToTips = {}
+eolIdToNode = {} # Maps eol ID to node name (if there are multiple, choose one with most tips)
+wikiIdToNode = {}
+for name, tips in dbCur.execute('SELECT name, tips from nodes'):
+ nodeToTips[name] = tips
+for name, eolId in dbCur.execute('SELECT name, id from eol_ids'):
+ if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]:
+ eolIdToNode[eolId] = name
+for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
+ if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]:
+ wikiIdToNode[wikiId] = name
+
+print('Creating table')
+dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))')
+dbCur.execute('CREATE INDEX names_idx ON names(name)')
+dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)')
+dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)')
+
+print('Getting names from EOL')
+# The CSV file has a header line, then lines with these fields:
+ # page_id, canonical_form (canonical name, not always unique to page ID),
+ # vernacular_string (vernacular name), language_code,
+ # resource_name, is_preferred_by_resource, is_preferred_by_eol
+namesToSkip = {'unknown', 'unknown species', 'unidentified species'}
+with open(eolNamesFile, newline='') as file:
+ for lineNum, fields in enumerate(csv.reader(file), 1):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}') # Reached about 2.8e6
+ # Skip header line
+ if lineNum == 1:
+ continue
+ # Parse line
+ eolId = int(fields[0])
+ name = html.unescape(fields[2]).lower()
+ lang = fields[3]
+ isPreferred = 1 if fields[6] == 'preferred' else 0
+ # Add to db
+ if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \
+ and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words
+ cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')' # The 'OR IGNORE' accounts for duplicate lines
+ dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred))
+
+print('Getting names from Wikipedia')
+altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)',
+enwikiCon = sqlite3.connect(enwikiDb)
+enwikiCur = enwikiCon.cursor()
+iterNum = 0
+for wikiId, nodeName in wikiIdToNode.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f'At iteration {iterNum}') # Reached about 3.6e5
+ #
+ query = 'SELECT p1.title FROM pages p1' \
+ ' INNER JOIN redirects r1 ON p1.id = r1.id' \
+ ' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?'
+ for (name,) in enwikiCur.execute(query, (wikiId,)):
+ name = name.lower()
+ if altNameRegex.fullmatch(name) != None and name != nodeName and name not in nodeToTips:
+ dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0))
+
+print('Getting picked names')
+# File format:
+ # nodename1|altName1|isPreferred1 -> Add an alt-name
+ # nodename1|altName1| -> Remove an alt-name
+ # nodename1|nodeName1| -> Remove any preferred-alt status
+altNamesToSkip = {} # Maps node names to alt-names to exclude
+if os.path.exists(pickedNamesFile):
+ with open(pickedNamesFile) as file:
+ for line in file:
+ nodeName, altName, isPreferred = line.lower().rstrip().split('|')
+ if nodeName not in nodeToTips:
+ print(f"Skipping \"{nodeName}\", as no such node exists")
+ continue
+ if isPreferred:
+ isPreferred = 1 if isPreferred == '1' else 0
+ if isPreferred == 1:
+ # Remove any existing preferred-alt status
+ cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1'
+ dbCur.execute(cmd, (nodeName, altName))
+ # Remove any existing record
+ dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName))
+ # Add record
+ dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'picked')", (nodeName, altName, isPreferred))
+ elif nodeName != altName: # Remove any matching record
+ dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName))
+ else: # Remove any preferred-alt status
+ cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1'
+ dbCur.execute(cmd, (nodeName, altName))
+
+print('Closing database')
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/otol/README.md b/backend/tolData/otol/README.md
index 4be2fd2..e018369 100644
--- a/backend/tolData/otol/README.md
+++ b/backend/tolData/otol/README.md
@@ -1,10 +1,19 @@
-Files
-=====
-- opentree13.4tree.tgz <br>
+This directory holds files obtained via the
+[Open Tree of Life](https://tree.opentreeoflife.org/about/open-tree-of-life).
+
+# Tree Data Files
+- `opentree13.4tree.tgz` <br>
Obtained from <https://tree.opentreeoflife.org/about/synthesis-release/v13.4>.
Contains tree data from the [Open Tree of Life](https://tree.opentreeoflife.org/about/open-tree-of-life).
-- labelled\_supertree\_ottnames.tre <br>
+- `labelled_supertree_ottnames.tre` <br>
Extracted from the .tgz file. Describes the structure of the tree.
-- annotations.json
+- `annotations.json` <br>
Extracted from the .tgz file. Contains additional attributes of tree
nodes. Used for finding out which nodes have 'phylogenetic support'.
+
+# Taxonomy Data Files
+- `ott3.3.tgz` <br>
+ Obtained from <https://tree.opentreeoflife.org/about/taxonomy-version/ott3.3>.
+ Contains taxonomy data from the Open Tree of Life.
+- `otol/taxonomy.tsv` <br>
+ Extracted from the .tgz file. Holds taxon IDs from sources like NCBI, used to map between datasets.
diff --git a/backend/tolData/reviewImgsToGen.py b/backend/tolData/reviewImgsToGen.py
index 88822c5..dcf18bc 100755
--- a/backend/tolData/reviewImgsToGen.py
+++ b/backend/tolData/reviewImgsToGen.py
@@ -66,6 +66,7 @@ if os.path.exists(enwikiImgDir):
print(f"WARNING: No node found for {enwikiImgDir}{filename}")
print(f"Result: {len(nodeToImgs)} nodes with images")
print("Filtering out already-made image choices")
+
oldSz = len(nodeToImgs)
if os.path.exists(outFile):
with open(outFile) as file:
diff --git a/backend/tolData/wikidata/README.md b/backend/tolData/wikidata/README.md
new file mode 100644
index 0000000..db45b3c
--- /dev/null
+++ b/backend/tolData/wikidata/README.md
@@ -0,0 +1,18 @@
+This directory holds files obtained via [Wikidata](https://www.wikidata.org/).
+
+# Downloaded Files
+- `latest-all.json.bz2` <br>
+ Obtained from <https://dumps.wikimedia.org/wikidatawiki/entities/> (on 23/08/22).
+ Format info can be found at <https://doc.wikimedia.org/Wikibase/master/php/md_docs_topics_json.html>.
+
+# Other Files
+- genTaxonSrcData.py <br>
+ Used to generate a database holding taxon information from the dump.
+- offsets.dat <br>
+ Holds bzip2 block offsets for the dump. Generated and used by
+ genTaxonSrcData.py for parallel processing of the dump.
+- taxonSrcs.db <br>
+ Generated by genTaxonSrcData.py. <br>
+ Tables: <br>
+ - `src_id_to_title`: `src TEXT, id INT, title TEXT, PRIMARY KEY(src, id)`
+ - `title_iucn`: `title TEXT PRIMARY KEY, status TEXT`
diff --git a/backend/tolData/wikidata/genTaxonSrcData.py b/backend/tolData/wikidata/genTaxonSrcData.py
new file mode 100755
index 0000000..bd86172
--- /dev/null
+++ b/backend/tolData/wikidata/genTaxonSrcData.py
@@ -0,0 +1,230 @@
+#!/usr/bin/python3
+
+import sys, os, re, math, io
+from collections import defaultdict
+import bz2, json, sqlite3
+import multiprocessing, indexed_bzip2, pickle, tempfile
+
+import argparse
+parser = argparse.ArgumentParser(description='''
+Reads a wikidata JSON dump, looking for enwiki taxon items, and associated
+IDs from sources like GBIF/etc, and IUCN conservation status. Writes results
+into a database.
+
+The JSON dump contains an array of objects, each of which describes a
+Wikidata item item1, and takes up it's own line.
+- Getting item1's Wikidata ID: item1['id'] (eg: "Q144")
+- Checking if item1 is a taxon: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['numeric-id'] == id1
+ 'idx1' indexes an array of statements
+ 'id1' is a Wikidata ID denoting a taxon item type (eg: 310890 means 'monotypic taxon')
+- Checking if item1 is a taxon-alt: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['numeric-id'] == id1
+ 'id1' denotes a common-name-alternative item type (eg: 55983715 means 'organisms known by a particular common name')
+ Getting the ID of the item that item1 is an alternative for:
+ item1['claims']['P31'][idx1]['qualifiers']['P642'][idx2]['datavalue']['value']['numeric-id']
+- Checking for an EOL/NCBI/etc ID: item['claims'][prop1][idx1]['mainsnak']['datavalue']['value'] (eg: "328672")
+ 'prop1' denotes a 'has ID from source *' property (eg: 'P830' means 'has EOL ID')
+- Checking for an IUCN status: item['claims']['P141'][idx1]['mainsnak']['datavalue']['value']['id'] (eg: "Q219127")
+
+Based on code from https://github.com/OneZoom/OZtree, located in
+OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
+''', formatter_class=argparse.RawDescriptionHelpFormatter)
+args = parser.parse_args()
+
+# On Linux, running on the full dataset caused the processes to hang after processing. This was resolved by:
+# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock.
+# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state.
+# Related: https://bugs.python.org/issue6721
+# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
+# Possibly related: https://github.com/python/cpython/issues/72882
+
+WD_FILE = 'latest-all.json.bz2'
+OFFSETS_FILE = 'offsets.dat'
+DB_FILE = 'taxonSrcs.db'
+N_PROCS = 6 # Took about 3 hours (probably would've taken 6-12 with N_PROCS=1)
+
+# Wikidata entity IDs
+TAXON_IDS = ['Q16521', 'Q310890', 'Q23038290', 'Q713623'] # 'taxon', 'monotypic taxon', 'fossil taxon', 'clade'
+TAXON_ALT_IDS = ['Q55983715', 'Q502895'] # 'organisms known by a particular common name', 'common name'
+SRC_PROP_IDS = {'P830': 'eol', 'P685': 'ncbi', 'P1391': 'if', 'P850': 'worms', 'P5055': 'irmng', 'P846': 'gbif'}
+IUCN_STATUS_IDS = {
+ 'Q211005': 'least concern', 'Q719675': 'near threatened', 'Q278113': 'vulnerable',
+ 'Q11394': 'endangered', 'Q219127': 'critically endangered', 'Q239509': 'extinct in the wild',
+ 'Q237350': 'extinct species', 'Q3245245': 'data deficient'
+}
+# For filtering lines before parsing JSON
+LINE_REGEX = re.compile(('"numeric-id":(?:' + '|'.join([s[1:] for s in TAXON_IDS + TAXON_ALT_IDS]) + ')\D').encode())
+
+def main():
+ # Maps to populate
+ srcIdToId = defaultdict(dict) # Maps 'eol'/etc to {srcId1: wikidataId1, ...} (IDs are ints)
+ idToTitle = {} # Maps wikidata ID to enwiki title
+ idToAltId = {} # Maps taxon-item wikidata ID to taxon-alt wikidata ID (eg: 'canis lupus familiaris' and 'dog')
+ idToIucnStatus = {} # Maps wikidata ID to iucn-status string ('least concern', etc)
+ # Check db
+ if os.path.exists(DB_FILE):
+ print('ERROR: Database already exists')
+ sys.exit(1)
+ # Read dump
+ if N_PROCS == 1:
+ with bz2.open(WD_FILE, mode='rb') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e4 == 0:
+ print(f'At line {lineNum}')
+ readDumpLine(line, srcIdToId, idToTitle, idToAltId, idToIucnStatus)
+ else:
+
+ if not os.path.exists(OFFSETS_FILE):
+ print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours)
+ with indexed_bzip2.open(WD_FILE) as file:
+ with open(OFFSETS_FILE, 'wb') as file2:
+ pickle.dump(file.block_offsets(), file2)
+ print('Allocating file into chunks')
+ fileSz = None # About 1.4 TB
+ with indexed_bzip2.open(WD_FILE) as file:
+ with open(OFFSETS_FILE, 'rb') as file2:
+ file.set_block_offsets(pickle.load(file2))
+ fileSz = file.seek(0, io.SEEK_END)
+ chunkSz = math.floor(fileSz / N_PROCS)
+ chunkIdxs = [None] + [chunkSz * i for i in range(1, N_PROCS)] + [fileSz-1]
+ # Each adjacent pair specifies a start+end byte for readDumpChunk()
+ print(f'- Chunk size: {chunkSz:,}')
+ print('Starting processes to read dump')
+ with tempfile.TemporaryDirectory() as tempDirName:
+ # Using maxtasksperchild=1 to free resources on task completion
+ with multiprocessing.Pool(processes=N_PROCS, maxtasksperchild=1) as pool:
+ for outFilename in pool.map(readDumpChunkOneParam,
+ [(i, chunkIdxs[i], chunkIdxs[i+1], f'{tempDirName}/{i}.pickle') for i in range(N_PROCS)]):
+ # Get map data from subprocess output file
+ with open(outFilename, 'rb') as file:
+ maps = pickle.load(file)
+ # Add to maps
+ for src, idMap in maps[0].items():
+ srcIdToId[src].update(idMap)
+ idToTitle.update(maps[1])
+ idToAltId.update(maps[2])
+ idToIucnStatus.update(maps[3])
+ #
+ print('Writing to db')
+ dbCon = sqlite3.connect(DB_FILE)
+ dbCur = dbCon.cursor()
+ dbCur.execute('CREATE TABLE src_id_to_title (src TEXT, id INT, title TEXT, PRIMARY KEY(src, id))')
+ for src, submap in srcIdToId.items():
+ for srcId, wId in submap.items():
+ if wId not in idToTitle: # Check for a title, possibly via an alt-taxon
+ if wId in idToAltId:
+ wId = idToAltId[wId]
+ else:
+ continue
+ dbCur.execute('INSERT INTO src_id_to_title VALUES (?, ?, ?)', (src, srcId, idToTitle[wId]))
+ dbCur.execute('CREATE TABLE title_iucn (title TEXT PRIMARY KEY, status TEXT)')
+ for wId, status in idToIucnStatus.items():
+ if wId not in idToTitle: # Check for a title, possibly via an alt-taxon
+ if wId in idToAltId and idToAltId[wId] not in idToIucnStatus:
+ wId = idToAltId[wId]
+ else:
+ continue
+ dbCur.execute('INSERT OR IGNORE INTO title_iucn VALUES (?, ?)', (idToTitle[wId], status))
+ # The 'OR IGNORE' allows for multiple taxons using the same alt
+ #dbCur.execute('CREATE TABLE id_to_alt_title (id TEXT PRIMARY KEY, title TEXT, alt TEXT)')
+ #for wId, altId in idToAltId.items():
+ # dbCur.execute('INSERT INTO id_to_alt_title VALUES (?, ?, ?)',
+ # (wId, idToTitle[wId] if wId in idToTitle else None, idToTitle[altId]))
+ dbCon.commit()
+ dbCon.close()
+
+def readDumpLine(line, srcIdToId, idToTitle, idToAltId, idToIucnStatus):
+ # Check if taxon item
+ if LINE_REGEX.search(line) == None:
+ return
+ try:
+ line = line.decode('utf-8').rstrip().rstrip(',')
+ jsonItem = json.loads(line)
+ except json.JSONDecodeError:
+ print(f'Unable to parse Line {lineNum} as JSON')
+ return
+ isTaxon = False
+ altTaxa = [] # For a taxon-alt item, holds associated taxon-item IDs
+ claims = None
+ try:
+ claims = jsonItem['claims']
+ for statement in claims['P31']: # Check for 'instance of' statements
+ typeId = statement['mainsnak']['datavalue']['value']['id']
+ if typeId in TAXON_IDS:
+ isTaxon = True
+ break
+ elif typeId in TAXON_ALT_IDS:
+ snaks = statement['qualifiers']['P642'] # Check for 'of' qualifiers
+ altTaxa.extend([int(s['datavalue']['value']['numeric-id']) for s in snaks])
+ break
+ except (KeyError, ValueError):
+ return
+ if not isTaxon and not altTaxa:
+ return
+ # Get wikidata ID and enwiki title
+ itemId, itemTitle = None, None
+ try:
+ itemId = int(jsonItem['id'][1:]) # Skips initial 'Q'
+ itemTitle = jsonItem['sitelinks']['enwiki']['title']
+ except KeyError:
+ # Allow taxon-items without titles (they might get one via a taxon-alt)
+ if itemId != None and isTaxon:
+ itemTitle = None
+ else:
+ return
+ # Update maps
+ if itemTitle != None:
+ idToTitle[itemId] = itemTitle
+ for altId in altTaxa:
+ idToAltId[altId] = itemId
+ # Check for source IDs
+ for srcPropId, src in SRC_PROP_IDS.items():
+ if srcPropId in claims:
+ try:
+ srcId = int(claims[srcPropId][0]['mainsnak']['datavalue']['value'])
+ srcIdToId[src][srcId] = itemId
+ except (KeyError, ValueError):
+ continue
+ # Check for IUCN status
+ if 'P141' in claims: # Check for 'iucn conservation status' statement
+ try:
+ iucnStatusId = claims['P141'][0]['mainsnak']['datavalue']['value']['id']
+ idToIucnStatus[itemId] = IUCN_STATUS_IDS[iucnStatusId]
+ except KeyError:
+ pass
+
+def readDumpChunkOneParam(params): # Forwards to readDumpChunk(), for use with pool.map()
+ return readDumpChunk(*params)
+
+# Reads lines in the dump that begin after a start-byte, and not after an end byte
+ # If startByte is None, start at the first line
+def readDumpChunk(procId, startByte, endByte, outFilename):
+ # Maps to populate
+ maps = [defaultdict(dict), {}, {}, {}]
+ # Read dump
+ with indexed_bzip2.open(WD_FILE) as file:
+ # Load offsets file
+ with open(OFFSETS_FILE, 'rb') as file2:
+ offsets = pickle.load(file2)
+ file.set_block_offsets(offsets)
+ # Seek to chunk
+ if startByte != None:
+ file.seek(startByte)
+ file.readline()
+ else:
+ startByte = 0 # Used for progress calculation
+ # Read lines
+ count = 0
+ while file.tell() <= endByte:
+ count += 1
+ if count % 1e4 == 0:
+ perc = (file.tell() - startByte) / (endByte - startByte) * 100
+ print(f'Thread {procId}: {perc:.2f}%')
+ readDumpLine(file.readline(), *maps)
+ # Output results into file
+ with open(outFilename, 'wb') as file:
+ pickle.dump(maps, file)
+ return outFilename
+
+if __name__ == '__main__': # Guard needed for multiprocessing
+ multiprocessing.set_start_method('spawn')
+ main()