aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/gen_mapping_data.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-11 14:55:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-11 15:04:14 +1000
commit5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree2567c25c902dbb40d44419805cebb38171df47fa /backend/tol_data/gen_mapping_data.py
parentdaccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
Add backend unit tests
- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tol_data/gen_mapping_data.py')
-rwxr-xr-xbackend/tol_data/gen_mapping_data.py271
1 files changed, 271 insertions, 0 deletions
diff --git a/backend/tol_data/gen_mapping_data.py b/backend/tol_data/gen_mapping_data.py
new file mode 100755
index 0000000..95e930b
--- /dev/null
+++ b/backend/tol_data/gen_mapping_data.py
@@ -0,0 +1,271 @@
+#!/usr/bin/python3
+
+"""
+Maps otol IDs to EOL and enwiki titles, using IDs from various
+other sources (like NCBI).
+
+Reads otol taxonomy data to get source IDs for otol IDs,
+then looks up those IDs in an EOL provider_ids file,
+and in a wikidata dump, and stores results in the database.
+
+Based on code from https://github.com/OneZoom/OZtree, located in
+OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
+"""
+
+import os
+from collections import defaultdict
+import gzip, csv, sqlite3
+
+TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv')
+EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz')
+WIKIDATA_DB = os.path.join('wikidata', 'taxon_srcs.db')
+ENWIKI_DUMP_INDEX_DB = os.path.join('enwiki', 'dumpIndex.db')
+PICKED_MAPPINGS = {
+ 'eol': ['picked_eol_ids.txt'],
+ 'enwiki': ['picked_wiki_ids.txt', 'picked_wiki_ids_rough.txt']
+}
+DB_FILE = 'data.db'
+
+OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority
+EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps external-source int-identifiers to names
+
+def genData(
+ taxonomyFile: str,
+ eolIdsFile: str,
+ wikidataDb: str,
+ pickedMappings: dict[str, list[str]],
+ enwikiDumpIndexDb: str,
+ dbFile: str) -> None:
+ """ Reads the files and enwiki db and creates the db """
+ nodeToSrcIds: dict[int, dict[str, int]] = {} # Maps otol ID to {src1: id1, src2: id2, ...}
+ usedSrcIds: set[tuple[str, int]] = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used)
+ nodeToEolId: dict[int, int] = {} # Maps otol ID to eol ID
+ nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title
+ titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string
+ titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID
+ # Get mappings from data input
+ readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds)
+ readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId)
+ readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId)
+ readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle)
+ getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId)
+ #
+ print('Writing to db')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ # Get otol id-to-name map
+ otolIdToName: dict[int, str] = {}
+ for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
+ if nodeId.startswith('ott'):
+ otolIdToName[int(nodeId[3:])] = nodeName
+ # Add eol mappings
+ dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
+ dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
+ for otolId, eolId in nodeToEolId.items():
+ if otolId in otolIdToName:
+ dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
+ # Add enwiki mappings
+ dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
+ dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
+ dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)')
+ for otolId, title in nodeToWikiTitle.items():
+ if otolId in otolIdToName and title in titleToPageId:
+ dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
+ if title in titleToIucnStatus:
+ dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
+ dbCon.commit()
+ dbCon.close()
+def readTaxonomyFile(
+ taxonomyFile: str,
+ nodeToSrcIds: dict[int, dict[str, int]],
+ usedSrcIds: set[tuple[str, int]]) -> None:
+ """ Reads taxonomy file, and maps OTOL node IDs to external-source IDs """
+ # The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence):
+ # uid (otol-id, eg: 93302), parent_uid, name, rank,
+ # sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags
+ print('Reading taxonomy file')
+ with open(taxonomyFile) as file: # Had about 4.5e6 lines
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ # Skip header line
+ if lineNum == 1:
+ continue
+ # Parse line
+ fields = line.split('\t|\t')
+ try:
+ otolId = int(fields[0])
+ except ValueError:
+ print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
+ continue
+ srcsField = fields[4]
+ # Add source IDs
+ for srcPair in srcsField.split(','):
+ src, srcIdStr = srcPair.split(':', 1)
+ if srcIdStr.isdecimal() and src in OTOL_SRCS:
+ if otolId not in nodeToSrcIds:
+ nodeToSrcIds[otolId] = {}
+ elif src in nodeToSrcIds[otolId]:
+ continue
+ srcId = int(srcIdStr)
+ nodeToSrcIds[otolId][src] = srcId
+ usedSrcIds.add((src, srcId))
+ print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
+def readEolIdsFile(
+ eolIdsFile: str,
+ nodeToSrcIds: dict[int, dict[str, int]],
+ usedSrcIds: set[tuple[str, int]],
+ nodeToEolId: dict[int, int]) -> None:
+ """ Reads EOL provider IDs file, and maps EOL IDs to external-source IDs """
+ # The file is a CSV with a header line, then lines that hold these fields:
+ # node_id, resource_pk (ID from external source), resource_id (int denoting external-source),
+ # page_id (eol ID), preferred_canonical_for_page
+ print('Reading EOL provider IDs file')
+ srcToEolId: dict[str, dict[int, int]] = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...}
+ with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines
+ for lineNum, row in enumerate(csv.reader(file), 1):
+ if lineNum % 1e6 == 0:
+ print(f'At line {lineNum}')
+ # Skip header line
+ if lineNum == 1:
+ continue
+ # Parse line
+ eolId = int(row[3])
+ srcInt = int(row[2])
+ srcIdStr = row[1]
+ if srcIdStr.isdecimal() and srcInt in EOL_SRCS:
+ srcId = int(srcIdStr)
+ src = EOL_SRCS[srcInt]
+ if (src, srcId) not in usedSrcIds:
+ continue
+ if srcId in srcToEolId[src]:
+ print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}')
+ continue
+ srcToEolId[src][srcId] = eolId
+ print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
+ # Was about 3.5e6 (4.2e6 without usedSrcIds)
+ #
+ print('Resolving candidate EOL IDs')
+ # For each otol ID, find eol IDs with matching sources, and choose the 'best' one
+ for otolId, srcInfo in nodeToSrcIds.items():
+ eolIdToCount: dict[int, int] = defaultdict(int)
+ for src, srcId in srcInfo.items():
+ if src in srcToEolId and srcId in srcToEolId[src]:
+ eolId = srcToEolId[src][srcId]
+ eolIdToCount[eolId] += 1
+ if len(eolIdToCount) == 1:
+ nodeToEolId[otolId] = list(eolIdToCount)[0]
+ elif len(eolIdToCount) > 1:
+ # For multiple candidates, prefer those with most sources, and break ties by picking the lowest
+ maxCount = max(eolIdToCount.values())
+ eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
+ nodeToEolId[otolId] = min(eolIds)
+ print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
+def readWikidataDb(
+ wikidataDb: str,
+ nodeToSrcIds: dict[int, dict[str, int]],
+ usedSrcIds: set[tuple[str, int]],
+ nodeToWikiTitle: dict[int, str],
+ titleToIucnStatus: dict[str, str],
+ nodeToEolId: dict[int, int]) -> None:
+ """ Reads db holding ID and IUCN mappings from wikidata, and maps otol IDs to Wikipedia titles and EOL IDs """
+ print('Reading from Wikidata db')
+ srcToWikiTitle: dict[str, dict[int, str]] = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...}
+ wikiTitles = set()
+ dbCon = sqlite3.connect(wikidataDb)
+ dbCur = dbCon.cursor()
+ for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'):
+ if (src, srcId) in usedSrcIds or src == 'eol': # Keep EOL IDs for later use
+ srcToWikiTitle[src][srcId] = title
+ wikiTitles.add(title)
+ for title, status in dbCur.execute('SELECT title, status from title_iucn'):
+ if title in wikiTitles:
+ titleToIucnStatus[title] = status
+ print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries')
+ # Was about 1.1e6 (1.2e6 without usedSrcIds)
+ print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
+ dbCon.close()
+ #
+ print('Resolving candidate Wikidata items')
+ # For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
+ for otolId, srcInfo in nodeToSrcIds.items():
+ titleToSrcs: dict[str, list[str]] = defaultdict(list) # Maps candidate titles to list of sources
+ for src, srcId in srcInfo.items():
+ if src in srcToWikiTitle and srcId in srcToWikiTitle[src]:
+ title = srcToWikiTitle[src][srcId]
+ titleToSrcs[title].append(src)
+ # Choose title to use
+ if len(titleToSrcs) == 1:
+ nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
+ elif len(titleToSrcs) > 1: # Test example: otol ID 621052
+ # Get titles with most sources
+ maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()])
+ titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt}
+ if len(titleToSrcs) == 1:
+ nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
+ else:
+ # Get a title with a source with highest priority
+ srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]}
+ for src in OTOL_SRCS:
+ if src in srcToTitle:
+ nodeToWikiTitle[otolId] = srcToTitle[src]
+ break
+ print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
+ #
+ print('Adding extra EOL mappings from Wikidata')
+ wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
+ addedEntries: dict[int, int] = {}
+ for eolId, title in srcToWikiTitle['eol'].items():
+ if title in wikiTitleToNode:
+ otolId = wikiTitleToNode[title]
+ if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID
+ nodeToEolId[otolId] = eolId
+ addedEntries[otolId] = eolId
+ print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
+def readPickedMappings(
+ pickedMappings: dict[str, list[str]],
+ nodeToEolId: dict[int, int],
+ nodeToWikiTitle: dict[int, str]) -> None:
+ """ Read mappings from OTOL IDs to EOL IDs and Wikipedia titles """
+ print('Reading picked mappings')
+ for src in pickedMappings:
+ for filename in pickedMappings[src]:
+ if not os.path.exists(filename):
+ continue
+ with open(filename) as file:
+ for line in file:
+ otolIdStr, mappedVal = line.rstrip().split('|')
+ otolId = int(otolIdStr)
+ if src == 'eol':
+ if mappedVal:
+ nodeToEolId[otolId] = int(mappedVal)
+ else:
+ if otolId in nodeToEolId:
+ del nodeToEolId[otolId]
+ else: # src == 'enwiki'
+ if mappedVal:
+ nodeToWikiTitle[otolId] = mappedVal
+ else:
+ if otolId in nodeToWikiTitle:
+ del nodeToWikiTitle[otolId]
+def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None:
+ """ Read a db for mappings from enwiki titles to page IDs """
+ print('Getting enwiki page IDs')
+ numNotFound = 0
+ dbCon = sqlite3.connect(enwikiDumpIndexDb)
+ dbCur = dbCon.cursor()
+ for title in nodeToWikiTitle.values():
+ record = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+ if record != None:
+ titleToPageId[title] = record[0]
+ else:
+ numNotFound += 1
+ dbCon.close()
+ print(f'Unable to find IDs for {numNotFound} titles') # Was 2913
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE)