aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genMappingData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/genMappingData.py')
-rwxr-xr-xbackend/tolData/genMappingData.py229
1 files changed, 0 insertions, 229 deletions
diff --git a/backend/tolData/genMappingData.py b/backend/tolData/genMappingData.py
deleted file mode 100755
index 5339c4e..0000000
--- a/backend/tolData/genMappingData.py
+++ /dev/null
@@ -1,229 +0,0 @@
-#!/usr/bin/python3
-
-import os
-from collections import defaultdict
-import gzip, csv, sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Maps otol IDs to EOL and enwiki titles, using IDs from various
-other sources (like NCBI).
-
-Reads otol taxonomy data to get source IDs for otol IDs,
-then looks up those IDs in an EOL provider_ids file,
-and in a wikidata dump, and stores results in the database.
-
-Based on code from https://github.com/OneZoom/OZtree, located in
-OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-args = parser.parse_args()
-
-taxonomyFile = 'otol/taxonomy.tsv'
-eolIdsFile = 'eol/provider_ids.csv.gz'
-wikidataDb = 'wikidata/taxonSrcs.db'
-enwikiDumpIndexDb = 'enwiki/dumpIndex.db'
-pickedMappings = {
- 'eol': ['pickedEolIds.txt'],
- 'enwiki': ['pickedWikiIds.txt', 'pickedWikiIdsRough.txt']
-}
-dbFile = 'data.db'
-
-print('Reading taxonomy file')
-# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence):
- # uid (otol-id, eg: 93302), parent_uid, name, rank,
- # sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags
-OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority
-nodeToSrcIds: dict[int, dict[str, int]] = defaultdict(dict) # Maps otol ID to {src1: id1, src2: id2, ...}
-usedSrcIds: set[tuple[str, int]] = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used)
-with open(taxonomyFile) as file: # Had about 4.5e6 lines
- lineNum = 0
- for line in file:
- lineNum += 1
- if lineNum % 1e5 == 0:
- print(f'At line {lineNum}')
- # Skip header line
- if lineNum == 1:
- continue
- # Parse line
- fields = line.split('\t|\t')
- try:
- otolId = int(fields[0])
- except ValueError:
- print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
- continue
- srcsField = fields[4]
- # Add source IDs
- for srcPair in srcsField.split(','):
- src, srcIdStr = srcPair.split(':', 1)
- if srcIdStr.isdecimal() and src in OTOL_SRCS and src not in nodeToSrcIds[otolId]:
- srcId = int(srcIdStr)
- nodeToSrcIds[otolId][src] = srcId
- usedSrcIds.add((src, srcId))
-print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
-
-print('Reading EOL provider_ids file')
-# The CSV file has a header line, then lines that hold these fields:
- # node_id, resource_pk (ID from external source), resource_id (int denoting external-source),
- # page_id (eol ID), preferred_canonical_for_page
-EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps ints to external-source names
-srcToEolId: dict[str, dict[int, int]] = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...}
-with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines
- for lineNum, row in enumerate(csv.reader(file), 1):
- if lineNum % 1e6 == 0:
- print(f'At line {lineNum}')
- # Skip header line
- if lineNum == 1:
- continue
- # Parse line
- eolId = int(row[3])
- srcVal = int(row[2])
- srcIdStr = row[1]
- if srcIdStr.isdecimal() and srcVal in EOL_SRCS:
- srcId = int(srcIdStr)
- src = EOL_SRCS[srcVal]
- if (src, srcId) not in usedSrcIds:
- continue
- if srcId in srcToEolId[src]:
- print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}')
- continue
- srcToEolId[src][srcId] = eolId
-print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
- # Was about 3.5e6 (4.2e6 without usedSrcIds)
-
-print('Resolving candidate EOL IDs')
-# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
-nodeToEolId: dict[int, int] = {} # Maps otol ID to eol ID
-for otolId, srcInfo in nodeToSrcIds.items():
- eolIdToCount: dict[int, int] = defaultdict(int)
- for src, srcId in srcInfo.items():
- if src in srcToEolId and srcId in srcToEolId[src]:
- eolId = srcToEolId[src][srcId]
- eolIdToCount[eolId] += 1
- if len(eolIdToCount) == 1:
- nodeToEolId[otolId] = list(eolIdToCount)[0]
- elif len(eolIdToCount) > 1:
- # For multiple candidates, prefer those with most sources, and break ties by picking the lowest
- maxCount = max(eolIdToCount.values())
- eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
- nodeToEolId[otolId] = min(eolIds)
-print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
-
-print('Reading from Wikidata db')
-srcToWikiTitle: dict[str, dict[int, str]] = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...}
-wikiTitles = set()
-titleToIucnStatus: dict[str, str] = {}
-dbCon = sqlite3.connect(wikidataDb)
-dbCur = dbCon.cursor()
-for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'):
- if (src, srcId) not in usedSrcIds and src != 'eol': # Keep EOL IDs for later use
- continue
- srcToWikiTitle[src][srcId] = title
- wikiTitles.add(title)
-for title, status in dbCur.execute('SELECT title, status from title_iucn'):
- if title in wikiTitles:
- titleToIucnStatus[title] = status
-print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries')
- # Was about 1.1e6 (1.2e6 without usedSrcIds)
-print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
-dbCon.close()
-
-print('Resolving candidate Wikidata items')
-# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
-nodeToWikiTitle: dict[int, str] = {}
-for otolId, srcInfo in nodeToSrcIds.items():
- titleToSrcs: dict[str, list[str]] = defaultdict(list) # Maps candidate titles to list of sources
- for src, srcId in srcInfo.items():
- if src in srcToWikiTitle and srcId in srcToWikiTitle[src]:
- title = srcToWikiTitle[src][srcId]
- titleToSrcs[title].append(src)
- # Choose title to use
- if len(titleToSrcs) == 1:
- nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
- elif len(titleToSrcs) > 1: # Test example: otol ID 621052
- # Get titles with most sources
- maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()])
- titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt}
- if len(titleToSrcs) == 1:
- nodeToWikiTitle[otolId] = list(titleToSrcs)[0]
- else: # Test example: otol ID 4235272
- # Get a title with a source with highest priority
- srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]}
- for src in OTOL_SRCS:
- if src in srcToTitle:
- nodeToWikiTitle[otolId] = srcToTitle[src]
- break
-print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
-
-print('Adding extra EOL mappings from Wikidata')
-eolIdToNode = {eolId: node for node, eolId in nodeToEolId.items()}
-wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
-addedEntries: dict[int, int] = {}
-for eolId, title in srcToWikiTitle['eol'].items():
- if title in wikiTitleToNode:
- otolId = wikiTitleToNode[title]
- if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID
- nodeToEolId[otolId] = eolId
- addedEntries[otolId] = eolId
-print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
-
-print('Reading picked mappings')
-for src in pickedMappings:
- for filename in pickedMappings[src]:
- if not os.path.exists(filename):
- continue
- with open(filename) as file:
- for line in file:
- otolIdStr, mappedVal = line.rstrip().split('|')
- otolId = int(otolIdStr)
- if src == 'eol':
- if mappedVal:
- nodeToEolId[otolId] = int(mappedVal)
- else:
- if otolId in nodeToEolId:
- del nodeToEolId[otolId]
- else: # src == 'enwiki'
- if mappedVal:
- nodeToWikiTitle[otolId] = mappedVal
- else:
- if otolId in nodeToWikiTitle:
- del nodeToWikiTitle[otolId]
-
-print('Getting enwiki page IDs')
-titleToPageId: dict[str, int] = {}
-numNotFound = 0
-dbCon = sqlite3.connect(enwikiDumpIndexDb)
-dbCur = dbCon.cursor()
-for title in nodeToWikiTitle.values():
- record = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
- if record != None:
- titleToPageId[title] = record[0]
- else:
- numNotFound += 1
-dbCon.close()
-print(f'Unable to find IDs for {numNotFound} titles') # Was 2913
-
-print('Writing to db')
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-# Get otol id-to-name map
-otolIdToName: dict[int, str] = {}
-for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
- if nodeId.startswith('ott'):
- otolIdToName[int(nodeId[3:])] = nodeName
-# Add eol mappings
-dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
-dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
-for otolId, eolId in nodeToEolId.items():
- if otolId in otolIdToName:
- dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
-# Add enwiki mappings
-dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
-dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
-dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)')
-for otolId, title in nodeToWikiTitle.items():
- if otolId in otolIdToName and title in titleToPageId:
- dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
- if title in titleToIucnStatus:
- dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
-dbCon.commit()
-dbCon.close()