diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
| commit | 8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch) | |
| tree | ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data/gen_mapping_data.py | |
| parent | f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff) | |
Adjust backend coding style
Add line spacing, section comments, and import consistency
Diffstat (limited to 'backend/tol_data/gen_mapping_data.py')
| -rwxr-xr-x | backend/tol_data/gen_mapping_data.py | 31 |
1 files changed, 24 insertions, 7 deletions
diff --git a/backend/tol_data/gen_mapping_data.py b/backend/tol_data/gen_mapping_data.py index 4373d1d..1ab577b 100755 --- a/backend/tol_data/gen_mapping_data.py +++ b/backend/tol_data/gen_mapping_data.py @@ -12,9 +12,12 @@ Based on code from https://github.com/OneZoom/OZtree, located in OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). """ +import argparse import os from collections import defaultdict -import gzip, csv, sqlite3 +import gzip +import csv +import sqlite3 TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv') EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz') @@ -43,27 +46,31 @@ def genData( nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID + # Get mappings from data input readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds) readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId) readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId) readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle) getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId) - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() + # Get otol id-to-name map otolIdToName: dict[int, str] = {} for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'): if nodeId.startswith('ott'): otolIdToName[int(nodeId[3:])] = nodeName + # Add eol mappings dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)') dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)') for otolId, eolId in nodeToEolId.items(): if otolId in otolIdToName: dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId)) + # Add enwiki mappings dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)') dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)') @@ -73,8 +80,10 @@ def genData( dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title])) if title in titleToIucnStatus: dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title])) + dbCon.commit() dbCon.close() + def readTaxonomyFile( taxonomyFile: str, nodeToSrcIds: dict[int, dict[str, int]], @@ -88,9 +97,11 @@ def readTaxonomyFile( for lineNum, line in enumerate(file, 1): if lineNum % 1e5 == 0: print(f'At line {lineNum}') + # Skip header line if lineNum == 1: continue + # Parse line fields = line.split('\t|\t') try: @@ -99,6 +110,7 @@ def readTaxonomyFile( print(f'Skipping non-integral ID {fields[0]} on line {lineNum}') continue srcsField = fields[4] + # Add source IDs for srcPair in srcsField.split(','): src, srcIdStr = srcPair.split(':', 1) @@ -111,6 +123,7 @@ def readTaxonomyFile( nodeToSrcIds[otolId][src] = srcId usedSrcIds.add((src, srcId)) print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6 + def readEolIdsFile( eolIdsFile: str, nodeToSrcIds: dict[int, dict[str, int]], @@ -126,9 +139,11 @@ def readEolIdsFile( for lineNum, row in enumerate(csv.reader(file), 1): if lineNum % 1e6 == 0: print(f'At line {lineNum}') + # Skip header line if lineNum == 1: continue + # Parse line eolId = int(row[3]) srcInt = int(row[2]) @@ -144,7 +159,7 @@ def readEolIdsFile( srcToEolId[src][srcId] = eolId print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries') # Was about 3.5e6 (4.2e6 without usedSrcIds) - # + print('Resolving candidate EOL IDs') # For each otol ID, find eol IDs with matching sources, and choose the 'best' one for otolId, srcInfo in nodeToSrcIds.items(): @@ -161,6 +176,7 @@ def readEolIdsFile( eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount] nodeToEolId[otolId] = min(eolIds) print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6 + def readWikidataDb( wikidataDb: str, nodeToSrcIds: dict[int, dict[str, int]], @@ -185,7 +201,7 @@ def readWikidataDb( # Was about 1.1e6 (1.2e6 without usedSrcIds) print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds) dbCon.close() - # + print('Resolving candidate Wikidata items') # For each otol ID, find wikidata titles with matching sources, and choose the 'best' one for otolId, srcInfo in nodeToSrcIds.items(): @@ -211,7 +227,7 @@ def readWikidataDb( nodeToWikiTitle[otolId] = srcToTitle[src] break print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5 - # + print('Adding extra EOL mappings from Wikidata') wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()} addedEntries: dict[int, int] = {} @@ -222,6 +238,7 @@ def readWikidataDb( nodeToEolId[otolId] = eolId addedEntries[otolId] = eolId print(f'- Added {len(addedEntries):,} entries') # Was about 3e3 + def readPickedMappings( pickedMappings: dict[str, list[str]], nodeToEolId: dict[int, int], @@ -248,6 +265,7 @@ def readPickedMappings( else: if otolId in nodeToWikiTitle: del nodeToWikiTitle[otolId] + def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None: """ Read a db for mappings from enwiki titles to page IDs """ print('Getting enwiki page IDs') @@ -264,8 +282,7 @@ def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], ti print(f'Unable to find IDs for {numNotFound} titles') # Was 2913 if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE) |
