diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-23 18:00:43 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-23 18:01:13 +1100 |
| commit | 94a8ad9b067e5a2c442ce47ce72d1a53eb444160 (patch) | |
| tree | 2056373ee56b8b2f8269ac3e94d40f8f0e6eec0d /backend/tol_data/wikidata | |
| parent | 796c4e5660b1006575b8f2af9d99e2ce592c767a (diff) | |
Clean up some docs and naming inconsistencies
Diffstat (limited to 'backend/tol_data/wikidata')
| -rw-r--r-- | backend/tol_data/wikidata/README.md | 4 | ||||
| -rwxr-xr-x | backend/tol_data/wikidata/gen_taxon_src_data.py | 6 |
2 files changed, 5 insertions, 5 deletions
diff --git a/backend/tol_data/wikidata/README.md b/backend/tol_data/wikidata/README.md index 7b3105e..806b315 100644 --- a/backend/tol_data/wikidata/README.md +++ b/backend/tol_data/wikidata/README.md @@ -1,4 +1,4 @@ -This directory holds files obtained via [Wikidata](https://www.wikidata.org/). +This directory holds files obtained/derived from [Wikidata](https://www.wikidata.org/). # Downloaded Files - `latest-all.json.bz2` <br> @@ -10,7 +10,7 @@ This directory holds files obtained via [Wikidata](https://www.wikidata.org/). Used to generate a database holding taxon information from the dump. - `offsets.dat` <br> Holds bzip2 block offsets for the dump. Generated and used by - genTaxonSrcData.py for parallel processing of the dump. + gen_taxon_src_data.py for parallel processing of the dump. - `taxon_srcs.db` <br> Generated by `gen_taxon_src_data.py`. <br> Tables: <br> diff --git a/backend/tol_data/wikidata/gen_taxon_src_data.py b/backend/tol_data/wikidata/gen_taxon_src_data.py index 50ed917..1bddb6e 100755 --- a/backend/tol_data/wikidata/gen_taxon_src_data.py +++ b/backend/tol_data/wikidata/gen_taxon_src_data.py @@ -50,7 +50,7 @@ IUCN_STATUS_IDS = { 'Q237350': 'extinct species', 'Q3245245': 'data deficient' } # For filtering lines before parsing JSON -LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")\D').encode()) +LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")').encode()) def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None: """ Reads the dump and writes source/iucn info to db """ @@ -92,8 +92,8 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool: for outFilename in pool.map( readDumpChunkOneParam, - ((i, wikidataFile, offsetsFile, chunkIdxs[i], chunkIdxs[i+1], - os.path.join(tempDirName, f'{i}.pickle')) for i in range(nProcs))): + [(i, wikidataFile, offsetsFile, chunkIdxs[i], chunkIdxs[i+1], + os.path.join(tempDirName, f'{i}.pickle')) for i in range(nProcs)]): # Get map data from subprocess output file with open(outFilename, 'rb') as file: maps = pickle.load(file) |
