Clean up some docs and naming inconsistencies

author: Terry Truong <terry06890@gmail.com> 2023-01-23 18:00:43 +1100
committer: Terry Truong <terry06890@gmail.com> 2023-01-23 18:01:13 +1100
commit: 94a8ad9b067e5a2c442ce47ce72d1a53eb444160 (patch)
tree: 2056373ee56b8b2f8269ac3e94d40f8f0e6eec0d /backend/tol_data/wikidata
parent: 796c4e5660b1006575b8f2af9d99e2ce592c767a (diff)
2 files changed, 5 insertions, 5 deletions
diff --git a/backend/tol_data/wikidata/README.md b/backend/tol_data/wikidata/README.md
index 7b3105e..806b315 100644
--- a/backend/tol_data/wikidata/README.md
+++ b/backend/tol_data/wikidata/README.md
@@ -1,4 +1,4 @@
-This directory holds files obtained via [Wikidata](https://www.wikidata.org/).
+This directory holds files obtained/derived from [Wikidata](https://www.wikidata.org/).
 
 # Downloaded Files
 -   `latest-all.json.bz2` <br>
@@ -10,7 +10,7 @@ This directory holds files obtained via [Wikidata](https://www.wikidata.org/).
     Used to generate a database holding taxon information from the dump.
 -   `offsets.dat` <br>
     Holds bzip2 block offsets for the dump. Generated and used by
-    genTaxonSrcData.py for parallel processing of the dump.
+    gen_taxon_src_data.py for parallel processing of the dump.
 -   `taxon_srcs.db` <br>
     Generated by `gen_taxon_src_data.py`. <br>
     Tables: <br>
diff --git a/backend/tol_data/wikidata/gen_taxon_src_data.py b/backend/tol_data/wikidata/gen_taxon_src_data.py
index 50ed917..1bddb6e 100755
--- a/backend/tol_data/wikidata/gen_taxon_src_data.py
+++ b/backend/tol_data/wikidata/gen_taxon_src_data.py
@@ -50,7 +50,7 @@ IUCN_STATUS_IDS = {
 	'Q237350': 'extinct species', 'Q3245245': 'data deficient'
 }
 # For filtering lines before parsing JSON
-LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")\D').encode())
+LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")').encode())
 
 def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
 	""" Reads the dump and writes source/iucn info to db """
@@ -92,8 +92,8 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 			with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool:
 				for outFilename in pool.map(
 						readDumpChunkOneParam,
-						((i, wikidataFile, offsetsFile, chunkIdxs[i], chunkIdxs[i+1],
-							os.path.join(tempDirName, f'{i}.pickle')) for i in range(nProcs))):
+						[(i, wikidataFile, offsetsFile, chunkIdxs[i], chunkIdxs[i+1],
+							os.path.join(tempDirName, f'{i}.pickle')) for i in range(nProcs)]):
 					# Get map data from subprocess output file
 					with open(outFilename, 'rb') as file:
 						maps = pickle.load(file)
author	Terry Truong <terry06890@gmail.com>	2023-01-23 18:00:43 +1100
committer	Terry Truong <terry06890@gmail.com>	2023-01-23 18:01:13 +1100
commit	94a8ad9b067e5a2c442ce47ce72d1a53eb444160 (patch)
tree	2056373ee56b8b2f8269ac3e94d40f8f0e6eec0d /backend/tol_data/wikidata
parent	796c4e5660b1006575b8f2af9d99e2ce592c767a (diff)