Adjust backend coding style

Add line spacing, section comments, and import consistency
author: Terry Truong <terry06890@gmail.com> 2023-01-29 11:30:47 +1100
committer: Terry Truong <terry06890@gmail.com> 2023-01-29 11:30:47 +1100
commit: 8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch)
tree: ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data
parent: f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff)
21 files changed, 456 insertions, 167 deletions
diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py
index 50418e0..f8a665a 100755
--- a/backend/tol_data/dbpedia/gen_desc_data.py
+++ b/backend/tol_data/dbpedia/gen_desc_data.py
@@ -6,8 +6,10 @@ Adds DBpedia labels/types/abstracts/etc data into a database
 
 # In testing, this script took a few hours to run, and generated about 10GB
 
+import argparse
 import re
-import bz2, sqlite3
+import bz2
+import sqlite3
 
 LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries
 IDS_FILE = 'page_lang=en_ids.ttl.bz2'
@@ -24,7 +26,7 @@ def genData(
 	print('Creating database')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
-	#
+
 	print('Reading/storing label data')
 	dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)')
 	dbCur.execute('CREATE INDEX labels_idx ON labels(label)')
@@ -38,7 +40,7 @@ def genData(
 			if match is None:
 				raise Exception(f'ERROR: Line {lineNum} has unexpected format')
 			dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2)))
-	#
+
 	print('Reading/storing wiki page ids')
 	dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)')
 	dbCur.execute('CREATE INDEX ids_idx ON ids(id)')
@@ -55,7 +57,7 @@ def genData(
 			except sqlite3.IntegrityError as e:
 				# Accounts for certain lines that have the same IRI
 				print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}')
-	#
+
 	print('Reading/storing redirection data')
 	dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)')
 	redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
@@ -67,7 +69,7 @@ def genData(
 			if match is None:
 				raise Exception(f'ERROR: Line {lineNum} has unexpected format')
 			dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2)))
-	#
+
 	print('Reading/storing diambiguation-page data')
 	dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)')
 	disambigLineRegex = redirLineRegex
@@ -79,7 +81,7 @@ def genData(
 			if match is None:
 				raise Exception(f'ERROR: Line {lineNum} has unexpected format')
 			dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),))
-	#
+
 	print('Reading/storing instance-type data')
 	dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)')
 	dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)')
@@ -92,7 +94,7 @@ def genData(
 			if match is None:
 				raise Exception(f'ERROR: Line {lineNum} has unexpected format')
 			dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2)))
-	#
+
 	print('Reading/storing abstracts')
 	dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)')
 	descLineRegex = labelLineRegex
@@ -107,14 +109,13 @@ def genData(
 				raise Exception(f'ERROR: Line {lineNum} has unexpected format')
 			dbCur.execute('INSERT INTO abstracts VALUES (?, ?)',
 				(match.group(1), match.group(2).replace(r'\"', '"')))
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py
index 17e15b4..6efc7a4 100755
--- a/backend/tol_data/enwiki/download_img_license_info.py
+++ b/backend/tol_data/enwiki/download_img_license_info.py
@@ -9,13 +9,19 @@ The program can be re-run to continue downloading, and looks
 at already-processed names to decide what to skip.
 """
 
+import argparse
 import re
-import sqlite3, urllib.parse, html
+import sqlite3
+
 import requests
-import time, signal
+import urllib.parse
+import html
+
+import time
+import signal
 
 IMG_DB = 'img_data.db'
-#
+
 API_URL = 'https://en.wikipedia.org/w/api.php'
 USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 BATCH_SZ = 50 # Max 50
@@ -30,19 +36,19 @@ def downloadInfo(imgDb: str) -> None:
 	if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
 		dbCur.execute('CREATE TABLE imgs (' \
 			'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
-	#
+
 	print('Reading image names')
 	imgNames: set[str] = set()
 	for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
 		imgNames.add(imgName)
 	print(f'Found {len(imgNames)}')
-	#
+
 	print('Checking for already-processed images')
 	oldSz = len(imgNames)
 	for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
 		imgNames.discard(imgName)
 	print(f'Found {oldSz - len(imgNames)}')
-	#
+
 	# Set SIGINT handler
 	interrupted = False
 	oldHandler = None
@@ -51,7 +57,7 @@ def downloadInfo(imgDb: str) -> None:
 		interrupted = True
 		signal.signal(signal.SIGINT, oldHandler)
 	oldHandler = signal.signal(signal.SIGINT, onSigint)
-	#
+
 	print('Iterating through image names')
 	imgNameList = list(imgNames)
 	iterNum = 0
@@ -62,9 +68,11 @@ def downloadInfo(imgDb: str) -> None:
 		if interrupted:
 			print(f'Exiting loop at iteration {iterNum}')
 			break
+
 		# Get batch
 		imgBatch = imgNameList[i:i+BATCH_SZ]
 		imgBatch = ['File:' + x for x in imgBatch]
+
 		# Make request
 		headers = {
 			'user-agent': USER_AGENT,
@@ -87,6 +95,7 @@ def downloadInfo(imgDb: str) -> None:
 			print(f'ERROR: Exception while downloading info: {e}')
 			print('\tImage batch: ' + '|'.join(imgBatch))
 			continue
+
 		# Parse response-object
 		if 'query' not in responseObj or 'pages' not in responseObj['query']:
 			print('WARNING: Response object doesn\'t have page data')
@@ -126,6 +135,7 @@ def downloadInfo(imgDb: str) -> None:
 			artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
 			credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
 			restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
 			# Remove markup
 			if artist is not None:
 				artist = TAG_REGEX.sub(' ', artist).strip()
@@ -137,17 +147,17 @@ def downloadInfo(imgDb: str) -> None:
 				credit = WHITESPACE_REGEX.sub(' ', credit)
 				credit = html.unescape(credit)
 				credit = urllib.parse.unquote(credit)
+
 			# Add to db
 			dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
 				(title, license, artist, credit, restrictions, url))
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	downloadInfo(IMG_DB)
diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py
index c6a1c21..164289d 100755
--- a/backend/tol_data/enwiki/download_imgs.py
+++ b/backend/tol_data/enwiki/download_imgs.py
@@ -11,14 +11,20 @@ in the output directory do decide what to skip.
 
 # In testing, this downloaded about 100k images, over several days
 
-import re, os
+import argparse
+import re
+import os
 import sqlite3
-import urllib.parse, requests
-import time, signal
+
+import requests
+import urllib.parse
+
+import time
+import signal
 
 IMG_DB = 'img_data.db' # About 130k image names
 OUT_DIR = 'imgs'
-#
+
 LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
 USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 TIMEOUT = 1
@@ -34,7 +40,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	for filename in fileList:
 		pageIdsDone.add(int(os.path.splitext(filename)[0]))
 	print(f'Found {len(pageIdsDone)}')
-	#
+
 	# Set SIGINT handler
 	interrupted = False
 	oldHandler = None
@@ -43,7 +49,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		interrupted = True
 		signal.signal(signal.SIGINT, oldHandler)
 	oldHandler = signal.signal(signal.SIGINT, onSigint)
-	#
+
 	print('Opening database')
 	dbCon = sqlite3.connect(imgDb)
 	dbCur = dbCon.cursor()
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		if interrupted:
 			print('Exiting loop')
 			break
+
 		# Check for problematic attributes
 		if license is None or LICENSE_REGEX.fullmatch(license) is None:
 			continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 			continue
 		if restrictions is not None and restrictions != '':
 			continue
+
 		# Download image
 		iterNum += 1
 		print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
@@ -87,12 +95,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		except Exception as e:
 			print(f'Error while downloading to {outFile}: {e}')
 			return
+
 	print('Closing database')
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py
index b3fde52..44e4d6f 100755
--- a/backend/tol_data/enwiki/gen_desc_data.py
+++ b/backend/tol_data/enwiki/gen_desc_data.py
@@ -7,10 +7,16 @@ and adds them to a database
 
 # In testing, this script took over 10 hours to run, and generated about 5GB
 
-import sys, os, re
+import argparse
+import sys
+import os
+import re
 import bz2
-import html, mwxml, mwparserfromhell
 import sqlite3
+import html
+
+import mwxml
+import mwparserfromhell
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
 DB_FILE = 'desc_data.db'
@@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
 EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
 	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
 CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
 def convertTemplateReplace(match):
 	""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
 	if match.group(2) is None:
 		return f'{match.group(1)} {match.group(4)}'
 	else:
 		return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
-PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
-LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+# ========== For data generation ==========
 
 def genData(dumpFile: str, dbFile: str) -> None:
 	print('Creating database')
@@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None:
 	dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
 	dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
 	dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
-	#
+
 	print('Iterating through dump file')
 	with bz2.open(dumpFile, mode='rt') as file:
 		for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
 			if pageNum % 1e4 == 0:
 				print(f'At page {pageNum}')
-			# Parse page
+
 			if page.namespace == 0:
 				try:
 					dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
@@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None:
 					desc = parseDesc(revision.text)
 					if desc is not None:
 						dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
+
 def parseDesc(text: str) -> str | None:
-	# Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
-		# and then accumulate lines until a blank one.
-	# Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, 
-		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	"""
+	Looks for a description in wikitext content.
+
+	Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+	and then accumulates lines until a blank one.
+
+	Some cases not accounted for include:
+		disambiguation pages, abstracts with sentences split-across-lines, 
+		nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	"""
 	lines: list[str] = []
 	openBraceCount = 0
 	openBracketCount = 0
@@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None:
 	if lines:
 		return removeMarkup(' '.join(lines))
 	return None
+
 def removeMarkup(content: str) -> str:
 	content = EMBEDDED_HTML_REGEX.sub('', content)
 	content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
@@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str:
 	content = PARENS_GROUP_REGEX.sub('', content)
 	content = LEFTOVER_BRACE_REGEX.sub('', content)
 	return content
+
 def convertTitle(title: str) -> str:
 	return html.unescape(title).replace('_', ' ')
 
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(DUMP_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py
index 5778680..12a8a10 100755
--- a/backend/tol_data/enwiki/gen_dump_index_db.py
+++ b/backend/tol_data/enwiki/gen_dump_index_db.py
@@ -1,9 +1,13 @@
 #!/usr/bin/python3
 
 """
-Adds data from the wiki dump index-file into a database
+Converts data from the wiki-dump index-file into a database
 """
-import sys, os, re
+
+import argparse
+import sys
+import os
+import re
 import bz2
 import sqlite3
 
@@ -14,10 +18,12 @@ def genData(indexFile: str, dbFile: str) -> None:
 	""" Reads the index file and creates the db """
 	if os.path.exists(dbFile):
 		raise Exception(f'ERROR: Existing {dbFile}')
+
 	print('Creating database')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
 	dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+
 	print('Iterating through index file')
 	lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
 	lastOffset = 0
@@ -28,7 +34,7 @@ def genData(indexFile: str, dbFile: str) -> None:
 			lineNum += 1
 			if lineNum % 1e5 == 0:
 				print(f'At line {lineNum}')
-			#
+
 			match = lineRegex.fullmatch(line.rstrip())
 			assert match is not None
 			offsetStr, pageId, title = match.group(1,2,3)
@@ -48,13 +54,13 @@ def genData(indexFile: str, dbFile: str) -> None:
 			dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
 		except sqlite3.IntegrityError as e:
 			print(f'Failed on title "{t}": {e}', file=sys.stderr)
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(INDEX_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
index 040f223..2c243f3 100755
--- a/backend/tol_data/enwiki/gen_img_data.py
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -8,31 +8,39 @@ The program can be re-run with an updated set of page IDs, and
 will skip already-processed page IDs.
 """
 
+import argparse
 import re
-import os, bz2, html, urllib.parse
+import os
+import bz2
+import html
+import urllib.parse
 import sqlite3
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
 INDEX_DB = 'dump_index.db'
 IMG_DB = 'img_data.db' # The database to create
 DB_FILE = os.path.join('..', 'data.db')
-#
+
 ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
 IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
 BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
 IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
 CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
 
+# ========== For data generation ==========
+
 def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 	print('Opening databases')
 	indexDbCon = sqlite3.connect(indexDb)
 	indexDbCur = indexDbCon.cursor()
 	imgDbCon = sqlite3.connect(imgDb)
 	imgDbCur = imgDbCon.cursor()
+
 	print('Checking tables')
 	if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
 		# Create tables if not present
-		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)')
+			# 'img_name' values are set to NULL to indicate page IDs where no image was found
 		imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
 	else:
 		# Check for already-processed page IDs
@@ -44,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			else:
 				print(f'Found already-processed page ID {pid} which was not in input set')
 		print(f'Will skip {numSkipped} already-processed page IDs')
-	#
+
 	print('Getting dump-file offsets')
 	offsetToPageids: dict[int, list[int]] = {}
 	offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
@@ -53,7 +61,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 		iterNum += 1
 		if iterNum % 1e4 == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
 		row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
 		if row is None:
@@ -65,7 +73,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			offsetToPageids[chunkOffset] = []
 		offsetToPageids[chunkOffset].append(pageId)
 	print(f'Found {len(offsetToEnd)} chunks to check')
-	#
+
 	print('Iterating through chunks in dump file')
 	with open(dumpFile, mode='rb') as file:
 		iterNum = 0
@@ -73,7 +81,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			iterNum += 1
 			if iterNum % 100 == 0:
 				print(f'At iteration {iterNum}')
-			#
+
 			chunkPageIds = offsetToPageids[pageOffset]
 			# Jump to chunk
 			file.seek(pageOffset)
@@ -126,14 +134,15 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 					break
 				if not foundText:
 					print(f'WARNING: Did not find <text> for page id {pageId}')
-	#
+
 	print('Closing databases')
 	indexDbCon.close()
 	imgDbCon.commit()
 	imgDbCon.close()
+
 def getImageName(content: list[str]) -> str | None:
 	""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
-	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	# Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
 	for line in content:
 		match = IMG_LINE_REGEX.match(line)
 		if match is not None:
@@ -174,6 +183,8 @@ def getImageName(content: list[str]) -> str | None:
 			return None
 	return None
 
+# ========== For getting input page IDs ==========
+
 def getInputPageIdsFromDb(dbFile: str) -> set[int]:
 	print('Getting input page-ids')
 	pageIds: set[int] = set()
@@ -182,12 +193,15 @@ def getInputPageIdsFromDb(dbFile: str) -> set[int]:
 	for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
 		pageIds.add(pageId)
 	dbCon.close()
+
 	print(f'Found {len(pageIds)}')
 	return pageIds
+
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	pageIds = getInputPageIdsFromDb(DB_FILE)
 	genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py
index 8aee1cc..95b4a60 100755
--- a/backend/tol_data/enwiki/gen_pageview_data.py
+++ b/backend/tol_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
 """
 Reads through wikimedia files containing pageview counts,
 computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+	wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+	platform (eg: mobile-web), monthly view count,
+	hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 """
 
 # Took about 15min per file (each had about 180e6 lines)
 
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
 from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
 
 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
 DUMP_INDEX_DB = 'dump_index.db'
 DB_FILE = 'pageview_data.db'
 
 def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
-	# Each pageview file has lines that seem to hold these space-separated fields:
-		# wiki code (eg: en.wikipedia), article title, page ID (may be: null),
-		# platform (eg: mobile-web), monthly view count,
-		# hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 	if os.path.exists(dbFile):
 		print('ERROR: Database already exists')
 		sys.exit(1)
-	#
+
 	namespaceRegex = re.compile(r'[a-zA-Z]+:')
 	titleToViews: dict[str, int] = defaultdict(int)
 	linePrefix = b'en.wikipedia '
@@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 					print(f'At line {lineNum}')
 				if not line.startswith(linePrefix):
 					continue
+
 				# Get second and second-last fields
 				line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
 				title = line[:line.find(b' ')].decode('utf-8')
 				viewCount = int(line[line.rfind(b' ')+1:])
 				if namespaceRegex.match(title) is not None:
 					continue
+
 				# Update map
 				title = title.replace('_', ' ')
 				titleToViews[title] += viewCount
 	print(f'Found {len(titleToViews)} titles')
-	#
+
 	print('Writing to db')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
@@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 	idbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py
index f744818..c4d0932 100755
--- a/backend/tol_data/enwiki/lookup_page.py
+++ b/backend/tol_data/enwiki/lookup_page.py
@@ -5,6 +5,7 @@ Looks up a page with title title1 in the wiki dump, using the dump-index
 db, and prints the corresponding <page>.
 """
 
+import argparse
 import sys
 import bz2
 import sqlite3
@@ -24,7 +25,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
 	_, pageOffset, endOffset = row
 	dbCon.close()
 	print(f'Found chunk at offset {pageOffset}')
-	#
+
 	print('Reading from wiki dump')
 	content: list[str] = []
 	with open(dumpFile, mode='rb') as file:
@@ -32,6 +33,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
 		file.seek(pageOffset)
 		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
 		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+
 		# Look in chunk for page
 		lines = data.splitlines()
 		lineIdx = 0
@@ -58,14 +60,13 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
 						if line.lstrip() == '</page>':
 							break
 			lineIdx += 1
-	#
+
 	print('Content: ')
 	print('\n'.join(content))
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.add_argument('title', help='The title to look up')
 	args = parser.parse_args()
-	#
+
 	lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' '))
diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py
index 8454a35..5757032 100755
--- a/backend/tol_data/eol/download_imgs.py
+++ b/backend/tol_data/eol/download_imgs.py
@@ -13,9 +13,16 @@ already-downloaded files, and continues after the one with
 highest EOL ID.
 """
 
-import sys, re, os, random
+import argparse
+import sys
+import re
+import os
+import random
 import sqlite3
-import urllib.parse, requests
+
+import requests
+import urllib.parse
+
 import time
 from threading import Thread
 import signal
@@ -23,7 +30,7 @@ import signal
 IMAGES_LIST_DB = 'images_list.db'
 OUT_DIR = 'imgs_for_review'
 DB_FILE = os.path.join('..', 'data.db')
-#
+
 MAX_IMGS_PER_ID = 3
 MAX_THREADS = 5
 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
@@ -43,7 +50,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
 	eolIdList = sorted(eolIds)
 	nextIdx = 0
 	print(f'Result: {len(eolIdList)} EOL IDs')
-	#
+
 	print('Checking output directory')
 	if not os.path.exists(outDir):
 		os.mkdir(outDir)
@@ -57,7 +64,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
 	if nextIdx == len(eolIdList):
 		print('No IDs left. Exiting...')
 		return
-	#
+
 	print('Starting download threads')
 	numThreads = 0
 	threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
@@ -81,6 +88,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
 			print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
 			threadException = e
 		numThreads -= 1
+
 	# Manage downloading
 	for idx in range(nextIdx, len(eolIdList)):
 		eolId = eolIdList[idx]
@@ -96,9 +104,11 @@ def downloadImgs(eolIds, imagesListDb, outDir):
 			if len(extension) <= 1:
 				print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
 				continue
+
 			# Check image-quantity limit
 			if len(ownerSet) == MAX_IMGS_PER_ID:
 				break
+
 			# Check for skip conditions
 			if re.fullmatch(LICENSE_REGEX, license) is None:
 				continue
@@ -107,11 +117,13 @@ def downloadImgs(eolIds, imagesListDb, outDir):
 			if copyrightOwner in ownerSet:
 				continue
 			ownerSet.add(copyrightOwner)
+
 			# Determine output filename
 			outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}')
 			if os.path.exists(outPath):
 				print(f'WARNING: {outPath} already exists. Skipping download.')
 				continue
+
 			# Check thread limit
 			while numThreads == MAX_THREADS:
 				time.sleep(1)
@@ -122,6 +134,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
 					time.sleep(1)
 				exitLoop = True
 				break
+
 			# Perform download
 			print(f'Downloading image to {outPath}')
 			numThreads += 1
@@ -129,6 +142,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
 			thread.start()
 		if exitLoop:
 			break
+
 	# Close images-list db
 	while numThreads > 0:
 		time.sleep(1)
@@ -143,10 +157,10 @@ def getEolIdsFromDb(dbFile) -> set[int]:
 		eolIds.add(id)
 	dbCon.close()
 	return eolIds
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	eolIds = getEolIdsFromDb(DB_FILE)
 	downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR)
diff --git a/backend/tol_data/eol/gen_images_list_db.py b/backend/tol_data/eol/gen_images_list_db.py
index ee57ac6..3e5bea1 100755
--- a/backend/tol_data/eol/gen_images_list_db.py
+++ b/backend/tol_data/eol/gen_images_list_db.py
@@ -4,8 +4,12 @@
 Generates a sqlite db from a directory of CSV files holding EOL image data
 """
 
-import os, glob
-import csv, re, sqlite3
+import argparse
+import os
+import glob
+import csv
+import re
+import sqlite3
 
 IMAGE_LISTS_GLOB = os.path.join('imagesList', '*.csv')
 DB_FILE = 'images_list.db'
@@ -18,6 +22,7 @@ def genData(imageListsGlob: str, dbFile: str) -> None:
 		' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \
 			' copy_url TEXT, license TEXT, copyright_owner TEXT)')
 	dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)')
+
 	print('Reading CSV files')
 	for filename in glob.glob(imageListsGlob):
 		print(f'Processing {filename}')
@@ -27,13 +32,13 @@ def genData(imageListsGlob: str, dbFile: str) -> None:
 					continue
 				dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
 					(int(contentId), int(pageId), sourceUrl, copyUrl, license, owner))
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(IMAGE_LISTS_GLOB, DB_FILE)
diff --git a/backend/tol_data/eol/review_imgs.py b/backend/tol_data/eol/review_imgs.py
index 9fb462c..145f338 100755
--- a/backend/tol_data/eol/review_imgs.py
+++ b/backend/tol_data/eol/review_imgs.py
@@ -7,8 +7,13 @@ choose an image to keep, or reject all. Also provides image rotation.
 Chosen images are placed in another directory, and rejected ones are deleted.
 """
 
-import sys, re, os, time
+import argparse
+import sys
+import re
+import os
+import time
 import sqlite3
+
 import tkinter as tki
 from tkinter import ttk
 import PIL
@@ -17,7 +22,7 @@ from PIL import ImageTk, Image, ImageOps
 IMG_DIR = 'imgs_for_review'
 OUT_DIR = 'imgs'
 EXTRA_INFO_DB = os.path.join('..', 'data.db')
-#
+
 IMG_DISPLAY_SZ = 400
 MAX_IMGS_PER_ID = 3
 IMG_BG_COLOR = (88, 28, 135)
@@ -28,11 +33,13 @@ class EolImgReviewer:
 	def __init__(self, root, imgDir, imgList, extraInfoDb, outDir):
 		self.root = root
 		root.title('EOL Image Reviewer')
+
 		# Setup main frame
 		mainFrame = ttk.Frame(root, padding='5 5 5 5')
 		mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
 		root.columnconfigure(0, weight=1)
 		root.rowconfigure(0, weight=1)
+
 		# Set up images-to-be-reviewed frames
 		self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation
 		self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter
@@ -44,9 +51,11 @@ class EolImgReviewer:
 			label = ttk.Label(frame, image=self.photoImgs[i])
 			label.grid(column=0, row=0)
 			self.labels.append(label)
+
 		# Add padding
 		for child in mainFrame.winfo_children():
 			child.grid_configure(padx=5, pady=5)
+
 		# Add keyboard bindings
 		root.bind('<q>', self.quit)
 		root.bind('<Key-j>', lambda evt: self.accept(0))
@@ -59,6 +68,7 @@ class EolImgReviewer:
 		root.bind('<Key-A>', lambda evt: self.rotate(0, True))
 		root.bind('<Key-S>', lambda evt: self.rotate(1, True))
 		root.bind('<Key-D>', lambda evt: self.rotate(2, True))
+
 		# Initialise fields
 		self.imgDir = imgDir
 		self.imgList = imgList
@@ -67,13 +77,15 @@ class EolImgReviewer:
 		self.nextEolId = 0
 		self.nextImgNames: list[str] = []
 		self.rotations: list[int] = []
+
 		# For displaying extra info
 		self.extraInfoDbCon = sqlite3.connect(extraInfoDb)
 		self.extraInfoDbCur = self.extraInfoDbCon.cursor()
 		self.numReviewed = 0
 		self.startTime = time.time()
-		#
+
 		self.getNextImgs()
+
 	def getNextImgs(self):
 		""" Updates display with new images to review, or ends program """
 		# Gather names of next images to review
@@ -95,6 +107,7 @@ class EolImgReviewer:
 				self.nextImgNames.append(imgName)
 				self.rotations.append(0)
 			self.imgListIdx += 1
+
 		# Update displayed images
 		idx = 0
 		while idx < MAX_IMGS_PER_ID:
@@ -113,16 +126,19 @@ class EolImgReviewer:
 			self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
 			self.labels[idx].config(image=self.photoImgs[idx])
 			idx += 1
+
 		# Restart if all image files non-recognisable
 		if not self.nextImgNames:
 			self.getNextImgs()
 			return
+
 		# Update title
 		firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
 		lastImgIdx = self.imgListIdx
 		title = self.getExtraInfo(self.nextEolId)
 		title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})'
 		self.root.title(title)
+
 	def accept(self, imgIdx):
 		""" React to a user selecting an image """
 		if imgIdx >= len(self.nextImgNames):
@@ -142,12 +158,14 @@ class EolImgReviewer:
 				os.remove(inFile)
 		self.numReviewed += 1
 		self.getNextImgs()
+
 	def reject(self):
 		""" React to a user rejecting all images of a set """
 		for i in range(len(self.nextImgNames)):
 			os.remove(os.path.join(self.imgDir, self.nextImgNames[i]))
 		self.numReviewed += 1
 		self.getNextImgs()
+
 	def rotate(self, imgIdx, anticlockwise = False):
 		""" Respond to a user rotating an image """
 		deg = -90 if not anticlockwise else 90
@@ -155,6 +173,7 @@ class EolImgReviewer:
 		self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
 		self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
 		self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
+
 	def quit(self, e = None):
 		print(f'Number reviewed: {self.numReviewed}')
 		timeElapsed = time.time() - self.startTime
@@ -163,7 +182,7 @@ class EolImgReviewer:
 			print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
 		self.extraInfoDbCon.close()
 		self.root.destroy()
-	#
+
 	def resizeImgForDisplay(self, img):
 		""" Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """
 		if max(img.width, img.height) > IMG_DISPLAY_SZ:
@@ -178,6 +197,7 @@ class EolImgReviewer:
 			int((IMG_DISPLAY_SZ - img.width) / 2),
 			int((IMG_DISPLAY_SZ - img.height) / 2)))
 		return bgImg
+
 	def getExtraInfo(self, eolId: int) -> str:
 		""" Used to display extra EOL ID info """
 		query = 'SELECT names.alt_name FROM' \
@@ -193,12 +213,14 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str):
 	print('Checking output directory')
 	if not os.path.exists(outDir):
 		os.mkdir(outDir)
+
 	print('Getting input image list')
 	imgList = os.listdir(imgDir)
 	imgList.sort(key=lambda s: int(s.split(' ')[0]))
 	if not imgList:
 		print('No input images found')
 		sys.exit(0)
+
 	# Create GUI and defer control
 	print('Starting GUI')
 	root = tki.Tk()
@@ -206,8 +228,7 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str):
 	root.mainloop()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	reviewImgs(IMG_DIR, OUT_DIR, EXTRA_INFO_DB)
diff --git a/backend/tol_data/gen_desc_data.py b/backend/tol_data/gen_desc_data.py
index fa08a8c..69efe79 100755
--- a/backend/tol_data/gen_desc_data.py
+++ b/backend/tol_data/gen_desc_data.py
@@ -5,7 +5,9 @@ Maps nodes to short descriptions, using data from DBpedia and
 Wikipedia, and stores results in the database.
 """
 
-import os, sqlite3
+import argparse
+import os
+import sqlite3
 
 DBPEDIA_DB = os.path.join('dbpedia', 'desc_data.db')
 ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
@@ -16,12 +18,12 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
 	dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)')
-	#
+
 	print('Getting node mappings')
 	nodeToWikiId: dict[str, int] = {}
 	for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
 		nodeToWikiId[name] = wikiId
-	#
+
 	print('Reading data from DBpedia')
 	dbpCon = sqlite3.connect(dbpediaDb)
 	dbpCur = dbpCon.cursor()
@@ -32,20 +34,22 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
 		iterNum += 1
 		if iterNum % 1e5 == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone()
 		if row is not None:
 			nodeToIri[name] = row[0]
+
 	print('Resolving redirects')
 	iterNum = 0
 	for name, iri in nodeToIri.items():
 		iterNum += 1
 		if iterNum % 1e5 == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone()
 		if row is not None:
 			nodeToIri[name] = row[0]
+
 	print('Adding descriptions')
 	iterNum = 0
 	for name, iri in nodeToIri.items():
@@ -57,11 +61,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
 		if row is not None:
 			dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1))
 			del nodeToWikiId[name]
+
 	dbpCon.close()
-	#
+
 	print('Reading data from Wikipedia')
 	enwikiCon = sqlite3.connect(enwikiDb)
 	enwikiCur = enwikiCon.cursor()
+
 	print('Adding descriptions')
 	iterNum = 0
 	for name, wikiId in nodeToWikiId.items():
@@ -79,14 +85,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
 		row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone()
 		if row is not None:
 			dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0))
-	#
+
 	print('Closing databases')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(DBPEDIA_DB, ENWIKI_DB, DB_FILE)
diff --git a/backend/tol_data/gen_imgs.py b/backend/tol_data/gen_imgs.py
index 0ba75ec..2479742 100755
--- a/backend/tol_data/gen_imgs.py
+++ b/backend/tol_data/gen_imgs.py
@@ -11,8 +11,11 @@ processing. It uses already-existing database entries to decide what
 to skip.
 """
 
-import os, subprocess
-import sqlite3, urllib.parse
+import argparse
+import os
+import subprocess
+import sqlite3
+import urllib.parse
 import signal
 
 IMG_LIST_FILE = 'img_list.txt'
@@ -23,10 +26,11 @@ ENWIKI_IMG_DB = os.path.join('enwiki', 'img_data.db')
 PICKED_IMGS_DIR = 'picked_imgs'
 PICKED_IMGS_FILE = 'img_data.txt'
 DB_FILE = 'data.db'
-#
+
 IMG_OUT_SZ = 200
 
 ImgId = tuple[int, str] # Holds an int ID and a source string (eg: 'eol')
+
 class PickedImg:
 	""" Represents a picked-image from pickedImgsDir """
 	def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str):
@@ -44,9 +48,9 @@ def genImgs(
 	""" Reads the image-list file, generates images, and updates db """
 	if not os.path.exists(outDir):
 		os.mkdir(outDir)
-	#
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
+
 	print('Checking for image tables')
 	nodesDone: set[str] = set()
 	imgsDone: set[ImgId] = set()
@@ -63,15 +67,16 @@ def genImgs(
 		for imgId, imgSrc in dbCur.execute('SELECT id, src from images'):
 			imgsDone.add((imgId, imgSrc))
 		print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip')
-	#
+
 	print('Processing picked-images')
 	success = processPickedImgs(pickedImgsDir, pickedImgsFile, nodesDone, imgsDone, outDir, dbCur)
 	if success:
 		print('Processing images from eol and enwiki')
 		processImgs(imgListFile, eolImgDir, eolImgDb, enwikiImgDb, nodesDone, imgsDone, outDir, dbCur)
-	# Close db
+
 	dbCon.commit()
 	dbCon.close()
+
 def processPickedImgs(
 		pickedImgsDir: str, pickedImgsFile: str, nodesDone: set[str], imgsDone: set[ImgId],
 		outDir: str, dbCur: sqlite3.Cursor) -> bool:
@@ -85,25 +90,30 @@ def processPickedImgs(
 				nodeName = os.path.splitext(filename)[0] # Remove extension
 				(otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone()
 				nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit)
+
 	# Set SIGINT handler
 	interrupted = False
 	def onSigint(sig, frame):
 		nonlocal interrupted
 		interrupted = True
 	signal.signal(signal.SIGINT, onSigint)
+
 	# Convert images
 	for otolId, imgData in nodeToPickedImg.items():
 		# Check for SIGINT event
 		if interrupted:
 			print('Exiting')
 			return False
+
 		# Skip if already processed
 		if otolId in nodesDone:
 			continue
+
 		# Convert image
 		success = convertImage(os.path.join(pickedImgsDir, imgData.filename), os.path.join(outDir, otolId + '.jpg'))
 		if not success:
 			return False
+
 		# Add entry to db
 		if (imgData.id, 'picked') not in imgsDone:
 			dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
@@ -112,6 +122,7 @@ def processPickedImgs(
 		dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked'))
 		nodesDone.add(otolId)
 	return True
+
 def processImgs(
 		imgListFile: str, eolImgDir: str, eolImgDb: str, enwikiImgDb: str,
 		nodesDone: set[str], imgsDone: set[ImgId], outDir: str, dbCur: sqlite3.Cursor) -> bool:
@@ -120,12 +131,14 @@ def processImgs(
 	eolCur = eolCon.cursor()
 	enwikiCon = sqlite3.connect(enwikiImgDb)
 	enwikiCur = enwikiCon.cursor()
+
 	# Set SIGINT handler
 	interrupted = False
 	def onSigint(sig, frame):
 		nonlocal interrupted
 		interrupted = True
 	signal.signal(signal.SIGINT, onSigint)
+
 	# Convert images
 	flag = False # Set to True upon interruption or failure
 	with open(imgListFile) as file:
@@ -135,19 +148,24 @@ def processImgs(
 				print('Exiting')
 				flag = True
 				break
+
 			# Skip lines without an image path
 			if line.find(' ') == -1:
 				continue
+
 			# Get filenames
 			otolId, _, imgPath = line.rstrip().partition(' ')
+
 			# Skip if already processed
 			if otolId in nodesDone:
 				continue
+
 			# Convert image
 			success = convertImage(imgPath, os.path.join(outDir, otolId + '.jpg'))
 			if not success:
 				flag = True
 				break
+
 			# Add entry to db
 			(nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone()
 			fromEol = imgPath.startswith(eolImgDir)
@@ -185,14 +203,17 @@ def processImgs(
 						(enwikiId, 'enwiki', url, license, artist, credit))
 					imgsDone.add((enwikiId, 'enwiki'))
 				dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki'))
+
 	eolCon.close()
 	enwikiCon.close()
 	return not flag
+
 def convertImage(imgPath: str, outPath: str):
 	print(f'Converting {imgPath} to {outPath}')
 	if os.path.exists(outPath):
 		print('ERROR: Output image already exists')
 		return False
+
 	try:
 		completedProcess = subprocess.run(
 			['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
@@ -207,8 +228,7 @@ def convertImage(imgPath: str, outPath: str):
 	return True
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genImgs(IMG_LIST_FILE, EOL_IMG_DIR, OUT_DIR, EOL_IMG_DB, ENWIKI_IMG_DB, PICKED_IMGS_DIR, PICKED_IMGS_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_linked_imgs.py b/backend/tol_data/gen_linked_imgs.py
index 7002e92..c9d7aac 100755
--- a/backend/tol_data/gen_linked_imgs.py
+++ b/backend/tol_data/gen_linked_imgs.py
@@ -5,11 +5,12 @@ Look for nodes without images in the database, and tries to
 associate them with images from their children
 """
 
+import argparse
 import re
 import sqlite3
 
 DB_FILE = 'data.db'
-#
+
 COMPOUND_NAME_REGEX = re.compile(r'\[(.+) \+ (.+)]')
 UP_PROPAGATE_COMPOUND_IMGS = False
 
@@ -18,14 +19,14 @@ def genData(dbFile: str) -> None:
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
 	dbCur.execute('CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)')
-	#
+
 	print('Getting nodes with images')
 	nodeToUsedId: dict[str, str] = {} # Maps name of node to otol ID of node to use image for
 	query = 'SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name'
 	for name, otolId in dbCur.execute(query):
 		nodeToUsedId[name] = otolId
 	print(f'Found {len(nodeToUsedId)}')
-	#
+
 	print('Getting node depths')
 	nodeToDepth: dict[str, int] = {}
 	maxDepth = 0
@@ -33,6 +34,7 @@ def genData(dbFile: str) -> None:
 	for nodeName in nodeToUsedId.keys():
 		nodeChain = [nodeName]
 		lastDepth = 0
+
 		# Add ancestors
 		while True:
 			row = dbCur.execute('SELECT parent FROM edges WHERE child = ?', (nodeName,)).fetchone()
@@ -45,11 +47,12 @@ def genData(dbFile: str) -> None:
 			if nodeName in nodeToDepth:
 				lastDepth = nodeToDepth[nodeName]
 				break
+
 		# Add depths
 		for i in range(len(nodeChain)):
 			nodeToDepth[nodeChain[-i-1]] = i + lastDepth
 		maxDepth = max(maxDepth, lastDepth + len(nodeChain) - 1)
-	#
+
 	print('Finding ancestors to give linked images')
 	depthToNodes: dict[int, list[str]] = {depth: [] for depth in range(maxDepth + 1)}
 	for nodeName, depth in nodeToDepth.items():
@@ -70,12 +73,12 @@ def genData(dbFile: str) -> None:
 				(tips,) = dbCur.execute('SELECT tips FROM nodes WHERE name == ?', (node,)).fetchone()
 				if parent not in parentToCandidate or parentToCandidate[parent][1] < tips:
 					parentToCandidate[parent] = (node, tips)
-	#
+
 	print('Replacing linked-images for compound nodes')
 	for iterNum, node in enumerate(parentToCandidate.keys(), 1):
 		if iterNum % 1e4 == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		match = COMPOUND_NAME_REGEX.fullmatch(node)
 		if match is not None:
 			# Replace associated image with subname images
@@ -85,12 +88,15 @@ def genData(dbFile: str) -> None:
 				otolIdPair[0] = nodeToUsedId[subName1]
 			if subName2 in nodeToUsedId:
 				otolIdPair[1] = nodeToUsedId[subName2]
+
 			# Use no image if both subimages not found
 			if otolIdPair[0] == '' and otolIdPair[1] == '':
 				dbCur.execute('DELETE FROM linked_imgs WHERE name = ?', (node,))
 				continue
+
 			# Add to db
 			dbCur.execute('UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', (','.join(otolIdPair), node))
+
 			# Possibly repeat operation upon parent/ancestors
 			if UP_PROPAGATE_COMPOUND_IMGS:
 				while True:
@@ -104,14 +110,13 @@ def genData(dbFile: str) -> None:
 							node = parent
 							continue
 					break
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(DB_FILE)
diff --git a/backend/tol_data/gen_mapping_data.py b/backend/tol_data/gen_mapping_data.py
index 4373d1d..1ab577b 100755
--- a/backend/tol_data/gen_mapping_data.py
+++ b/backend/tol_data/gen_mapping_data.py
@@ -12,9 +12,12 @@ Based on code from https://github.com/OneZoom/OZtree, located in
 OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
 """
 
+import argparse
 import os
 from collections import defaultdict
-import gzip, csv, sqlite3
+import gzip
+import csv
+import sqlite3
 
 TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv')
 EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz')
@@ -43,27 +46,31 @@ def genData(
 	nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title
 	titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string
 	titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID
+
 	# Get mappings from data input
 	readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds)
 	readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId)
 	readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId)
 	readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle)
 	getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId)
-	#
+
 	print('Writing to db')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
+
 	# Get otol id-to-name map
 	otolIdToName: dict[int, str] = {}
 	for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
 		if nodeId.startswith('ott'):
 			otolIdToName[int(nodeId[3:])] = nodeName
+
 	# Add eol mappings
 	dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
 	dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
 	for otolId, eolId in nodeToEolId.items():
 		if otolId in otolIdToName:
 			dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
+
 	# Add enwiki mappings
 	dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
 	dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
@@ -73,8 +80,10 @@ def genData(
 			dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
 			if title in titleToIucnStatus:
 				dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
+
 	dbCon.commit()
 	dbCon.close()
+
 def readTaxonomyFile(
 		taxonomyFile: str,
 		nodeToSrcIds: dict[int, dict[str, int]],
@@ -88,9 +97,11 @@ def readTaxonomyFile(
 		for lineNum, line in enumerate(file, 1):
 			if lineNum % 1e5 == 0:
 				print(f'At line {lineNum}')
+
 			# Skip header line
 			if lineNum == 1:
 				continue
+
 			# Parse line
 			fields = line.split('\t|\t')
 			try:
@@ -99,6 +110,7 @@ def readTaxonomyFile(
 				print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
 				continue
 			srcsField = fields[4]
+
 			# Add source IDs
 			for srcPair in srcsField.split(','):
 				src, srcIdStr = srcPair.split(':', 1)
@@ -111,6 +123,7 @@ def readTaxonomyFile(
 					nodeToSrcIds[otolId][src] = srcId
 					usedSrcIds.add((src, srcId))
 	print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
+
 def readEolIdsFile(
 		eolIdsFile: str,
 		nodeToSrcIds: dict[int, dict[str, int]],
@@ -126,9 +139,11 @@ def readEolIdsFile(
 		for lineNum, row in enumerate(csv.reader(file), 1):
 			if lineNum % 1e6 == 0:
 				print(f'At line {lineNum}')
+
 			# Skip header line
 			if lineNum == 1:
 				continue
+
 			# Parse line
 			eolId = int(row[3])
 			srcInt = int(row[2])
@@ -144,7 +159,7 @@ def readEolIdsFile(
 				srcToEolId[src][srcId] = eolId
 	print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
 		# Was about 3.5e6 (4.2e6 without usedSrcIds)
-	#
+
 	print('Resolving candidate EOL IDs')
 	# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
 	for otolId, srcInfo in nodeToSrcIds.items():
@@ -161,6 +176,7 @@ def readEolIdsFile(
 			eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
 			nodeToEolId[otolId] = min(eolIds)
 	print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
+
 def readWikidataDb(
 		wikidataDb: str,
 		nodeToSrcIds: dict[int, dict[str, int]],
@@ -185,7 +201,7 @@ def readWikidataDb(
 		# Was about 1.1e6 (1.2e6 without usedSrcIds)
 	print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
 	dbCon.close()
-	#
+
 	print('Resolving candidate Wikidata items')
 	# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
 	for otolId, srcInfo in nodeToSrcIds.items():
@@ -211,7 +227,7 @@ def readWikidataDb(
 						nodeToWikiTitle[otolId] = srcToTitle[src]
 						break
 	print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
-	#
+
 	print('Adding extra EOL mappings from Wikidata')
 	wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
 	addedEntries: dict[int, int] = {}
@@ -222,6 +238,7 @@ def readWikidataDb(
 				nodeToEolId[otolId] = eolId
 				addedEntries[otolId] = eolId
 	print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
+
 def readPickedMappings(
 		pickedMappings: dict[str, list[str]],
 		nodeToEolId: dict[int, int],
@@ -248,6 +265,7 @@ def readPickedMappings(
 						else:
 							if otolId in nodeToWikiTitle:
 								del nodeToWikiTitle[otolId]
+
 def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None:
 	""" Read a db for mappings from enwiki titles to page IDs """
 	print('Getting enwiki page IDs')
@@ -264,8 +282,7 @@ def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], ti
 	print(f'Unable to find IDs for {numNotFound} titles') # Was 2913
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/gen_name_data.py b/backend/tol_data/gen_name_data.py
index 2e92c20..5b6e963 100755
--- a/backend/tol_data/gen_name_data.py
+++ b/backend/tol_data/gen_name_data.py
@@ -5,8 +5,12 @@ Maps nodes to vernacular names, using data from EOL, enwiki, and a
 picked-names file, and stores results in the database.
 """
 
-import re, os
-import html, csv, sqlite3
+import argparse
+import re
+import os
+import html
+import csv
+import sqlite3
 
 EOL_NAMES_FILE = os.path.join('eol', 'vernacularNames.csv')
 ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
@@ -17,25 +21,26 @@ def genData(eolNamesFile: str, enwikiDb: str, pickedNamesFile: str, dbFile: str)
 	""" Reads the files and adds to db """
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
-	#
+
 	print('Creating table')
 	dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))')
 	dbCur.execute('CREATE INDEX names_idx ON names(name)')
 	dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)')
 	dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)')
-	#
+
 	print('Getting node mappings')
 	nodeToTips: dict[str, int] = {}
 	for name, tips in dbCur.execute('SELECT name, tips from nodes'):
 		nodeToTips[name] = tips
-	#
+
 	addEolNames(eolNamesFile, nodeToTips, dbCur)
 	addEnwikiNames(enwikiDb, nodeToTips, dbCur)
 	addPickedNames(pickedNamesFile, nodeToTips, dbCur)
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
+
 def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
 	""" Reads EOL names, associates them with otol nodes, and writes to db """
 	# The CSV file has a header line, then lines with these fields:
@@ -47,26 +52,31 @@ def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cu
 	for name, eolId in dbCur.execute('SELECT name, id from eol_ids'):
 		if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]:
 			eolIdToNode[eolId] = name
+
 	print('Adding names from EOL')
 	namesToSkip = {'unknown', 'unknown species', 'unidentified species'}
 	with open(eolNamesFile, newline='') as file:
 		for lineNum, fields in enumerate(csv.reader(file), 1):
 			if lineNum % 1e5 == 0:
 				print(f'At line {lineNum}') # Reached about 2.8e6
+
 			# Skip header line
 			if lineNum == 1:
 				continue
+
 			# Parse line
 			eolId = int(fields[0])
 			name = html.unescape(fields[2]).lower()
 			lang = fields[3]
 			isPreferred = 1 if fields[6] == 'preferred' else 0
+
 			# Add to db
 			if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \
 				and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words
 				cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')'
 					# The 'OR IGNORE' accounts for duplicate lines
 				dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred))
+
 def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
 	""" Reads enwiki names, associates them with otol nodes, and writes to db """
 	print('Getting enwiki mappings')
@@ -74,6 +84,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
 	for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
 		if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]:
 			wikiIdToNode[wikiId] = name
+
 	print('Adding names from enwiki')
 	altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)',
 	enwikiCon = sqlite3.connect(enwikiDb)
@@ -83,7 +94,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
 		iterNum += 1
 		if iterNum % 1e4 == 0:
 			print(f'At iteration {iterNum}') # Reached about 3.6e5
-		#
+
 		query = 'SELECT p1.title FROM pages p1' \
 			' INNER JOIN redirects r1 ON p1.id = r1.id' \
 			' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?'
@@ -91,6 +102,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
 			name = name.lower()
 			if altNameRegex.fullmatch(name) is not None and name != nodeName and name not in nodeToTips:
 				dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0))
+
 def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
 	# File format:
 		# nodename1|altName1|isPreferred1 -> Add an alt-name
@@ -121,8 +133,7 @@ def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqli
 					dbCur.execute(cmd, (nodeName,))
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(EOL_NAMES_FILE, ENWIKI_DB, PICKED_NAMES_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py
index eba8779..a67ea4b 100755
--- a/backend/tol_data/gen_otol_data.py
+++ b/backend/tol_data/gen_otol_data.py
@@ -21,14 +21,19 @@ Reads from a picked-names file, if present, which specifies name and node ID pai
     These help resolve cases where multiple nodes share the same name.
 """
 
-import re, os
-import json, sqlite3
+import argparse
+import re
+import os
+import json
+import sqlite3
 
 TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes
 ANN_FILE = os.path.join('otol', 'annotations.json')
 DB_FILE = 'data.db'
 PICKED_NAMES_FILE = 'picked_otol_names.txt'
 
+# ========== Classes ==========
+
 class Node:
 	""" Represents a tree-of-life node """
 	def __init__(self, name, childIds, parentId, tips, pSupport):
@@ -37,13 +42,16 @@ class Node:
 		self.parentId = parentId
 		self.tips = tips
 		self.pSupport = pSupport
+
 class BasicStream:
 	""" Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """
 	def __init__(self, data, idx=0):
 		self.data = data
 		self.idx = idx
+
 	def hasNext(self) -> bool:
 		return self.idx < len(self.data)
+
 	def next(self) -> str:
 		if self.hasNext():
 			char = self.data[self.idx]
@@ -51,30 +59,37 @@ class BasicStream:
 			return char;
 		else:
 			return '';
+
 	def peek(self) -> str:
 		if self.hasNext():
 			return self.data[self.idx]
 		else:
 			return '';
+
 	def skipWhitespace(self) -> None:
 		while self.hasNext() and self.data[self.idx].isspace():
 			self.idx += 1
+
 	def progress(self) -> float:
 		return (self.idx / len(self.data))
 
+# ========== For data generation ==========
+
 def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None:
 	""" Reads the files and stores the tree info """
 	nodeMap: dict[str, Node] = {} # Maps node IDs to node objects
 	nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs)
 	dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs
-	#
+
 	print('Parsing tree file')
 	treeStream: BasicStream
 	with open(treeFile) as file:
 		treeStream = BasicStream(file.read())
+
 	# Parse content
 	parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds)
 	print('Resolving duplicate names')
+
 	# Read picked-names file
 	nameToPickedId: dict[str, str] = {}
 	if os.path.exists(pickedNamesFile):
@@ -82,6 +97,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			for line in file:
 				name, _, otolId = line.strip().partition('|')
 				nameToPickedId[name] = otolId
+
 	# Resolve duplicates
 	for dupName, ids in dupNameToIds.items():
 		# Check for picked id
@@ -98,10 +114,12 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			if id != idToUse:
 				nodeMap[id].name += f' [{counter}]'
 				counter += 1
+
 	print('Changing mrca* names')
 	for id, node in nodeMap.items():
 		if node.name.startswith('mrca'):
 			convertMrcaName(id, nodeMap)
+
 	print('Parsing annotations file')
 	# Read file
 	with open(annFile) as file:
@@ -116,6 +134,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0
 			conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0
 			node.pSupport = supportQty > 0 and conflictQty == 0
+
 	print('Creating nodes and edges tables')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
@@ -129,9 +148,11 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
 			childNode = nodeMap[childId]
 			dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)',
 				(node.name, childNode.name, 1 if childNode.pSupport else 0))
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
+
 def parseNewick(
 		stream: BasicStream,
 		nodeMap: dict[str, Node],
@@ -140,6 +161,7 @@ def parseNewick(
 	""" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """
 	if stream.idx % 1e5 == 0:
 		print(f'Progress: {stream.progress() * 100:.2f}%')
+
 	# Find node
 	stream.skipWhitespace()
 	if stream.peek() == '':
@@ -151,6 +173,7 @@ def parseNewick(
 			# Read child
 			childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds)
 			childIds.append(childId)
+
 			# Check for next child or end of node
 			stream.skipWhitespace()
 			if stream.peek() == '':
@@ -164,12 +187,15 @@ def parseNewick(
 				stream.skipWhitespace()
 				name, id = parseNewickName(stream)
 				updateNameMaps(name, id, nameToFirstId, dupNameToIds)
+
 				# Get child num-tips total
 				tips = 0
 				for childId in childIds:
 					tips += nodeMap[childId].tips
+
 				# Add node to nodeMap
 				nodeMap[id] = Node(name, childIds, None, tips, False)
+
 				# Update childrens' parent reference
 				for childId in childIds:
 					nodeMap[childId].parentId = id
@@ -179,6 +205,7 @@ def parseNewick(
 		updateNameMaps(name, id, nameToFirstId, dupNameToIds)
 		nodeMap[id] = Node(name, [], None, 1, False)
 		return id
+
 def parseNewickName(stream: BasicStream) -> tuple[str, str]:
 	""" Parses a node name from 'stream', and returns a (name, id) pair """
 	name: str
@@ -202,6 +229,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
 			nameChars.append(stream.next())
 		if stream.peek() == ';': # Ignore trailing input semicolon
 			stream.next()
+
 	# Convert to (name, id)
 	name = ''.join(nameChars).rstrip().lower()
 	if name.startswith('mrca'):
@@ -217,6 +245,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
 		if match is None:
 			raise Exception(f'ERROR: invalid name \'{name}\'')
 		return (match.group(1).replace('_', ' '), match.group(2))
+
 def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None:
 	""" Update maps upon a newly parsed name """
 	if name not in nameToFirstId:
@@ -226,6 +255,7 @@ def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToI
 			dupNameToIds[name] = [nameToFirstId[name], id]
 		else:
 			dupNameToIds[name].append(id)
+
 def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	""" Update a node in a tree to be named after 2 descendants.
 		Returns the name of one such descendant, for use during recursion. """
@@ -234,6 +264,7 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	childIds = node.childIds
 	if len(childIds) < 2:
 		raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children')
+
 	# Get 2 children with most tips
 	childTips = [nodeMap[id].tips for id in childIds]
 	maxIdx1 = childTips.index(max(childTips))
@@ -243,11 +274,13 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	childId2 = childIds[maxIdx2]
 	childName1 = nodeMap[childId1].name
 	childName2 = nodeMap[childId2].name
+
 	# Check for mrca* child names
 	if childName1.startswith('mrca'):
 		childName1 = convertMrcaName(childId1, nodeMap)
 	if childName2.startswith('mrca'):
 		childName2 = convertMrcaName(childId2, nodeMap)
+
 	# Check for composite names
 	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1)
 	if match is not None:
@@ -255,13 +288,15 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
 	match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2)
 	if match is not None:
 		childName2 = match.group(1)
+
 	# Create composite name
 	node.name = f'[{childName1} + {childName2}]'
 	return childName1
 
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	# 
+
 	genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_pop_data.py b/backend/tol_data/gen_pop_data.py
index e6a646e..4280a12 100755
--- a/backend/tol_data/gen_pop_data.py
+++ b/backend/tol_data/gen_pop_data.py
@@ -5,7 +5,9 @@ Reads enwiki page view info from a database, and stores it
 as node popularity values in the database.
 """
 
-import os, sqlite3
+import argparse
+import os
+import sqlite3
 
 PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
 DB_FILE = 'data.db'
@@ -13,7 +15,7 @@ DB_FILE = 'data.db'
 def genData(pageviewsDb: str, dbFile: str) -> None:
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
-	#
+
 	print('Getting view counts')
 	pdbCon = sqlite3.connect(pageviewsDb)
 	pdbCur = pdbCon.cursor()
@@ -23,23 +25,22 @@ def genData(pageviewsDb: str, dbFile: str) -> None:
 		iterNum += 1
 		if iterNum % 1e4 == 0:
 			print(f'At iteration {iterNum}') # Reached 1.6e6
-		#
+
 		row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone()
 		if row is not None:
 			nodeToViews[row[0]] = views
 	pdbCon.close()
-	#
+
 	print(f'Writing {len(nodeToViews)} entries to db')
 	dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)')
 	for nodeName, views in nodeToViews.items():
 		dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views))
-	#
+
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(PAGEVIEWS_DB, DB_FILE)
diff --git a/backend/tol_data/gen_reduced_trees.py b/backend/tol_data/gen_reduced_trees.py
index 3742544..ce628f7 100755
--- a/backend/tol_data/gen_reduced_trees.py
+++ b/backend/tol_data/gen_reduced_trees.py
@@ -14,12 +14,14 @@ Creates reduced versions of the tree in the database:
     removing some more, despite any node descriptions.
 """
 
-import sys, re
+import argparse
+import sys
+import re
 import sqlite3
 
 DB_FILE = 'data.db'
 PICKED_NODES_FILE = 'picked_nodes.txt'
-#
+
 COMP_NAME_REGEX = re.compile(r'\[.+ \+ .+]') # Used to recognise composite nodes
 
 class Node:
@@ -30,16 +32,18 @@ class Node:
 		self.tips = tips
 		self.pSupport = pSupport
 
+# ========== For data generation ==========
+
 def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
 	print('Opening database')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
-	#
+
 	print('Finding root node')
 	query = 'SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1'
 	(rootName,) = dbCur.execute(query).fetchone()
 	print(f'Found \'{rootName}\'')
-	#
+
 	print('=== Getting picked-nodes ===')
 	pickedNames: set[str] = set()
 	pickedTreeExists = False
@@ -63,7 +67,7 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
 		for (name,) in dbCur.execute('SELECT name FROM nodes_p'):
 			pickedNames.add(name)
 	print(f'Found {len(pickedNames)} names')
-	#
+
 	if (tree == 'picked' or tree is None) and not pickedTreeExists:
 		print('=== Generating picked-nodes tree ===')
 		genPickedNodeTree(dbCur, pickedNames, rootName)
@@ -88,22 +92,27 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
 		if tree == 'trimmed' or tree is None:
 			print('=== Generating weakly-trimmed tree ===')
 			genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName)
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
+
 def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: str) -> None:
 	PREF_NUM_CHILDREN = 3 # Include extra children up to this limit
+
 	print('Getting ancestors')
 	nodeMap = genNodeMap(dbCur, pickedNames, 100)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Removing composite nodes')
 	removedNames = removeCompositeNodes(nodeMap)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Removing \'collapsible\' nodes')
 	temp = removeCollapsibleNodes(nodeMap, pickedNames)
 	removedNames.update(temp)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Adding some additional nearby children')
 	namesToAdd: list[str] = []
 	iterNum = 0
@@ -111,7 +120,7 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st
 		iterNum += 1
 		if iterNum % 100 == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		numChildren = len(node.children)
 		if numChildren < PREF_NUM_CHILDREN:
 			children = [row[0] for row in dbCur.execute('SELECT child FROM edges where parent = ?', (name,))]
@@ -134,33 +143,44 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st
 		parent = None if parent == '' else parent
 		nodeMap[name] = Node(id, [], parent, 0, pSupport == 1)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Updating \'tips\' values')
 	updateTips(rootName, nodeMap)
+
 	print('Creating table')
 	addTreeTables(nodeMap, dbCur, 'p')
+
 def genImagesOnlyTree(
 		dbCur: sqlite3.Cursor,
 		nodesWithImgOrPicked: set[str],
 		pickedNames: set[str],
 		rootName: str) -> None:
+
 	print('Getting ancestors')
 	nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Removing composite nodes')
 	removeCompositeNodes(nodeMap)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Removing \'collapsible\' nodes')
 	removeCollapsibleNodes(nodeMap, pickedNames)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Updating \'tips\' values') # Needed for next trimming step
 	updateTips(rootName, nodeMap)
+
 	print('Trimming from nodes with \'many\' children')
 	trimIfManyChildren(nodeMap, rootName, 300, pickedNames)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Updating \'tips\' values')
 	updateTips(rootName, nodeMap)
+
 	print('Creating table')
 	addTreeTables(nodeMap, dbCur, 'i')
+
 def genWeaklyTrimmedTree(
 		dbCur: sqlite3.Cursor,
 		nodesWithImgDescOrPicked: set[str],
@@ -169,6 +189,7 @@ def genWeaklyTrimmedTree(
 	print('Getting ancestors')
 	nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Getting nodes to \'strongly keep\'')
 	iterNum = 0
 	nodesFromImgOrPicked: set[str] = set()
@@ -184,19 +205,26 @@ def genWeaklyTrimmedTree(
 			else:
 				break
 	print(f'Node set has {len(nodesFromImgOrPicked)} nodes')
+
 	print('Removing \'collapsible\' nodes')
 	removeCollapsibleNodes(nodeMap, nodesWithImgDescOrPicked)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Updating \'tips\' values') # Needed for next trimming step
 	updateTips(rootName, nodeMap)
+
 	print('Trimming from nodes with \'many\' children')
 	trimIfManyChildren(nodeMap, rootName, 600, nodesFromImgOrPicked)
 	print(f'Result has {len(nodeMap)} nodes')
+
 	print('Updating \'tips\' values')
 	updateTips(rootName, nodeMap)
+
 	print('Creating table')
 	addTreeTables(nodeMap, dbCur, 't')
-# Helper functions
+
+# ========== Helper functions ==========
+
 def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -> dict[str, Node]:
 	""" Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map """
 	nodeMap: dict[str, Node] = {}
@@ -206,7 +234,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -
 		iterNum += 1
 		if iterNum % itersBeforePrint == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		prevName: str | None = None
 		while name is not None:
 			if name not in nodeMap:
@@ -227,6 +255,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -
 					nodeMap[name].children.append(prevName)
 				break
 	return nodeMap
+
 def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]:
 	""" Given a tree, removes composite-name nodes, and returns the removed nodes' names """
 	namesToRemove: set[str] = set()
@@ -244,10 +273,12 @@ def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]:
 	for name in namesToRemove:
 		del nodeMap[name]
 	return namesToRemove
+
 def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set()) -> set[str]:
 	""" Given a tree, removes single-child parents, then only-childs,
 		with given exceptions, and returns the set of removed nodes' names """
 	namesToRemove: set[str] = set()
+
 	# Remove single-child parents
 	for name, node in nodeMap.items():
 		if len(node.children) == 1 and node.parent is not None and name not in nodesToKeep:
@@ -262,6 +293,7 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set
 			namesToRemove.add(name)
 	for name in namesToRemove:
 		del nodeMap[name]
+
 	# Remove only-childs (not redundant because 'nodesToKeep' can cause single-child parents to be kept)
 	namesToRemove.clear()
 	for name, node in nodeMap.items():
@@ -277,8 +309,9 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set
 			namesToRemove.add(name)
 	for name in namesToRemove:
 		del nodeMap[name]
-	#
+
 	return namesToRemove
+
 def trimIfManyChildren(
 		nodeMap: dict[str, Node], rootName: str, childThreshold: int, nodesToKeep: set[str] = set()) -> None:
 	namesToRemove: set[str] = set()
@@ -299,14 +332,17 @@ def trimIfManyChildren(
 		# Recurse on children
 		for n in node.children:
 			findTrimmables(n)
+
 	def markForRemoval(nodeName: str) -> None:
 		nonlocal nodeMap, namesToRemove
 		namesToRemove.add(nodeName)
 		for child in nodeMap[nodeName].children:
 			markForRemoval(child)
+
 	findTrimmables(rootName)
 	for nodeName in namesToRemove:
 		del nodeMap[nodeName]
+
 def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int:
 	""" Updates the 'tips' values for a node and it's descendants, returning the node's new 'tips' value """
 	node = nodeMap[nodeName]
@@ -314,6 +350,7 @@ def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int:
 	tips = max(1, tips)
 	node.tips = tips
 	return tips
+
 def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str):
 	""" Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix """
 	nodesTbl = f'nodes_{suffix}'
@@ -328,10 +365,11 @@ def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str):
 			pSupport = 1 if nodeMap[childName].pSupport else 0
 			dbCur.execute(f'INSERT INTO {edgesTbl} VALUES (?, ?, ?)', (name, childName, pSupport))
 
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.add_argument('--tree', choices=['picked', 'images', 'trimmed'], help='Only generate the specified tree')
 	args = parser.parse_args()
-	#
+
 	genData(args.tree, DB_FILE, PICKED_NODES_FILE)
diff --git a/backend/tol_data/review_imgs_to_gen.py b/backend/tol_data/review_imgs_to_gen.py
index 2283ed7..f384ddf 100755
--- a/backend/tol_data/review_imgs_to_gen.py
+++ b/backend/tol_data/review_imgs_to_gen.py
@@ -11,8 +11,11 @@ The program looks for an existing output file to determine what choices
 have already been made.
 """
 
-import os, time
+import argparse
+import os
+import time
 import sqlite3
+
 import tkinter as tki
 from tkinter import ttk
 import PIL
@@ -22,7 +25,7 @@ EOL_IMG_DIR = os.path.join('eol', 'imgs')
 ENWIKI_IMG_DIR = os.path.join('enwiki', 'imgs')
 DB_FILE = 'data.db'
 OUT_FILE = 'img_list.txt'
-#
+
 IMG_DISPLAY_SZ = 400
 PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
 REVIEW = 'only pairs' # Can be: 'all', 'only pairs', 'none'
@@ -32,11 +35,13 @@ class ImgReviewer:
 	def __init__(self, root, nodeToImgs, eolImgDir, enwikiImgDir, outFile, dbCon, review):
 		self.root = root
 		root.title('Image Reviewer')
+
 		# Setup main frame
 		mainFrame = ttk.Frame(root, padding='5 5 5 5')
 		mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
 		root.columnconfigure(0, weight=1)
 		root.rowconfigure(0, weight=1)
+
 		# Set up images-to-be-reviewed frames
 		self.eolImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
 		self.enwikiImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
@@ -47,14 +52,17 @@ class ImgReviewer:
 			label = ttk.Label(frame, image=self.eolImg if i == 0 else self.enwikiImg)
 			label.grid(column=0, row=0)
 			self.labels.append(label)
+
 		# Add padding
 		for child in mainFrame.winfo_children():
 			child.grid_configure(padx=5, pady=5)
+
 		# Add keyboard bindings
 		root.bind('<q>', self.quit)
 		root.bind('<Key-j>', lambda evt: self.accept(0))
 		root.bind('<Key-k>', lambda evt: self.accept(1))
 		root.bind('<Key-l>', lambda evt: self.reject())
+
 		# Set fields
 		self.nodeImgsList = list(nodeToImgs.items())
 		self.listIdx = -1
@@ -69,8 +77,10 @@ class ImgReviewer:
 		self.enwikiImgPath = None
 		self.numReviewed = 0
 		self.startTime = time.time()
+
 		# Initialise images to review
 		self.getNextImgs()
+
 	def getNextImgs(self):
 		""" Updates display with new images to review, or ends program """
 		# Get next image paths
@@ -81,6 +91,7 @@ class ImgReviewer:
 				self.quit()
 				return
 			self.otolId, imgPaths = self.nodeImgsList[self.listIdx]
+
 			# Potentially skip user choice
 			if len(imgPaths) == 1 and (self.review == 'only pairs' or self.review == 'none'):
 				with open(self.outFile, 'a') as file:
@@ -91,6 +102,7 @@ class ImgReviewer:
 					file.write(f'{self.otolId} {imgPaths[-1]}\n') # Prefer enwiki image
 				continue
 			break
+
 		# Update displayed images
 		self.eolImgPath = self.enwikiImgPath = None
 		imageOpenError = False
@@ -113,20 +125,24 @@ class ImgReviewer:
 				print(f'Unexpected image path {imgPath}')
 				self.quit()
 				return
+
 		# Re-iterate if all image paths invalid
 		if self.eolImgPath is None and self.enwikiImgPath is None:
 			if imageOpenError:
 				self.reject()
 			self.getNextImgs()
 			return
+
 		# Add placeholder images
 		if self.eolImgPath is None:
 			self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
 		elif self.enwikiImgPath is None:
 			self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
+
 		# Update image-frames
 		self.labels[0].config(image=self.eolImg)
 		self.labels[1].config(image=self.enwikiImg)
+
 		# Update title
 		title = f'Images for otol ID {self.otolId}'
 		query = 'SELECT names.alt_name FROM' \
@@ -137,6 +153,7 @@ class ImgReviewer:
 			title += f', aka {row[0]}'
 		title += f' ({self.listIdx + 1} out of {len(self.nodeImgsList)})'
 		self.root.title(title)
+
 	def accept(self, imgIdx):
 		""" React to a user selecting an image """
 		imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath
@@ -147,12 +164,14 @@ class ImgReviewer:
 			file.write(f'{self.otolId} {imgPath}\n')
 		self.numReviewed += 1
 		self.getNextImgs()
+
 	def reject(self):
 		""""" React to a user rejecting all images of a set """
 		with open(self.outFile, 'a') as file:
 			file.write(f'{self.otolId}\n')
 		self.numReviewed += 1
 		self.getNextImgs()
+
 	def quit(self, e = None):
 		print(f'Number reviewed: {self.numReviewed}')
 		timeElapsed = time.time() - self.startTime
@@ -161,6 +180,7 @@ class ImgReviewer:
 			print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
 		self.dbCon.close()
 		self.root.destroy()
+
 	def resizeImgForDisplay(self, img):
 		""" Returns a copy of an image, shrunk to fit it's frame (keeps aspect ratio), and with a background """
 		if max(img.width, img.height) > IMG_DISPLAY_SZ:
@@ -180,7 +200,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
 	print('Opening database')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
-	#
+
 	nodeToImgs: dict[str, list[str]] = {} # Maps otol-ids to arrays of image paths
 	print('Iterating through images from EOL')
 	if os.path.exists(eolImgDir):
@@ -198,6 +218,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
 			if not found:
 				print(f'WARNING: No node found for {os.path.join(eolImgDir, filename)}')
 	print(f'Result: {len(nodeToImgs)} nodes with images')
+
 	print('Iterating through images from Wikipedia')
 	if os.path.exists(enwikiImgDir):
 		for filename in os.listdir(enwikiImgDir):
@@ -214,7 +235,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
 			if not found:
 				print(f'WARNING: No node found for {os.path.join(enwikiImgDir, filename)}')
 	print(f'Result: {len(nodeToImgs)} nodes with images')
-	#
+
 	print('Filtering out already-made image choices')
 	oldSz = len(nodeToImgs)
 	if os.path.exists(outFile):
@@ -225,7 +246,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
 					line = line[:line.find(' ')]
 				del nodeToImgs[line]
 	print(f'Filtered out {oldSz - len(nodeToImgs)} entries')
-	#
+
 	# Create GUI and defer control
 	print('Starting GUI')
 	root = tki.Tk()
@@ -234,8 +255,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	reviewImgs(EOL_IMG_DIR, ENWIKI_IMG_DIR, DB_FILE, OUT_FILE, REVIEW)
diff --git a/backend/tol_data/wikidata/gen_taxon_src_data.py b/backend/tol_data/wikidata/gen_taxon_src_data.py
index 1bddb6e..d2a3811 100755
--- a/backend/tol_data/wikidata/gen_taxon_src_data.py
+++ b/backend/tol_data/wikidata/gen_taxon_src_data.py
@@ -30,10 +30,21 @@ OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
 # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
 #   Possibly related: https://github.com/python/cpython/issues/72882
 
-import sys, os, re, math, io
+import argparse
+import sys
+import os
+import re
+import math
+import io
 from collections import defaultdict
-import bz2, json, sqlite3
-import multiprocessing, indexed_bzip2, pickle, tempfile
+import bz2
+import json
+import sqlite3
+
+import multiprocessing
+import indexed_bzip2
+import pickle
+import tempfile
 
 WIKIDATA_FILE = 'latest-all.json.bz2'
 OFFSETS_FILE = 'offsets.dat'
@@ -49,9 +60,12 @@ IUCN_STATUS_IDS = {
 	'Q11394': 'endangered', 'Q219127': 'critically endangered', 'Q239509': 'extinct in the wild',
 	'Q237350': 'extinct species', 'Q3245245': 'data deficient'
 }
+
 # For filtering lines before parsing JSON
 LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")').encode())
 
+# ========== For data generation ==========
+
 def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
 	""" Reads the dump and writes source/iucn info to db """
 	# Maps to populate
@@ -59,10 +73,12 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 	idToTitle: dict[int, str] = {} # Maps wikidata ID to enwiki title
 	idToAltId: dict[int, int] = {} # Maps taxon-item wikidata ID to taxon-alt ID (eg: 'canis lupus familiaris' -> 'dog')
 	idToIucnStatus: dict[int, str] = {} # Maps wikidata ID to iucn-status string ('least concern', etc)
+
 	# Check db
 	if os.path.exists(dbFile):
 		print('ERROR: Database already exists')
 		sys.exit(1)
+
 	# Read dump
 	if nProcs == 1:
 		with bz2.open(wikidataFile, mode='rb') as file:
@@ -76,6 +92,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 			with indexed_bzip2.open(wikidataFile) as file:
 				with open(offsetsFile, 'wb') as file2:
 					pickle.dump(file.block_offsets(), file2)
+
 		print('Allocating file into chunks')
 		fileSz: int # About 1.4 TB
 		with indexed_bzip2.open(wikidataFile) as file:
@@ -86,6 +103,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 		chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
 			# Each adjacent pair specifies a start+end byte index for readDumpChunk()
 		print(f'- Chunk size: {chunkSz:,}')
+
 		print('Starting processes to read dump')
 		with tempfile.TemporaryDirectory() as tempDirName:
 			# Using maxtasksperchild=1 to free resources on task completion
@@ -103,7 +121,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 					idToTitle.update(maps[1])
 					idToAltId.update(maps[2])
 					idToIucnStatus.update(maps[3])
-	#
+
 	print('Writing to db')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
@@ -127,6 +145,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 			# The 'OR IGNORE' allows for multiple taxons using the same alt
 	dbCon.commit()
 	dbCon.close()
+
 def readDumpLine(
 		lineBytes: bytes,
 		srcIdToId: dict[str, dict[int, int]],
@@ -160,6 +179,7 @@ def readDumpLine(
 		return
 	if not isTaxon and not altTaxa:
 		return
+
 	# Get wikidata ID and enwiki title
 	itemId: int | None = None
 	itemTitle: str | None = None
@@ -172,11 +192,13 @@ def readDumpLine(
 			itemTitle = None
 		else:
 			return
+
 	# Update maps
 	if itemTitle is not None:
 		idToTitle[itemId] = itemTitle
 	for altId in altTaxa:
 		idToAltId[altId] = itemId
+
 	# Check for source IDs
 	for srcPropId, src in SRC_PROP_IDS.items():
 		if srcPropId in claims:
@@ -185,6 +207,7 @@ def readDumpLine(
 				srcIdToId[src][srcId] = itemId
 			except (KeyError, ValueError):
 				continue
+
 	# Check for IUCN status
 	if 'P141' in claims: # Check for 'iucn conservation status' statement
 		try:
@@ -192,9 +215,11 @@ def readDumpLine(
 			idToIucnStatus[itemId] = IUCN_STATUS_IDS[iucnStatusId]
 		except KeyError:
 			pass
+
 def readDumpChunkOneParam(params: tuple[int, str, str, int, int, str]) -> str:
 	""" Forwards to readDumpChunk(), for use with pool.map() """
 	return readDumpChunk(*params)
+
 def readDumpChunk(
 		procId: int, wikidataFile: str, offsetsFile: str, startByte: int, endByte: int, outFilename: str) -> str:
 	""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
@@ -205,18 +230,21 @@ def readDumpChunk(
 		dict[int, str],
 		dict[int, int],
 		dict[int, str]] = (defaultdict(dict), {}, {}, {})
+
 	# Read dump
 	with indexed_bzip2.open(wikidataFile) as file:
 		# Load offsets file
 		with open(offsetsFile, 'rb') as file2:
 			offsets = pickle.load(file2)
 			file.set_block_offsets(offsets)
+
 		# Seek to chunk
 		if startByte != -1:
 			file.seek(startByte)
 			file.readline()
 		else:
 			startByte = 0 # Used for progress calculation
+
 		# Read lines
 		count = 0
 		while file.tell() <= endByte:
@@ -225,15 +253,17 @@ def readDumpChunk(
 				perc = (file.tell() - startByte) / (endByte - startByte) * 100
 				print(f'Thread {procId}: {perc:.2f}%')
 			readDumpLine(file.readline(), *maps)
+
 	# Output results into file
 	with open(outFilename, 'wb') as file:
 		pickle.dump(maps, file)
 	return outFilename
 
+# ========== Main block ==========
+
 if __name__ == '__main__': # Guard needed for multiprocessing
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	multiprocessing.set_start_method('spawn')
 	genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)
author	Terry Truong <terry06890@gmail.com>	2023-01-29 11:30:47 +1100
committer	Terry Truong <terry06890@gmail.com>	2023-01-29 11:30:47 +1100
commit	8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch)
tree	ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data
parent	f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff)