From 8781fdb2b8c530a6c1531ae9e82221eb062e34fb Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Sun, 29 Jan 2023 11:30:47 +1100
Subject: Adjust backend coding style

Add line spacing, section comments, and import consistency
---
 .../tol_data/enwiki/download_img_license_info.py   | 30 ++++++++++-----
 backend/tol_data/enwiki/download_imgs.py           | 24 ++++++++----
 backend/tol_data/enwiki/gen_desc_data.py           | 45 +++++++++++++++-------
 backend/tol_data/enwiki/gen_dump_index_db.py       | 16 +++++---
 backend/tol_data/enwiki/gen_img_data.py            | 36 +++++++++++------
 backend/tol_data/enwiki/gen_pageview_data.py       | 28 +++++++++-----
 backend/tol_data/enwiki/lookup_page.py             |  9 +++--
 7 files changed, 127 insertions(+), 61 deletions(-)

(limited to 'backend/tol_data/enwiki')

diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py
index 17e15b4..6efc7a4 100755
--- a/backend/tol_data/enwiki/download_img_license_info.py
+++ b/backend/tol_data/enwiki/download_img_license_info.py
@@ -9,13 +9,19 @@ The program can be re-run to continue downloading, and looks
 at already-processed names to decide what to skip.
 """
 
+import argparse
 import re
-import sqlite3, urllib.parse, html
+import sqlite3
+
 import requests
-import time, signal
+import urllib.parse
+import html
+
+import time
+import signal
 
 IMG_DB = 'img_data.db'
-#
+
 API_URL = 'https://en.wikipedia.org/w/api.php'
 USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 BATCH_SZ = 50 # Max 50
@@ -30,19 +36,19 @@ def downloadInfo(imgDb: str) -> None:
 	if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
 		dbCur.execute('CREATE TABLE imgs (' \
 			'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
-	#
+
 	print('Reading image names')
 	imgNames: set[str] = set()
 	for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
 		imgNames.add(imgName)
 	print(f'Found {len(imgNames)}')
-	#
+
 	print('Checking for already-processed images')
 	oldSz = len(imgNames)
 	for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
 		imgNames.discard(imgName)
 	print(f'Found {oldSz - len(imgNames)}')
-	#
+
 	# Set SIGINT handler
 	interrupted = False
 	oldHandler = None
@@ -51,7 +57,7 @@ def downloadInfo(imgDb: str) -> None:
 		interrupted = True
 		signal.signal(signal.SIGINT, oldHandler)
 	oldHandler = signal.signal(signal.SIGINT, onSigint)
-	#
+
 	print('Iterating through image names')
 	imgNameList = list(imgNames)
 	iterNum = 0
@@ -62,9 +68,11 @@ def downloadInfo(imgDb: str) -> None:
 		if interrupted:
 			print(f'Exiting loop at iteration {iterNum}')
 			break
+
 		# Get batch
 		imgBatch = imgNameList[i:i+BATCH_SZ]
 		imgBatch = ['File:' + x for x in imgBatch]
+
 		# Make request
 		headers = {
 			'user-agent': USER_AGENT,
@@ -87,6 +95,7 @@ def downloadInfo(imgDb: str) -> None:
 			print(f'ERROR: Exception while downloading info: {e}')
 			print('\tImage batch: ' + '|'.join(imgBatch))
 			continue
+
 		# Parse response-object
 		if 'query' not in responseObj or 'pages' not in responseObj['query']:
 			print('WARNING: Response object doesn\'t have page data')
@@ -126,6 +135,7 @@ def downloadInfo(imgDb: str) -> None:
 			artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
 			credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
 			restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
 			# Remove markup
 			if artist is not None:
 				artist = TAG_REGEX.sub(' ', artist).strip()
@@ -137,17 +147,17 @@ def downloadInfo(imgDb: str) -> None:
 				credit = WHITESPACE_REGEX.sub(' ', credit)
 				credit = html.unescape(credit)
 				credit = urllib.parse.unquote(credit)
+
 			# Add to db
 			dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
 				(title, license, artist, credit, restrictions, url))
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	downloadInfo(IMG_DB)
diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py
index c6a1c21..164289d 100755
--- a/backend/tol_data/enwiki/download_imgs.py
+++ b/backend/tol_data/enwiki/download_imgs.py
@@ -11,14 +11,20 @@ in the output directory do decide what to skip.
 
 # In testing, this downloaded about 100k images, over several days
 
-import re, os
+import argparse
+import re
+import os
 import sqlite3
-import urllib.parse, requests
-import time, signal
+
+import requests
+import urllib.parse
+
+import time
+import signal
 
 IMG_DB = 'img_data.db' # About 130k image names
 OUT_DIR = 'imgs'
-#
+
 LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
 USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 TIMEOUT = 1
@@ -34,7 +40,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	for filename in fileList:
 		pageIdsDone.add(int(os.path.splitext(filename)[0]))
 	print(f'Found {len(pageIdsDone)}')
-	#
+
 	# Set SIGINT handler
 	interrupted = False
 	oldHandler = None
@@ -43,7 +49,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		interrupted = True
 		signal.signal(signal.SIGINT, oldHandler)
 	oldHandler = signal.signal(signal.SIGINT, onSigint)
-	#
+
 	print('Opening database')
 	dbCon = sqlite3.connect(imgDb)
 	dbCur = dbCon.cursor()
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		if interrupted:
 			print('Exiting loop')
 			break
+
 		# Check for problematic attributes
 		if license is None or LICENSE_REGEX.fullmatch(license) is None:
 			continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 			continue
 		if restrictions is not None and restrictions != '':
 			continue
+
 		# Download image
 		iterNum += 1
 		print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
@@ -87,12 +95,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		except Exception as e:
 			print(f'Error while downloading to {outFile}: {e}')
 			return
+
 	print('Closing database')
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py
index b3fde52..44e4d6f 100755
--- a/backend/tol_data/enwiki/gen_desc_data.py
+++ b/backend/tol_data/enwiki/gen_desc_data.py
@@ -7,10 +7,16 @@ and adds them to a database
 
 # In testing, this script took over 10 hours to run, and generated about 5GB
 
-import sys, os, re
+import argparse
+import sys
+import os
+import re
 import bz2
-import html, mwxml, mwparserfromhell
 import sqlite3
+import html
+
+import mwxml
+import mwparserfromhell
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
 DB_FILE = 'desc_data.db'
@@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
 EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
 	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
 CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
 def convertTemplateReplace(match):
 	""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
 	if match.group(2) is None:
 		return f'{match.group(1)} {match.group(4)}'
 	else:
 		return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
-PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
-LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+# ========== For data generation ==========
 
 def genData(dumpFile: str, dbFile: str) -> None:
 	print('Creating database')
@@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None:
 	dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
 	dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
 	dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
-	#
+
 	print('Iterating through dump file')
 	with bz2.open(dumpFile, mode='rt') as file:
 		for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
 			if pageNum % 1e4 == 0:
 				print(f'At page {pageNum}')
-			# Parse page
+
 			if page.namespace == 0:
 				try:
 					dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
@@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None:
 					desc = parseDesc(revision.text)
 					if desc is not None:
 						dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
+
 def parseDesc(text: str) -> str | None:
-	# Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
-		# and then accumulate lines until a blank one.
-	# Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, 
-		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	"""
+	Looks for a description in wikitext content.
+
+	Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+	and then accumulates lines until a blank one.
+
+	Some cases not accounted for include:
+		disambiguation pages, abstracts with sentences split-across-lines, 
+		nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	"""
 	lines: list[str] = []
 	openBraceCount = 0
 	openBracketCount = 0
@@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None:
 	if lines:
 		return removeMarkup(' '.join(lines))
 	return None
+
 def removeMarkup(content: str) -> str:
 	content = EMBEDDED_HTML_REGEX.sub('', content)
 	content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
@@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str:
 	content = PARENS_GROUP_REGEX.sub('', content)
 	content = LEFTOVER_BRACE_REGEX.sub('', content)
 	return content
+
 def convertTitle(title: str) -> str:
 	return html.unescape(title).replace('_', ' ')
 
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(DUMP_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py
index 5778680..12a8a10 100755
--- a/backend/tol_data/enwiki/gen_dump_index_db.py
+++ b/backend/tol_data/enwiki/gen_dump_index_db.py
@@ -1,9 +1,13 @@
 #!/usr/bin/python3
 
 """
-Adds data from the wiki dump index-file into a database
+Converts data from the wiki-dump index-file into a database
 """
-import sys, os, re
+
+import argparse
+import sys
+import os
+import re
 import bz2
 import sqlite3
 
@@ -14,10 +18,12 @@ def genData(indexFile: str, dbFile: str) -> None:
 	""" Reads the index file and creates the db """
 	if os.path.exists(dbFile):
 		raise Exception(f'ERROR: Existing {dbFile}')
+
 	print('Creating database')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
 	dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+
 	print('Iterating through index file')
 	lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
 	lastOffset = 0
@@ -28,7 +34,7 @@ def genData(indexFile: str, dbFile: str) -> None:
 			lineNum += 1
 			if lineNum % 1e5 == 0:
 				print(f'At line {lineNum}')
-			#
+
 			match = lineRegex.fullmatch(line.rstrip())
 			assert match is not None
 			offsetStr, pageId, title = match.group(1,2,3)
@@ -48,13 +54,13 @@ def genData(indexFile: str, dbFile: str) -> None:
 			dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
 		except sqlite3.IntegrityError as e:
 			print(f'Failed on title "{t}": {e}', file=sys.stderr)
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(INDEX_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
index 040f223..2c243f3 100755
--- a/backend/tol_data/enwiki/gen_img_data.py
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -8,31 +8,39 @@ The program can be re-run with an updated set of page IDs, and
 will skip already-processed page IDs.
 """
 
+import argparse
 import re
-import os, bz2, html, urllib.parse
+import os
+import bz2
+import html
+import urllib.parse
 import sqlite3
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
 INDEX_DB = 'dump_index.db'
 IMG_DB = 'img_data.db' # The database to create
 DB_FILE = os.path.join('..', 'data.db')
-#
+
 ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
 IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
 BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
 IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
 CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
 
+# ========== For data generation ==========
+
 def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 	print('Opening databases')
 	indexDbCon = sqlite3.connect(indexDb)
 	indexDbCur = indexDbCon.cursor()
 	imgDbCon = sqlite3.connect(imgDb)
 	imgDbCur = imgDbCon.cursor()
+
 	print('Checking tables')
 	if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
 		# Create tables if not present
-		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)')
+			# 'img_name' values are set to NULL to indicate page IDs where no image was found
 		imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
 	else:
 		# Check for already-processed page IDs
@@ -44,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			else:
 				print(f'Found already-processed page ID {pid} which was not in input set')
 		print(f'Will skip {numSkipped} already-processed page IDs')
-	#
+
 	print('Getting dump-file offsets')
 	offsetToPageids: dict[int, list[int]] = {}
 	offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
@@ -53,7 +61,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 		iterNum += 1
 		if iterNum % 1e4 == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
 		row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
 		if row is None:
@@ -65,7 +73,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			offsetToPageids[chunkOffset] = []
 		offsetToPageids[chunkOffset].append(pageId)
 	print(f'Found {len(offsetToEnd)} chunks to check')
-	#
+
 	print('Iterating through chunks in dump file')
 	with open(dumpFile, mode='rb') as file:
 		iterNum = 0
@@ -73,7 +81,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			iterNum += 1
 			if iterNum % 100 == 0:
 				print(f'At iteration {iterNum}')
-			#
+
 			chunkPageIds = offsetToPageids[pageOffset]
 			# Jump to chunk
 			file.seek(pageOffset)
@@ -126,14 +134,15 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 					break
 				if not foundText:
 					print(f'WARNING: Did not find <text> for page id {pageId}')
-	#
+
 	print('Closing databases')
 	indexDbCon.close()
 	imgDbCon.commit()
 	imgDbCon.close()
+
 def getImageName(content: list[str]) -> str | None:
 	""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
-	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	# Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
 	for line in content:
 		match = IMG_LINE_REGEX.match(line)
 		if match is not None:
@@ -174,6 +183,8 @@ def getImageName(content: list[str]) -> str | None:
 			return None
 	return None
 
+# ========== For getting input page IDs ==========
+
 def getInputPageIdsFromDb(dbFile: str) -> set[int]:
 	print('Getting input page-ids')
 	pageIds: set[int] = set()
@@ -182,12 +193,15 @@ def getInputPageIdsFromDb(dbFile: str) -> set[int]:
 	for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
 		pageIds.add(pageId)
 	dbCon.close()
+
 	print(f'Found {len(pageIds)}')
 	return pageIds
+
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	pageIds = getInputPageIdsFromDb(DB_FILE)
 	genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py
index 8aee1cc..95b4a60 100755
--- a/backend/tol_data/enwiki/gen_pageview_data.py
+++ b/backend/tol_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
 """
 Reads through wikimedia files containing pageview counts,
 computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+	wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+	platform (eg: mobile-web), monthly view count,
+	hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 """
 
 # Took about 15min per file (each had about 180e6 lines)
 
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
 from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
 
 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
 DUMP_INDEX_DB = 'dump_index.db'
 DB_FILE = 'pageview_data.db'
 
 def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
-	# Each pageview file has lines that seem to hold these space-separated fields:
-		# wiki code (eg: en.wikipedia), article title, page ID (may be: null),
-		# platform (eg: mobile-web), monthly view count,
-		# hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 	if os.path.exists(dbFile):
 		print('ERROR: Database already exists')
 		sys.exit(1)
-	#
+
 	namespaceRegex = re.compile(r'[a-zA-Z]+:')
 	titleToViews: dict[str, int] = defaultdict(int)
 	linePrefix = b'en.wikipedia '
@@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 					print(f'At line {lineNum}')
 				if not line.startswith(linePrefix):
 					continue
+
 				# Get second and second-last fields
 				line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
 				title = line[:line.find(b' ')].decode('utf-8')
 				viewCount = int(line[line.rfind(b' ')+1:])
 				if namespaceRegex.match(title) is not None:
 					continue
+
 				# Update map
 				title = title.replace('_', ' ')
 				titleToViews[title] += viewCount
 	print(f'Found {len(titleToViews)} titles')
-	#
+
 	print('Writing to db')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
@@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 	idbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py
index f744818..c4d0932 100755
--- a/backend/tol_data/enwiki/lookup_page.py
+++ b/backend/tol_data/enwiki/lookup_page.py
@@ -5,6 +5,7 @@ Looks up a page with title title1 in the wiki dump, using the dump-index
 db, and prints the corresponding <page>.
 """
 
+import argparse
 import sys
 import bz2
 import sqlite3
@@ -24,7 +25,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
 	_, pageOffset, endOffset = row
 	dbCon.close()
 	print(f'Found chunk at offset {pageOffset}')
-	#
+
 	print('Reading from wiki dump')
 	content: list[str] = []
 	with open(dumpFile, mode='rb') as file:
@@ -32,6 +33,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
 		file.seek(pageOffset)
 		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
 		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+
 		# Look in chunk for page
 		lines = data.splitlines()
 		lineIdx = 0
@@ -58,14 +60,13 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
 						if line.lstrip() == '</page>':
 							break
 			lineIdx += 1
-	#
+
 	print('Content: ')
 	print('\n'.join(content))
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.add_argument('title', help='The title to look up')
 	args = parser.parse_args()
-	#
+
 	lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' '))
-- 
cgit v1.2.3