1 files changed, 18 insertions, 10 deletions
diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py
index 8aee1cc..95b4a60 100755
--- a/backend/tol_data/enwiki/gen_pageview_data.py
+++ b/backend/tol_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
 """
 Reads through wikimedia files containing pageview counts,
 computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+	wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+	platform (eg: mobile-web), monthly view count,
+	hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 """
 
 # Took about 15min per file (each had about 180e6 lines)
 
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
 from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
 
 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
 DUMP_INDEX_DB = 'dump_index.db'
 DB_FILE = 'pageview_data.db'
 
 def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
-	# Each pageview file has lines that seem to hold these space-separated fields:
-		# wiki code (eg: en.wikipedia), article title, page ID (may be: null),
-		# platform (eg: mobile-web), monthly view count,
-		# hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 	if os.path.exists(dbFile):
 		print('ERROR: Database already exists')
 		sys.exit(1)
-	#
+
 	namespaceRegex = re.compile(r'[a-zA-Z]+:')
 	titleToViews: dict[str, int] = defaultdict(int)
 	linePrefix = b'en.wikipedia '
@@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 					print(f'At line {lineNum}')
 				if not line.startswith(linePrefix):
 					continue
+
 				# Get second and second-last fields
 				line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
 				title = line[:line.find(b' ')].decode('utf-8')
 				viewCount = int(line[line.rfind(b' ')+1:])
 				if namespaceRegex.match(title) is not None:
 					continue
+
 				# Update map
 				title = title.replace('_', ' ')
 				titleToViews[title] += viewCount
 	print(f'Found {len(titleToViews)} titles')
-	#
+
 	print('Writing to db')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
@@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 	idbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)