Adjust backend coding style

Increase line spacing, add section comments, etc
author: Terry Truong <terry06890@gmail.com> 2023-01-21 12:21:03 +1100
committer: Terry Truong <terry06890@gmail.com> 2023-01-21 12:32:01 +1100
commit: 0a9b2c2e5eca8a04e37fbdd423379882863237c2 (patch)
tree: 1812bdb6bb13e4f76fdd7ef04075b291f775c213 /backend/hist_data/enwiki
parent: 8321e2f92dbc073b8f1de87895d6620a2021b22e (diff)
6 files changed, 135 insertions, 63 deletions
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 43f2c43..6fd710c 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -10,12 +10,16 @@ at already-processed names to decide what to skip.
 """
 
 import argparse
-import re, time, signal
-import sqlite3, urllib.parse, html
+import re
+import time
+import signal
+import sqlite3
+import urllib.parse
+import html
 import requests
 
 IMG_DB = 'img_data.db'
-#
+
 API_URL = 'https://en.wikipedia.org/w/api.php'
 USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 BATCH_SZ = 50 # Max 50
@@ -26,17 +30,18 @@ def downloadInfo(imgDb: str) -> None:
 	print('Opening database')
 	dbCon = sqlite3.connect(imgDb)
 	dbCur = dbCon.cursor()
+
 	print('Checking for table')
 	if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
 		dbCur.execute('CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \
 			'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
-	#
+
 	print('Reading image names')
 	imgNames: set[str] = set()
 	for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
 		imgNames.add(imgName)
 	print(f'Found {len(imgNames)}')
-	#
+
 	print('Checking for already-processed images')
 	nextImgId = 1
 	oldSz = len(imgNames)
@@ -45,7 +50,7 @@ def downloadInfo(imgDb: str) -> None:
 		if imgId >= nextImgId:
 			nextImgId = imgId + 1
 	print(f'Found {oldSz - len(imgNames)}')
-	#
+
 	# Set SIGINT handler
 	interrupted = False
 	oldHandler = None
@@ -54,7 +59,7 @@ def downloadInfo(imgDb: str) -> None:
 		interrupted = True
 		signal.signal(signal.SIGINT, oldHandler)
 	oldHandler = signal.signal(signal.SIGINT, onSigint)
-	#
+
 	print('Iterating through image names')
 	imgNameList = list(imgNames)
 	iterNum = 0
@@ -65,9 +70,11 @@ def downloadInfo(imgDb: str) -> None:
 		if interrupted:
 			print(f'Exiting loop at iteration {iterNum}')
 			break
+
 		# Get batch
 		imgBatch = imgNameList[i:i+BATCH_SZ]
 		imgBatch = ['File:' + x for x in imgBatch]
+
 		# Make request
 		headers = {
 			'user-agent': USER_AGENT,
@@ -90,6 +97,7 @@ def downloadInfo(imgDb: str) -> None:
 			print(f'ERROR: Exception while downloading info: {e}')
 			print('\tImage batch: ' + '|'.join(imgBatch))
 			continue
+
 		# Parse response-object
 		if 'query' not in responseObj or 'pages' not in responseObj['query']:
 			print('WARNING: Response object doesn\'t have page data')
@@ -120,6 +128,7 @@ def downloadInfo(imgDb: str) -> None:
 			if title not in imgNames:
 				print(f'WARNING: Got title "{title}" not in image-name list')
 				continue
+
 			if 'imageinfo' not in page:
 				print(f'WARNING: No imageinfo section for page "{title}"')
 				continue
@@ -129,6 +138,7 @@ def downloadInfo(imgDb: str) -> None:
 			artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
 			credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
 			restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
 			# Remove markup
 			if artist is not None:
 				artist = TAG_REGEX.sub(' ', artist).strip()
@@ -140,11 +150,12 @@ def downloadInfo(imgDb: str) -> None:
 				credit = WHITESPACE_REGEX.sub(' ', credit)
 				credit = html.unescape(credit)
 				credit = urllib.parse.unquote(credit)
+
 			# Add to db
 			dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)',
 				(nextImgId, title, license, artist, credit, restrictions, url))
 			nextImgId += 1
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
@@ -152,5 +163,5 @@ def downloadInfo(imgDb: str) -> None:
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	downloadInfo(IMG_DB)
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index df40bae..e484b33 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,33 +9,38 @@ The program can be re-run to continue downloading, and looks
 in the output directory do decide what to skip.
 """
 
-# Took about a week to downloaded about 60k images
+# Note: Took about a week to downloaded about 60k images
 
 import argparse
-import re, os, time, signal
+import re
+import os
+import time
+import signal
 import sqlite3
-import urllib.parse, requests
+import urllib.parse
+import requests
 
 IMG_DB = 'img_data.db' # About 130k image names
 OUT_DIR = 'imgs'
-#
+
 LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
 USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 TIMEOUT = 1
-	# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
-	# It's unclear how to properly check for cache misses, so we just aim for 1 per sec
+	# Note: https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'.
+	# It's unclear how to properly check for cache misses, so we just aim for 1 per sec.
 EXP_BACKOFF = True # If True, double the timeout each time a download error occurs (otherwise just exit)
 
 def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	if not os.path.exists(outDir):
 		os.mkdir(outDir)
+
 	print('Checking for already-downloaded images')
 	fileList = os.listdir(outDir)
 	imgIdsDone: set[int] = set()
 	for filename in fileList:
 		imgIdsDone.add(int(os.path.splitext(filename)[0]))
 	print(f'Found {len(imgIdsDone)}')
-	#
+
 	# Set SIGINT handler
 	interrupted = False
 	oldHandler = None
@@ -44,10 +49,11 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		interrupted = True
 		signal.signal(signal.SIGINT, oldHandler)
 	oldHandler = signal.signal(signal.SIGINT, onSigint)
-	#
+
 	print('Opening database')
 	dbCon = sqlite3.connect(imgDb)
 	dbCur = dbCon.cursor()
+
 	print('Starting downloads')
 	iterNum = 0
 	query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs'
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 		if interrupted:
 			print('Exiting loop')
 			break
+
 		# Check for problematic attributes
 		if license is None or LICENSE_REGEX.fullmatch(license) is None:
 			continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 			continue
 		if restrictions is not None and restrictions != '':
 			continue
+
 		# Download image
 		iterNum += 1
 		print(f'Iteration {iterNum}: Downloading for image ID {imgId}')
@@ -92,11 +100,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 				timeout *= 2
 				print(f'New timeout: {timeout}')
 				continue
+
 	print('Closing database')
 	dbCon.close()
 
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index bb2b845..194afe8 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -5,30 +5,40 @@ Reads through the wiki dump, attempts to parse short-descriptions,
 and adds them to a database
 """
 
-# In testing, this script took over 10 hours to run, and generated about 5GB
+# Note: In testing, this script took over 10 hours to run, and generated about 5GB
 
 import argparse
-import sys, os, re
-import bz2, html, mwxml, mwparserfromhell
+import sys
+import os
+import re
 import sqlite3
+import bz2
+import html
+
+import mwxml
+import mwparserfromhell
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
 DB_FILE = 'desc_data.db'
-# Regexps
+
 DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
 EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
 	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
 CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
 def convertTemplateReplace(match):
 	""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
 	if match.group(2) is None:
 		return f'{match.group(1)} {match.group(4)}'
 	else:
 		return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
-PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
-LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+# ========== For data generation ==========
 
 def genData(dumpFile: str, dbFile: str) -> None:
+	""" Reads dump, parses descriptions, and writes to db """
 	print('Creating database')
 	if os.path.exists(dbFile):
 		raise Exception(f'ERROR: Existing {dbFile}')
@@ -39,13 +49,13 @@ def genData(dumpFile: str, dbFile: str) -> None:
 	dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
 	dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
 	dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
-	#
+
 	print('Iterating through dump file')
 	with bz2.open(dumpFile, mode='rt') as file:
 		for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
 			if pageNum % 1e4 == 0:
 				print(f'At page {pageNum}')
-			# Parse page
+
 			if page.namespace == 0:
 				try:
 					dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
@@ -60,15 +70,22 @@ def genData(dumpFile: str, dbFile: str) -> None:
 					desc = parseDesc(revision.text)
 					if desc is not None:
 						dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
-	#
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
+
 def parseDesc(text: str) -> str | None:
-	# Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
-		# and then accumulate lines until a blank one.
-	# Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, 
-		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	"""
+	Looks for a description in wikitext content.
+
+	Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+	and then accumulates lines until a blank one.
+
+	Some cases not accounted for include:
+		disambiguation pages, abstracts with sentences split-across-lines, 
+		nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	"""
 	lines: list[str] = []
 	openBraceCount = 0
 	openBracketCount = 0
@@ -108,18 +125,24 @@ def parseDesc(text: str) -> str | None:
 	if lines:
 		return removeMarkup(' '.join(lines))
 	return None
+
 def removeMarkup(content: str) -> str:
+	""" Tries to remove markup from wikitext content """
 	content = EMBEDDED_HTML_REGEX.sub('', content)
 	content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
 	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
 	content = PARENS_GROUP_REGEX.sub('', content)
 	content = LEFTOVER_BRACE_REGEX.sub('', content)
 	return content
+
 def convertTitle(title: str) -> str:
+	""" Replaces underscores in wiki item title """
 	return html.unescape(title).replace('_', ' ')
 
+# ========== Main block ==========
+
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(DUMP_FILE, DB_FILE)
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
index 6be8bc5..8872171 100755
--- a/backend/hist_data/enwiki/gen_dump_index_db.py
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -1,24 +1,28 @@
 #!/usr/bin/python3
 
 """
-Adds data from the wiki-dump index-file into a database
+Converts data from the wiki-dump index-file into a database
 """
 
 import argparse
-import sys, os, re
-import bz2, sqlite3
+import sys
+import os
+import re
+import bz2
+import sqlite3
 
 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
 DB_FILE = 'dump_index.db'
 
 def genData(indexFile: str, dbFile: str) -> None:
-	""" Reads the index file and creates the db """
 	if os.path.exists(dbFile):
 		raise Exception(f'ERROR: Existing {dbFile}')
+
 	print('Creating database')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
 	dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+
 	print('Iterating through index file')
 	lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
 	lastOffset = 0
@@ -29,7 +33,7 @@ def genData(indexFile: str, dbFile: str) -> None:
 			lineNum += 1
 			if lineNum % 1e5 == 0:
 				print(f'At line {lineNum}')
-			#
+
 			match = lineRegex.fullmatch(line.rstrip())
 			assert match is not None
 			offsetStr, pageId, title = match.group(1,2,3)
@@ -49,6 +53,7 @@ def genData(indexFile: str, dbFile: str) -> None:
 			dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
 		except sqlite3.IntegrityError as e:
 			print(f'Failed on title "{t}": {e}', file=sys.stderr)
+
 	print('Closing database')
 	dbCon.commit()
 	dbCon.close()
@@ -56,5 +61,5 @@ def genData(indexFile: str, dbFile: str) -> None:
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	genData(INDEX_FILE, DB_FILE)
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index 9aa3863..05df63d 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -8,35 +8,42 @@ The program can be re-run with an updated set of page IDs, and
 will skip already-processed page IDs.
 """
 
-import os, re
-import bz2, html, urllib.parse
+import argparse
+import os
+import re
+import bz2
+import html
+import urllib.parse
 import sqlite3
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
 INDEX_DB = 'dump_index.db'
 IMG_DB = 'img_data.db' # The database to create
 DB_FILE = os.path.join('..', 'data.db')
-# Regexps
+
 ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
 IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
 BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
 IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
 CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
 
+# ========== For data generation ==========
+
 def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
+	""" Looks up page IDs in dump and creates database """
 	print('Opening databases')
 	indexDbCon = sqlite3.connect(indexDb)
 	indexDbCur = indexDbCon.cursor()
 	imgDbCon = sqlite3.connect(imgDb)
 	imgDbCur = imgDbCon.cursor()
+
 	print('Checking tables')
 	if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
 		# Create tables if not present
 		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)')
 			# 'img_name' values are set to NULL to indicate page IDs where no image was found
 		imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
-	else:
-		# Check for already-processed page IDs
+	else: # Check for already-processed page IDs
 		numSkipped = 0
 		for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
 			if pid in pageIds:
@@ -45,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			else:
 				print(f'Found already-processed page ID {pid} which was not in input set')
 		print(f'Will skip {numSkipped} already-processed page IDs')
-	#
+
 	print('Getting dump-file offsets')
 	offsetToPageId: dict[int, list[int]] = {}
 	offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
@@ -55,7 +62,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 		iterNum += 1
 		if iterNum % 1e4 == 0:
 			print(f'At iteration {iterNum}')
-		#
+
 		query = 'SELECT offset, next_offset, title FROM offsets WHERE id = ?'
 		row = indexDbCur.execute(query, (pageId,)).fetchone()
 		if row is None:
@@ -68,7 +75,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 		offsetToPageId[chunkOffset].append(pageId)
 		pageIdToTitle[pageId] = title
 	print(f'Found {len(offsetToEnd)} chunks to check')
-	#
+
 	print('Iterating through chunks in dump file')
 	with open(dumpFile, mode='rb') as file:
 		iterNum = 0
@@ -76,7 +83,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 			iterNum += 1
 			if iterNum % 100 == 0:
 				print(f'At iteration {iterNum}')
-			#
+
 			chunkPageIds = offsetToPageId[pageOffset]
 			# Jump to chunk
 			file.seek(pageOffset)
@@ -122,21 +129,24 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 						content.append(line[:line.rfind('</text>')])
 						# Look for image-filename
 						imageName = getImageName(content)
-						imgDbCur.execute('INSERT into page_imgs VALUES (?, ?, ?)', (pageId, None if imageName is None else pageIdToTitle[pageId], imageName))
+						imgDbCur.execute(
+							'INSERT into page_imgs VALUES (?, ?, ?)',
+							(pageId, None if imageName is None else pageIdToTitle[pageId], imageName))
 						break
 					if not foundTextEnd:
 						print(f'WARNING: Did not find </text> for page id {pageId}')
 					break
 				if not foundText:
 					print(f'WARNING: Did not find <text> for page id {pageId}')
-	#
+
 	print('Closing databases')
 	indexDbCon.close()
 	imgDbCon.commit()
 	imgDbCon.close()
+
 def getImageName(content: list[str]) -> str | None:
 	""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
-	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	# Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
 	for line in content:
 		match = IMG_LINE_REGEX.match(line)
 		if match is not None:
@@ -177,6 +187,8 @@ def getImageName(content: list[str]) -> str | None:
 			return None
 	return None
 
+# ========== For getting input page IDs ==========
+
 def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
 	print('Getting event data')
 	titles: set[str] = set()
@@ -184,6 +196,7 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
 	for (title,) in dbCon.execute('SELECT title from events'):
 		titles.add(title)
 	dbCon.close()
+
 	print('Getting page IDs')
 	pageIds: set[int] = set()
 	dbCon = sqlite3.connect(indexDb)
@@ -193,12 +206,15 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
 		if row:
 			pageIds.add(row[0])
 	dbCon.close()
+
 	print(f'Result: {len(pageIds)} out of {len(titles)}')
 	return pageIds
+
+# ========== Main block ==========
+
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
-	#
+
 	pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB)
 	genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 935b303..57d6c7b 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
 """
 Reads through wikimedia files containing pageview counts,
 computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+	wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+	platform (eg: mobile-web), monthly view count,
+	hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 """
 
-# Took about 10min per file (each had about 180e6 lines)
+# Note: Took about 10min per file (each had about 180e6 lines)
 
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
 from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
 
 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
 DUMP_INDEX_DB = 'dump_index.db'
 DB_FILE = 'pageview_data.db'
 
 def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
-	# Each pageview file has lines that seem to hold these space-separated fields:
-		# wiki code (eg: en.wikipedia), article title, page ID (may be: null),
-		# platform (eg: mobile-web), monthly view count,
-		# hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
 	if os.path.exists(dbFile):
 		print('ERROR: Database already exists')
 		sys.exit(1)
-	#
+
 	namespaceRegex = re.compile(r'[a-zA-Z]+:')
 	titleToViews: dict[str, int] = defaultdict(int)
 	linePrefix = b'en.wikipedia '
@@ -35,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 					print(f'At line {lineNum}')
 				if not line.startswith(linePrefix):
 					continue
+
 				# Get second and second-last fields
 				linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
 				title = linePart[:linePart.find(b' ')].decode('utf-8')
@@ -45,11 +53,12 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 					continue
 				if namespaceRegex.match(title) is not None:
 					continue
+
 				# Update map
 				title = title.replace('_', ' ')
 				titleToViews[title] += viewCount
 	print(f'Found {len(titleToViews)} titles')
-	#
+
 	print('Writing to db')
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
@@ -66,8 +75,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 	idbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
-	#
+
 	genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
author	Terry Truong <terry06890@gmail.com>	2023-01-21 12:21:03 +1100
committer	Terry Truong <terry06890@gmail.com>	2023-01-21 12:32:01 +1100
commit	0a9b2c2e5eca8a04e37fbdd423379882863237c2 (patch)
tree	1812bdb6bb13e4f76fdd7ef04075b291f775c213 /backend/hist_data/enwiki
parent	8321e2f92dbc073b8f1de87895d6620a2021b22e (diff)