backend/hist_data/enwiki/gen_pageview_data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

#!/usr/bin/python3

"""
Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database

Each pageview file has lines that seem to hold these space-separated fields:
	wiki code (eg: en.wikipedia), article title, page ID (may be: null),
	platform (eg: mobile-web), monthly view count,
	hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
"""

# Note: Took about 10min per file (each had about 180e6 lines)

import argparse
import sys
import os
import glob
import math
import re
from collections import defaultdict
import bz2
import sqlite3

PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
DUMP_INDEX_DB = 'dump_index.db'
DB_FILE = 'pageview_data.db'

def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
	if os.path.exists(dbFile):
		print('ERROR: Database already exists')
		sys.exit(1)

	namespaceRegex = re.compile(r'[a-zA-Z]+:')
	titleToViews: dict[str, int] = defaultdict(int)
	linePrefix = b'en.wikipedia '
	for filename in pageviewFiles:
		print(f'Reading from {filename}')
		with bz2.open(filename, 'rb') as file:
			for lineNum, line in enumerate(file, 1):
				if lineNum % 1e6 == 0:
					print(f'At line {lineNum}')
				if not line.startswith(linePrefix):
					continue

				# Get second and second-last fields
				linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
				title = linePart[:linePart.find(b' ')].decode('utf-8')
				try:
					viewCount = int(linePart[linePart.rfind(b' ')+1:])
				except ValueError:
					print(f'Unable to read count in line {lineNum}: {line}')
					continue
				if namespaceRegex.match(title) is not None:
					continue

				# Update map
				title = title.replace('_', ' ')
				titleToViews[title] += viewCount
	print(f'Found {len(titleToViews)} titles')

	print('Writing to db')
	dbCon = sqlite3.connect(dbFile)
	dbCur = dbCon.cursor()
	idbCon = sqlite3.connect(dumpIndexDb)
	idbCur = idbCon.cursor()
	dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)')
	for title, views in titleToViews.items():
		row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
		if row is not None:
			wikiId = int(row[0])
			dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles))))
	dbCon.commit()
	dbCon.close()
	idbCon.close()

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	args = parser.parse_args()

	genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)