1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
#!/usr/bin/python3
"""
Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
Each pageview file has lines that seem to hold these space-separated fields:
wiki code (eg: en.wikipedia), article title, page ID (may be: null),
platform (eg: mobile-web), monthly view count,
hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
"""
# Took about 15min per file (each had about 180e6 lines)
import argparse
import sys
import os
import glob
import math
import re
from collections import defaultdict
import bz2
import sqlite3
PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
DUMP_INDEX_DB = 'dump_index.db'
DB_FILE = 'pageview_data.db'
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
namespaceRegex = re.compile(r'[a-zA-Z]+:')
titleToViews: dict[str, int] = defaultdict(int)
linePrefix = b'en.wikipedia '
for filename in pageviewFiles:
print(f'Reading from {filename}')
with bz2.open(filename, 'rb') as file:
for lineNum, line in enumerate(file, 1):
if lineNum % 1e6 == 0:
print(f'At line {lineNum}')
if not line.startswith(linePrefix):
continue
# Get second and second-last fields
line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
title = line[:line.find(b' ')].decode('utf-8')
viewCount = int(line[line.rfind(b' ')+1:])
if namespaceRegex.match(title) is not None:
continue
# Update map
title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
idbCon = sqlite3.connect(dumpIndexDb)
idbCur = idbCon.cursor()
dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)')
for title, views in titleToViews.items():
row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
if row is not None:
wikiId = int(row[0])
dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles))))
dbCon.commit()
dbCon.close()
idbCon.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
|