1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
#!/usr/bin/python3
"""
Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
"""
# Took about 15min per file (each had about 180e6 lines)
import sys, os, glob, math, re
from collections import defaultdict
import bz2, sqlite3
PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
DUMP_INDEX_DB = 'dumpIndex.db'
DB_FILE = 'pageview_data.db'
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
# Each pageview file has lines that seem to hold these space-separated fields:
# wiki code (eg: en.wikipedia), article title, page ID (may be: null),
# platform (eg: mobile-web), monthly view count,
# hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
#
namespaceRegex = re.compile(r'[a-zA-Z]+:')
titleToViews: dict[str, int] = defaultdict(int)
linePrefix = b'en.wikipedia '
for filename in pageviewFiles:
print(f'Reading from {filename}')
with bz2.open(filename, 'rb') as file:
for lineNum, line in enumerate(file, 1):
if lineNum % 1e6 == 0:
print(f'At line {lineNum}')
if not line.startswith(linePrefix):
continue
# Get second and second-last fields
line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
title = line[:line.find(b' ')].decode('utf-8')
viewCount = int(line[line.rfind(b' ')+1:])
if namespaceRegex.match(title) is not None:
continue
# Update map
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
#
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
idbCon = sqlite3.connect(dumpIndexDb)
idbCur = idbCon.cursor()
dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)')
for title, views in titleToViews.items():
row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
if row is not None:
wikiId = int(row[0])
dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles))))
dbCon.commit()
dbCon.close()
idbCon.close()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
#
genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
|