diff options
Diffstat (limited to 'backend/hist_data/enwiki/gen_pageview_data.py')
| -rwxr-xr-x | backend/hist_data/enwiki/gen_pageview_data.py | 30 |
1 files changed, 19 insertions, 11 deletions
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index 935b303..57d6c7b 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -3,27 +3,34 @@ """ Reads through wikimedia files containing pageview counts, computes average counts, and adds them to a database + +Each pageview file has lines that seem to hold these space-separated fields: + wiki code (eg: en.wikipedia), article title, page ID (may be: null), + platform (eg: mobile-web), monthly view count, + hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) """ -# Took about 10min per file (each had about 180e6 lines) +# Note: Took about 10min per file (each had about 180e6 lines) -import sys, os, glob, math, re +import argparse +import sys +import os +import glob +import math +import re from collections import defaultdict -import bz2, sqlite3 +import bz2 +import sqlite3 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') DUMP_INDEX_DB = 'dump_index.db' DB_FILE = 'pageview_data.db' def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: - # Each pageview file has lines that seem to hold these space-separated fields: - # wiki code (eg: en.wikipedia), article title, page ID (may be: null), - # platform (eg: mobile-web), monthly view count, - # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) if os.path.exists(dbFile): print('ERROR: Database already exists') sys.exit(1) - # + namespaceRegex = re.compile(r'[a-zA-Z]+:') titleToViews: dict[str, int] = defaultdict(int) linePrefix = b'en.wikipedia ' @@ -35,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: print(f'At line {lineNum}') if not line.startswith(linePrefix): continue + # Get second and second-last fields linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields title = linePart[:linePart.find(b' ')].decode('utf-8') @@ -45,11 +53,12 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: continue if namespaceRegex.match(title) is not None: continue + # Update map title = title.replace('_', ' ') titleToViews[title] += viewCount print(f'Found {len(titleToViews)} titles') - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -66,8 +75,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: idbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE) |
