diff options
Diffstat (limited to 'backend/hist_data/enwiki/gen_pageview_data.py')
| -rwxr-xr-x | backend/hist_data/enwiki/gen_pageview_data.py | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py new file mode 100755 index 0000000..b37a107 --- /dev/null +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 + +""" +Reads through wikimedia files containing pageview counts, +computes average counts, and adds them to a database +""" + +# Took about 15min per file (each had about 180e6 lines) + +import sys, os, glob, math, re +from collections import defaultdict +import bz2, sqlite3 + +PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') +DUMP_INDEX_DB = 'dump_index.db' +DB_FILE = 'pageview_data.db' + +def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: + # Each pageview file has lines that seem to hold these space-separated fields: + # wiki code (eg: en.wikipedia), article title, page ID (may be: null), + # platform (eg: mobile-web), monthly view count, + # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) + if os.path.exists(dbFile): + print('ERROR: Database already exists') + sys.exit(1) + # + namespaceRegex = re.compile(r'[a-zA-Z]+:') + titleToViews: dict[str, int] = defaultdict(int) + linePrefix = b'en.wikipedia ' + for filename in pageviewFiles: + print(f'Reading from {filename}') + with bz2.open(filename, 'rb') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e6 == 0: + print(f'At line {lineNum}') + if not line.startswith(linePrefix): + continue + # Get second and second-last fields + line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields + title = line[:line.find(b' ')].decode('utf-8') + viewCount = int(line[line.rfind(b' ')+1:]) + if namespaceRegex.match(title) is not None: + continue + # Update map + titleToViews[title] += viewCount + print(f'Found {len(titleToViews)} titles') + # + print('Writing to db') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + idbCon = sqlite3.connect(dumpIndexDb) + idbCur = idbCon.cursor() + dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)') + for title, views in titleToViews.items(): + row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row is not None: + wikiId = int(row[0]) + dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles)))) + dbCon.commit() + dbCon.close() + idbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE) |
