diff options
Diffstat (limited to 'backend/tolData/enwiki')
| -rw-r--r-- | backend/tolData/enwiki/README.md | 16 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/genPageviewData.py | 62 |
2 files changed, 76 insertions, 2 deletions
diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md index 7df21c9..76f9ee5 100644 --- a/backend/tolData/enwiki/README.md +++ b/backend/tolData/enwiki/README.md @@ -2,8 +2,8 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. # Downloaded Files - enwiki-20220501-pages-articles-multistream.xml.bz2 <br> - Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror). Contains text content and metadata for pages in enwiki. + Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror). Some file content and format information was available from <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>. - enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br> @@ -13,7 +13,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. # Dump-Index Files - genDumpIndexDb.py <br> - Creates an sqlite-database version of the enwiki-dump index file. + Creates a database version of the enwiki-dump index file. - dumpIndex.db <br> Generated by genDumpIndexDb.py. <br> Tables: <br> @@ -45,6 +45,18 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. - downloadImgs.py <br> Used to download image files into imgs/. +# Page View Files +- pageviews/pageviews-*-user.bz2 + Each holds wikimedia article page view data for some month. + Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>. + Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>. +- genPageviewData.py <br> + Reads pageview/*, and creates a database holding average monthly pageview counts. +- pageviewData.db <br> + Generated using genPageviewData.py. <br> + Tables: <br> + - `views`: `title TEXT PRIMARY KEY, id INT, views INT` + # Other Files - lookupPage.py <br> Running `lookupPage.py title1` looks in the dump for a page with a given title, diff --git a/backend/tolData/enwiki/genPageviewData.py b/backend/tolData/enwiki/genPageviewData.py new file mode 100755 index 0000000..f0901b2 --- /dev/null +++ b/backend/tolData/enwiki/genPageviewData.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 + +import sys, os, glob, math, re +from collections import defaultdict +import bz2, sqlite3 + +import argparse +parser = argparse.ArgumentParser(description=''' +Reads through wikimedia files containing pageview counts, +computes average counts, and adds them to a database +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +pageviewFiles = glob.glob('./pageviews/pageviews-*-user.bz2') +dbFile = 'pageviewData.db' +dumpIndexDb = 'dumpIndex.db' + +# Took about 15min per file (each about 180e6 lines) + +if os.path.exists(dbFile): + print('ERROR: Database already exists') + sys.exit(1) + +# Each pageview file has lines that seem to hold these space-separated fields: + # wiki code (eg: en.wikipedia), article title, page ID (may be: null), + # platform (eg: mobile-web), monthly view count, + # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) +namespaceRegex = re.compile(r'[a-zA-Z]+:') +titleToViews = defaultdict(int) +linePrefix = b'en.wikipedia ' +for filename in pageviewFiles: + print(f'Reading from {filename}') + with bz2.open(filename, 'rb') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e6 == 0: + print(f'At line {lineNum}') + if not line.startswith(linePrefix): + continue + # Get second and second-last fields + line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields + title = line[:line.find(b' ')].decode('utf-8') + viewCount = int(line[line.rfind(b' ')+1:]) + if namespaceRegex.match(title) != None: + continue + # Update map + titleToViews[title] += viewCount +print(f'Found {len(titleToViews)} titles') + +print('Writing to db') +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +idbCon = sqlite3.connect(dumpIndexDb) +idbCur = idbCon.cursor() +dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)') +for title, views in titleToViews.items(): + row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row != None: + wikiId = int(row[0]) + dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles)))) +dbCon.commit() +dbCon.close() +idbCon.close() |
