diff options
| -rw-r--r-- | backend/hist_data/README.md | 25 | ||||
| -rw-r--r-- | backend/hist_data/enwiki/README.md | 20 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_pageview_data.py | 3 | ||||
| -rwxr-xr-x | backend/hist_data/gen_pop_data.py | 49 | ||||
| -rw-r--r-- | backend/tests/test_gen_pop_data.py | 43 |
5 files changed, 122 insertions, 18 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index c55549e..5b64462 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -18,27 +18,38 @@ This directory holds files used to generate the history database data.db. - If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'. For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not. - If 0, they denote a number of years CE (if positive) or BCE (if negative). +- `pop`: <br> + Format: `id INT PRIMARY KEY, pop INT` <br> + Associates each event with a popularity measure (currently an average monthly viewcount) # Generating the Database +## Environment +Some of the scripts use third-party packages: +- `jdcal`: For date conversion +- `indexed_bzip2`: For parallelised bzip2 processing. +- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps. +- `requests`: For downloading data. + ## Generate Event Data 1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README. 1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table. -## Generate Description Data -1. Obtain an enwiki dump in enwiki/, as specified in the README. -1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump. -1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database. -1. Run - ## Generate Popularity Data 1. Obtain 'page view files' in enwiki/, as specified in it's README. -1. Run +1. Run `gen_pop_data.py`, which adds the `pop` table, using data in enwiki/ and the `events` table. ## Generate Image Data and Popularity Data 1. In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`, looks for infobox image names, and stores them in an image database. + Uses popularity data in enwiki/ to find the top N events in each event category. 1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found images, and adds them to the image database. 1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. +1. Run + +## Generate Description Data +1. Obtain an enwiki dump in enwiki/, as specified in the README. +1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump. +1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database. 1. Run diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index dd090ca..95795f3 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -19,16 +19,6 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. Tables: <br> - `offsets`: `id INT PRIMARY KEY, title TEXT UNIQUE, offset INT, next_offset INT` -# Description Files -- `gen_desc_data.py` <br> - Reads through pages in the dump file, and adds short-description info to a database. -- `desc_data.db` <br> - Generated by `gen_desc_data.py`. <br> - Tables: <br> - - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE` - - `redirects`: `id INT PRIMARY KEY, target TEXT` - - `descs`: `id INT PRIMARY KEY, desc TEXT` - # Page View Files - `pageviews/pageviews-*-user.bz2` Each holds wikimedia article page view data for some month. @@ -58,3 +48,13 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. - `download_imgs.py` <br> Used to download image files into imgs/. + +# Description Files +- `gen_desc_data.py` <br> + Reads through pages in the dump file, and adds short-description info to a database. +- `desc_data.db` <br> + Generated by `gen_desc_data.py`. <br> + Tables: <br> + - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE` + - `redirects`: `id INT PRIMARY KEY, target TEXT` + - `descs`: `id INT PRIMARY KEY, desc TEXT` diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index b37a107..90ec925 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -5,7 +5,7 @@ Reads through wikimedia files containing pageview counts, computes average counts, and adds them to a database """ -# Took about 15min per file (each had about 180e6 lines) +# Took about 10min per file (each had about 180e6 lines) import sys, os, glob, math, re from collections import defaultdict @@ -42,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: if namespaceRegex.match(title) is not None: continue # Update map + title = title.replace('_', ' ') titleToViews[title] += viewCount print(f'Found {len(titleToViews)} titles') # diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py new file mode 100755 index 0000000..46c9c68 --- /dev/null +++ b/backend/hist_data/gen_pop_data.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 + +""" +Adds Wikipedia page view info to the database as popularity values. +""" + +import os, sqlite3 + +PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db') +DB_FILE = 'data.db' + +def genData(pageviewsDb: str, dbFile: str) -> None: + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Getting event data') + titleToId: dict[str, int] = {} + for eventId, title in dbCur.execute('SELECT id, title FROM events'): + titleToId[title] = eventId + # + print('Getting view counts') + pdbCon = sqlite3.connect(pageviewsDb) + pdbCur = pdbCon.cursor() + titleToViews: dict[str, int] = {} + iterNum = 0 + for title, views in pdbCur.execute('SELECT title, views from views'): + iterNum += 1 + if iterNum % 1e6 == 0: + print(f'At iteration {iterNum}') + # + if title not in titleToId: + continue + titleToViews[title] = views + pdbCon.close() + # + print(f'Result: {len(titleToViews)} out of {len(titleToId)}') + dbCur.execute('CREATE TABLE pop (id INT PRIMARY KEY, pop INT)') + for title, views in titleToViews.items(): + dbCur.execute('INSERT INTO pop VALUES (?, ?)', (titleToId[title], views)) + # + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(PAGEVIEWS_DB, DB_FILE) diff --git a/backend/tests/test_gen_pop_data.py b/backend/tests/test_gen_pop_data.py new file mode 100644 index 0000000..2f505f0 --- /dev/null +++ b/backend/tests/test_gen_pop_data.py @@ -0,0 +1,43 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from hist_data.gen_pop_data import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp pageviews db + pageviewsDb = os.path.join(tempDir, 'pageview_data.db') + createTestDbTable( + pageviewsDb, + 'CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)', + 'INSERT INTO views VALUES (?, ?, ?)', + { + ('one', 1, 10), + ('two', 2, 20), + ('three', 3, 30), + } + ) + # Create temp history db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ + 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', + 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + { + (11, 'one', 100, None, None, None, 0, 'event'), + (33, 'three', 100, None, None, None, 0, 'event'), + } + ) + # Run + genData(pageviewsDb, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, pop from pop'), + { + (11, 10), + (33, 30) + } + ) |
