aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-10-02 12:23:19 +1100
committerTerry Truong <terry06890@gmail.com>2022-10-02 12:23:19 +1100
commitd70b96295d768aa0c80bf66639ad7a56bdef92a8 (patch)
tree3f960ad83e4158fff1c0931d838033392a3391ec /backend
parent1b4fc8667714ef4ce9f326bd14f795fc2417ecb9 (diff)
Add gen_pop_data.py
Diffstat (limited to 'backend')
-rw-r--r--backend/hist_data/README.md25
-rw-r--r--backend/hist_data/enwiki/README.md20
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py3
-rwxr-xr-xbackend/hist_data/gen_pop_data.py49
-rw-r--r--backend/tests/test_gen_pop_data.py43
5 files changed, 122 insertions, 18 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index c55549e..5b64462 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -18,27 +18,38 @@ This directory holds files used to generate the history database data.db.
- If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'.
For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
- If 0, they denote a number of years CE (if positive) or BCE (if negative).
+- `pop`: <br>
+ Format: `id INT PRIMARY KEY, pop INT` <br>
+ Associates each event with a popularity measure (currently an average monthly viewcount)
# Generating the Database
+## Environment
+Some of the scripts use third-party packages:
+- `jdcal`: For date conversion
+- `indexed_bzip2`: For parallelised bzip2 processing.
+- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps.
+- `requests`: For downloading data.
+
## Generate Event Data
1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table.
-## Generate Description Data
-1. Obtain an enwiki dump in enwiki/, as specified in the README.
-1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
-1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
-1. Run
-
## Generate Popularity Data
1. Obtain 'page view files' in enwiki/, as specified in it's README.
-1. Run
+1. Run `gen_pop_data.py`, which adds the `pop` table, using data in enwiki/ and the `events` table.
## Generate Image Data and Popularity Data
1. In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`,
looks for infobox image names, and stores them in an image database.
+ Uses popularity data in enwiki/ to find the top N events in each event category.
1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
images, and adds them to the image database.
1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
+1. Run
+
+## Generate Description Data
+1. Obtain an enwiki dump in enwiki/, as specified in the README.
+1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
+1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
1. Run
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index dd090ca..95795f3 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -19,16 +19,6 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
Tables: <br>
- `offsets`: `id INT PRIMARY KEY, title TEXT UNIQUE, offset INT, next_offset INT`
-# Description Files
-- `gen_desc_data.py` <br>
- Reads through pages in the dump file, and adds short-description info to a database.
-- `desc_data.db` <br>
- Generated by `gen_desc_data.py`. <br>
- Tables: <br>
- - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
- - `redirects`: `id INT PRIMARY KEY, target TEXT`
- - `descs`: `id INT PRIMARY KEY, desc TEXT`
-
# Page View Files
- `pageviews/pageviews-*-user.bz2`
Each holds wikimedia article page view data for some month.
@@ -58,3 +48,13 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
- `download_imgs.py` <br>
Used to download image files into imgs/.
+
+# Description Files
+- `gen_desc_data.py` <br>
+ Reads through pages in the dump file, and adds short-description info to a database.
+- `desc_data.db` <br>
+ Generated by `gen_desc_data.py`. <br>
+ Tables: <br>
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
+ - `redirects`: `id INT PRIMARY KEY, target TEXT`
+ - `descs`: `id INT PRIMARY KEY, desc TEXT`
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index b37a107..90ec925 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -5,7 +5,7 @@ Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
"""
-# Took about 15min per file (each had about 180e6 lines)
+# Took about 10min per file (each had about 180e6 lines)
import sys, os, glob, math, re
from collections import defaultdict
@@ -42,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if namespaceRegex.match(title) is not None:
continue
# Update map
+ title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
#
diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py
new file mode 100755
index 0000000..46c9c68
--- /dev/null
+++ b/backend/hist_data/gen_pop_data.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+
+"""
+Adds Wikipedia page view info to the database as popularity values.
+"""
+
+import os, sqlite3
+
+PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
+DB_FILE = 'data.db'
+
+def genData(pageviewsDb: str, dbFile: str) -> None:
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ #
+ print('Getting event data')
+ titleToId: dict[str, int] = {}
+ for eventId, title in dbCur.execute('SELECT id, title FROM events'):
+ titleToId[title] = eventId
+ #
+ print('Getting view counts')
+ pdbCon = sqlite3.connect(pageviewsDb)
+ pdbCur = pdbCon.cursor()
+ titleToViews: dict[str, int] = {}
+ iterNum = 0
+ for title, views in pdbCur.execute('SELECT title, views from views'):
+ iterNum += 1
+ if iterNum % 1e6 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ if title not in titleToId:
+ continue
+ titleToViews[title] = views
+ pdbCon.close()
+ #
+ print(f'Result: {len(titleToViews)} out of {len(titleToId)}')
+ dbCur.execute('CREATE TABLE pop (id INT PRIMARY KEY, pop INT)')
+ for title, views in titleToViews.items():
+ dbCur.execute('INSERT INTO pop VALUES (?, ?)', (titleToId[title], views))
+ #
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ genData(PAGEVIEWS_DB, DB_FILE)
diff --git a/backend/tests/test_gen_pop_data.py b/backend/tests/test_gen_pop_data.py
new file mode 100644
index 0000000..2f505f0
--- /dev/null
+++ b/backend/tests/test_gen_pop_data.py
@@ -0,0 +1,43 @@
+import unittest
+import tempfile, os
+
+from tests.common import createTestDbTable, readTestDbTable
+from hist_data.gen_pop_data import genData
+
+class TestGenData(unittest.TestCase):
+ def test_gen(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp pageviews db
+ pageviewsDb = os.path.join(tempDir, 'pageview_data.db')
+ createTestDbTable(
+ pageviewsDb,
+ 'CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)',
+ 'INSERT INTO views VALUES (?, ?, ?)',
+ {
+ ('one', 1, 10),
+ ('two', 2, 20),
+ ('three', 3, 30),
+ }
+ )
+ # Create temp history db
+ dbFile = os.path.join(tempDir, 'data.db')
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)',
+ 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+ {
+ (11, 'one', 100, None, None, None, 0, 'event'),
+ (33, 'three', 100, None, None, None, 0, 'event'),
+ }
+ )
+ # Run
+ genData(pageviewsDb, dbFile)
+ # Check
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, pop from pop'),
+ {
+ (11, 10),
+ (33, 30)
+ }
+ )