aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data')
-rw-r--r--backend/hist_data/README.md25
-rw-r--r--backend/hist_data/enwiki/README.md20
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py3
-rwxr-xr-xbackend/hist_data/gen_pop_data.py49
4 files changed, 79 insertions, 18 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index c55549e..5b64462 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -18,27 +18,38 @@ This directory holds files used to generate the history database data.db.
- If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'.
For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
- If 0, they denote a number of years CE (if positive) or BCE (if negative).
+- `pop`: <br>
+ Format: `id INT PRIMARY KEY, pop INT` <br>
+ Associates each event with a popularity measure (currently an average monthly viewcount)
# Generating the Database
+## Environment
+Some of the scripts use third-party packages:
+- `jdcal`: For date conversion
+- `indexed_bzip2`: For parallelised bzip2 processing.
+- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps.
+- `requests`: For downloading data.
+
## Generate Event Data
1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table.
-## Generate Description Data
-1. Obtain an enwiki dump in enwiki/, as specified in the README.
-1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
-1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
-1. Run
-
## Generate Popularity Data
1. Obtain 'page view files' in enwiki/, as specified in it's README.
-1. Run
+1. Run `gen_pop_data.py`, which adds the `pop` table, using data in enwiki/ and the `events` table.
## Generate Image Data and Popularity Data
1. In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`,
looks for infobox image names, and stores them in an image database.
+ Uses popularity data in enwiki/ to find the top N events in each event category.
1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
images, and adds them to the image database.
1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
+1. Run
+
+## Generate Description Data
+1. Obtain an enwiki dump in enwiki/, as specified in the README.
+1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
+1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
1. Run
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index dd090ca..95795f3 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -19,16 +19,6 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
Tables: <br>
- `offsets`: `id INT PRIMARY KEY, title TEXT UNIQUE, offset INT, next_offset INT`
-# Description Files
-- `gen_desc_data.py` <br>
- Reads through pages in the dump file, and adds short-description info to a database.
-- `desc_data.db` <br>
- Generated by `gen_desc_data.py`. <br>
- Tables: <br>
- - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
- - `redirects`: `id INT PRIMARY KEY, target TEXT`
- - `descs`: `id INT PRIMARY KEY, desc TEXT`
-
# Page View Files
- `pageviews/pageviews-*-user.bz2`
Each holds wikimedia article page view data for some month.
@@ -58,3 +48,13 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
- `download_imgs.py` <br>
Used to download image files into imgs/.
+
+# Description Files
+- `gen_desc_data.py` <br>
+ Reads through pages in the dump file, and adds short-description info to a database.
+- `desc_data.db` <br>
+ Generated by `gen_desc_data.py`. <br>
+ Tables: <br>
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
+ - `redirects`: `id INT PRIMARY KEY, target TEXT`
+ - `descs`: `id INT PRIMARY KEY, desc TEXT`
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index b37a107..90ec925 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -5,7 +5,7 @@ Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
"""
-# Took about 15min per file (each had about 180e6 lines)
+# Took about 10min per file (each had about 180e6 lines)
import sys, os, glob, math, re
from collections import defaultdict
@@ -42,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if namespaceRegex.match(title) is not None:
continue
# Update map
+ title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
#
diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py
new file mode 100755
index 0000000..46c9c68
--- /dev/null
+++ b/backend/hist_data/gen_pop_data.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+
+"""
+Adds Wikipedia page view info to the database as popularity values.
+"""
+
+import os, sqlite3
+
+PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
+DB_FILE = 'data.db'
+
+def genData(pageviewsDb: str, dbFile: str) -> None:
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ #
+ print('Getting event data')
+ titleToId: dict[str, int] = {}
+ for eventId, title in dbCur.execute('SELECT id, title FROM events'):
+ titleToId[title] = eventId
+ #
+ print('Getting view counts')
+ pdbCon = sqlite3.connect(pageviewsDb)
+ pdbCur = pdbCon.cursor()
+ titleToViews: dict[str, int] = {}
+ iterNum = 0
+ for title, views in pdbCur.execute('SELECT title, views from views'):
+ iterNum += 1
+ if iterNum % 1e6 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ if title not in titleToId:
+ continue
+ titleToViews[title] = views
+ pdbCon.close()
+ #
+ print(f'Result: {len(titleToViews)} out of {len(titleToId)}')
+ dbCur.execute('CREATE TABLE pop (id INT PRIMARY KEY, pop INT)')
+ for title, views in titleToViews.items():
+ dbCur.execute('INSERT INTO pop VALUES (?, ?)', (titleToId[title], views))
+ #
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ genData(PAGEVIEWS_DB, DB_FILE)