5 files changed, 122 insertions, 18 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index c55549e..5b64462 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -18,27 +18,38 @@ This directory holds files used to generate the history database data.db.
         -   If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'.
             For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
         -   If 0, they denote a number of years CE (if positive) or BCE (if negative).
+-   `pop`: <br>
+    Format: `id INT PRIMARY KEY, pop INT` <br>
+    Associates each event with a popularity measure (currently an average monthly viewcount)
 
 # Generating the Database
 
+## Environment
+Some of the scripts use third-party packages:
+-   `jdcal`: For date conversion
+-   `indexed_bzip2`: For parallelised bzip2 processing.
+-   `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps.
+-   `requests`: For downloading data.
+
 ## Generate Event Data
 1.  Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
 1.  Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table.
 
-## Generate Description Data
-1.  Obtain an enwiki dump in enwiki/, as specified in the README.
-1.  In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
-1.  In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
-1.  Run 
-
 ## Generate Popularity Data
 1.  Obtain 'page view files' in enwiki/, as specified in it's README.
-1.  Run 
+1.  Run `gen_pop_data.py`, which adds the `pop` table, using data in enwiki/ and the `events` table.
 
 ## Generate Image Data and Popularity Data
 1.  In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`,
     looks for infobox image names, and stores them in an image database.
+    Uses popularity data in enwiki/ to find the top N events in each event category.
 1.  In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
     images, and adds them to the image database.
 1.  In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
+1.  Run
+
+## Generate Description Data
+1.  Obtain an enwiki dump in enwiki/, as specified in the README.
+1.  In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
+1.  In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
 1.  Run 
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index dd090ca..95795f3 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -19,16 +19,6 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
     Tables: <br>
     -   `offsets`: `id INT PRIMARY KEY, title TEXT UNIQUE, offset INT, next_offset INT`
 
-# Description Files
--   `gen_desc_data.py` <br>
-    Reads through pages in the dump file, and adds short-description info to a database.
--   `desc_data.db` <br>
-    Generated by `gen_desc_data.py`. <br>
-    Tables: <br>
-    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
-    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
-    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
-
 # Page View Files
 -   `pageviews/pageviews-*-user.bz2`
     Each holds wikimedia article page view data for some month.
@@ -58,3 +48,13 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
         Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
 -   `download_imgs.py` <br>
     Used to download image files into imgs/.
+
+# Description Files
+-   `gen_desc_data.py` <br>
+    Reads through pages in the dump file, and adds short-description info to a database.
+-   `desc_data.db` <br>
+    Generated by `gen_desc_data.py`. <br>
+    Tables: <br>
+    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
+    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
+    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index b37a107..90ec925 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -5,7 +5,7 @@ Reads through wikimedia files containing pageview counts,
 computes average counts, and adds them to a database
 """
 
-# Took about 15min per file (each had about 180e6 lines)
+# Took about 10min per file (each had about 180e6 lines)
 
 import sys, os, glob, math, re
 from collections import defaultdict
@@ -42,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 				if namespaceRegex.match(title) is not None:
 					continue
 				# Update map
+				title = title.replace('_', ' ')
 				titleToViews[title] += viewCount
 	print(f'Found {len(titleToViews)} titles')
 	#
diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py
new file mode 100755
index 0000000..46c9c68
--- /dev/null
+++ b/backend/hist_data/gen_pop_data.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+
+"""
+Adds Wikipedia page view info to the database as popularity values.
+"""
+
+import os, sqlite3
+
+PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
+DB_FILE = 'data.db'
+
+def genData(pageviewsDb: str, dbFile: str) -> None:
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	#
+	print('Getting event data')
+	titleToId: dict[str, int] = {}
+	for eventId, title in dbCur.execute('SELECT id, title FROM events'):
+		titleToId[title] = eventId
+	#
+	print('Getting view counts')
+	pdbCon = sqlite3.connect(pageviewsDb)
+	pdbCur = pdbCon.cursor()
+	titleToViews: dict[str, int] = {}
+	iterNum = 0
+	for title, views in pdbCur.execute('SELECT title, views from views'):
+		iterNum += 1
+		if iterNum % 1e6 == 0:
+			print(f'At iteration {iterNum}')
+		#
+		if title not in titleToId:
+			continue
+		titleToViews[title] = views
+	pdbCon.close()
+	#
+	print(f'Result: {len(titleToViews)} out of {len(titleToId)}')
+	dbCur.execute('CREATE TABLE pop (id INT PRIMARY KEY, pop INT)')
+	for title, views in titleToViews.items():
+		dbCur.execute('INSERT INTO pop VALUES (?, ?)', (titleToId[title], views))
+	#
+	dbCon.commit()
+	dbCon.close()
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	args = parser.parse_args()
+	#
+	genData(PAGEVIEWS_DB, DB_FILE)
diff --git a/backend/tests/test_gen_pop_data.py b/backend/tests/test_gen_pop_data.py
new file mode 100644
index 0000000..2f505f0
--- /dev/null
+++ b/backend/tests/test_gen_pop_data.py
@@ -0,0 +1,43 @@
+import unittest
+import tempfile, os
+
+from tests.common import createTestDbTable, readTestDbTable
+from hist_data.gen_pop_data import genData
+
+class TestGenData(unittest.TestCase):
+	def test_gen(self):
+		with tempfile.TemporaryDirectory() as tempDir:
+			# Create temp pageviews db
+			pageviewsDb = os.path.join(tempDir, 'pageview_data.db')
+			createTestDbTable(
+				pageviewsDb,
+				'CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)',
+				'INSERT INTO views VALUES (?, ?, ?)',
+				{
+					('one', 1, 10),
+					('two', 2, 20),
+					('three', 3, 30),
+				}
+			)
+			# Create temp history db
+			dbFile = os.path.join(tempDir, 'data.db')
+			createTestDbTable(
+				dbFile,
+				'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+					'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)',
+				'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+				{
+					(11, 'one', 100, None, None, None, 0, 'event'),
+					(33, 'three', 100, None, None, None, 0, 'event'),
+				}
+			)
+			# Run
+			genData(pageviewsDb, dbFile)
+			# Check
+			self.assertEqual(
+				readTestDbTable(dbFile, 'SELECT id, pop from pop'),
+				{
+					(11, 10),
+					(33, 30)
+				}
+			)