Add gen_pop_data.py

author: Terry Truong <terry06890@gmail.com> 2022-10-02 12:23:19 +1100
committer: Terry Truong <terry06890@gmail.com> 2022-10-02 12:23:19 +1100
commit: d70b96295d768aa0c80bf66639ad7a56bdef92a8 (patch)
tree: 3f960ad83e4158fff1c0931d838033392a3391ec /backend/hist_data/enwiki
parent: 1b4fc8667714ef4ce9f326bd14f795fc2417ecb9 (diff)
2 files changed, 12 insertions, 11 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index dd090ca..95795f3 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -19,16 +19,6 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
     Tables: <br>
     -   `offsets`: `id INT PRIMARY KEY, title TEXT UNIQUE, offset INT, next_offset INT`
 
-# Description Files
--   `gen_desc_data.py` <br>
-    Reads through pages in the dump file, and adds short-description info to a database.
--   `desc_data.db` <br>
-    Generated by `gen_desc_data.py`. <br>
-    Tables: <br>
-    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
-    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
-    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
-
 # Page View Files
 -   `pageviews/pageviews-*-user.bz2`
     Each holds wikimedia article page view data for some month.
@@ -58,3 +48,13 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
         Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
 -   `download_imgs.py` <br>
     Used to download image files into imgs/.
+
+# Description Files
+-   `gen_desc_data.py` <br>
+    Reads through pages in the dump file, and adds short-description info to a database.
+-   `desc_data.db` <br>
+    Generated by `gen_desc_data.py`. <br>
+    Tables: <br>
+    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
+    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
+    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index b37a107..90ec925 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -5,7 +5,7 @@ Reads through wikimedia files containing pageview counts,
 computes average counts, and adds them to a database
 """
 
-# Took about 15min per file (each had about 180e6 lines)
+# Took about 10min per file (each had about 180e6 lines)
 
 import sys, os, glob, math, re
 from collections import defaultdict
@@ -42,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 				if namespaceRegex.match(title) is not None:
 					continue
 				# Update map
+				title = title.replace('_', ' ')
 				titleToViews[title] += viewCount
 	print(f'Found {len(titleToViews)} titles')
 	#
author	Terry Truong <terry06890@gmail.com>	2022-10-02 12:23:19 +1100
committer	Terry Truong <terry06890@gmail.com>	2022-10-02 12:23:19 +1100
commit	d70b96295d768aa0c80bf66639ad7a56bdef92a8 (patch)
tree	3f960ad83e4158fff1c0931d838033392a3391ec /backend/hist_data/enwiki
parent	1b4fc8667714ef4ce9f326bd14f795fc2417ecb9 (diff)