aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-10-02 12:23:19 +1100
committerTerry Truong <terry06890@gmail.com>2022-10-02 12:23:19 +1100
commitd70b96295d768aa0c80bf66639ad7a56bdef92a8 (patch)
tree3f960ad83e4158fff1c0931d838033392a3391ec /backend/hist_data/enwiki
parent1b4fc8667714ef4ce9f326bd14f795fc2417ecb9 (diff)
Add gen_pop_data.py
Diffstat (limited to 'backend/hist_data/enwiki')
-rw-r--r--backend/hist_data/enwiki/README.md20
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py3
2 files changed, 12 insertions, 11 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index dd090ca..95795f3 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -19,16 +19,6 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
Tables: <br>
- `offsets`: `id INT PRIMARY KEY, title TEXT UNIQUE, offset INT, next_offset INT`
-# Description Files
-- `gen_desc_data.py` <br>
- Reads through pages in the dump file, and adds short-description info to a database.
-- `desc_data.db` <br>
- Generated by `gen_desc_data.py`. <br>
- Tables: <br>
- - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
- - `redirects`: `id INT PRIMARY KEY, target TEXT`
- - `descs`: `id INT PRIMARY KEY, desc TEXT`
-
# Page View Files
- `pageviews/pageviews-*-user.bz2`
Each holds wikimedia article page view data for some month.
@@ -58,3 +48,13 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
- `download_imgs.py` <br>
Used to download image files into imgs/.
+
+# Description Files
+- `gen_desc_data.py` <br>
+ Reads through pages in the dump file, and adds short-description info to a database.
+- `desc_data.db` <br>
+ Generated by `gen_desc_data.py`. <br>
+ Tables: <br>
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
+ - `redirects`: `id INT PRIMARY KEY, target TEXT`
+ - `descs`: `id INT PRIMARY KEY, desc TEXT`
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index b37a107..90ec925 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -5,7 +5,7 @@ Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
"""
-# Took about 15min per file (each had about 180e6 lines)
+# Took about 10min per file (each had about 180e6 lines)
import sys, os, glob, math, re
from collections import defaultdict
@@ -42,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if namespaceRegex.match(title) is not None:
continue
# Update map
+ title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
#