aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki/gen_pageview_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data/enwiki/gen_pageview_data.py')
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index b37a107..90ec925 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -5,7 +5,7 @@ Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
"""
-# Took about 15min per file (each had about 180e6 lines)
+# Took about 10min per file (each had about 180e6 lines)
import sys, os, glob, math, re
from collections import defaultdict
@@ -42,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if namespaceRegex.match(title) is not None:
continue
# Update map
+ title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
#