aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/hist_data/README.md1
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py10
-rwxr-xr-xbackend/hist_data/gen_events_data.py2
3 files changed, 9 insertions, 4 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index a3ae6c1..9ae7811 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -44,7 +44,6 @@ Some of the scripts use third-party packages:
- `indexed_bzip2`: For parallelised bzip2 processing
- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps
- `requests`: For downloading data
-- `Pillow`: For image processing
## Generate Event Data
1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 90ec925..935b303 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -36,9 +36,13 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if not line.startswith(linePrefix):
continue
# Get second and second-last fields
- line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
- title = line[:line.find(b' ')].decode('utf-8')
- viewCount = int(line[line.rfind(b' ')+1:])
+ linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
+ title = linePart[:linePart.find(b' ')].decode('utf-8')
+ try:
+ viewCount = int(linePart[linePart.rfind(b' ')+1:])
+ except ValueError:
+ print(f'Unable to read count in line {lineNum}: {line}')
+ continue
if namespaceRegex.match(title) is not None:
continue
# Update map
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py
index 51d6940..deaf794 100755
--- a/backend/hist_data/gen_events_data.py
+++ b/backend/hist_data/gen_events_data.py
@@ -59,6 +59,8 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or
# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
# Possibly related: https://github.com/python/cpython/issues/72882
+# Took about 4.5 hours to run
+
# Code used in unit testing (for resolving imports of modules within this directory)
import os, sys
parentDir = os.path.dirname(os.path.realpath(__file__))