aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki/gen_pageview_data.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-01-03 01:31:19 +1100
committerTerry Truong <terry06890@gmail.com>2023-01-03 01:31:19 +1100
commit6f34c4c3aafee39f8c7ec41a7777c194443a27fa (patch)
tree41159dd3d0194790280df87c23c9b7c2b59599d6 /backend/hist_data/enwiki/gen_pageview_data.py
parent9f7c8f8b55887c713bcb6af663a836476df76c7d (diff)
Extend gen_pageview_data.py to handle problematic lines
Diffstat (limited to 'backend/hist_data/enwiki/gen_pageview_data.py')
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py10
1 files changed, 7 insertions, 3 deletions
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 90ec925..935b303 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -36,9 +36,13 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if not line.startswith(linePrefix):
continue
# Get second and second-last fields
- line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
- title = line[:line.find(b' ')].decode('utf-8')
- viewCount = int(line[line.rfind(b' ')+1:])
+ linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
+ title = linePart[:linePart.find(b' ')].decode('utf-8')
+ try:
+ viewCount = int(linePart[linePart.rfind(b' ')+1:])
+ except ValueError:
+ print(f'Unable to read count in line {lineNum}: {line}')
+ continue
if namespaceRegex.match(title) is not None:
continue
# Update map