From 6f34c4c3aafee39f8c7ec41a7777c194443a27fa Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Tue, 3 Jan 2023 01:31:19 +1100 Subject: Extend gen_pageview_data.py to handle problematic lines --- backend/hist_data/README.md | 1 - backend/hist_data/enwiki/gen_pageview_data.py | 10 +++++++--- backend/hist_data/gen_events_data.py | 2 ++ 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'backend/hist_data') diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index a3ae6c1..9ae7811 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -44,7 +44,6 @@ Some of the scripts use third-party packages: - `indexed_bzip2`: For parallelised bzip2 processing - `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps - `requests`: For downloading data -- `Pillow`: For image processing ## Generate Event Data 1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README. diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index 90ec925..935b303 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -36,9 +36,13 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: if not line.startswith(linePrefix): continue # Get second and second-last fields - line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields - title = line[:line.find(b' ')].decode('utf-8') - viewCount = int(line[line.rfind(b' ')+1:]) + linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields + title = linePart[:linePart.find(b' ')].decode('utf-8') + try: + viewCount = int(linePart[linePart.rfind(b' ')+1:]) + except ValueError: + print(f'Unable to read count in line {lineNum}: {line}') + continue if namespaceRegex.match(title) is not None: continue # Update map diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py index 51d6940..deaf794 100755 --- a/backend/hist_data/gen_events_data.py +++ b/backend/hist_data/gen_events_data.py @@ -59,6 +59,8 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 +# Took about 4.5 hours to run + # Code used in unit testing (for resolving imports of modules within this directory) import os, sys parentDir = os.path.dirname(os.path.realpath(__file__)) -- cgit v1.2.3