diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-02-05 13:55:00 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-02-05 13:55:00 +1100 |
| commit | ff638e269d439c853b5182b68ff10777f12597f1 (patch) | |
| tree | 2cc62068a501884821969b6ebeba3686df8f4044 | |
| parent | cc79c17fbc05bddc8b08f2734e721bc241123a4e (diff) | |
Use relative imports between data generation scriptspackage-imports
Avoids the need for code that modifies sys.path, but requires
running of the scripts using 'python -m' with backend/ as cwd.
Also expects constants like DB_FILE to be non-relative, due to
running from backend/.
| -rwxr-xr-x | backend/chrona.py | 4 | ||||
| -rw-r--r-- | backend/hist_data/README.md | 3 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_img_license_info.py | 4 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_imgs.py | 5 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_desc_data.py | 5 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_dump_index_db.py | 6 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 9 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_pageview_data.py | 7 | ||||
| -rwxr-xr-x | backend/hist_data/gen_desc_data.py | 5 | ||||
| -rwxr-xr-x | backend/hist_data/gen_disp_data.py | 13 | ||||
| -rwxr-xr-x | backend/hist_data/gen_events_data.py | 16 | ||||
| -rwxr-xr-x | backend/hist_data/gen_imgs.py | 9 | ||||
| -rwxr-xr-x | backend/hist_data/gen_picked_data.py | 29 | ||||
| -rwxr-xr-x | backend/hist_data/gen_pop_data.py | 5 |
14 files changed, 62 insertions, 58 deletions
diff --git a/backend/chrona.py b/backend/chrona.py index e107d1c..c327693 100755 --- a/backend/chrona.py +++ b/backend/chrona.py @@ -24,6 +24,7 @@ Expected HTTP query parameters: from typing import Iterable, cast import sys +import os import re import urllib.parse import sqlite3 @@ -32,7 +33,8 @@ import jsonpickle from hist_data.cal import HistDate, dbDateToHistDate, dateToUnit -DB_FILE = 'hist_data/data.db' +DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hist_data') +DB_FILE = os.path.join(DATA_DIR, 'data.db') MAX_REQ_EVENTS = 2000 MAX_REQ_UNIT_COUNTS = MAX_REQ_EVENTS DEFAULT_REQ_EVENTS = 20 diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index 73b7a36..4baf0b1 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -47,7 +47,8 @@ This directory holds files used to generate the history database data.db. ## Generate Event Data 1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README. 1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table. - You might want to set WIKIDATA_FILE in the script to the dump file's name. + It needs to be run as a module, by using `python -m hist_data.gen_events_data` in hist_data/. + Also, you might want to check that WIKIDATA_FILE in the script matches the dump file's name. ## Generate Popularity Data 1. Obtain an enwiki dump and 'page view files' in enwiki/, as specified in the README. diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 6fd710c..3ef20fb 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -10,6 +10,7 @@ at already-processed names to decide what to skip. """ import argparse +import os import re import time import signal @@ -18,7 +19,8 @@ import urllib.parse import html import requests -IMG_DB = 'img_data.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') API_URL = 'https://en.wikipedia.org/w/api.php' USER_AGENT = 'terryt.dev (terry06890@gmail.com)' diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index e484b33..686f912 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -20,8 +20,9 @@ import sqlite3 import urllib.parse import requests -IMG_DB = 'img_data.db' # About 130k image names -OUT_DIR = 'imgs' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # About 130k image names +OUT_DIR = os.path.join(ENWIKI_DIR, 'imgs') LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) USER_AGENT = 'terryt.dev (terry06890@gmail.com)' diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index 194afe8..b866c1e 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -18,8 +18,9 @@ import html import mwxml import mwparserfromhell -DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages -DB_FILE = 'desc_data.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2') # Had about 22e6 pages +DB_FILE = os.path.join(ENWIKI_DIR, 'desc_data.db') DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 8872171..e4a0fbe 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -11,8 +11,10 @@ import re import bz2 import sqlite3 -INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines -DB_FILE = 'dump_index.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +INDEX_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream-index.txt.bz2') + # Had about 22e6 lines +DB_FILE = os.path.join(ENWIKI_DIR, 'dump_index.db') def genData(indexFile: str, dbFile: str) -> None: if os.path.exists(dbFile): diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 05df63d..044e5a0 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -16,10 +16,11 @@ import html import urllib.parse import sqlite3 -DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' -INDEX_DB = 'dump_index.db' -IMG_DB = 'img_data.db' # The database to create -DB_FILE = os.path.join('..', 'data.db') +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2') +INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db') +IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # The database to create +DB_FILE = os.path.join(ENWIKI_DIR, '..', 'data.db') ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index 57d6c7b..2dc6eb7 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -22,9 +22,10 @@ from collections import defaultdict import bz2 import sqlite3 -PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') -DUMP_INDEX_DB = 'dump_index.db' -DB_FILE = 'pageview_data.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +PAGEVIEW_FILES = glob.glob(os.path.join(ENWIKI_DIR, './pageviews/pageviews-*-user.bz2')) +DUMP_INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db') +DB_FILE = os.path.join(ENWIKI_DIR, 'pageview_data.db') def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: if os.path.exists(dbFile): diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py index bcd8870..09eeab6 100755 --- a/backend/hist_data/gen_desc_data.py +++ b/backend/hist_data/gen_desc_data.py @@ -8,8 +8,9 @@ import argparse import os import sqlite3 -ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') -DB_FILE = 'data.db' +DATA_DIR = os.path.dirname(os.path.realpath(__file__)) +ENWIKI_DB = os.path.join(DATA_DIR, 'enwiki', 'desc_data.db') +DB_FILE = os.path.join(DATA_DIR, 'data.db') def genData(enwikiDb: str, dbFile: str) -> None: print('Creating table') diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py index 6bb84ad..ca9b172 100755 --- a/backend/hist_data/gen_disp_data.py +++ b/backend/hist_data/gen_disp_data.py @@ -5,19 +5,16 @@ Adds data about event distribution to the database, and removes events not eligible for display """ -# For unit testing, resolve imports of modules within this directory -import os -import sys -parentDir = os.path.dirname(os.path.realpath(__file__)) -sys.path.append(parentDir) - import argparse +import os import sqlite3 -from cal import SCALES, dbDateToHistDate, dateToUnit +from .cal import SCALES, dbDateToHistDate, dateToUnit MAX_DISPLAYED_PER_UNIT = 4 -DB_FILE = 'data.db' + +DATA_DIR = os.path.dirname(os.path.realpath(__file__)) +DB_FILE = os.path.join(DATA_DIR, 'data.db') def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTables: bool) -> None: dbCon = sqlite3.connect(dbFile) diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py index 453a9ad..d433ce7 100755 --- a/backend/hist_data/gen_events_data.py +++ b/backend/hist_data/gen_events_data.py @@ -61,14 +61,9 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or # Note: Took about 4.5 hours to run -# For unit testing, resolve imports of modules within this directory -import os -import sys -parentDir = os.path.dirname(os.path.realpath(__file__)) -sys.path.append(parentDir) - from typing import cast import argparse +import os import math import re import io @@ -81,13 +76,14 @@ import pickle import multiprocessing import tempfile -from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR +from .cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR # ========== Constants ========== -WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2') -OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat') -DB_FILE = 'data.db' +DATA_DIR = os.path.dirname(os.path.realpath(__file__)) +WIKIDATA_FILE = os.path.join(DATA_DIR, 'wikidata', 'latest-all.json.bz2') +OFFSETS_FILE = os.path.join(DATA_DIR, 'wikidata', 'offsets.dat') +DB_FILE = os.path.join(DATA_DIR, 'data.db') N_PROCS = 6 # Number of processes to use # For getting Wikidata entity IDs diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py index 44c0020..8860fe0 100755 --- a/backend/hist_data/gen_imgs.py +++ b/backend/hist_data/gen_imgs.py @@ -19,10 +19,11 @@ import signal import sqlite3 import urllib.parse -IMG_DIR = os.path.join('enwiki', 'imgs') -IMG_DB = os.path.join('enwiki', 'img_data.db') -OUT_DIR = 'img' -DB_FILE = 'data.db' +DATA_DIR = os.path.dirname(os.path.realpath(__file__)) +IMG_DIR = os.path.join(DATA_DIR, 'enwiki', 'imgs') +IMG_DB = os.path.join(DATA_DIR, 'enwiki', 'img_data.db') +OUT_DIR = os.path.join(DATA_DIR, 'img') +DB_FILE = os.path.join(DATA_DIR, 'data.db') IMG_OUT_SZ = 200 diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py index a6bb8f8..2deb72c 100755 --- a/backend/hist_data/gen_picked_data.py +++ b/backend/hist_data/gen_picked_data.py @@ -4,28 +4,25 @@ Adds additional manually-picked events to the database """ -# For unit testing, resolve imports of modules within this directory -import os -import sys -parentDir = os.path.dirname(os.path.realpath(__file__)) -sys.path.append(parentDir) - import argparse -import json, sqlite3 +import os +import json +import sqlite3 -from gen_imgs import convertImage -from cal import SCALES, dbDateToHistDate, dateToUnit +from .gen_imgs import convertImage +from .cal import SCALES, dbDateToHistDate, dateToUnit -PICKED_DIR = 'picked' -PICKED_EVT_FILE = 'events.json' -DB_FILE = 'data.db' -IMG_OUT_DIR = 'img' +DATA_DIR = os.path.dirname(os.path.realpath(__file__)) +PICKED_DIR = os.path.join(DATA_DIR, 'picked') +PICKED_EVT_FILENAME = 'events.json' +DB_FILE = os.path.join(DATA_DIR, 'data.db') +IMG_OUT_DIR = os.path.join(DATA_DIR, 'img') -def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, scales: list[int]) -> None: +def genData(pickedDir: str, pickedEvtFileName: str, dbFile: str, imgOutDir: str, scales: list[int]) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - with open(os.path.join(pickedDir, pickedEvtFile)) as f: + with open(os.path.join(pickedDir, pickedEvtFileName)) as f: eventsToAdd = json.load(f) nextId = -1 for event in eventsToAdd: @@ -167,4 +164,4 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - genData(PICKED_DIR, PICKED_EVT_FILE, DB_FILE, IMG_OUT_DIR, SCALES) + genData(PICKED_DIR, PICKED_EVT_FILENAME, DB_FILE, IMG_OUT_DIR, SCALES) diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py index 8d50b6b..4a4d972 100755 --- a/backend/hist_data/gen_pop_data.py +++ b/backend/hist_data/gen_pop_data.py @@ -8,8 +8,9 @@ import argparse import os import sqlite3 -PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db') -DB_FILE = 'data.db' +DATA_DIR = os.path.dirname(os.path.realpath(__file__)) +PAGEVIEWS_DB = os.path.join(DATA_DIR, 'enwiki', 'pageview_data.db') +DB_FILE = os.path.join(DATA_DIR, 'data.db') def genData(pageviewsDb: str, dbFile: str) -> None: dbCon = sqlite3.connect(dbFile) |
