aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-02-05 13:55:00 +1100
committerTerry Truong <terry06890@gmail.com>2023-02-05 13:55:00 +1100
commitff638e269d439c853b5182b68ff10777f12597f1 (patch)
tree2cc62068a501884821969b6ebeba3686df8f4044
parentcc79c17fbc05bddc8b08f2734e721bc241123a4e (diff)
Use relative imports between data generation scriptspackage-imports
Avoids the need for code that modifies sys.path, but requires running of the scripts using 'python -m' with backend/ as cwd. Also expects constants like DB_FILE to be non-relative, due to running from backend/.
-rwxr-xr-xbackend/chrona.py4
-rw-r--r--backend/hist_data/README.md3
-rwxr-xr-xbackend/hist_data/enwiki/download_img_license_info.py4
-rwxr-xr-xbackend/hist_data/enwiki/download_imgs.py5
-rwxr-xr-xbackend/hist_data/enwiki/gen_desc_data.py5
-rwxr-xr-xbackend/hist_data/enwiki/gen_dump_index_db.py6
-rwxr-xr-xbackend/hist_data/enwiki/gen_img_data.py9
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py7
-rwxr-xr-xbackend/hist_data/gen_desc_data.py5
-rwxr-xr-xbackend/hist_data/gen_disp_data.py13
-rwxr-xr-xbackend/hist_data/gen_events_data.py16
-rwxr-xr-xbackend/hist_data/gen_imgs.py9
-rwxr-xr-xbackend/hist_data/gen_picked_data.py29
-rwxr-xr-xbackend/hist_data/gen_pop_data.py5
14 files changed, 62 insertions, 58 deletions
diff --git a/backend/chrona.py b/backend/chrona.py
index e107d1c..c327693 100755
--- a/backend/chrona.py
+++ b/backend/chrona.py
@@ -24,6 +24,7 @@ Expected HTTP query parameters:
from typing import Iterable, cast
import sys
+import os
import re
import urllib.parse
import sqlite3
@@ -32,7 +33,8 @@ import jsonpickle
from hist_data.cal import HistDate, dbDateToHistDate, dateToUnit
-DB_FILE = 'hist_data/data.db'
+DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hist_data')
+DB_FILE = os.path.join(DATA_DIR, 'data.db')
MAX_REQ_EVENTS = 2000
MAX_REQ_UNIT_COUNTS = MAX_REQ_EVENTS
DEFAULT_REQ_EVENTS = 20
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index 73b7a36..4baf0b1 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -47,7 +47,8 @@ This directory holds files used to generate the history database data.db.
## Generate Event Data
1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table.
- You might want to set WIKIDATA_FILE in the script to the dump file's name.
+ It needs to be run as a module, by using `python -m hist_data.gen_events_data` in hist_data/.
+ Also, you might want to check that WIKIDATA_FILE in the script matches the dump file's name.
## Generate Popularity Data
1. Obtain an enwiki dump and 'page view files' in enwiki/, as specified in the README.
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 6fd710c..3ef20fb 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -10,6 +10,7 @@ at already-processed names to decide what to skip.
"""
import argparse
+import os
import re
import time
import signal
@@ -18,7 +19,8 @@ import urllib.parse
import html
import requests
-IMG_DB = 'img_data.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db')
API_URL = 'https://en.wikipedia.org/w/api.php'
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index e484b33..686f912 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -20,8 +20,9 @@ import sqlite3
import urllib.parse
import requests
-IMG_DB = 'img_data.db' # About 130k image names
-OUT_DIR = 'imgs'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # About 130k image names
+OUT_DIR = os.path.join(ENWIKI_DIR, 'imgs')
LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index 194afe8..b866c1e 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -18,8 +18,9 @@ import html
import mwxml
import mwparserfromhell
-DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
-DB_FILE = 'desc_data.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2') # Had about 22e6 pages
+DB_FILE = os.path.join(ENWIKI_DIR, 'desc_data.db')
DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
index 8872171..e4a0fbe 100755
--- a/backend/hist_data/enwiki/gen_dump_index_db.py
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -11,8 +11,10 @@ import re
import bz2
import sqlite3
-INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
-DB_FILE = 'dump_index.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+INDEX_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream-index.txt.bz2')
+ # Had about 22e6 lines
+DB_FILE = os.path.join(ENWIKI_DIR, 'dump_index.db')
def genData(indexFile: str, dbFile: str) -> None:
if os.path.exists(dbFile):
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index 05df63d..044e5a0 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -16,10 +16,11 @@ import html
import urllib.parse
import sqlite3
-DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
-INDEX_DB = 'dump_index.db'
-IMG_DB = 'img_data.db' # The database to create
-DB_FILE = os.path.join('..', 'data.db')
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2')
+INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db')
+IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # The database to create
+DB_FILE = os.path.join(ENWIKI_DIR, '..', 'data.db')
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 57d6c7b..2dc6eb7 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -22,9 +22,10 @@ from collections import defaultdict
import bz2
import sqlite3
-PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
-DUMP_INDEX_DB = 'dump_index.db'
-DB_FILE = 'pageview_data.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+PAGEVIEW_FILES = glob.glob(os.path.join(ENWIKI_DIR, './pageviews/pageviews-*-user.bz2'))
+DUMP_INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db')
+DB_FILE = os.path.join(ENWIKI_DIR, 'pageview_data.db')
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if os.path.exists(dbFile):
diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py
index bcd8870..09eeab6 100755
--- a/backend/hist_data/gen_desc_data.py
+++ b/backend/hist_data/gen_desc_data.py
@@ -8,8 +8,9 @@ import argparse
import os
import sqlite3
-ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
-DB_FILE = 'data.db'
+DATA_DIR = os.path.dirname(os.path.realpath(__file__))
+ENWIKI_DB = os.path.join(DATA_DIR, 'enwiki', 'desc_data.db')
+DB_FILE = os.path.join(DATA_DIR, 'data.db')
def genData(enwikiDb: str, dbFile: str) -> None:
print('Creating table')
diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py
index 6bb84ad..ca9b172 100755
--- a/backend/hist_data/gen_disp_data.py
+++ b/backend/hist_data/gen_disp_data.py
@@ -5,19 +5,16 @@ Adds data about event distribution to the database,
and removes events not eligible for display
"""
-# For unit testing, resolve imports of modules within this directory
-import os
-import sys
-parentDir = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(parentDir)
-
import argparse
+import os
import sqlite3
-from cal import SCALES, dbDateToHistDate, dateToUnit
+from .cal import SCALES, dbDateToHistDate, dateToUnit
MAX_DISPLAYED_PER_UNIT = 4
-DB_FILE = 'data.db'
+
+DATA_DIR = os.path.dirname(os.path.realpath(__file__))
+DB_FILE = os.path.join(DATA_DIR, 'data.db')
def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTables: bool) -> None:
dbCon = sqlite3.connect(dbFile)
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py
index 453a9ad..d433ce7 100755
--- a/backend/hist_data/gen_events_data.py
+++ b/backend/hist_data/gen_events_data.py
@@ -61,14 +61,9 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or
# Note: Took about 4.5 hours to run
-# For unit testing, resolve imports of modules within this directory
-import os
-import sys
-parentDir = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(parentDir)
-
from typing import cast
import argparse
+import os
import math
import re
import io
@@ -81,13 +76,14 @@ import pickle
import multiprocessing
import tempfile
-from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR
+from .cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR
# ========== Constants ==========
-WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2')
-OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat')
-DB_FILE = 'data.db'
+DATA_DIR = os.path.dirname(os.path.realpath(__file__))
+WIKIDATA_FILE = os.path.join(DATA_DIR, 'wikidata', 'latest-all.json.bz2')
+OFFSETS_FILE = os.path.join(DATA_DIR, 'wikidata', 'offsets.dat')
+DB_FILE = os.path.join(DATA_DIR, 'data.db')
N_PROCS = 6 # Number of processes to use
# For getting Wikidata entity IDs
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
index 44c0020..8860fe0 100755
--- a/backend/hist_data/gen_imgs.py
+++ b/backend/hist_data/gen_imgs.py
@@ -19,10 +19,11 @@ import signal
import sqlite3
import urllib.parse
-IMG_DIR = os.path.join('enwiki', 'imgs')
-IMG_DB = os.path.join('enwiki', 'img_data.db')
-OUT_DIR = 'img'
-DB_FILE = 'data.db'
+DATA_DIR = os.path.dirname(os.path.realpath(__file__))
+IMG_DIR = os.path.join(DATA_DIR, 'enwiki', 'imgs')
+IMG_DB = os.path.join(DATA_DIR, 'enwiki', 'img_data.db')
+OUT_DIR = os.path.join(DATA_DIR, 'img')
+DB_FILE = os.path.join(DATA_DIR, 'data.db')
IMG_OUT_SZ = 200
diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py
index a6bb8f8..2deb72c 100755
--- a/backend/hist_data/gen_picked_data.py
+++ b/backend/hist_data/gen_picked_data.py
@@ -4,28 +4,25 @@
Adds additional manually-picked events to the database
"""
-# For unit testing, resolve imports of modules within this directory
-import os
-import sys
-parentDir = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(parentDir)
-
import argparse
-import json, sqlite3
+import os
+import json
+import sqlite3
-from gen_imgs import convertImage
-from cal import SCALES, dbDateToHistDate, dateToUnit
+from .gen_imgs import convertImage
+from .cal import SCALES, dbDateToHistDate, dateToUnit
-PICKED_DIR = 'picked'
-PICKED_EVT_FILE = 'events.json'
-DB_FILE = 'data.db'
-IMG_OUT_DIR = 'img'
+DATA_DIR = os.path.dirname(os.path.realpath(__file__))
+PICKED_DIR = os.path.join(DATA_DIR, 'picked')
+PICKED_EVT_FILENAME = 'events.json'
+DB_FILE = os.path.join(DATA_DIR, 'data.db')
+IMG_OUT_DIR = os.path.join(DATA_DIR, 'img')
-def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, scales: list[int]) -> None:
+def genData(pickedDir: str, pickedEvtFileName: str, dbFile: str, imgOutDir: str, scales: list[int]) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- with open(os.path.join(pickedDir, pickedEvtFile)) as f:
+ with open(os.path.join(pickedDir, pickedEvtFileName)) as f:
eventsToAdd = json.load(f)
nextId = -1
for event in eventsToAdd:
@@ -167,4 +164,4 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- genData(PICKED_DIR, PICKED_EVT_FILE, DB_FILE, IMG_OUT_DIR, SCALES)
+ genData(PICKED_DIR, PICKED_EVT_FILENAME, DB_FILE, IMG_OUT_DIR, SCALES)
diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py
index 8d50b6b..4a4d972 100755
--- a/backend/hist_data/gen_pop_data.py
+++ b/backend/hist_data/gen_pop_data.py
@@ -8,8 +8,9 @@ import argparse
import os
import sqlite3
-PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
-DB_FILE = 'data.db'
+DATA_DIR = os.path.dirname(os.path.realpath(__file__))
+PAGEVIEWS_DB = os.path.join(DATA_DIR, 'enwiki', 'pageview_data.db')
+DB_FILE = os.path.join(DATA_DIR, 'data.db')
def genData(pageviewsDb: str, dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)