diff options
Diffstat (limited to 'backend/hist_data/enwiki')
| -rwxr-xr-x | backend/hist_data/enwiki/download_img_license_info.py | 4 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_imgs.py | 5 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_desc_data.py | 5 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_dump_index_db.py | 6 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 9 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_pageview_data.py | 7 |
6 files changed, 22 insertions, 14 deletions
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 6fd710c..3ef20fb 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -10,6 +10,7 @@ at already-processed names to decide what to skip. """ import argparse +import os import re import time import signal @@ -18,7 +19,8 @@ import urllib.parse import html import requests -IMG_DB = 'img_data.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') API_URL = 'https://en.wikipedia.org/w/api.php' USER_AGENT = 'terryt.dev (terry06890@gmail.com)' diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index e484b33..686f912 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -20,8 +20,9 @@ import sqlite3 import urllib.parse import requests -IMG_DB = 'img_data.db' # About 130k image names -OUT_DIR = 'imgs' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # About 130k image names +OUT_DIR = os.path.join(ENWIKI_DIR, 'imgs') LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) USER_AGENT = 'terryt.dev (terry06890@gmail.com)' diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index 194afe8..b866c1e 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -18,8 +18,9 @@ import html import mwxml import mwparserfromhell -DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages -DB_FILE = 'desc_data.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2') # Had about 22e6 pages +DB_FILE = os.path.join(ENWIKI_DIR, 'desc_data.db') DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 8872171..e4a0fbe 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -11,8 +11,10 @@ import re import bz2 import sqlite3 -INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines -DB_FILE = 'dump_index.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +INDEX_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream-index.txt.bz2') + # Had about 22e6 lines +DB_FILE = os.path.join(ENWIKI_DIR, 'dump_index.db') def genData(indexFile: str, dbFile: str) -> None: if os.path.exists(dbFile): diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 05df63d..044e5a0 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -16,10 +16,11 @@ import html import urllib.parse import sqlite3 -DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' -INDEX_DB = 'dump_index.db' -IMG_DB = 'img_data.db' # The database to create -DB_FILE = os.path.join('..', 'data.db') +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2') +INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db') +IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # The database to create +DB_FILE = os.path.join(ENWIKI_DIR, '..', 'data.db') ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index 57d6c7b..2dc6eb7 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -22,9 +22,10 @@ from collections import defaultdict import bz2 import sqlite3 -PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') -DUMP_INDEX_DB = 'dump_index.db' -DB_FILE = 'pageview_data.db' +ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__)) +PAGEVIEW_FILES = glob.glob(os.path.join(ENWIKI_DIR, './pageviews/pageviews-*-user.bz2')) +DUMP_INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db') +DB_FILE = os.path.join(ENWIKI_DIR, 'pageview_data.db') def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: if os.path.exists(dbFile): |
