aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data/enwiki')
-rwxr-xr-xbackend/hist_data/enwiki/download_img_license_info.py4
-rwxr-xr-xbackend/hist_data/enwiki/download_imgs.py5
-rwxr-xr-xbackend/hist_data/enwiki/gen_desc_data.py5
-rwxr-xr-xbackend/hist_data/enwiki/gen_dump_index_db.py6
-rwxr-xr-xbackend/hist_data/enwiki/gen_img_data.py9
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py7
6 files changed, 22 insertions, 14 deletions
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 6fd710c..3ef20fb 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -10,6 +10,7 @@ at already-processed names to decide what to skip.
"""
import argparse
+import os
import re
import time
import signal
@@ -18,7 +19,8 @@ import urllib.parse
import html
import requests
-IMG_DB = 'img_data.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db')
API_URL = 'https://en.wikipedia.org/w/api.php'
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index e484b33..686f912 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -20,8 +20,9 @@ import sqlite3
import urllib.parse
import requests
-IMG_DB = 'img_data.db' # About 130k image names
-OUT_DIR = 'imgs'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # About 130k image names
+OUT_DIR = os.path.join(ENWIKI_DIR, 'imgs')
LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index 194afe8..b866c1e 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -18,8 +18,9 @@ import html
import mwxml
import mwparserfromhell
-DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
-DB_FILE = 'desc_data.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2') # Had about 22e6 pages
+DB_FILE = os.path.join(ENWIKI_DIR, 'desc_data.db')
DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
index 8872171..e4a0fbe 100755
--- a/backend/hist_data/enwiki/gen_dump_index_db.py
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -11,8 +11,10 @@ import re
import bz2
import sqlite3
-INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
-DB_FILE = 'dump_index.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+INDEX_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream-index.txt.bz2')
+ # Had about 22e6 lines
+DB_FILE = os.path.join(ENWIKI_DIR, 'dump_index.db')
def genData(indexFile: str, dbFile: str) -> None:
if os.path.exists(dbFile):
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index 05df63d..044e5a0 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -16,10 +16,11 @@ import html
import urllib.parse
import sqlite3
-DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
-INDEX_DB = 'dump_index.db'
-IMG_DB = 'img_data.db' # The database to create
-DB_FILE = os.path.join('..', 'data.db')
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2')
+INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db')
+IMG_DB = os.path.join(ENWIKI_DIR, 'img_data.db') # The database to create
+DB_FILE = os.path.join(ENWIKI_DIR, '..', 'data.db')
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 57d6c7b..2dc6eb7 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -22,9 +22,10 @@ from collections import defaultdict
import bz2
import sqlite3
-PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
-DUMP_INDEX_DB = 'dump_index.db'
-DB_FILE = 'pageview_data.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+PAGEVIEW_FILES = glob.glob(os.path.join(ENWIKI_DIR, './pageviews/pageviews-*-user.bz2'))
+DUMP_INDEX_DB = os.path.join(ENWIKI_DIR, 'dump_index.db')
+DB_FILE = os.path.join(ENWIKI_DIR, 'pageview_data.db')
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
if os.path.exists(dbFile):