aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki/gen_desc_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data/enwiki/gen_desc_data.py')
-rwxr-xr-xbackend/hist_data/enwiki/gen_desc_data.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index 194afe8..b866c1e 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -18,8 +18,9 @@ import html
import mwxml
import mwparserfromhell
-DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
-DB_FILE = 'desc_data.db'
+ENWIKI_DIR = os.path.dirname(os.path.realpath(__file__))
+DUMP_FILE = os.path.join(ENWIKI_DIR, 'enwiki-20220501-pages-articles-multistream.xml.bz2') # Had about 22e6 pages
+DB_FILE = os.path.join(ENWIKI_DIR, 'desc_data.db')
DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')