From 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Mon, 11 Jul 2022 01:54:08 +1000 Subject: Make backend dev server script serve the image files Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images. --- backend/tolData/enwiki/genDumpIndexDb.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100755 backend/tolData/enwiki/genDumpIndexDb.py (limited to 'backend/tolData/enwiki/genDumpIndexDb.py') diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py new file mode 100755 index 0000000..3955885 --- /dev/null +++ b/backend/tolData/enwiki/genDumpIndexDb.py @@ -0,0 +1,58 @@ +#!/usr/bin/python3 + +import sys, os, re +import bz2 +import sqlite3 + +usageInfo = f""" +Usage: {sys.argv[0]} + +Adds data from the wiki dump index-file into a database. +""" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines +indexDb = "dumpIndex.db" + +if os.path.exists(indexDb): + raise Exception(f"ERROR: Existing {indexDb}") +print("Creating database") +dbCon = sqlite3.connect(indexDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)") + +print("Iterating through index file") +lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") +lastOffset = 0 +lineNum = 0 +entriesToAdd = [] +with bz2.open(indexFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f"At line {lineNum}") + # + match = lineRegex.fullmatch(line.rstrip()) + (offset, pageId, title) = match.group(1,2,3) + offset = int(offset) + if offset > lastOffset: + for (t, p) in entriesToAdd: + try: + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset)) + except sqlite3.IntegrityError as e: + # Accounts for certain entries in the file that have the same title + print(f"Failed on title \"{t}\": {e}", file=sys.stderr) + entriesToAdd = [] + lastOffset = offset + entriesToAdd.append([title, pageId]) +for (title, pageId) in entriesToAdd: + try: + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1)) + except sqlite3.IntegrityError as e: + print(f"Failed on title \"{t}\": {e}", file=sys.stderr) + +print("Closing database") +dbCon.commit() +dbCon.close() -- cgit v1.2.3