From 0a9b2c2e5eca8a04e37fbdd423379882863237c2 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sat, 21 Jan 2023 12:21:03 +1100 Subject: Adjust backend coding style Increase line spacing, add section comments, etc --- backend/hist_data/enwiki/gen_dump_index_db.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'backend/hist_data/enwiki/gen_dump_index_db.py') diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 6be8bc5..8872171 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -1,24 +1,28 @@ #!/usr/bin/python3 """ -Adds data from the wiki-dump index-file into a database +Converts data from the wiki-dump index-file into a database """ import argparse -import sys, os, re -import bz2, sqlite3 +import sys +import os +import re +import bz2 +import sqlite3 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines DB_FILE = 'dump_index.db' def genData(indexFile: str, dbFile: str) -> None: - """ Reads the index file and creates the db """ if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') lastOffset = 0 @@ -29,7 +33,7 @@ def genData(indexFile: str, dbFile: str) -> None: lineNum += 1 if lineNum % 1e5 == 0: print(f'At line {lineNum}') - # + match = lineRegex.fullmatch(line.rstrip()) assert match is not None offsetStr, pageId, title = match.group(1,2,3) @@ -49,6 +53,7 @@ def genData(indexFile: str, dbFile: str) -> None: dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) except sqlite3.IntegrityError as e: print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') dbCon.commit() dbCon.close() @@ -56,5 +61,5 @@ def genData(indexFile: str, dbFile: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(INDEX_FILE, DB_FILE) -- cgit v1.2.3