diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-09-07 11:37:37 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-09-07 11:37:37 +1000 |
| commit | daccbbd9c73a5292ea9d6746560d7009e5aa666d (patch) | |
| tree | 9156bf011ab6302eb3c0d219d40587d594f51841 /backend/tolData/enwiki/genDumpIndexDb.py | |
| parent | 1a7fe33edafa68a6f759d124bdeee673ff9cf9ff (diff) | |
Add python type annotations
Also use consistent quote symbols
Also use 'is None' instead of '== None'
Also use 'if list1' instead of 'if len(list1) > 0'
Diffstat (limited to 'backend/tolData/enwiki/genDumpIndexDb.py')
| -rwxr-xr-x | backend/tolData/enwiki/genDumpIndexDb.py | 39 |
1 files changed, 20 insertions, 19 deletions
diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py index 1bffb27..3bd129f 100755 --- a/backend/tolData/enwiki/genDumpIndexDb.py +++ b/backend/tolData/enwiki/genDumpIndexDb.py @@ -10,46 +10,47 @@ Adds data from the wiki dump index-file into a database """, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines -indexDb = "dumpIndex.db" +indexFile = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines +indexDb = 'dumpIndex.db' if os.path.exists(indexDb): - raise Exception(f"ERROR: Existing {indexDb}") -print("Creating database") + raise Exception(f'ERROR: Existing {indexDb}') +print('Creating database') dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)") +dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') -print("Iterating through index file") -lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") +print('Iterating through index file') +lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') lastOffset = 0 lineNum = 0 -entriesToAdd = [] +entriesToAdd: list[tuple[str, str]] = [] with bz2.open(indexFile, mode='rt') as file: for line in file: lineNum += 1 if lineNum % 1e5 == 0: - print(f"At line {lineNum}") + print(f'At line {lineNum}') # match = lineRegex.fullmatch(line.rstrip()) - (offset, pageId, title) = match.group(1,2,3) - offset = int(offset) + assert match is not None + offsetStr, pageId, title = match.group(1,2,3) + offset = int(offsetStr) if offset > lastOffset: - for (t, p) in entriesToAdd: + for t, p in entriesToAdd: try: - dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset)) + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset)) except sqlite3.IntegrityError as e: # Accounts for certain entries in the file that have the same title - print(f"Failed on title \"{t}\": {e}", file=sys.stderr) + print(f'Failed on title "{t}": {e}', file=sys.stderr) entriesToAdd = [] lastOffset = offset - entriesToAdd.append([title, pageId]) -for (title, pageId) in entriesToAdd: + entriesToAdd.append((title, pageId)) +for title, pageId in entriesToAdd: try: - dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1)) + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) except sqlite3.IntegrityError as e: - print(f"Failed on title \"{t}\": {e}", file=sys.stderr) + print(f'Failed on title "{t}": {e}', file=sys.stderr) -print("Closing database") +print('Closing database') dbCon.commit() dbCon.close() |
