diff options
Diffstat (limited to 'backend/tolData/enwiki/lookupPage.py')
| -rwxr-xr-x | backend/tolData/enwiki/lookupPage.py | 34 |
1 files changed, 17 insertions, 17 deletions
diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py index e7b95f0..427aa7a 100755 --- a/backend/tolData/enwiki/lookupPage.py +++ b/backend/tolData/enwiki/lookupPage.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, re +import sys import bz2 import sqlite3 @@ -12,24 +12,24 @@ db, and prints the corresponding <page>. parser.add_argument("title", help="The title to look up") args = parser.parse_args() -dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" -indexDb = "dumpIndex.db" -pageTitle = args.title.replace("_", " ") +dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2' +indexDb = 'dumpIndex.db' +pageTitle = args.title.replace('_', ' ') -print("Looking up offset in index db") +print('Looking up offset in index db') dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() -query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?" +query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?' row = dbCur.execute(query, (pageTitle,)).fetchone() -if row == None: - print("Title not found") +if row is None: + print('Title not found') sys.exit(0) _, pageOffset, endOffset = row dbCon.close() -print(f"Found chunk at offset {pageOffset}") +print(f'Found chunk at offset {pageOffset}') -print("Reading from wiki dump") -content = [] +print('Reading from wiki dump') +content: list[str] = [] with open(dumpFile, mode='rb') as file: # Get uncompressed chunk file.seek(pageOffset) @@ -42,25 +42,25 @@ with open(dumpFile, mode='rb') as file: pageNum = 0 while not found: line = lines[lineIdx] - if line.lstrip() == "<page>": + if line.lstrip() == '<page>': pageNum += 1 if pageNum > 100: - print("ERROR: Did not find title after 100 pages") + print('ERROR: Did not find title after 100 pages') break lineIdx += 1 titleLine = lines[lineIdx] if titleLine.lstrip() == '<title>' + pageTitle + '</title>': found = True - print(f"Found title in chunk as page {pageNum}") + print(f'Found title in chunk as page {pageNum}') content.append(line) content.append(titleLine) while True: lineIdx += 1 line = lines[lineIdx] content.append(line) - if line.lstrip() == "</page>": + if line.lstrip() == '</page>': break lineIdx += 1 -print("Content: ") -print("\n".join(content)) +print('Content: ') +print('\n'.join(content)) |
