aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki/lookupPage.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/enwiki/lookupPage.py')
-rwxr-xr-xbackend/tolData/enwiki/lookupPage.py34
1 files changed, 17 insertions, 17 deletions
diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py
index e7b95f0..427aa7a 100755
--- a/backend/tolData/enwiki/lookupPage.py
+++ b/backend/tolData/enwiki/lookupPage.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, re
+import sys
import bz2
import sqlite3
@@ -12,24 +12,24 @@ db, and prints the corresponding <page>.
parser.add_argument("title", help="The title to look up")
args = parser.parse_args()
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
-indexDb = "dumpIndex.db"
-pageTitle = args.title.replace("_", " ")
+dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+indexDb = 'dumpIndex.db'
+pageTitle = args.title.replace('_', ' ')
-print("Looking up offset in index db")
+print('Looking up offset in index db')
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
-query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?"
+query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?'
row = dbCur.execute(query, (pageTitle,)).fetchone()
-if row == None:
- print("Title not found")
+if row is None:
+ print('Title not found')
sys.exit(0)
_, pageOffset, endOffset = row
dbCon.close()
-print(f"Found chunk at offset {pageOffset}")
+print(f'Found chunk at offset {pageOffset}')
-print("Reading from wiki dump")
-content = []
+print('Reading from wiki dump')
+content: list[str] = []
with open(dumpFile, mode='rb') as file:
# Get uncompressed chunk
file.seek(pageOffset)
@@ -42,25 +42,25 @@ with open(dumpFile, mode='rb') as file:
pageNum = 0
while not found:
line = lines[lineIdx]
- if line.lstrip() == "<page>":
+ if line.lstrip() == '<page>':
pageNum += 1
if pageNum > 100:
- print("ERROR: Did not find title after 100 pages")
+ print('ERROR: Did not find title after 100 pages')
break
lineIdx += 1
titleLine = lines[lineIdx]
if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
found = True
- print(f"Found title in chunk as page {pageNum}")
+ print(f'Found title in chunk as page {pageNum}')
content.append(line)
content.append(titleLine)
while True:
lineIdx += 1
line = lines[lineIdx]
content.append(line)
- if line.lstrip() == "</page>":
+ if line.lstrip() == '</page>':
break
lineIdx += 1
-print("Content: ")
-print("\n".join(content))
+print('Content: ')
+print('\n'.join(content))