From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tolData/enwiki/lookupPage.py | 66 ------------------------------------ 1 file changed, 66 deletions(-) delete mode 100755 backend/tolData/enwiki/lookupPage.py (limited to 'backend/tolData/enwiki/lookupPage.py') diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py deleted file mode 100755 index 427aa7a..0000000 --- a/backend/tolData/enwiki/lookupPage.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/python3 - -import sys -import bz2 -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Looks up a page with title title1 in the wiki dump, using the dump-index -db, and prints the corresponding . -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.add_argument("title", help="The title to look up") -args = parser.parse_args() - -dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2' -indexDb = 'dumpIndex.db' -pageTitle = args.title.replace('_', ' ') - -print('Looking up offset in index db') -dbCon = sqlite3.connect(indexDb) -dbCur = dbCon.cursor() -query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?' -row = dbCur.execute(query, (pageTitle,)).fetchone() -if row is None: - print('Title not found') - sys.exit(0) -_, pageOffset, endOffset = row -dbCon.close() -print(f'Found chunk at offset {pageOffset}') - -print('Reading from wiki dump') -content: list[str] = [] -with open(dumpFile, mode='rb') as file: - # Get uncompressed chunk - file.seek(pageOffset) - compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) - data = bz2.BZ2Decompressor().decompress(compressedData).decode() - # Look in chunk for page - lines = data.splitlines() - lineIdx = 0 - found = False - pageNum = 0 - while not found: - line = lines[lineIdx] - if line.lstrip() == '': - pageNum += 1 - if pageNum > 100: - print('ERROR: Did not find title after 100 pages') - break - lineIdx += 1 - titleLine = lines[lineIdx] - if titleLine.lstrip() == '' + pageTitle + '': - found = True - print(f'Found title in chunk as page {pageNum}') - content.append(line) - content.append(titleLine) - while True: - lineIdx += 1 - line = lines[lineIdx] - content.append(line) - if line.lstrip() == '': - break - lineIdx += 1 - -print('Content: ') -print('\n'.join(content)) -- cgit v1.2.3