Add backend unit tests

- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
author: Terry Truong <terry06890@gmail.com> 2022-09-11 14:55:42 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-09-11 15:04:14 +1000
commit: 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree: 2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/enwiki/lookupPage.py
parent: daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
1 files changed, 0 insertions, 66 deletions
diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py
deleted file mode 100755
index 427aa7a..0000000
--- a/backend/tolData/enwiki/lookupPage.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/python3
-
-import sys
-import bz2
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Looks up a page with title title1 in the wiki dump, using the dump-index
-db, and prints the corresponding <page>.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-parser.add_argument("title", help="The title to look up")
-args = parser.parse_args()
-
-dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
-indexDb = 'dumpIndex.db'
-pageTitle = args.title.replace('_', ' ')
-
-print('Looking up offset in index db')
-dbCon = sqlite3.connect(indexDb)
-dbCur = dbCon.cursor()
-query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?'
-row = dbCur.execute(query, (pageTitle,)).fetchone()
-if row is None:
-	print('Title not found')
-	sys.exit(0)
-_, pageOffset, endOffset = row
-dbCon.close()
-print(f'Found chunk at offset {pageOffset}')
-
-print('Reading from wiki dump')
-content: list[str] = []
-with open(dumpFile, mode='rb') as file:
-	# Get uncompressed chunk
-	file.seek(pageOffset)
-	compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
-	data = bz2.BZ2Decompressor().decompress(compressedData).decode()
-	# Look in chunk for page
-	lines = data.splitlines()
-	lineIdx = 0
-	found = False
-	pageNum = 0
-	while not found:
-		line = lines[lineIdx]
-		if line.lstrip() == '<page>':
-			pageNum += 1
-			if pageNum > 100:
-				print('ERROR: Did not find title after 100 pages')
-				break
-			lineIdx += 1
-			titleLine = lines[lineIdx]
-			if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
-				found = True
-				print(f'Found title in chunk as page {pageNum}')
-				content.append(line)
-				content.append(titleLine)
-				while True:
-					lineIdx += 1
-					line = lines[lineIdx]
-					content.append(line)
-					if line.lstrip() == '</page>':
-						break
-		lineIdx += 1
-
-print('Content: ')
-print('\n'.join(content))
author	Terry Truong <terry06890@gmail.com>	2022-09-11 14:55:42 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-09-11 15:04:14 +1000
commit	5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree	2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/enwiki/lookupPage.py
parent	daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)