Add backend unit tests

- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
author: Terry Truong <terry06890@gmail.com> 2022-09-11 14:55:42 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-09-11 15:04:14 +1000
commit: 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree: 2567c25c902dbb40d44419805cebb38171df47fa /backend/tol_data/enwiki/lookup_page.py
parent: daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
1 files changed, 71 insertions, 0 deletions
diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py
new file mode 100755
index 0000000..8ef1229
--- /dev/null
+++ b/backend/tol_data/enwiki/lookup_page.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+
+"""
+Looks up a page with title title1 in the wiki dump, using the dump-index
+db, and prints the corresponding <page>.
+"""
+
+import sys
+import bz2
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+INDEX_DB = 'dumpIndex.db'
+
+def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
+	print('Looking up offset in index db')
+	dbCon = sqlite3.connect(indexDb)
+	dbCur = dbCon.cursor()
+	query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?'
+	row = dbCur.execute(query, (pageTitle,)).fetchone()
+	if row is None:
+		print('Title not found')
+		sys.exit(0)
+	_, pageOffset, endOffset = row
+	dbCon.close()
+	print(f'Found chunk at offset {pageOffset}')
+	#
+	print('Reading from wiki dump')
+	content: list[str] = []
+	with open(dumpFile, mode='rb') as file:
+		# Get uncompressed chunk
+		file.seek(pageOffset)
+		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+		# Look in chunk for page
+		lines = data.splitlines()
+		lineIdx = 0
+		found = False
+		pageNum = 0
+		while not found:
+			line = lines[lineIdx]
+			if line.lstrip() == '<page>':
+				pageNum += 1
+				if pageNum > 100:
+					print('ERROR: Did not find title after 100 pages')
+					break
+				lineIdx += 1
+				titleLine = lines[lineIdx]
+				if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
+					found = True
+					print(f'Found title in chunk as page {pageNum}')
+					content.append(line)
+					content.append(titleLine)
+					while True:
+						lineIdx += 1
+						line = lines[lineIdx]
+						content.append(line)
+						if line.lstrip() == '</page>':
+							break
+			lineIdx += 1
+	#
+	print('Content: ')
+	print('\n'.join(content))
+
+if __name__ == '__main__':
+	import argparse
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	parser.add_argument('title', help='The title to look up')
+	args = parser.parse_args()
+	#
+	lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' '))
author	Terry Truong <terry06890@gmail.com>	2022-09-11 14:55:42 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-09-11 15:04:14 +1000
commit	5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree	2567c25c902dbb40d44419805cebb38171df47fa /backend/tol_data/enwiki/lookup_page.py
parent	daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)