aboutsummaryrefslogtreecommitdiff
path: root/backend/tests/enwiki/test_gen_img_data.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-10-01 21:07:59 +1000
committerTerry Truong <terry06890@gmail.com>2022-10-01 21:07:59 +1000
commita0b1e1a8a303504dd2cc743ab72937aee7f60f4d (patch)
tree8dfe88637c4c1f7830bb0f089ff630f8582310b2 /backend/tests/enwiki/test_gen_img_data.py
parentde9d6642ad2a57830f559fce22e36e3d68c5c70f (diff)
Add unit tests for Wikipedia extraction
Diffstat (limited to 'backend/tests/enwiki/test_gen_img_data.py')
-rw-r--r--backend/tests/enwiki/test_gen_img_data.py77
1 files changed, 77 insertions, 0 deletions
diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py
new file mode 100644
index 0000000..019b757
--- /dev/null
+++ b/backend/tests/enwiki/test_gen_img_data.py
@@ -0,0 +1,77 @@
+import unittest
+import tempfile, os
+
+from tests.common import createTestDbTable, readTestDbTable
+from hist_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData
+
+TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2')
+
+class TestGetInputPageIdsFromDb(unittest.TestCase):
+ def test_get(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp tree-of-life db
+ dbFile = os.path.join(tempDir, 'data.db')
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)',
+ 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+ {
+ (1, 'Belgium', 2389729, None, None, None, 2, 'country'),
+ (2, 'George Washington', 2353711, None, 2378478, None, 2, 'human'),
+ }
+ )
+ # Create temp dump-index db
+ indexDb = os.path.join(tempDir, 'dump_index.db')
+ createTestDbTable(
+ indexDb,
+ 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)',
+ 'INSERT INTO offsets VALUES (?, ?, ?, ?)',
+ {
+ ('Belgium',10,0,-1),
+ ('George Washington',20,0,-1),
+ ('Autism',25,0,-1),
+ }
+ )
+ # Run
+ pageIds = getInputPageIdsFromDb(dbFile, indexDb)
+ # Check
+ self.assertEqual(pageIds, {10, 20})
+
+class TestGenData(unittest.TestCase):
+ def test_gen(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp dump-index db
+ indexDb = os.path.join(tempDir, 'dump_index.db')
+ createTestDbTable(
+ indexDb,
+ 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)',
+ 'INSERT INTO offsets VALUES (?, ?, ?, ?)',
+ {
+ ('AccessibleComputing',10,0,-1),
+ ('AfghanistanHistory',13,0,-1),
+ ('Autism',25,0,-1),
+ }
+ )
+ # Run
+ imgDb = os.path.join(tempDir, 'imgData.db')
+ genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb)
+ # Check
+ self.assertEqual(
+ readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
+ {
+ (10, None),
+ (25, 'Autism-stacking-cans 2nd edit.jpg'),
+ }
+ )
+ # Run with updated page-ids set
+ genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb)
+ # Check
+ self.assertEqual(
+ readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
+ {
+ (10, None),
+ (13, None),
+ (25, 'Autism-stacking-cans 2nd edit.jpg'),
+ }
+ )