aboutsummaryrefslogtreecommitdiff
path: root/backend/tests/enwiki/test_gen_img_data.py
blob: 019b757c4f49e49db23a123f60c892234fe76808 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import unittest
import tempfile, os

from tests.common import createTestDbTable, readTestDbTable
from hist_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData

TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2')

class TestGetInputPageIdsFromDb(unittest.TestCase):
	def test_get(self):
		with tempfile.TemporaryDirectory() as tempDir:
			# Create temp tree-of-life db
			dbFile = os.path.join(tempDir, 'data.db')
			createTestDbTable(
				dbFile,
				'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
					'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)',
				'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
				{
					(1, 'Belgium', 2389729, None, None, None, 2, 'country'),
					(2, 'George Washington', 2353711, None, 2378478, None, 2, 'human'),
				}
			)
			# Create temp dump-index db
			indexDb = os.path.join(tempDir, 'dump_index.db')
			createTestDbTable(
				indexDb,
				'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)',
				'INSERT INTO offsets VALUES (?, ?, ?, ?)',
				{
					('Belgium',10,0,-1),
					('George Washington',20,0,-1),
					('Autism',25,0,-1),
				}
			)
			# Run
			pageIds = getInputPageIdsFromDb(dbFile, indexDb)
			# Check
			self.assertEqual(pageIds, {10, 20})

class TestGenData(unittest.TestCase):
	def test_gen(self):
		with tempfile.TemporaryDirectory() as tempDir:
			# Create temp dump-index db
			indexDb = os.path.join(tempDir, 'dump_index.db')
			createTestDbTable(
				indexDb,
				'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)',
				'INSERT INTO offsets VALUES (?, ?, ?, ?)',
				{
					('AccessibleComputing',10,0,-1),
					('AfghanistanHistory',13,0,-1),
					('Autism',25,0,-1),
				}
			)
			# Run
			imgDb = os.path.join(tempDir, 'imgData.db')
			genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb)
			# Check
			self.assertEqual(
				readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
				{
					(10, None),
					(25, 'Autism-stacking-cans 2nd edit.jpg'),
				}
			)
			# Run with updated page-ids set
			genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb)
			# Check
			self.assertEqual(
				readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
				{
					(10, None),
					(13, None),
					(25, 'Autism-stacking-cans 2nd edit.jpg'),
				}
			)