diff options
| -rw-r--r-- | backend/tests/enwiki/__init__.py | 0 | ||||
| -rw-r--r-- | backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 | bin | 0 -> 41998 bytes | |||
| -rw-r--r-- | backend/tests/enwiki/test_download_img_license_info.py | 185 | ||||
| -rw-r--r-- | backend/tests/enwiki/test_download_imgs.py | 54 | ||||
| -rw-r--r-- | backend/tests/enwiki/test_gen_desc_data.py | 37 | ||||
| -rw-r--r-- | backend/tests/enwiki/test_gen_dump_index_db.py | 39 | ||||
| -rw-r--r-- | backend/tests/enwiki/test_gen_img_data.py | 77 | ||||
| -rw-r--r-- | backend/tests/enwiki/test_gen_pageview_data.py | 44 |
8 files changed, 436 insertions, 0 deletions
diff --git a/backend/tests/enwiki/__init__.py b/backend/tests/enwiki/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tests/enwiki/__init__.py diff --git a/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 b/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 Binary files differnew file mode 100644 index 0000000..2abfdaa --- /dev/null +++ b/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 diff --git a/backend/tests/enwiki/test_download_img_license_info.py b/backend/tests/enwiki/test_download_img_license_info.py new file mode 100644 index 0000000..f285d55 --- /dev/null +++ b/backend/tests/enwiki/test_download_img_license_info.py @@ -0,0 +1,185 @@ +import unittest +from unittest.mock import Mock, patch +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from hist_data.enwiki.download_img_license_info import downloadInfo + +TEST_RESPONSE1 = { + 'batchcomplete': '', + 'query': { + 'normalized': [ + { + 'from': 'File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'to': 'File:Georgia Aquarium - Giant Grouper edit.jpg' + } + ], + 'pages': { + '-1': { + 'ns': 6, + 'title': 'File:Octopus2.jpg', + 'missing': '', + 'known': '', + 'imagerepository': 'shared', + 'imageinfo': [ + { + 'url': 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg', + 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Octopus2.jpg', + 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=2795257', + 'extmetadata': { + 'Credit': { + 'value': '<span class=\\"int-own-work\\" lang=\\"en\\">Own work</span>', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Artist': { + 'value': 'albert kok', + 'source': 'commons-desc-page' + }, + 'LicenseShortName': { + 'value': 'CC BY-SA 3.0', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Restrictions': { + 'value': '', + 'source': 'commons-desc-page', + 'hidden': '' + } + } + } + ] + } + } + } +} +TEST_RESPONSE2 = { + 'batchcomplete': '', + 'query': { + 'normalized': [ + { + 'from': 'File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'to': 'File:Georgia Aquarium - Giant Grouper edit.jpg' + } + ], + 'pages': { + '-1': { + 'ns': 6, + 'title': 'File:Octopus2.jpg', + 'missing': '', + 'known': '', + 'imagerepository': 'shared', + 'imageinfo': [ + { + 'url': 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg', + 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Octopus2.jpg', + 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=2795257', + 'extmetadata': { + 'Credit': { + 'value': '<span class=\\"int-own-work\\" lang=\\"en\\">Own work</span>', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Artist': { + 'value': 'albert kok', + 'source': 'commons-desc-page' + }, + 'LicenseShortName': { + 'value': 'CC BY-SA 3.0', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Restrictions': { + 'value': '', + 'source': 'commons-desc-page', + 'hidden': '' + } + } + } + ] + }, + '-2': { + 'ns': 6, + 'title': 'File:Georgia Aquarium - Giant Grouper edit.jpg', + 'missing': '', + 'known': '', + 'imagerepository': 'shared', + 'imageinfo': [ + { + 'url': 'https://upload.wikimedia.org/wikipedia/commons/2/23/Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=823649', + 'extmetadata': { + 'Credit': { + "value": "<a href=\"//commons.wikimedia.org/wiki/File:Georgia_Aquarium_-_Giant_Grouper.jpg\" title=\"File:Georgia Aquarium - Giant Grouper.jpg\">File:Georgia Aquarium - Giant Grouper.jpg</a>", + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Artist': { + "value": "Taken by <a href=\"//commons.wikimedia.org/wiki/User:Diliff\" title=\"User:Diliff\">Diliff</a> Edited by <a href=\"//commons.wikimedia.org/wiki/User:Fir0002\" title=\"User:Fir0002\">Fir0002</a>", + 'source': 'commons-desc-page' + }, + 'LicenseShortName': { + 'value': 'CC BY 2.5', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Restrictions': { + 'value': '', + 'source': 'commons-desc-page', + 'hidden': '' + } + } + } + ] + } + } + } +} + +class TestDownloadInfo(unittest.TestCase): + @patch('requests.get', autospec=True) + def test_download(self, requestsGetMock): + requestsGetMock.side_effect = [Mock(json=lambda: TEST_RESPONSE1), Mock(json=lambda: TEST_RESPONSE2)] + with tempfile.TemporaryDirectory() as tempDir: + # Create temp image-data db + imgDb = os.path.join(tempDir, 'img_data.db') + createTestDbTable( + imgDb, + 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)', + 'INSERT into page_imgs VALUES (?, ?)', + { + (1, 'Octopus2.jpg'), + } + ) + # Run + downloadInfo(imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT id, name, license, artist, credit, restrictions, url from imgs'), + { + (1, 'Octopus2.jpg', 'CC BY-SA 3.0', 'albert kok', 'Own work', '', + 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'), + } + ) + # Run with updated image-data db + createTestDbTable( + imgDb, + None, + 'INSERT into page_imgs VALUES (?, ?)', + { + (2, 'Georgia_Aquarium_-_Giant_Grouper_edit.jpg'), + } + ) + downloadInfo(imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT id, name, license, artist, credit, restrictions, url from imgs'), + { + (1, 'Octopus2.jpg', 'CC BY-SA 3.0', 'albert kok', 'Own work', '', + 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'), + (2, 'Georgia_Aquarium_-_Giant_Grouper_edit.jpg', 'CC BY 2.5', 'Taken by Diliff Edited by Fir0002', + 'File:Georgia Aquarium - Giant Grouper.jpg', '', 'https://upload.wikimedia.org/' \ + 'wikipedia/commons/2/23/Georgia_Aquarium_-_Giant_Grouper_edit.jpg'), + } + ) diff --git a/backend/tests/enwiki/test_download_imgs.py b/backend/tests/enwiki/test_download_imgs.py new file mode 100644 index 0000000..823ac37 --- /dev/null +++ b/backend/tests/enwiki/test_download_imgs.py @@ -0,0 +1,54 @@ +import unittest +from unittest.mock import Mock, patch +import tempfile, os + +from tests.common import readTestFile, createTestDbTable +from hist_data.enwiki.download_imgs import downloadImgs + +class TestDownloadInfo(unittest.TestCase): + @patch('requests.get', autospec=True) + def test_download(self, requestsGetMock): + requestsGetMock.side_effect = lambda url, **kwargs: Mock(content=('img:' + url).encode()) + with tempfile.TemporaryDirectory() as tempDir: + # Create temp image-data db + imgDb = os.path.join(tempDir, 'img_data.db') + createTestDbTable( + imgDb, + 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)', + 'INSERT into page_imgs VALUES (?, ?)', + { + (1, 'one'), + (2, 'two'), + (3, 'three'), + (4, 'four'), + (5, 'five'), + (6, 'six'), + (7, 'seven'), + } + ) + createTestDbTable( + imgDb, + 'CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \ + 'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)', + 'INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)', + { + (11, 'one','cc-by','alice','anna','','https://upload.wikimedia.org/1.jpg'), + (12, 'two','???','bob','barbara','','https://upload.wikimedia.org/2.png'), + (13, 'three','cc-by-sa','clare','File:?','','https://upload.wikimedia.org/3.gif'), + (14, 'four','cc-by-sa 4.0','dave','dan','all','https://upload.wikimedia.org/4.jpeg'), + (15, 'five','cc0','eve','eric',None,'https://upload.wikimedia.org/5.png'), + (16, 'six','cc-by','','fred','','https://upload.wikimedia.org/6.png'), + } + ) + # Create temp output directory + with tempfile.TemporaryDirectory() as outDir: + # Run + downloadImgs(imgDb, outDir, 0) + # Check + expectedImgs = { + '11.jpg': 'img:https://upload.wikimedia.org/1.jpg', + '15.png': 'img:https://upload.wikimedia.org/5.png', + } + self.assertEqual(set(os.listdir(outDir)), set(expectedImgs.keys())) + for imgName, content in expectedImgs.items(): + self.assertEqual(readTestFile(os.path.join(outDir, imgName)), content) diff --git a/backend/tests/enwiki/test_gen_desc_data.py b/backend/tests/enwiki/test_gen_desc_data.py new file mode 100644 index 0000000..f6d4250 --- /dev/null +++ b/backend/tests/enwiki/test_gen_desc_data.py @@ -0,0 +1,37 @@ +import unittest +import os, tempfile + +from tests.common import readTestDbTable +from hist_data.enwiki.gen_desc_data import genData + +TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2') + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Run + dbFile = os.path.join(tempDir, 'descData.db') + genData(TEST_DUMP_FILE, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, title FROM pages'), + { + (10, 'AccessibleComputing'), + (13, 'AfghanistanHistory'), + (25, 'Autism'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, target FROM redirects'), + { + (10, 'Computer accessibility'), + (13, 'History of Afghanistan'), + } + ) + descsRows = readTestDbTable(dbFile, 'SELECT id, desc FROM descs') + expectedDescPrefixes = { + 25: 'Kanner autism, or classic autism, is a neurodevelopmental disorder', + } + self.assertEqual({row[0] for row in descsRows}, set(expectedDescPrefixes.keys())) + for id, desc in descsRows: + self.assertTrue(id in expectedDescPrefixes and desc.startswith(expectedDescPrefixes[id])) diff --git a/backend/tests/enwiki/test_gen_dump_index_db.py b/backend/tests/enwiki/test_gen_dump_index_db.py new file mode 100644 index 0000000..64053c4 --- /dev/null +++ b/backend/tests/enwiki/test_gen_dump_index_db.py @@ -0,0 +1,39 @@ +import unittest +import tempfile, os + +from tests.common import createTestBz2, readTestDbTable +from hist_data.enwiki.gen_dump_index_db import genData + +def runGenData(indexFileContents: str): + """ Sets up index file to be read by genData(), runs it, reads the output database, and returns offset info. """ + with tempfile.TemporaryDirectory() as tempDir: + # Create temp index file + indexFile = os.path.join(tempDir, 'index.txt.bz2') + createTestBz2(indexFile, indexFileContents) + # Run + dbFile = os.path.join(tempDir, 'data.db') + genData(indexFile, dbFile) + # Read db + return readTestDbTable(dbFile, 'SELECT title, id, offset, next_offset FROM offsets') + +class TestGenData(unittest.TestCase): + def setUp(self): + self.maxDiff = None # Remove output-diff size limit + def test_index_file(self): + indexFileContents = ( + '100:10:apple\n' + '100:11:ant\n' + '300:99:banana ice-cream\n' + '1000:2030:Custard!\n' + ) + offsetsMap = runGenData(indexFileContents) + self.assertEqual(offsetsMap, { + ('apple', 10, 100, 300), + ('ant', 11, 100, 300), + ('banana ice-cream', 99, 300, 1000), + ('Custard!', 2030, 1000, -1), + }) + def test_emp_index(self): + offsetsMap = runGenData('') + self.assertEqual(offsetsMap, set()) + pass diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py new file mode 100644 index 0000000..019b757 --- /dev/null +++ b/backend/tests/enwiki/test_gen_img_data.py @@ -0,0 +1,77 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from hist_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData + +TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2') + +class TestGetInputPageIdsFromDb(unittest.TestCase): + def test_get(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp tree-of-life db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ + 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', + 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + { + (1, 'Belgium', 2389729, None, None, None, 2, 'country'), + (2, 'George Washington', 2353711, None, 2378478, None, 2, 'human'), + } + ) + # Create temp dump-index db + indexDb = os.path.join(tempDir, 'dump_index.db') + createTestDbTable( + indexDb, + 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)', + 'INSERT INTO offsets VALUES (?, ?, ?, ?)', + { + ('Belgium',10,0,-1), + ('George Washington',20,0,-1), + ('Autism',25,0,-1), + } + ) + # Run + pageIds = getInputPageIdsFromDb(dbFile, indexDb) + # Check + self.assertEqual(pageIds, {10, 20}) + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp dump-index db + indexDb = os.path.join(tempDir, 'dump_index.db') + createTestDbTable( + indexDb, + 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)', + 'INSERT INTO offsets VALUES (?, ?, ?, ?)', + { + ('AccessibleComputing',10,0,-1), + ('AfghanistanHistory',13,0,-1), + ('Autism',25,0,-1), + } + ) + # Run + imgDb = os.path.join(tempDir, 'imgData.db') + genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'), + { + (10, None), + (25, 'Autism-stacking-cans 2nd edit.jpg'), + } + ) + # Run with updated page-ids set + genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'), + { + (10, None), + (13, None), + (25, 'Autism-stacking-cans 2nd edit.jpg'), + } + ) diff --git a/backend/tests/enwiki/test_gen_pageview_data.py b/backend/tests/enwiki/test_gen_pageview_data.py new file mode 100644 index 0000000..154953e --- /dev/null +++ b/backend/tests/enwiki/test_gen_pageview_data.py @@ -0,0 +1,44 @@ +import unittest +import tempfile, os + +from tests.common import createTestBz2, createTestDbTable, readTestDbTable +from hist_data.enwiki.gen_pageview_data import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp pageview files + pageviewFiles = [os.path.join(tempDir, 'pageviews1.bz2'), os.path.join(tempDir, 'pageviews2.bz2')] + createTestBz2(pageviewFiles[0], ( + 'aa.wikibooks One null desktop 1 W1\n' + 'en.wikipedia Two null mobile-web 10 A9B1\n' + 'en.wikipedia Three null desktop 4 D3\n' + )) + createTestBz2(pageviewFiles[1], ( + 'fr.wikipedia Four null desktop 12 T6U6\n' + 'en.wikipedia Three null desktop 10 E4G5Z61\n' + )) + # Create temp dump-index db + dumpIndexDb = os.path.join(tempDir, 'dump_index.db') + createTestDbTable( + dumpIndexDb, + 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)', + 'INSERT INTO offsets VALUES (?, ?, ?, ?)', + { + ('One', 1, 0, -1), + ('Two', 2, 0, -1), + ('Three', 3, 0, -1), + ('Four', 4, 0, -1), + } + ) + # Run + dbFile = os.path.join(tempDir, 'data.db') + genData(pageviewFiles, dumpIndexDb, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT title, id, views from views'), + { + ('Two', 2, 5), + ('Three', 3, 7), + } + ) |
