aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--backend/tests/enwiki/__init__.py0
-rw-r--r--backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2bin0 -> 41998 bytes
-rw-r--r--backend/tests/enwiki/test_download_img_license_info.py185
-rw-r--r--backend/tests/enwiki/test_download_imgs.py54
-rw-r--r--backend/tests/enwiki/test_gen_desc_data.py37
-rw-r--r--backend/tests/enwiki/test_gen_dump_index_db.py39
-rw-r--r--backend/tests/enwiki/test_gen_img_data.py77
-rw-r--r--backend/tests/enwiki/test_gen_pageview_data.py44
8 files changed, 436 insertions, 0 deletions
diff --git a/backend/tests/enwiki/__init__.py b/backend/tests/enwiki/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/tests/enwiki/__init__.py
diff --git a/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 b/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2
new file mode 100644
index 0000000..2abfdaa
--- /dev/null
+++ b/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2
Binary files differ
diff --git a/backend/tests/enwiki/test_download_img_license_info.py b/backend/tests/enwiki/test_download_img_license_info.py
new file mode 100644
index 0000000..f285d55
--- /dev/null
+++ b/backend/tests/enwiki/test_download_img_license_info.py
@@ -0,0 +1,185 @@
+import unittest
+from unittest.mock import Mock, patch
+import tempfile, os
+
+from tests.common import createTestDbTable, readTestDbTable
+from hist_data.enwiki.download_img_license_info import downloadInfo
+
+TEST_RESPONSE1 = {
+ 'batchcomplete': '',
+ 'query': {
+ 'normalized': [
+ {
+ 'from': 'File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg',
+ 'to': 'File:Georgia Aquarium - Giant Grouper edit.jpg'
+ }
+ ],
+ 'pages': {
+ '-1': {
+ 'ns': 6,
+ 'title': 'File:Octopus2.jpg',
+ 'missing': '',
+ 'known': '',
+ 'imagerepository': 'shared',
+ 'imageinfo': [
+ {
+ 'url': 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg',
+ 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Octopus2.jpg',
+ 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=2795257',
+ 'extmetadata': {
+ 'Credit': {
+ 'value': '<span class=\\"int-own-work\\" lang=\\"en\\">Own work</span>',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ },
+ 'Artist': {
+ 'value': 'albert kok',
+ 'source': 'commons-desc-page'
+ },
+ 'LicenseShortName': {
+ 'value': 'CC BY-SA 3.0',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ },
+ 'Restrictions': {
+ 'value': '',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+}
+TEST_RESPONSE2 = {
+ 'batchcomplete': '',
+ 'query': {
+ 'normalized': [
+ {
+ 'from': 'File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg',
+ 'to': 'File:Georgia Aquarium - Giant Grouper edit.jpg'
+ }
+ ],
+ 'pages': {
+ '-1': {
+ 'ns': 6,
+ 'title': 'File:Octopus2.jpg',
+ 'missing': '',
+ 'known': '',
+ 'imagerepository': 'shared',
+ 'imageinfo': [
+ {
+ 'url': 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg',
+ 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Octopus2.jpg',
+ 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=2795257',
+ 'extmetadata': {
+ 'Credit': {
+ 'value': '<span class=\\"int-own-work\\" lang=\\"en\\">Own work</span>',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ },
+ 'Artist': {
+ 'value': 'albert kok',
+ 'source': 'commons-desc-page'
+ },
+ 'LicenseShortName': {
+ 'value': 'CC BY-SA 3.0',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ },
+ 'Restrictions': {
+ 'value': '',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ }
+ }
+ }
+ ]
+ },
+ '-2': {
+ 'ns': 6,
+ 'title': 'File:Georgia Aquarium - Giant Grouper edit.jpg',
+ 'missing': '',
+ 'known': '',
+ 'imagerepository': 'shared',
+ 'imageinfo': [
+ {
+ 'url': 'https://upload.wikimedia.org/wikipedia/commons/2/23/Georgia_Aquarium_-_Giant_Grouper_edit.jpg',
+ 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg',
+ 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=823649',
+ 'extmetadata': {
+ 'Credit': {
+ "value": "<a href=\"//commons.wikimedia.org/wiki/File:Georgia_Aquarium_-_Giant_Grouper.jpg\" title=\"File:Georgia Aquarium - Giant Grouper.jpg\">File:Georgia Aquarium - Giant Grouper.jpg</a>",
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ },
+ 'Artist': {
+ "value": "Taken by <a href=\"//commons.wikimedia.org/wiki/User:Diliff\" title=\"User:Diliff\">Diliff</a> Edited by <a href=\"//commons.wikimedia.org/wiki/User:Fir0002\" title=\"User:Fir0002\">Fir0002</a>",
+ 'source': 'commons-desc-page'
+ },
+ 'LicenseShortName': {
+ 'value': 'CC BY 2.5',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ },
+ 'Restrictions': {
+ 'value': '',
+ 'source': 'commons-desc-page',
+ 'hidden': ''
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+}
+
+class TestDownloadInfo(unittest.TestCase):
+ @patch('requests.get', autospec=True)
+ def test_download(self, requestsGetMock):
+ requestsGetMock.side_effect = [Mock(json=lambda: TEST_RESPONSE1), Mock(json=lambda: TEST_RESPONSE2)]
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp image-data db
+ imgDb = os.path.join(tempDir, 'img_data.db')
+ createTestDbTable(
+ imgDb,
+ 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)',
+ 'INSERT into page_imgs VALUES (?, ?)',
+ {
+ (1, 'Octopus2.jpg'),
+ }
+ )
+ # Run
+ downloadInfo(imgDb)
+ # Check
+ self.assertEqual(
+ readTestDbTable(imgDb, 'SELECT id, name, license, artist, credit, restrictions, url from imgs'),
+ {
+ (1, 'Octopus2.jpg', 'CC BY-SA 3.0', 'albert kok', 'Own work', '',
+ 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'),
+ }
+ )
+ # Run with updated image-data db
+ createTestDbTable(
+ imgDb,
+ None,
+ 'INSERT into page_imgs VALUES (?, ?)',
+ {
+ (2, 'Georgia_Aquarium_-_Giant_Grouper_edit.jpg'),
+ }
+ )
+ downloadInfo(imgDb)
+ # Check
+ self.assertEqual(
+ readTestDbTable(imgDb, 'SELECT id, name, license, artist, credit, restrictions, url from imgs'),
+ {
+ (1, 'Octopus2.jpg', 'CC BY-SA 3.0', 'albert kok', 'Own work', '',
+ 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'),
+ (2, 'Georgia_Aquarium_-_Giant_Grouper_edit.jpg', 'CC BY 2.5', 'Taken by Diliff Edited by Fir0002',
+ 'File:Georgia Aquarium - Giant Grouper.jpg', '', 'https://upload.wikimedia.org/' \
+ 'wikipedia/commons/2/23/Georgia_Aquarium_-_Giant_Grouper_edit.jpg'),
+ }
+ )
diff --git a/backend/tests/enwiki/test_download_imgs.py b/backend/tests/enwiki/test_download_imgs.py
new file mode 100644
index 0000000..823ac37
--- /dev/null
+++ b/backend/tests/enwiki/test_download_imgs.py
@@ -0,0 +1,54 @@
+import unittest
+from unittest.mock import Mock, patch
+import tempfile, os
+
+from tests.common import readTestFile, createTestDbTable
+from hist_data.enwiki.download_imgs import downloadImgs
+
+class TestDownloadInfo(unittest.TestCase):
+ @patch('requests.get', autospec=True)
+ def test_download(self, requestsGetMock):
+ requestsGetMock.side_effect = lambda url, **kwargs: Mock(content=('img:' + url).encode())
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp image-data db
+ imgDb = os.path.join(tempDir, 'img_data.db')
+ createTestDbTable(
+ imgDb,
+ 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)',
+ 'INSERT into page_imgs VALUES (?, ?)',
+ {
+ (1, 'one'),
+ (2, 'two'),
+ (3, 'three'),
+ (4, 'four'),
+ (5, 'five'),
+ (6, 'six'),
+ (7, 'seven'),
+ }
+ )
+ createTestDbTable(
+ imgDb,
+ 'CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \
+ 'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)',
+ 'INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)',
+ {
+ (11, 'one','cc-by','alice','anna','','https://upload.wikimedia.org/1.jpg'),
+ (12, 'two','???','bob','barbara','','https://upload.wikimedia.org/2.png'),
+ (13, 'three','cc-by-sa','clare','File:?','','https://upload.wikimedia.org/3.gif'),
+ (14, 'four','cc-by-sa 4.0','dave','dan','all','https://upload.wikimedia.org/4.jpeg'),
+ (15, 'five','cc0','eve','eric',None,'https://upload.wikimedia.org/5.png'),
+ (16, 'six','cc-by','','fred','','https://upload.wikimedia.org/6.png'),
+ }
+ )
+ # Create temp output directory
+ with tempfile.TemporaryDirectory() as outDir:
+ # Run
+ downloadImgs(imgDb, outDir, 0)
+ # Check
+ expectedImgs = {
+ '11.jpg': 'img:https://upload.wikimedia.org/1.jpg',
+ '15.png': 'img:https://upload.wikimedia.org/5.png',
+ }
+ self.assertEqual(set(os.listdir(outDir)), set(expectedImgs.keys()))
+ for imgName, content in expectedImgs.items():
+ self.assertEqual(readTestFile(os.path.join(outDir, imgName)), content)
diff --git a/backend/tests/enwiki/test_gen_desc_data.py b/backend/tests/enwiki/test_gen_desc_data.py
new file mode 100644
index 0000000..f6d4250
--- /dev/null
+++ b/backend/tests/enwiki/test_gen_desc_data.py
@@ -0,0 +1,37 @@
+import unittest
+import os, tempfile
+
+from tests.common import readTestDbTable
+from hist_data.enwiki.gen_desc_data import genData
+
+TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2')
+
+class TestGenData(unittest.TestCase):
+ def test_gen(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Run
+ dbFile = os.path.join(tempDir, 'descData.db')
+ genData(TEST_DUMP_FILE, dbFile)
+ # Check
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, title FROM pages'),
+ {
+ (10, 'AccessibleComputing'),
+ (13, 'AfghanistanHistory'),
+ (25, 'Autism'),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, target FROM redirects'),
+ {
+ (10, 'Computer accessibility'),
+ (13, 'History of Afghanistan'),
+ }
+ )
+ descsRows = readTestDbTable(dbFile, 'SELECT id, desc FROM descs')
+ expectedDescPrefixes = {
+ 25: 'Kanner autism, or classic autism, is a neurodevelopmental disorder',
+ }
+ self.assertEqual({row[0] for row in descsRows}, set(expectedDescPrefixes.keys()))
+ for id, desc in descsRows:
+ self.assertTrue(id in expectedDescPrefixes and desc.startswith(expectedDescPrefixes[id]))
diff --git a/backend/tests/enwiki/test_gen_dump_index_db.py b/backend/tests/enwiki/test_gen_dump_index_db.py
new file mode 100644
index 0000000..64053c4
--- /dev/null
+++ b/backend/tests/enwiki/test_gen_dump_index_db.py
@@ -0,0 +1,39 @@
+import unittest
+import tempfile, os
+
+from tests.common import createTestBz2, readTestDbTable
+from hist_data.enwiki.gen_dump_index_db import genData
+
+def runGenData(indexFileContents: str):
+ """ Sets up index file to be read by genData(), runs it, reads the output database, and returns offset info. """
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp index file
+ indexFile = os.path.join(tempDir, 'index.txt.bz2')
+ createTestBz2(indexFile, indexFileContents)
+ # Run
+ dbFile = os.path.join(tempDir, 'data.db')
+ genData(indexFile, dbFile)
+ # Read db
+ return readTestDbTable(dbFile, 'SELECT title, id, offset, next_offset FROM offsets')
+
+class TestGenData(unittest.TestCase):
+ def setUp(self):
+ self.maxDiff = None # Remove output-diff size limit
+ def test_index_file(self):
+ indexFileContents = (
+ '100:10:apple\n'
+ '100:11:ant\n'
+ '300:99:banana ice-cream\n'
+ '1000:2030:Custard!\n'
+ )
+ offsetsMap = runGenData(indexFileContents)
+ self.assertEqual(offsetsMap, {
+ ('apple', 10, 100, 300),
+ ('ant', 11, 100, 300),
+ ('banana ice-cream', 99, 300, 1000),
+ ('Custard!', 2030, 1000, -1),
+ })
+ def test_emp_index(self):
+ offsetsMap = runGenData('')
+ self.assertEqual(offsetsMap, set())
+ pass
diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py
new file mode 100644
index 0000000..019b757
--- /dev/null
+++ b/backend/tests/enwiki/test_gen_img_data.py
@@ -0,0 +1,77 @@
+import unittest
+import tempfile, os
+
+from tests.common import createTestDbTable, readTestDbTable
+from hist_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData
+
+TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2')
+
+class TestGetInputPageIdsFromDb(unittest.TestCase):
+ def test_get(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp tree-of-life db
+ dbFile = os.path.join(tempDir, 'data.db')
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)',
+ 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+ {
+ (1, 'Belgium', 2389729, None, None, None, 2, 'country'),
+ (2, 'George Washington', 2353711, None, 2378478, None, 2, 'human'),
+ }
+ )
+ # Create temp dump-index db
+ indexDb = os.path.join(tempDir, 'dump_index.db')
+ createTestDbTable(
+ indexDb,
+ 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)',
+ 'INSERT INTO offsets VALUES (?, ?, ?, ?)',
+ {
+ ('Belgium',10,0,-1),
+ ('George Washington',20,0,-1),
+ ('Autism',25,0,-1),
+ }
+ )
+ # Run
+ pageIds = getInputPageIdsFromDb(dbFile, indexDb)
+ # Check
+ self.assertEqual(pageIds, {10, 20})
+
+class TestGenData(unittest.TestCase):
+ def test_gen(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp dump-index db
+ indexDb = os.path.join(tempDir, 'dump_index.db')
+ createTestDbTable(
+ indexDb,
+ 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)',
+ 'INSERT INTO offsets VALUES (?, ?, ?, ?)',
+ {
+ ('AccessibleComputing',10,0,-1),
+ ('AfghanistanHistory',13,0,-1),
+ ('Autism',25,0,-1),
+ }
+ )
+ # Run
+ imgDb = os.path.join(tempDir, 'imgData.db')
+ genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb)
+ # Check
+ self.assertEqual(
+ readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
+ {
+ (10, None),
+ (25, 'Autism-stacking-cans 2nd edit.jpg'),
+ }
+ )
+ # Run with updated page-ids set
+ genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb)
+ # Check
+ self.assertEqual(
+ readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
+ {
+ (10, None),
+ (13, None),
+ (25, 'Autism-stacking-cans 2nd edit.jpg'),
+ }
+ )
diff --git a/backend/tests/enwiki/test_gen_pageview_data.py b/backend/tests/enwiki/test_gen_pageview_data.py
new file mode 100644
index 0000000..154953e
--- /dev/null
+++ b/backend/tests/enwiki/test_gen_pageview_data.py
@@ -0,0 +1,44 @@
+import unittest
+import tempfile, os
+
+from tests.common import createTestBz2, createTestDbTable, readTestDbTable
+from hist_data.enwiki.gen_pageview_data import genData
+
+class TestGenData(unittest.TestCase):
+ def test_gen(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp pageview files
+ pageviewFiles = [os.path.join(tempDir, 'pageviews1.bz2'), os.path.join(tempDir, 'pageviews2.bz2')]
+ createTestBz2(pageviewFiles[0], (
+ 'aa.wikibooks One null desktop 1 W1\n'
+ 'en.wikipedia Two null mobile-web 10 A9B1\n'
+ 'en.wikipedia Three null desktop 4 D3\n'
+ ))
+ createTestBz2(pageviewFiles[1], (
+ 'fr.wikipedia Four null desktop 12 T6U6\n'
+ 'en.wikipedia Three null desktop 10 E4G5Z61\n'
+ ))
+ # Create temp dump-index db
+ dumpIndexDb = os.path.join(tempDir, 'dump_index.db')
+ createTestDbTable(
+ dumpIndexDb,
+ 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)',
+ 'INSERT INTO offsets VALUES (?, ?, ?, ?)',
+ {
+ ('One', 1, 0, -1),
+ ('Two', 2, 0, -1),
+ ('Three', 3, 0, -1),
+ ('Four', 4, 0, -1),
+ }
+ )
+ # Run
+ dbFile = os.path.join(tempDir, 'data.db')
+ genData(pageviewFiles, dumpIndexDb, dbFile)
+ # Check
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT title, id, views from views'),
+ {
+ ('Two', 2, 5),
+ ('Three', 3, 7),
+ }
+ )