diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-10-04 23:58:08 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-10-04 23:58:08 +1100 |
| commit | b1d4c709cb2793745e61d85c337514b9c6c85603 (patch) | |
| tree | 5e26d56ec90e810862d1aba8d0ce03abb0e8cc27 | |
| parent | 07b7ef49b07242014f288652980f5b15bfc087f1 (diff) | |
Add gen_picked_data.py
Add unit test
Update READMEs and .gitignore
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | backend/hist_data/README.md | 5 | ||||
| -rwxr-xr-x | backend/hist_data/gen_picked_data.py | 62 | ||||
| -rw-r--r-- | backend/hist_data/picked/README.md | 29 | ||||
| -rw-r--r-- | backend/tests/test_gen_picked_data.py | 171 |
5 files changed, 268 insertions, 0 deletions
@@ -12,3 +12,4 @@ __pycache__ /backend/hist_data/enwiki/*.bz2 /backend/hist_data/enwiki/imgs/ /backend/hist_data/img/ +/backend/hist_data/picked/ diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index 517259c..3fcb8df 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -68,3 +68,8 @@ Some of the scripts use third-party packages: 1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database. 1. Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/, and the `events` and `images` tables (only adds descriptions for events with images). + +## Optionally Add Extra Event Data +1. Additional events can be described in `picked/events.json`, with images for them put + in `picked` (see the README for details). +1. Can run `gen_picked_data.py` to add those described events to the database. diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py new file mode 100755 index 0000000..7d6071a --- /dev/null +++ b/backend/hist_data/gen_picked_data.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 + +""" +Adds additional manually-picked events to the database +""" + +# Enable unit testing code to, when running this script, resolve imports of modules within this directory +import os, sys +parentDir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(parentDir) + +import json, sqlite3 +from gen_imgs import convertImage + +PICKED_DIR = 'picked' +PICKED_EVT_FILE = 'events.json' +DB_FILE = 'data.db' +IMG_OUT_DIR = 'img' + +def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str) -> None: + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + with open(os.path.join(pickedDir, pickedEvtFile)) as f: + eventsToAdd = json.load(f) + nextId = -1 + for event in eventsToAdd: + row = dbCur.execute('SELECT id from events where title = ?', (event['title'],)).fetchone() + if row is not None: + print(f'WARNING: Event "{event["title"]}" already exists, and will be skipped') + continue + print(f'Adding event {event["title"]}') + print("- Updating 'events'") + dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + (nextId, event['title'], event['start'], event['start_upper'], event['end'], event['end_upper'], + event['fmt'], event['ctg'])) + print('- Converting image file') + image = event['image'] + success = convertImage(os.path.join(pickedDir, image['file']), os.path.join(imgOutDir, str(nextId) + '.jpg')) + if not success: + break + print("- Updating 'images'") + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', + (nextId, image['url'], image['license'], image['artist'], image['credit'])) + print("- Updating 'event_imgs'") + dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (nextId, nextId)) + print("- Updating 'descs'") + dbCur.execute('INSERT INTO descs VALUES (?, ?, ?)', (nextId, nextId, event['desc'])) + print("- Updating 'pop'") + dbCur.execute('INSERT INTO pop VALUES (?, ?)', (nextId, event['pop'])) + # + nextId -= 1 + # + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(PICKED_DIR, PICKED_EVT_FILE, DB_FILE, IMG_OUT_DIR) diff --git a/backend/hist_data/picked/README.md b/backend/hist_data/picked/README.md new file mode 100644 index 0000000..becbd24 --- /dev/null +++ b/backend/hist_data/picked/README.md @@ -0,0 +1,29 @@ +This directory holds data for additional events + +Files +===== +- events.json <br> + Encodes an array of objects, each describing an event to add. + For example: + + [{ + "title": "COVID-19 Pandemic", + "start": 2458919, + "start_upper": null, + "end": null, + "end_upper": null, + "fmt": 2, + "ctg": "event", + "image": { + "file": "covid.jpg", + "url": "https://en.wikipedia.org/wiki/File:Covid-19_SP_-_UTI_V._Nova_Cachoeirinha.jpg", + "license": "cc-by-sa 4.0", + "artist": "Gustavo Basso", + "credit": "" + }, + "desc": "Global pandemic caused by the virus SARS-CoV-2", + "pop": 100 + }] + + The `image.file` field should name an image file in this directory. + Other fields correspond to those in the `events`, `images`, `descs`, and `pop` tables (see `../README.md`). diff --git a/backend/tests/test_gen_picked_data.py b/backend/tests/test_gen_picked_data.py new file mode 100644 index 0000000..d469a31 --- /dev/null +++ b/backend/tests/test_gen_picked_data.py @@ -0,0 +1,171 @@ +import unittest +from unittest.mock import patch +import tempfile, os, shutil + +from tests.common import createTestFile, createTestDbTable, readTestDbTable +from hist_data.gen_picked_data import genData + +TEST_IMG = os.path.join(os.path.dirname(__file__), 'test_img.png') + +class TestGenImgs(unittest.TestCase): + @patch('hist_data.gen_imgs.convertImage', autospec=True) + def test_gen(self, convertImageMock): + with tempfile.TemporaryDirectory() as tempDir: + convertImageMock.side_effect = lambda imgPath, outPath: shutil.copy(imgPath, outPath) + # Create picked-event file + pickedDir = os.path.join(tempDir, 'picked') + os.mkdir(pickedDir) + pickedEvtFile = os.path.join(pickedDir, 'events.json') + createTestFile(pickedEvtFile, ''' + [{ + "title": "COVID-19 Pandemic", + "start": 2458919, + "start_upper": null, + "end": null, + "end_upper": null, + "fmt": 2, + "ctg": "event", + "image": { + "file": "covid.jpg", + "url": "https://en.wikipedia.org/wiki/File:Covid-19_SP_-_UTI_V._Nova_Cachoeirinha.jpg", + "license": "cc-by-sa 4.0", + "artist": "Gustavo Basso", + "credit": "" + }, + "desc": "Global pandemic caused by the virus SARS-CoV-2", + "pop": 100 + },{ + "title": "foo", + "start": -100, + "start_upper": 2000, + "end": null, + "end_upper": null, + "fmt": 0, + "ctg": "discovery", + "image": { + "file": "foo.jpg", + "url": "https://example.com/foo_img", + "license": "cc-by", + "artist": "Fibble Wesky", + "credit": "Plosta Grimble and Hoska Ferlento" + }, + "desc": "Rhubarb, broccoli, and the fifth box under Tuesday", + "pop": 0 + },{ + "title": "event one", + "start": 100, + "start_upper": null, + "end": null, + "end_upper": null, + "fmt": 0, + "ctg": "event", + "image": { + "file": "x.jpg", + "url": "?", + "license": "cc0", + "artist": "?", + "credit": "???" + }, + "desc": "?", + "pop": 0 + }] + ''') + # Create picked images + shutil.copy(TEST_IMG, os.path.join(pickedDir, 'covid.jpg')) + shutil.copy(TEST_IMG, os.path.join(pickedDir, 'foo.jpg')) + # Create temp history db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ + 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', + 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + { + (1, 'event one', 100, 1000, None, None, 0, 'event'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)', + 'INSERT INTO images VALUES (?, ?, ?, ?, ?)', + { + (10, 'http://example.com/img1', 'cc0', 'Spofta Klurry', ''), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)', + 'INSERT INTO event_imgs VALUES (?, ?)', + { + (1, 10), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)', + 'INSERT INTO descs VALUES (?, ?, ?)', + { + (1, 100, 'desc one'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)', + 'INSERT INTO pop VALUES (?, ?)', + { + (1, 99), + } + ) + # Create existing event images + imgOutDir = os.path.join(tempDir, 'imgs') + os.mkdir(imgOutDir) + shutil.copy(TEST_IMG, os.path.join(imgOutDir, '10.jpg')) + # Run + genData(pickedDir, pickedEvtFile, dbFile, imgOutDir) + # Check + self.assertEqual(set(os.listdir(imgOutDir)), { + '10.jpg', + '-1.jpg', + '-2.jpg', + }) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg FROM events'), + { + (1, 'event one', 100, 1000, None, None, 0, 'event'), + (-1, 'COVID-19 Pandemic', 2458919, None, None, None, 2, 'event'), + (-2, 'foo', -100, 2000, None, None, 0, 'discovery'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, url, license, artist, credit FROM images'), + { + (10, 'http://example.com/img1', 'cc0', 'Spofta Klurry', ''), + (-1, 'https://en.wikipedia.org/wiki/File:Covid-19_SP_-_UTI_V._Nova_Cachoeirinha.jpg', + 'cc-by-sa 4.0', 'Gustavo Basso', ''), + (-2, 'https://example.com/foo_img', 'cc-by', 'Fibble Wesky', 'Plosta Grimble and Hoska Ferlento'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, img_id FROM event_imgs'), + { + (1, 10), + (-1, -1), + (-2, -2), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, wiki_id, desc from descs'), + { + (1, 100, 'desc one'), + (-1, -1, 'Global pandemic caused by the virus SARS-CoV-2'), + (-2, -2, 'Rhubarb, broccoli, and the fifth box under Tuesday'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, pop from pop'), + { + (1, 99), + (-1, 100), + (-2, 0), + } + ) |
