aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--backend/hist_data/README.md5
-rwxr-xr-xbackend/hist_data/gen_picked_data.py62
-rw-r--r--backend/hist_data/picked/README.md29
-rw-r--r--backend/tests/test_gen_picked_data.py171
5 files changed, 268 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 6e83679..8790efe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ __pycache__
/backend/hist_data/enwiki/*.bz2
/backend/hist_data/enwiki/imgs/
/backend/hist_data/img/
+/backend/hist_data/picked/
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index 517259c..3fcb8df 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -68,3 +68,8 @@ Some of the scripts use third-party packages:
1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
1. Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/,
and the `events` and `images` tables (only adds descriptions for events with images).
+
+## Optionally Add Extra Event Data
+1. Additional events can be described in `picked/events.json`, with images for them put
+ in `picked` (see the README for details).
+1. Can run `gen_picked_data.py` to add those described events to the database.
diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py
new file mode 100755
index 0000000..7d6071a
--- /dev/null
+++ b/backend/hist_data/gen_picked_data.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python3
+
+"""
+Adds additional manually-picked events to the database
+"""
+
+# Enable unit testing code to, when running this script, resolve imports of modules within this directory
+import os, sys
+parentDir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(parentDir)
+
+import json, sqlite3
+from gen_imgs import convertImage
+
+PICKED_DIR = 'picked'
+PICKED_EVT_FILE = 'events.json'
+DB_FILE = 'data.db'
+IMG_OUT_DIR = 'img'
+
+def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str) -> None:
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ #
+ with open(os.path.join(pickedDir, pickedEvtFile)) as f:
+ eventsToAdd = json.load(f)
+ nextId = -1
+ for event in eventsToAdd:
+ row = dbCur.execute('SELECT id from events where title = ?', (event['title'],)).fetchone()
+ if row is not None:
+ print(f'WARNING: Event "{event["title"]}" already exists, and will be skipped')
+ continue
+ print(f'Adding event {event["title"]}')
+ print("- Updating 'events'")
+ dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+ (nextId, event['title'], event['start'], event['start_upper'], event['end'], event['end_upper'],
+ event['fmt'], event['ctg']))
+ print('- Converting image file')
+ image = event['image']
+ success = convertImage(os.path.join(pickedDir, image['file']), os.path.join(imgOutDir, str(nextId) + '.jpg'))
+ if not success:
+ break
+ print("- Updating 'images'")
+ dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)',
+ (nextId, image['url'], image['license'], image['artist'], image['credit']))
+ print("- Updating 'event_imgs'")
+ dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (nextId, nextId))
+ print("- Updating 'descs'")
+ dbCur.execute('INSERT INTO descs VALUES (?, ?, ?)', (nextId, nextId, event['desc']))
+ print("- Updating 'pop'")
+ dbCur.execute('INSERT INTO pop VALUES (?, ?)', (nextId, event['pop']))
+ #
+ nextId -= 1
+ #
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ genData(PICKED_DIR, PICKED_EVT_FILE, DB_FILE, IMG_OUT_DIR)
diff --git a/backend/hist_data/picked/README.md b/backend/hist_data/picked/README.md
new file mode 100644
index 0000000..becbd24
--- /dev/null
+++ b/backend/hist_data/picked/README.md
@@ -0,0 +1,29 @@
+This directory holds data for additional events
+
+Files
+=====
+- events.json <br>
+ Encodes an array of objects, each describing an event to add.
+ For example:
+
+ [{
+ "title": "COVID-19 Pandemic",
+ "start": 2458919,
+ "start_upper": null,
+ "end": null,
+ "end_upper": null,
+ "fmt": 2,
+ "ctg": "event",
+ "image": {
+ "file": "covid.jpg",
+ "url": "https://en.wikipedia.org/wiki/File:Covid-19_SP_-_UTI_V._Nova_Cachoeirinha.jpg",
+ "license": "cc-by-sa 4.0",
+ "artist": "Gustavo Basso",
+ "credit": ""
+ },
+ "desc": "Global pandemic caused by the virus SARS-CoV-2",
+ "pop": 100
+ }]
+
+ The `image.file` field should name an image file in this directory.
+ Other fields correspond to those in the `events`, `images`, `descs`, and `pop` tables (see `../README.md`).
diff --git a/backend/tests/test_gen_picked_data.py b/backend/tests/test_gen_picked_data.py
new file mode 100644
index 0000000..d469a31
--- /dev/null
+++ b/backend/tests/test_gen_picked_data.py
@@ -0,0 +1,171 @@
+import unittest
+from unittest.mock import patch
+import tempfile, os, shutil
+
+from tests.common import createTestFile, createTestDbTable, readTestDbTable
+from hist_data.gen_picked_data import genData
+
+TEST_IMG = os.path.join(os.path.dirname(__file__), 'test_img.png')
+
+class TestGenImgs(unittest.TestCase):
+ @patch('hist_data.gen_imgs.convertImage', autospec=True)
+ def test_gen(self, convertImageMock):
+ with tempfile.TemporaryDirectory() as tempDir:
+ convertImageMock.side_effect = lambda imgPath, outPath: shutil.copy(imgPath, outPath)
+ # Create picked-event file
+ pickedDir = os.path.join(tempDir, 'picked')
+ os.mkdir(pickedDir)
+ pickedEvtFile = os.path.join(pickedDir, 'events.json')
+ createTestFile(pickedEvtFile, '''
+ [{
+ "title": "COVID-19 Pandemic",
+ "start": 2458919,
+ "start_upper": null,
+ "end": null,
+ "end_upper": null,
+ "fmt": 2,
+ "ctg": "event",
+ "image": {
+ "file": "covid.jpg",
+ "url": "https://en.wikipedia.org/wiki/File:Covid-19_SP_-_UTI_V._Nova_Cachoeirinha.jpg",
+ "license": "cc-by-sa 4.0",
+ "artist": "Gustavo Basso",
+ "credit": ""
+ },
+ "desc": "Global pandemic caused by the virus SARS-CoV-2",
+ "pop": 100
+ },{
+ "title": "foo",
+ "start": -100,
+ "start_upper": 2000,
+ "end": null,
+ "end_upper": null,
+ "fmt": 0,
+ "ctg": "discovery",
+ "image": {
+ "file": "foo.jpg",
+ "url": "https://example.com/foo_img",
+ "license": "cc-by",
+ "artist": "Fibble Wesky",
+ "credit": "Plosta Grimble and Hoska Ferlento"
+ },
+ "desc": "Rhubarb, broccoli, and the fifth box under Tuesday",
+ "pop": 0
+ },{
+ "title": "event one",
+ "start": 100,
+ "start_upper": null,
+ "end": null,
+ "end_upper": null,
+ "fmt": 0,
+ "ctg": "event",
+ "image": {
+ "file": "x.jpg",
+ "url": "?",
+ "license": "cc0",
+ "artist": "?",
+ "credit": "???"
+ },
+ "desc": "?",
+ "pop": 0
+ }]
+ ''')
+ # Create picked images
+ shutil.copy(TEST_IMG, os.path.join(pickedDir, 'covid.jpg'))
+ shutil.copy(TEST_IMG, os.path.join(pickedDir, 'foo.jpg'))
+ # Create temp history db
+ dbFile = os.path.join(tempDir, 'data.db')
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)',
+ 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+ {
+ (1, 'event one', 100, 1000, None, None, 0, 'event'),
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)',
+ 'INSERT INTO images VALUES (?, ?, ?, ?, ?)',
+ {
+ (10, 'http://example.com/img1', 'cc0', 'Spofta Klurry', ''),
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)',
+ 'INSERT INTO event_imgs VALUES (?, ?)',
+ {
+ (1, 10),
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)',
+ 'INSERT INTO descs VALUES (?, ?, ?)',
+ {
+ (1, 100, 'desc one'),
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)',
+ 'INSERT INTO pop VALUES (?, ?)',
+ {
+ (1, 99),
+ }
+ )
+ # Create existing event images
+ imgOutDir = os.path.join(tempDir, 'imgs')
+ os.mkdir(imgOutDir)
+ shutil.copy(TEST_IMG, os.path.join(imgOutDir, '10.jpg'))
+ # Run
+ genData(pickedDir, pickedEvtFile, dbFile, imgOutDir)
+ # Check
+ self.assertEqual(set(os.listdir(imgOutDir)), {
+ '10.jpg',
+ '-1.jpg',
+ '-2.jpg',
+ })
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg FROM events'),
+ {
+ (1, 'event one', 100, 1000, None, None, 0, 'event'),
+ (-1, 'COVID-19 Pandemic', 2458919, None, None, None, 2, 'event'),
+ (-2, 'foo', -100, 2000, None, None, 0, 'discovery'),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, url, license, artist, credit FROM images'),
+ {
+ (10, 'http://example.com/img1', 'cc0', 'Spofta Klurry', ''),
+ (-1, 'https://en.wikipedia.org/wiki/File:Covid-19_SP_-_UTI_V._Nova_Cachoeirinha.jpg',
+ 'cc-by-sa 4.0', 'Gustavo Basso', ''),
+ (-2, 'https://example.com/foo_img', 'cc-by', 'Fibble Wesky', 'Plosta Grimble and Hoska Ferlento'),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, img_id FROM event_imgs'),
+ {
+ (1, 10),
+ (-1, -1),
+ (-2, -2),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, wiki_id, desc from descs'),
+ {
+ (1, 100, 'desc one'),
+ (-1, -1, 'Global pandemic caused by the virus SARS-CoV-2'),
+ (-2, -2, 'Rhubarb, broccoli, and the fifth box under Tuesday'),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, pop from pop'),
+ {
+ (1, 99),
+ (-1, 100),
+ (-2, 0),
+ }
+ )