diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-12-30 23:28:09 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-12-30 23:33:37 +1100 |
| commit | 0e5e46cedaaeacf59cfd0f2e30c1ae6923466870 (patch) | |
| tree | 016b712ce1d4255895bbba11714e624df09cfc4a | |
| parent | 086b0c30afdf2c0fbff48e1005b2f9220b028094 (diff) | |
Generate event_disp data before image-generation
Make gen_disp_data.py delete non-displayable events
Make reduce_event_data.py also delete from 'dist' and 'event_disp'
Remove MAX_IMGS_PER_CTG from enwiki/gen_img_data.py
Make gen_desc_data.py include events without images
| -rw-r--r-- | backend/hist_data/README.md | 28 | ||||
| -rw-r--r-- | backend/hist_data/enwiki/README.md | 2 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 43 | ||||
| -rwxr-xr-x | backend/hist_data/gen_desc_data.py | 5 | ||||
| -rw-r--r-- | backend/hist_data/gen_disp_data.py | 45 | ||||
| -rwxr-xr-x | backend/hist_data/reduce_event_data.py | 42 | ||||
| -rw-r--r-- | backend/tests/enwiki/test_gen_img_data.py | 21 | ||||
| -rw-r--r-- | backend/tests/test_gen_desc_data.py | 13 | ||||
| -rw-r--r-- | backend/tests/test_gen_disp_data.py | 61 | ||||
| -rw-r--r-- | backend/tests/test_reduce_event_data.py | 91 |
10 files changed, 231 insertions, 120 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index b557b14..d05016c 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -21,6 +21,12 @@ This directory holds files used to generate the history database data.db. - `pop`: <br> Format: `id INT PRIMARY KEY, pop INT` <br> Associates each event with a popularity measure (currently an average monthly viewcount) +- `dist`: <br> + Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br> + Maps scale units to counts of events in them. +- `event_disp`: <br> + Format: `id INT, scale INT, PRIMARY KEY (id, scale)` <br> + Maps events to scales they are 'displayable' on (used to make displayed events more uniform across time). - `images`: <br> Format: `id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT` <br> Holds metadata for available images @@ -30,12 +36,6 @@ This directory holds files used to generate the history database data.db. - `descs`: <br> Format: `id INT PRIMARY KEY, wiki_id INT, desc TEXT` <br> Associates an event's enwiki title with a short description. -- `dist`: <br> - Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br> - Maps scale units to event counts. -- `event_disp`: <br> - Format: `id INT, scale INT, PRIMARY KEY (id, scale)` <br> - Maps events to scales they are 'displayable' on (used to make displayed events more uniform across time). # Generating the Database @@ -51,13 +51,15 @@ Some of the scripts use third-party packages: 1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table. ## Generate Popularity Data -1. Obtain 'page view files' in enwiki/, as specified in it's README. +1. Obtain an enwiki dump and 'page view files' in enwiki/, as specified in the README. 1. Run `gen_pop_data.py`, which adds the `pop` table, using data in enwiki/ and the `events` table. +## Generate Event Display Data, and Reduce Dataset +1. Run `gen_disp_data.py`, which adds the `dist` and `event_disp` tables, and removes events not in `event_disp`. + ## Generate Image Data and Popularity Data 1. In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`, looks for infobox image names, and stores them in an image database. - Uses popularity data in enwiki/ to find the top N events in each event category. 1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found images, and adds them to the image database. 1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. @@ -69,11 +71,8 @@ Some of the scripts use third-party packages: - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. ## Generate Description Data -1. Obtain an enwiki dump in enwiki/, as specified in the README. -1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump. 1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database. -1. Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/, - and the `events` and `images` tables (only adds descriptions for events with images). +1. Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/, and the `events` table. ## Optionally Add Extra Event Data 1. Additional events can be described in `picked/events.json`, with images for them put @@ -81,7 +80,4 @@ Some of the scripts use third-party packages: 1. Can run `gen_picked_data.py` to add those described events to the database. ## Remove Events Without Images/Descs -1. Run `reduce_event_data.py` to remove data for events that have no image/description. - -## Generate Distribution and Displayability Data -1. Run `gen_disp_data.py`, which add the `dist` and `event_disp` tables. +1. Run `reduce_event_data.py` to remove data for events that have no image. diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index 29fc2ff..262ebdb 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -38,7 +38,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database. - `img_data.db` <br> Used to hold metadata about infobox images for a set of page IDs. - Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br> + Generated using `gen_img_data.py` and `download_img_license_info.py`. <br> Tables: <br> - `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br> `img_name` may be NULL, which means 'none found', and is used to avoid re-processing page IDs. diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index b4ade9f..922b893 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -14,10 +14,8 @@ import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' -PAGEVIEW_DB = 'pageview_data.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -MAX_IMGS_PER_CTG = 20000 # ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') @@ -179,49 +177,28 @@ def getImageName(content: list[str]) -> str | None: return None return None -def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]: +def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: print('Getting event data') - titleToCtg: dict[str, str] = {} + titles: set[str] = set() dbCon = sqlite3.connect(dbFile) - for title, ctg in dbCon.execute('SELECT title, ctg from events'): - titleToCtg[title] = ctg + for (title,) in dbCon.execute('SELECT title from events'): + titles.add(title) dbCon.close() - print('Getting top images for each event category') - ctgToTitles: dict[str, list[str]] = {} - dbCon = sqlite3.connect(pageviewDb) - for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'): - if title not in titleToCtg: - continue - ctg = titleToCtg[title] - if ctg not in ctgToTitles: - ctgToTitles[ctg] = [] - elif len(ctgToTitles[ctg]) == maxImgsPerCtg: - continue - ctgToTitles[ctg].append(title) - del titleToCtg[title] - dbCon.close() - for title, ctg in titleToCtg.items(): # Account for titles without view counts - if ctg not in ctgToTitles: - ctgToTitles[ctg] = [] - elif len(ctgToTitles[ctg]) == maxImgsPerCtg: - continue - ctgToTitles[ctg].append(title) print('Getting page IDs') pageIds: set[int] = set() dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() - for ctg in ctgToTitles: - for title in ctgToTitles[ctg]: - row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() - if row: - pageIds.add(row[0]) + for title in titles: + row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row: + pageIds.add(row[0]) dbCon.close() - print(f'Result: {len(pageIds)} out of {len(titleToCtg)}') + print(f'Result: {len(pageIds)} out of {len(titles)}') return pageIds if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # - pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG) + pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py index 68f9e56..0d7ee88 100755 --- a/backend/hist_data/gen_desc_data.py +++ b/backend/hist_data/gen_desc_data.py @@ -16,10 +16,9 @@ def genData(enwikiDb: str, dbFile: str) -> None: dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)') # - print('Getting events with images') + print('Getting events') titleToId: dict[str, int] = {} - query = 'SELECT events.id, events.title FROM events INNER JOIN event_imgs ON events.id = event_imgs.id' - for eventId, title in dbCur.execute(query): + for eventId, title in dbCur.execute('SELECT id, title FROM events'): titleToId[title] = eventId # print('Getting Wikipedia descriptions') diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py index a81263f..e771e57 100644 --- a/backend/hist_data/gen_disp_data.py +++ b/backend/hist_data/gen_disp_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 """ -Adds data about event distribution and displayability to the database. +Adds data about event distribution to the database, and removes events not eligible for display. """ # Enable unit testing code to, when running this script, resolve imports of modules within this directory @@ -23,12 +23,12 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None: scaleUnitToCounts: dict[tuple[int, int], list[int]] = {} # Maps scale and unit to two counts (num events in that unit, num events displayable for that unit) # Only includes events with popularity values - idScales: set[tuple[int, int]] = set() # Maps event ids to scales they are displayable on + idScales: dict[int, list[int]] = {} # Maps event ids to scales they are displayable on iterNum = 0 query = 'SELECT events.id, start, fmt FROM events INNER JOIN pop ON events.id = pop.id ORDER BY pop.pop DESC' for eventId, eventStart, fmt in dbCur.execute(query): iterNum += 1 - if iterNum % 1e3 == 0: + if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') # For each scale for scale in scales: @@ -42,16 +42,49 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None: counts = [1, 0] if counts[1] < maxDisplayedPerUnit: counts[1] += 1 - idScales.add((eventId, scale)) + if eventId not in idScales: + idScales[eventId] = [] + idScales[eventId].append(scale) scaleUnitToCounts[(scale, unit)] = counts + print(f'Results: {len(idScales)} displayable events') + # + print('Looking for non-displayable events') + eventsToDel: list[int] = [] + for eventId, eventStart, fmt in dbCur.execute(query): + if eventId in idScales: + continue + eventsToDel.append(eventId) + # Remove from data to be added to 'dist' + for scale in scales: + unit = dateToUnit(dbDateToHistDate(eventStart, fmt), scale) + count = scaleUnitToCounts[(scale, unit)][0] - 1 + if count == 0: + del scaleUnitToCounts[(scale, unit)] + else: + scaleUnitToCounts[(scale, unit)][0] = count + query2 = 'SELECT events.id FROM events LEFT JOIN pop ON events.id = pop.id WHERE pop.id IS NULL' + for (eventId,) in dbCur.execute(query2): # Include events without scores + eventsToDel.append(eventId) + print(f'Found {len(eventsToDel)}') + # + print(f'Deleting {len(eventsToDel)} events') + iterNum = 0 + for eventId in eventsToDel: + iterNum += 1 + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,)) + dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,)) # print('Writing to db') dbCur.execute('CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))') for (scale, unit), (count, _) in scaleUnitToCounts.items(): dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, count)) dbCur.execute('CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))') - for eventId, scale in idScales: - dbCur.execute('INSERT INTO event_disp VALUES (?, ?)', (eventId, scale)) + for eventId, scales in idScales.items(): + for scale in scales: + dbCur.execute('INSERT INTO event_disp VALUES (?, ?)', (eventId, scale)) # print('Closing db') dbCon.commit() diff --git a/backend/hist_data/reduce_event_data.py b/backend/hist_data/reduce_event_data.py index 15c2ab5..c061f90 100755 --- a/backend/hist_data/reduce_event_data.py +++ b/backend/hist_data/reduce_event_data.py @@ -1,23 +1,43 @@ #!/usr/bin/python3 """ -Delete extraneous events from the database that have no image (and consequently no description) +Delete events from the database that have no image. """ +# Enable unit testing code to, when running this script, resolve imports of modules within this directory +import os, sys +parentDir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(parentDir) + import argparse import sqlite3 +from cal import SCALES, dbDateToHistDate, dateToUnit DB_FILE = 'data.db' -def reduceData(dbFile: str) -> None: +def reduceData(dbFile: str, scales: list[int]) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() # print('Getting events to delete') - eventsToDel = set() - query = 'SELECT events.id FROM events LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL' - for (eventId,) in dbCur.execute(query): - eventsToDel.add(eventId) + eventsToDel: list[int] = [] + scaleUnitToDelCount: dict[tuple[int, int], int] = {} # Stores counts to subtract from entries in 'dist' + query = 'SELECT events.id, events.start, events.fmt FROM events' \ + ' LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL' + iterNum = 0 + for (eventId, start, fmt) in dbCur.execute(query): + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + eventsToDel.append(eventId) + date = dbDateToHistDate(start, fmt) + for scale in scales: + unit = dateToUnit(date, scale) + if (scale, unit) not in scaleUnitToDelCount: + scaleUnitToDelCount[(scale, unit)] = 1 + else: + scaleUnitToDelCount[(scale, unit)] += 1 + print(f'Found {len(eventsToDel)}') # print('Deleting events') iterNum = 0 @@ -26,8 +46,12 @@ def reduceData(dbFile: str) -> None: if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') # - dbCur.execute('DELETE from events where id = ?', (eventId,)) - dbCur.execute('DELETE from pop where id = ?', (eventId,)) + dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,)) + dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,)) + dbCur.execute('DELETE FROM event_disp WHERE id = ?', (eventId,)) + for (scale, unit), delCount in scaleUnitToDelCount.items(): + dbCur.execute('UPDATE dist SET count = count - ? WHERE scale = ? AND unit = ?', (delCount, scale, unit)) + dbCur.execute('DELETE FROM dist WHERE count < 1') # dbCon.commit() dbCon.close() @@ -36,4 +60,4 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() # - reduceData(DB_FILE) + reduceData(DB_FILE, SCALES) diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py index 93bb196..04fdd69 100644 --- a/backend/tests/enwiki/test_gen_img_data.py +++ b/backend/tests/enwiki/test_gen_img_data.py @@ -9,7 +9,7 @@ TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_ar class TestGetInputPageIdsFromDb(unittest.TestCase): def test_get(self): with tempfile.TemporaryDirectory() as tempDir: - # Create temp tree-of-life db + # Create temp history db dbFile = os.path.join(tempDir, 'data.db') createTestDbTable( dbFile, @@ -24,19 +24,6 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): (5, 'Marie Curie', 2403277, None, 2427622, None, 2, 'human'), } ) - # Create temp pageviews db - pageviewDb = os.path.join(tempDir, 'pageview_data.db') - createTestDbTable( - pageviewDb, - 'CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)', - 'INSERT INTO views VALUES (?, ?, ?)', - { - ('George Washington', 2, 8), - ('Marie Curie', 5, 10), - ('Douglas Adams', 3, 5), - ('Belgium', 1, 100), - } - ) # Create temp dump-index db indexDb = os.path.join(tempDir, 'dump_index.db') createTestDbTable( @@ -46,15 +33,15 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): { ('Belgium',10,0,-1), ('George Washington',20,0,-1), - ('Douglas Adamns',30,0,-1), + ('Douglas Adams',30,0,-1), ('Marie Curie',50,0,-1), ('Autism',25,0,-1), } ) # Run - pageIds = getInputPageIdsFromDb(dbFile, pageviewDb, indexDb, 2) + pageIds = getInputPageIdsFromDb(dbFile, indexDb) # Check - self.assertEqual(pageIds, {50, 20, 10}) + self.assertEqual(pageIds, {10, 20, 30, 50}) class TestGenData(unittest.TestCase): def test_gen(self): diff --git a/backend/tests/test_gen_desc_data.py b/backend/tests/test_gen_desc_data.py index 6f321b4..eabe644 100644 --- a/backend/tests/test_gen_desc_data.py +++ b/backend/tests/test_gen_desc_data.py @@ -18,7 +18,6 @@ class TestGenData(unittest.TestCase): (3, 'III'), (4, 'IV'), (5, 'V'), - (6, 'VI'), } ) createTestDbTable( @@ -38,7 +37,6 @@ class TestGenData(unittest.TestCase): (3, 'Three'), (4, 'Four'), (5, 'Five'), - (6, 'Six'), } ) # Create temp history db @@ -53,17 +51,6 @@ class TestGenData(unittest.TestCase): (20, 'II', 200, None, None, None, 0, 'discovery'), (30, 'III', 300, None, 350, None, 0, 'event'), (50, 'V', 5, 10, None, None, 1, 'human'), - (60, 'VI', 6000, None, None, None, None, 'event'), - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)', - 'INSERT INTO event_imgs VALUES (?, ?)', - { - (10, 100), - (30, 300), - (50, 500), } ) # Run diff --git a/backend/tests/test_gen_disp_data.py b/backend/tests/test_gen_disp_data.py index 464405a..c39c962 100644 --- a/backend/tests/test_gen_disp_data.py +++ b/backend/tests/test_gen_disp_data.py @@ -17,11 +17,16 @@ class TestGenData(unittest.TestCase): 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', { (1, 'event one', 1900, None, None, None, 0, 'event'), - (2, 'event two', 2452607, None, 2455369, None, 3, 'human'), # 15/11/2002 to 21/06/2010 - (3, 'event three', 1900, None, 2000, None, 0, 'event'), + (2, 'event two', 2452607, None, 2455369, None, 3, 'human'), # 15/11/2002 + (3, 'event three', 1900, None, 2000, None, 0, 'event'), # version of 1 without pop score (4, 'event four', 1901, None, 2000, 2010, 0, 'event'), (5, 'event five', 2415307, None, None, None, 1, 'event'), # 01/10/1900 (6, 'event six', 2415030, None, None, None, 2, 'event'), # 10/01/1900 + (7, 'event seven', 1900, None, None, None, 0, 'event'), # popular version of 1 + (8, 'event eight', 1900, None, None, None, 0, 'event'), # less popular version of 1 + (9, 'event nine', 1900, None, None, None, 0, 'event'), # less popular version of 1 + (10, 'event ten', 2415307, None, None, None, 1, 'event'), # less popular version of 5 + (11, 'event eleven', 2415307, None, None, None, 1, 'event'), # slightly less popular version of 5 } ) createTestDbTable( @@ -34,26 +39,55 @@ class TestGenData(unittest.TestCase): (4, 5), (5, 50), (6, 10), + (7, 100), + (8, 1), + (9, 2), + (10, 40), + (11, 45), } ) # Run genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2) # Check self.assertEqual( + readTestDbTable(dbFile, 'SELECT * FROM events'), + { + (1, 'event one', 1900, None, None, None, 0, 'event'), + (2, 'event two', 2452607, None, 2455369, None, 3, 'human'), + (4, 'event four', 1901, None, 2000, 2010, 0, 'event'), + (5, 'event five', 2415307, None, None, None, 1, 'event'), + (6, 'event six', 2415030, None, None, None, 2, 'event'), + (7, 'event seven', 1900, None, None, None, 0, 'event'), + (11, 'event eleven', 2415307, None, None, None, 1, 'event'), # 01/10/1900 + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT * FROM pop'), + { + (1, 11), + (2, 21), + (4, 5), + (5, 50), + (6, 10), + (7, 100), + (11, 45), + } + ) + self.assertEqual( readTestDbTable(dbFile, 'SELECT scale, unit, count FROM dist'), { - (10, 190, 4), + (10, 190, 6), (10, 200, 1), - (1, 1900, 3), + (1, 1900, 5), (1, 1901, 1), (1, 2002, 1), - (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2), + (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 3), (MONTH_SCALE, gregorianToJdn(1901, 1, 1), 1), - (MONTH_SCALE, julianToJdn(1900, 10, 1), 1), + (MONTH_SCALE, julianToJdn(1900, 10, 1), 2), (MONTH_SCALE, julianToJdn(2002, 11, 1), 1), - (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1), + (DAY_SCALE, gregorianToJdn(1900, 1, 1), 2), (DAY_SCALE, gregorianToJdn(1900, 1, 10), 1), - (DAY_SCALE, julianToJdn(1900, 10, 1), 1), + (DAY_SCALE, julianToJdn(1900, 10, 1), 2), (DAY_SCALE, gregorianToJdn(1901, 1, 1), 1), (DAY_SCALE, julianToJdn(2002, 11, 15), 1), } @@ -62,21 +96,24 @@ class TestGenData(unittest.TestCase): readTestDbTable(dbFile, 'SELECT id, scale FROM event_disp'), { (5, 10), - (1, 10), + (7, 10), (2, 10), (5, 1), - (1, 1), + (7, 1), (4, 1), (2, 1), (1, MONTH_SCALE), - (6, MONTH_SCALE), + (7, MONTH_SCALE), (4, MONTH_SCALE), (5, MONTH_SCALE), + (11, MONTH_SCALE), (2, MONTH_SCALE), (1, DAY_SCALE), + (7, DAY_SCALE), + (6, DAY_SCALE), (4, DAY_SCALE), (5, DAY_SCALE), - (6, DAY_SCALE), + (11, DAY_SCALE), (2, DAY_SCALE), } ) diff --git a/backend/tests/test_reduce_event_data.py b/backend/tests/test_reduce_event_data.py index c879150..7f1ce73 100644 --- a/backend/tests/test_reduce_event_data.py +++ b/backend/tests/test_reduce_event_data.py @@ -3,6 +3,7 @@ import tempfile, os from tests.common import createTestDbTable, readTestDbTable from hist_data.reduce_event_data import reduceData +from hist_data.cal import gregorianToJdn, julianToJdn, MONTH_SCALE, DAY_SCALE class TestReduceData(unittest.TestCase): def test_reduce(self): @@ -16,8 +17,10 @@ class TestReduceData(unittest.TestCase): 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', { (1, 'event one', 1900, None, None, None, 0, 'event'), - (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 2/11/2002 to 21/06/2010 - (3, 'event three', 2448175, 2451828, None, None, 2, 'discovery'), # 10/10/1990 to 10/10/2000 + (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 2/11/2002 + (3, 'event three', 2448175, 2448200, None, None, 2, 'discovery'), # 10/10/1990 + (4, 'event four', 1900, None, None, None, 0, 'event'), # Copy of 1 + (5, 'event five', 2452595, None, 2455369, None, 3, 'human'), # Day after 2 } ) createTestDbTable( @@ -25,8 +28,50 @@ class TestReduceData(unittest.TestCase): 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)', 'INSERT INTO pop VALUES (?, ?)', { - (1, 11), - (2, 21), + (1, 10), + (2, 20), + (3, 30), + (4, 40), + (5, 50), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))', + 'INSERT INTO dist VALUES (?, ?, ?)', + { + (1, 1900, 2), + (1, 1990, 1), + (1, 2002, 2), + (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2), + (MONTH_SCALE, gregorianToJdn(1990, 10, 1), 1), + (MONTH_SCALE, julianToJdn(2002, 11, 1), 2), + (DAY_SCALE, gregorianToJdn(1900, 1, 1), 2), + (DAY_SCALE, gregorianToJdn(1990, 10, 10), 1), + (DAY_SCALE, 2452594, 1), + (DAY_SCALE, 2452595, 1), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))', + 'INSERT INTO event_disp VALUES (?, ?)', + { + (1, 1), + (1, MONTH_SCALE), + (1, DAY_SCALE), + (2, 1), + (2, MONTH_SCALE), + (2, DAY_SCALE), + (3, 1), + (3, MONTH_SCALE), + (3, DAY_SCALE), + (4, 1), + (4, MONTH_SCALE), + (4, DAY_SCALE), + (5, 1), + (5, MONTH_SCALE), + (5, DAY_SCALE), } ) createTestDbTable( @@ -34,7 +79,8 @@ class TestReduceData(unittest.TestCase): 'CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)', 'INSERT INTO event_imgs VALUES (?, ?)', { - (1, 10), + (1, 11), + (2, 21), } ) createTestDbTable( @@ -42,7 +88,8 @@ class TestReduceData(unittest.TestCase): 'CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)', 'INSERT INTO images VALUES (?, ?, ?, ?, ?)', { - (10, 'example.com/1', 'cc0', 'artist one', 'credits one'), + (11, 'example.com/1', 'cc0', 'artist one', 'credits one'), + (21, 'example.com/1', 'cc0', 'artist two', 'credits two'), } ) createTestDbTable( @@ -54,17 +101,41 @@ class TestReduceData(unittest.TestCase): } ) # Run - reduceData(dbFile) + reduceData(dbFile, [1, MONTH_SCALE, DAY_SCALE]) # Check self.assertEqual( - readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg FROM events'), + readTestDbTable(dbFile, 'SELECT * FROM events'), { (1, 'event one', 1900, None, None, None, 0, 'event'), + (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), } ) self.assertEqual( - readTestDbTable(dbFile, 'SELECT id, pop from pop'), + readTestDbTable(dbFile, 'SELECT * from pop'), { - (1, 11), + (1, 10), + (2, 20), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT * from dist'), + { + (1, 1900, 1), + (1, 2002, 1), + (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 1), + (MONTH_SCALE, julianToJdn(2002, 11, 1), 1), + (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1), + (DAY_SCALE, 2452594, 1), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT * from event_disp'), + { + (1, 1), + (1, MONTH_SCALE), + (1, DAY_SCALE), + (2, 1), + (2, MONTH_SCALE), + (2, DAY_SCALE), } ) |
