From 4ad7206443660587a15a7b47384b927188155da8 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 28 Dec 2022 20:49:13 +1100 Subject: Convert 'scores' table to 'events_disp', removing 'scores' column --- backend/hist_data/README.md | 10 ++--- backend/hist_data/gen_disp_data.py | 80 +++++++++++++++++++++++++++++++++++ backend/hist_data/gen_score_data.py | 80 ----------------------------------- backend/histplorer.py | 15 ++++--- backend/tests/test_gen_disp_data.py | 82 ++++++++++++++++++++++++++++++++++++ backend/tests/test_gen_score_data.py | 82 ------------------------------------ backend/tests/test_histplorer.py | 18 ++++---- 7 files changed, 184 insertions(+), 183 deletions(-) create mode 100644 backend/hist_data/gen_disp_data.py delete mode 100644 backend/hist_data/gen_score_data.py create mode 100644 backend/tests/test_gen_disp_data.py delete mode 100644 backend/tests/test_gen_score_data.py (limited to 'backend') diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index 1a832ef..b557b14 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -33,9 +33,9 @@ This directory holds files used to generate the history database data.db. - `dist`:
Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)`
Maps scale units to event counts. -- `scores`:
- Format: `id INT, scale INT, score INT, PRIMARY KEY (id, scale)`
- Maps events to score values for each scale (used to show events by popularity and uniformity across scale). +- `event_disp`:
+ Format: `id INT, scale INT, PRIMARY KEY (id, scale)`
+ Maps events to scales they are 'displayable' on (used to make displayed events more uniform across time). # Generating the Database @@ -83,5 +83,5 @@ Some of the scripts use third-party packages: ## Remove Events Without Images/Descs 1. Run `reduce_event_data.py` to remove data for events that have no image/description. -## Generate Distribution and Score Data -1. Run `gen_score_data.py`, which add the `dist` and `scores` tables. +## Generate Distribution and Displayability Data +1. Run `gen_disp_data.py`, which add the `dist` and `event_disp` tables. diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py new file mode 100644 index 0000000..e425efc --- /dev/null +++ b/backend/hist_data/gen_disp_data.py @@ -0,0 +1,80 @@ +#!/usr/bin/python3 + +""" +Adds data about event distribution and displayability to the database. +""" + +# Enable unit testing code to, when running this script, resolve imports of modules within this directory +import os, sys +parentDir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(parentDir) + +import sqlite3 +from cal import gregorianToJdn, jdnToGregorian + +MONTH_SCALE = -1; +DAY_SCALE = -2; +SCALES: list[int] = [int(x) for x in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]]; +MAX_DISPLAYED_PER_UNIT = 4 +# +DB_FILE = 'data.db' + +def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None: + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Reading through events') + scaleUnitToCounts: dict[tuple[int, int], list[int]] = {} + # Maps scale and unit to two counts (num events in that unit, num events displayable for that unit) + # Only includes events with popularity values + idScales: set[tuple[int, int]] = set() # Maps event ids to scales they are displayable on + iterNum = 0 + query = 'SELECT events.id, start, fmt FROM events INNER JOIN pop ON events.id = pop.id ORDER BY pop.pop DESC' + for eventId, eventStart, fmt in dbCur.execute(query): + iterNum += 1 + if iterNum % 1e3 == 0: + print(f'At iteration {iterNum}') + # For each scale + for scale in scales: + # Get unit + unit: int + if scale >= 1: + unit = (eventStart if fmt == 0 else jdnToGregorian(eventStart)[0]) // scale + elif scale == MONTH_SCALE: + if fmt == 0: + unit = gregorianToJdn(eventStart, 1, 1) + else: + year, month, day = jdnToGregorian(eventStart) + unit = eventStart if day == 1 else gregorianToJdn(year, month, 1) + else: # scale == DAY_SCALE + unit = eventStart if fmt != 0 else gregorianToJdn(eventStart, 1, 1) + # Update maps + counts: list[int] + if (scale, unit) in scaleUnitToCounts: + counts = scaleUnitToCounts[(scale, unit)] + counts[0] += 1 + else: + counts = [1, 0] + if counts[1] < maxDisplayedPerUnit: + counts[1] += 1 + idScales.add((eventId, scale)) + scaleUnitToCounts[(scale, unit)] = counts + # + print('Writing to db') + dbCur.execute('CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))') + for (scale, unit), (count, _) in scaleUnitToCounts.items(): + dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, count)) + dbCur.execute('CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))') + for eventId, scale in idScales: + dbCur.execute('INSERT INTO event_disp VALUES (?, ?)', (eventId, scale)) + # + print('Closing db') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(DB_FILE, SCALES, MAX_DISPLAYED_PER_UNIT) diff --git a/backend/hist_data/gen_score_data.py b/backend/hist_data/gen_score_data.py deleted file mode 100644 index 4ea66c3..0000000 --- a/backend/hist_data/gen_score_data.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/python3 - -""" -Adds data about event distribution and scores to the database. -""" - -# Enable unit testing code to, when running this script, resolve imports of modules within this directory -import os, sys -parentDir = os.path.dirname(os.path.realpath(__file__)) -sys.path.append(parentDir) - -import sqlite3 -from cal import gregorianToJdn, jdnToGregorian - -MONTH_SCALE = -1; -DAY_SCALE = -2; -SCALES: list[int] = [int(x) for x in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]]; -MAX_DISPLAYED_PER_UNIT = 4 -# -DB_FILE = 'data.db' - -def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None: - dbCon = sqlite3.connect(dbFile) - dbCur = dbCon.cursor() - # - print('Reading through events') - scaleUnitToCounts: dict[tuple[int, int], list[int]] = {} - # Maps scale and unit to two counts (num events in that unit, num events displayable for that unit) - # Only includes events with popularity values - idAndScaleToScore: dict[tuple[int, int], int] = {} # Maps event id and scale to score - iterNum = 0 - query = 'SELECT events.id, start, fmt, pop FROM events INNER JOIN pop ON events.id = pop.id ORDER BY pop.pop DESC' - for eventId, eventStart, fmt, pop in dbCur.execute(query): - iterNum += 1 - if iterNum % 1e3 == 0: - print(f'At iteration {iterNum}') - # For each scale - for scale in scales: - # Get unit - unit: int - if scale >= 1: - unit = (eventStart if fmt == 0 else jdnToGregorian(eventStart)[0]) // scale - elif scale == MONTH_SCALE: - if fmt == 0: - unit = gregorianToJdn(eventStart, 1, 1) - else: - year, month, day = jdnToGregorian(eventStart) - unit = eventStart if day == 1 else gregorianToJdn(year, month, 1) - else: # scale == DAY_SCALE - unit = eventStart if fmt != 0 else gregorianToJdn(eventStart, 1, 1) - # Update maps - counts: list[int] - if (scale, unit) in scaleUnitToCounts: - counts = scaleUnitToCounts[(scale, unit)] - counts[0] += 1 - else: - counts = [1, 0] - if counts[1] < maxDisplayedPerUnit: - counts[1] += 1 - idAndScaleToScore[(eventId, scale)] = pop - scaleUnitToCounts[(scale, unit)] = counts - # - print('Writing to db') - dbCur.execute('CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))') - dbCur.execute('CREATE TABLE scores (id INT, scale INT, score INT, PRIMARY KEY (id, scale))') - for (scale, unit), (count, _) in scaleUnitToCounts.items(): - dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, count)) - for (eventId, scale), score in idAndScaleToScore.items(): - dbCur.execute('INSERT INTO scores VALUES (?, ?, ?)', (eventId, scale, score)) - # - print('Closing db') - dbCon.commit() - dbCon.close() - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - args = parser.parse_args() - # - genData(DB_FILE, SCALES, MAX_DISPLAYED_PER_UNIT) diff --git a/backend/histplorer.py b/backend/histplorer.py index 09aabd8..72ef88e 100755 --- a/backend/histplorer.py +++ b/backend/histplorer.py @@ -11,7 +11,7 @@ Expected HTTP query parameters: Examples: range=1000.1910-10-09 means '1000 CE to 09/10/1910 (inclusive)' range=-13000. means '13000 BCE onwards' -- scale: With type=events, specifies a date scale (matched against 'scale' column in 'scores' table) +- scale: With type=events, specifies a date scale (matched against 'scale' column in 'event_disp' table) - incl: With type=events, specifies an event to include, as an event ID - event: With type=info, specifies the event to get info for - input: With type=sugg, specifies a search string to suggest for @@ -214,11 +214,12 @@ def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: """ Looks for events within a date range, in given scale, restricted by event category, an optional particular inclusion, and a result limit """ query = \ - 'SELECT events.id, title, start, start_upper, end, end_upper, fmt, ctg, images.id, scores.score FROM events' \ - ' INNER JOIN scores ON events.id = scores.id' \ + 'SELECT events.id, title, start, start_upper, end, end_upper, fmt, ctg, images.id, pop.pop FROM events' \ + ' INNER JOIN event_disp ON events.id = event_disp.id' \ + ' INNER JOIN pop ON events.id = pop.id' \ ' INNER JOIN event_imgs ON events.id = event_imgs.id' \ ' INNER JOIN images ON event_imgs.img_id = images.id' - constraints = ['scores.scale = ?'] + constraints = ['event_disp.scale = ?'] params: list[str | int] = [scale] # Constrain by start/end if start is not None: @@ -251,7 +252,7 @@ def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: query2 = query if constraints: query2 += ' WHERE ' + ' AND '.join(constraints) - query2 += ' ORDER BY scores.score DESC' + query2 += ' ORDER BY pop.pop DESC' query2 += f' LIMIT {resultLimit}' # Run query results: list[Event] = [] @@ -270,7 +271,7 @@ def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: return results def eventEntryToResults( row: tuple[int, str, int, int | None, int | None, int | None, int, str, int, int | None]) -> Event: - eventId, title, start, startUpper, end, endUpper, fmt, ctg, imageId, score = row + eventId, title, start, startUpper, end, endUpper, fmt, ctg, imageId, pop = row """ Helper for converting an 'events' db entry into an Event object """ # Convert dates dateVals: list[int | None] = [start, startUpper, end, endUpper] @@ -293,7 +294,7 @@ def eventEntryToResults( else: newDates[i] = HistDate(True, *jdnToGregorian(n)) # - return Event(eventId, title, newDates[0], newDates[1], newDates[2], newDates[3], ctg, imageId, score) + return Event(eventId, title, newDates[0], newDates[1], newDates[2], newDates[3], ctg, imageId, pop) # For type=info def handleInfoReq(params: dict[str, str], dbCur: sqlite3.Cursor): diff --git a/backend/tests/test_gen_disp_data.py b/backend/tests/test_gen_disp_data.py new file mode 100644 index 0000000..b806958 --- /dev/null +++ b/backend/tests/test_gen_disp_data.py @@ -0,0 +1,82 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from hist_data.gen_disp_data import genData, MONTH_SCALE, DAY_SCALE +from hist_data.cal import gregorianToJdn + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp history db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ + 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', + 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + { + (1, 'event one', 1900, None, None, None, 0, 'event'), + (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 15/11/2002 to 21/06/2010 + (3, 'event three', 1900, None, 2000, None, 0, 'event'), + (4, 'event four', 1901, None, 2000, 2010, 0, 'event'), + (5, 'event five', 2415294, None, None, None, 1, 'event'), # 01/10/1900 + (6, 'event six', 2415030, None, None, None, 1, 'event'), # 10/01/1900 + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)', + 'INSERT INTO pop VALUES (?, ?)', + { + (1, 11), + (2, 21), + (4, 5), + (5, 50), + (6, 10), + } + ) + # Run + genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT scale, unit, count FROM dist'), + { + (10, 190, 4), + (10, 200, 1), + (1, 1900, 3), + (1, 1901, 1), + (1, 2002, 1), + (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2), + (MONTH_SCALE, gregorianToJdn(1901, 1, 1), 1), + (MONTH_SCALE, gregorianToJdn(1900, 10, 1), 1), + (MONTH_SCALE, gregorianToJdn(2002, 11, 1), 1), + (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1), + (DAY_SCALE, gregorianToJdn(1900, 1, 10), 1), + (DAY_SCALE, gregorianToJdn(1900, 10, 1), 1), + (DAY_SCALE, gregorianToJdn(1901, 1, 1), 1), + (DAY_SCALE, gregorianToJdn(2002, 11, 15), 1), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, scale FROM event_disp'), + { + (5, 10), + (1, 10), + (2, 10), + (5, 1), + (1, 1), + (4, 1), + (2, 1), + (1, MONTH_SCALE), + (6, MONTH_SCALE), + (4, MONTH_SCALE), + (5, MONTH_SCALE), + (2, MONTH_SCALE), + (1, DAY_SCALE), + (4, DAY_SCALE), + (5, DAY_SCALE), + (6, DAY_SCALE), + (2, DAY_SCALE), + } + ) diff --git a/backend/tests/test_gen_score_data.py b/backend/tests/test_gen_score_data.py deleted file mode 100644 index a466679..0000000 --- a/backend/tests/test_gen_score_data.py +++ /dev/null @@ -1,82 +0,0 @@ -import unittest -import tempfile, os - -from tests.common import createTestDbTable, readTestDbTable -from hist_data.gen_score_data import genData, MONTH_SCALE, DAY_SCALE -from hist_data.cal import gregorianToJdn - -class TestGenData(unittest.TestCase): - def test_gen(self): - with tempfile.TemporaryDirectory() as tempDir: - # Create temp history db - dbFile = os.path.join(tempDir, 'data.db') - createTestDbTable( - dbFile, - 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ - 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', - 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', - { - (1, 'event one', 1900, None, None, None, 0, 'event'), - (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 15/11/2002 to 21/06/2010 - (3, 'event three', 1900, None, 2000, None, 0, 'event'), - (4, 'event four', 1901, None, 2000, 2010, 0, 'event'), - (5, 'event five', 2415294, None, None, None, 1, 'event'), # 01/10/1900 - (6, 'event six', 2415030, None, None, None, 1, 'event'), # 10/01/1900 - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)', - 'INSERT INTO pop VALUES (?, ?)', - { - (1, 11), - (2, 21), - (4, 5), - (5, 50), - (6, 10), - } - ) - # Run - genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2) - # Check - self.assertEqual( - readTestDbTable(dbFile, 'SELECT scale, unit, count FROM dist'), - { - (10, 190, 4), - (10, 200, 1), - (1, 1900, 3), - (1, 1901, 1), - (1, 2002, 1), - (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2), - (MONTH_SCALE, gregorianToJdn(1901, 1, 1), 1), - (MONTH_SCALE, gregorianToJdn(1900, 10, 1), 1), - (MONTH_SCALE, gregorianToJdn(2002, 11, 1), 1), - (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1), - (DAY_SCALE, gregorianToJdn(1900, 1, 10), 1), - (DAY_SCALE, gregorianToJdn(1900, 10, 1), 1), - (DAY_SCALE, gregorianToJdn(1901, 1, 1), 1), - (DAY_SCALE, gregorianToJdn(2002, 11, 15), 1), - } - ) - self.assertEqual( - readTestDbTable(dbFile, 'SELECT id, scale, score FROM scores'), - { - (5, 10, 50), - (1, 10, 11), - (2, 10, 21), - (5, 1, 50), - (1, 1, 11), - (4, 1, 5), - (2, 1, 21), - (1, MONTH_SCALE, 11), - (6, MONTH_SCALE, 10), - (4, MONTH_SCALE, 5), - (5, MONTH_SCALE, 50), - (2, MONTH_SCALE, 21), - (1, DAY_SCALE, 11), - (4, DAY_SCALE, 5), - (5, DAY_SCALE, 50), - (6, DAY_SCALE, 10), - (2, DAY_SCALE, 21), - } - ) diff --git a/backend/tests/test_histplorer.py b/backend/tests/test_histplorer.py index eb89f50..a2b4623 100644 --- a/backend/tests/test_histplorer.py +++ b/backend/tests/test_histplorer.py @@ -34,16 +34,16 @@ def initTestDb(dbFile: str) -> None: ) createTestDbTable( dbFile, - 'CREATE TABLE scores (id INT, scale INT, score INT, PRIMARY KEY (id, scale))', - 'INSERT INTO scores VALUES (?, ?, ?)', + 'CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))', + 'INSERT INTO event_disp VALUES (?, ?)', { - (1, 1, 11), - (1, 10, 11), - (2, 1, 21), - (3, 1, 0), - (4, 1, 1000), - (5, 1, 51), - (6, 10, 60), + (1, 1), + (1, 10), + (2, 1), + (3, 1), + (4, 1), + (5, 1), + (6, 10), } ) createTestDbTable( -- cgit v1.2.3