From db9321ca32f283f7fd59e2e8b5f8a695c66fce98 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 28 Dec 2022 19:46:34 +1100 Subject: Use scale-associated event scores for uniform spread Add 'dist' and 'scores' db tables Add 'scale' server query parameter (and remove 'excl') Disregard event-end when querying database --- backend/hist_data/README.md | 11 ++++- backend/hist_data/gen_score_data.py | 80 ++++++++++++++++++++++++++++++++ backend/histplorer.py | 82 +++++++++++++++------------------ backend/tests/test_gen_score_data.py | 82 +++++++++++++++++++++++++++++++++ backend/tests/test_histplorer.py | 24 +++++++++- backend/tests/test_reduce_event_data.py | 2 +- src/App.vue | 3 +- 7 files changed, 235 insertions(+), 49 deletions(-) create mode 100644 backend/hist_data/gen_score_data.py create mode 100644 backend/tests/test_gen_score_data.py diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index bfecc1e..1a832ef 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -27,9 +27,15 @@ This directory holds files used to generate the history database data.db. - `event_imgs`:
Format: `id INT PRIMARY KEY, img_id INT`
Assocates events with images -- `descs`
+- `descs`:
Format: `id INT PRIMARY KEY, wiki_id INT, desc TEXT`
Associates an event's enwiki title with a short description. +- `dist`:
+ Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)`
+ Maps scale units to event counts. +- `scores`:
+ Format: `id INT, scale INT, score INT, PRIMARY KEY (id, scale)`
+ Maps events to score values for each scale (used to show events by popularity and uniformity across scale). # Generating the Database @@ -76,3 +82,6 @@ Some of the scripts use third-party packages: ## Remove Events Without Images/Descs 1. Run `reduce_event_data.py` to remove data for events that have no image/description. + +## Generate Distribution and Score Data +1. Run `gen_score_data.py`, which add the `dist` and `scores` tables. diff --git a/backend/hist_data/gen_score_data.py b/backend/hist_data/gen_score_data.py new file mode 100644 index 0000000..4ea66c3 --- /dev/null +++ b/backend/hist_data/gen_score_data.py @@ -0,0 +1,80 @@ +#!/usr/bin/python3 + +""" +Adds data about event distribution and scores to the database. +""" + +# Enable unit testing code to, when running this script, resolve imports of modules within this directory +import os, sys +parentDir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(parentDir) + +import sqlite3 +from cal import gregorianToJdn, jdnToGregorian + +MONTH_SCALE = -1; +DAY_SCALE = -2; +SCALES: list[int] = [int(x) for x in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]]; +MAX_DISPLAYED_PER_UNIT = 4 +# +DB_FILE = 'data.db' + +def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None: + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Reading through events') + scaleUnitToCounts: dict[tuple[int, int], list[int]] = {} + # Maps scale and unit to two counts (num events in that unit, num events displayable for that unit) + # Only includes events with popularity values + idAndScaleToScore: dict[tuple[int, int], int] = {} # Maps event id and scale to score + iterNum = 0 + query = 'SELECT events.id, start, fmt, pop FROM events INNER JOIN pop ON events.id = pop.id ORDER BY pop.pop DESC' + for eventId, eventStart, fmt, pop in dbCur.execute(query): + iterNum += 1 + if iterNum % 1e3 == 0: + print(f'At iteration {iterNum}') + # For each scale + for scale in scales: + # Get unit + unit: int + if scale >= 1: + unit = (eventStart if fmt == 0 else jdnToGregorian(eventStart)[0]) // scale + elif scale == MONTH_SCALE: + if fmt == 0: + unit = gregorianToJdn(eventStart, 1, 1) + else: + year, month, day = jdnToGregorian(eventStart) + unit = eventStart if day == 1 else gregorianToJdn(year, month, 1) + else: # scale == DAY_SCALE + unit = eventStart if fmt != 0 else gregorianToJdn(eventStart, 1, 1) + # Update maps + counts: list[int] + if (scale, unit) in scaleUnitToCounts: + counts = scaleUnitToCounts[(scale, unit)] + counts[0] += 1 + else: + counts = [1, 0] + if counts[1] < maxDisplayedPerUnit: + counts[1] += 1 + idAndScaleToScore[(eventId, scale)] = pop + scaleUnitToCounts[(scale, unit)] = counts + # + print('Writing to db') + dbCur.execute('CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))') + dbCur.execute('CREATE TABLE scores (id INT, scale INT, score INT, PRIMARY KEY (id, scale))') + for (scale, unit), (count, _) in scaleUnitToCounts.items(): + dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, count)) + for (eventId, scale), score in idAndScaleToScore.items(): + dbCur.execute('INSERT INTO scores VALUES (?, ?, ?)', (eventId, scale, score)) + # + print('Closing db') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(DB_FILE, SCALES, MAX_DISPLAYED_PER_UNIT) diff --git a/backend/histplorer.py b/backend/histplorer.py index 6d1298e..09aabd8 100755 --- a/backend/histplorer.py +++ b/backend/histplorer.py @@ -3,7 +3,7 @@ WSGI script that serves historical data Expected HTTP query parameters: - type: - If 'events', reply with list of event objects, within a date range + If 'events', reply with list of event objects, within a date range, for a given scale If 'info', reply with information about a given event If 'sugg', reply with search suggestions for an event search string - range: With type=events, specifies a historical-date range @@ -11,8 +11,8 @@ Expected HTTP query parameters: Examples: range=1000.1910-10-09 means '1000 CE to 09/10/1910 (inclusive)' range=-13000. means '13000 BCE onwards' +- scale: With type=events, specifies a date scale (matched against 'scale' column in 'scores' table) - incl: With type=events, specifies an event to include, as an event ID -- excl: With type=events, specifies events to exclude, as period-separated event IDs - event: With type=info, specifies the event to get info for - input: With type=sugg, specifies a search string to suggest for - limit: With type=events or type=sugg, specifies the max number of results @@ -26,9 +26,8 @@ import gzip, jsonpickle from hist_data.cal import gregorianToJdn, jdnToGregorian, jdnToJulian DB_FILE = 'hist_data/data.db' -MAX_REQ_EVENTS = 100 +MAX_REQ_EVENTS = 500 DEFAULT_REQ_EVENTS = 20 -MAX_REQ_EXCLS = 100 MAX_REQ_SUGGS = 50 DEFAULT_REQ_SUGGS = 5 MIN_CAL_YEAR = -4713 # Disallow within-year dates before this year @@ -182,17 +181,22 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor): except ValueError: print(f'INFO: Invalid date-range value {dateRange}', file=sys.stderr) return None + # Get scale + if 'scale' not in params: + print('INFO: No scale provided', file=sys.stderr) + return None + try: + scale = int(params['scale']) + except ValueError: + print('INFO: Invalid scale value', file=sys.stderr) + return None # Get event category ctg = params['ctg'] if 'ctg' in params else None - # Get incl/excl + # Get incl value try: incl = int(params['incl']) if 'incl' in params else None - excl = [int(x) for x in params['excl'].split('.')] if 'excl' in params else [] except ValueError: - print('INFO: Invalid incl/excl value', file=sys.stderr) - return None - if len(excl) > MAX_REQ_EXCLS: - print('INFO: Exceeded excl value limit', file=sys.stderr) + print('INFO: Invalid incl value', file=sys.stderr) return None # Get result set limit try: @@ -204,65 +208,57 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor): print(f'INFO: Invalid results limit {resultLimit}', file=sys.stderr) return None # - return lookupEvents(start, end, ctg, incl, excl, resultLimit, dbCur) -def lookupEvents(start: HistDate | None, end: HistDate | None, ctg: str | None, - incl: int | None, excl: list[int], resultLimit: int, dbCur: sqlite3.Cursor) -> list[Event] | None: - """ Looks for events within a date range, restricted by event category, - particular inclusions/exclusions, and a result limit """ - query = 'SELECT events.id, title, start, start_upper, end, end_upper, fmt, ctg, images.id, pop.pop from events' \ + return lookupEvents(start, end, scale, ctg, incl, resultLimit, dbCur) +def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: str | None, + incl: int | None, resultLimit: int, dbCur: sqlite3.Cursor) -> list[Event] | None: + """ Looks for events within a date range, in given scale, + restricted by event category, an optional particular inclusion, and a result limit """ + query = \ + 'SELECT events.id, title, start, start_upper, end, end_upper, fmt, ctg, images.id, scores.score FROM events' \ + ' INNER JOIN scores ON events.id = scores.id' \ ' INNER JOIN event_imgs ON events.id = event_imgs.id' \ - ' INNER JOIN images ON event_imgs.img_id = images.id LEFT JOIN pop ON events.id = pop.id' - constraints = [] - params: list[str | int] = [] + ' INNER JOIN images ON event_imgs.img_id = images.id' + constraints = ['scores.scale = ?'] + params: list[str | int] = [scale] # Constrain by start/end if start is not None: constraint = '(start >= ? AND fmt > 0 OR start >= ? AND fmt = 0)' if start.gcal is None: startJdn = gregorianToJdn(start.year, 1, 1) if start.year >= -4713 else 0 constraints.append(constraint) - params.append(startJdn) - params.append(start.year) + params.extend([startJdn, start.year]) else: startJdn = gregorianToJdn(start.year, start.month, start.day) constraints.append(constraint) - params.append(startJdn) - params.append(start.year if start.month == 1 and start.day == 1 else start.year + 1) + year = start.year if start.month == 1 and start.day == 1 else start.year + 1 + params.extend([startJdn, year]) if end is not None: - constraint = '(end IS NULL AND (start <= ? AND fmt > 0 OR start <= ? AND fmt = 0) OR ' \ - 'end IS NOT NULL AND (end <= ? AND fmt > 0 OR end <= ? AND fmt = 0))' + constraint = '(start <= ? AND fmt > 0 OR start <= ? AND fmt = 0)' if end.gcal is None: endJdn = gregorianToJdn(end.year, 1, 1) if end.year >= -4713 else -1 constraints.append(constraint) - params.extend([endJdn, end.year, endJdn, end.year]) + params.extend([endJdn, end.year]) else: endJdn = gregorianToJdn(end.year, end.month, end.day) constraints.append(constraint) year = end.year if end.month == 12 and end.day == 31 else end.year - 1 - params.extend([endJdn, year, endJdn, year]) + params.extend([endJdn, year]) # Constrain by event category if ctg is not None: constraints.append('ctg = ?') params.append(ctg) - # For exclusions, lookup extra, and remove later - tempLimit = resultLimit + len(excl) - exclusions = set(excl) # Add constraints to query query2 = query if constraints: - query2 += ' WHERE' + ' AND '.join(constraints) - query2 += ' ORDER BY pop.pop DESC' - query2 += f' LIMIT {tempLimit}' + query2 += ' WHERE ' + ' AND '.join(constraints) + query2 += ' ORDER BY scores.score DESC' + query2 += f' LIMIT {resultLimit}' # Run query results: list[Event] = [] for row in dbCur.execute(query2, params): - eventId = row[0] - if eventId in exclusions: - continue - if incl is not None and incl == eventId: - incl = None - if len(results) == resultLimit: - break results.append(eventEntryToResults(row)) + if incl is not None and incl == row[0]: + incl = None # Get any additional inclusion if incl is not None: row = dbCur.execute(query + ' WHERE events.id = ?', (incl,)).fetchone() @@ -274,7 +270,7 @@ def lookupEvents(start: HistDate | None, end: HistDate | None, ctg: str | None, return results def eventEntryToResults( row: tuple[int, str, int, int | None, int | None, int | None, int, str, int, int | None]) -> Event: - eventId, title, start, startUpper, end, endUpper, fmt, ctg, imageId, pop = row + eventId, title, start, startUpper, end, endUpper, fmt, ctg, imageId, score = row """ Helper for converting an 'events' db entry into an Event object """ # Convert dates dateVals: list[int | None] = [start, startUpper, end, endUpper] @@ -296,10 +292,8 @@ def eventEntryToResults( newDates[i] = HistDate(False, *jdnToJulian(n)) else: newDates[i] = HistDate(True, *jdnToGregorian(n)) - if pop is None: - pop = 0 # - return Event(eventId, title, newDates[0], newDates[1], newDates[2], newDates[3], ctg, imageId, pop) + return Event(eventId, title, newDates[0], newDates[1], newDates[2], newDates[3], ctg, imageId, score) # For type=info def handleInfoReq(params: dict[str, str], dbCur: sqlite3.Cursor): diff --git a/backend/tests/test_gen_score_data.py b/backend/tests/test_gen_score_data.py new file mode 100644 index 0000000..a466679 --- /dev/null +++ b/backend/tests/test_gen_score_data.py @@ -0,0 +1,82 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from hist_data.gen_score_data import genData, MONTH_SCALE, DAY_SCALE +from hist_data.cal import gregorianToJdn + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp history db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ + 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', + 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + { + (1, 'event one', 1900, None, None, None, 0, 'event'), + (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 15/11/2002 to 21/06/2010 + (3, 'event three', 1900, None, 2000, None, 0, 'event'), + (4, 'event four', 1901, None, 2000, 2010, 0, 'event'), + (5, 'event five', 2415294, None, None, None, 1, 'event'), # 01/10/1900 + (6, 'event six', 2415030, None, None, None, 1, 'event'), # 10/01/1900 + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)', + 'INSERT INTO pop VALUES (?, ?)', + { + (1, 11), + (2, 21), + (4, 5), + (5, 50), + (6, 10), + } + ) + # Run + genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT scale, unit, count FROM dist'), + { + (10, 190, 4), + (10, 200, 1), + (1, 1900, 3), + (1, 1901, 1), + (1, 2002, 1), + (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2), + (MONTH_SCALE, gregorianToJdn(1901, 1, 1), 1), + (MONTH_SCALE, gregorianToJdn(1900, 10, 1), 1), + (MONTH_SCALE, gregorianToJdn(2002, 11, 1), 1), + (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1), + (DAY_SCALE, gregorianToJdn(1900, 1, 10), 1), + (DAY_SCALE, gregorianToJdn(1900, 10, 1), 1), + (DAY_SCALE, gregorianToJdn(1901, 1, 1), 1), + (DAY_SCALE, gregorianToJdn(2002, 11, 15), 1), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, scale, score FROM scores'), + { + (5, 10, 50), + (1, 10, 11), + (2, 10, 21), + (5, 1, 50), + (1, 1, 11), + (4, 1, 5), + (2, 1, 21), + (1, MONTH_SCALE, 11), + (6, MONTH_SCALE, 10), + (4, MONTH_SCALE, 5), + (5, MONTH_SCALE, 50), + (2, MONTH_SCALE, 21), + (1, DAY_SCALE, 11), + (4, DAY_SCALE, 5), + (5, DAY_SCALE, 50), + (6, DAY_SCALE, 10), + (2, DAY_SCALE, 21), + } + ) diff --git a/backend/tests/test_histplorer.py b/backend/tests/test_histplorer.py index 6487977..eb89f50 100644 --- a/backend/tests/test_histplorer.py +++ b/backend/tests/test_histplorer.py @@ -16,6 +16,7 @@ def initTestDb(dbFile: str) -> None: (3, 'event three', 2448175, 2451828, None, None, 2, 'discovery'), # 10/10/1990 to 10/10/2000 (4, 'event four', 991206, None, 1721706, None, 1, 'event'), # 10/10/-2000 to 10/10/1 (5, 'event five', 2000, None, 2001, None, 0, 'event'), + (6, 'event six', 1500, None, 2000, None, 0, 'event'), } ) createTestDbTable( @@ -28,6 +29,21 @@ def initTestDb(dbFile: str) -> None: (3, 0), (4, 1000), (5, 51), + (6, 60), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE scores (id INT, scale INT, score INT, PRIMARY KEY (id, scale))', + 'INSERT INTO scores VALUES (?, ?, ?)', + { + (1, 1, 11), + (1, 10, 11), + (2, 1, 21), + (3, 1, 0), + (4, 1, 1000), + (5, 1, 51), + (6, 10, 60), } ) createTestDbTable( @@ -40,6 +56,7 @@ def initTestDb(dbFile: str) -> None: (3, 30), (4, 20), (5, 50), + (6, 60), } ) createTestDbTable( @@ -51,6 +68,7 @@ def initTestDb(dbFile: str) -> None: (20, 'example.com/2', 'cc-by', 'artist two', 'credits two'), (30, 'example.com/3', 'cc-by-sa 3.0', 'artist three', 'credits three'), (50, 'example.com/5', 'cc-by', 'artist five', 'credits five'), + (60, 'example.com/6', 'cc-by', 'artist six', 'credits six'), } ) createTestDbTable( @@ -63,6 +81,7 @@ def initTestDb(dbFile: str) -> None: (3, 300, 'desc three'), (4, 400, 'desc four'), (5, 500, 'desc five'), + (6, 600, 'desc six'), } ) @@ -75,17 +94,18 @@ class TestHandleReq(unittest.TestCase): def tearDown(self): self.tempDir.cleanup() def test_events_req(self): - response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=-1999.2002-11-1&incl=3&limit=2'}) + response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=-1999.2002-11-1&scale=1&incl=3&limit=2'}) self.assertEqual(response, [ Event(5, 'event five', HistDate(True, 2000, 1, 1), None, HistDate(True, 2001, 1, 1), None, 'event', 50, 51), Event(3, 'event three', HistDate(True, 1990, 10, 10), HistDate(True, 2000, 10, 10), None, None, 'discovery', 30, 0), ]) - response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=.1999-11-27&excl=1&ctg=event'}) + response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=.1999-11-27&scale=1&ctg=event'}) self.assertEqual(response, [ Event(4, 'event four', HistDate(False, -2000, 10, 10), None, HistDate(False, 1, 10, 10), None, 'event', 20, 1000), + Event(1, 'event one', HistDate(True, 1900, 1, 1), None, None, None, 'event', 10, 11), ]) def test_info_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'type=info&event=3'}) diff --git a/backend/tests/test_reduce_event_data.py b/backend/tests/test_reduce_event_data.py index cb54758..c879150 100644 --- a/backend/tests/test_reduce_event_data.py +++ b/backend/tests/test_reduce_event_data.py @@ -57,7 +57,7 @@ class TestReduceData(unittest.TestCase): reduceData(dbFile) # Check self.assertEqual( - readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg from events'), + readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg FROM events'), { (1, 'event one', 1900, None, None, None, 0, 'event'), } diff --git a/src/App.vue b/src/App.vue index eacc73f..feba10e 100644 --- a/src/App.vue +++ b/src/App.vue @@ -122,7 +122,7 @@ function reduceEvents(){ idToEvent = eventsToKeep; } // For getting events from server -const EVENT_REQ_LIMIT = 100; +const EVENT_REQ_LIMIT = 500; let queriedRanges: DateRangeTree[] = SCALES.map(() => new DateRangeTree()); // For each scale, holds date ranges for which data has already been queried fromm the server let pendingReq = false; // Used to serialise event-req handling @@ -141,6 +141,7 @@ async function onEventDisplay( let urlParams = new URLSearchParams({ type: 'events', range: `${firstDate}.${lastDate}`, + scale: String(SCALES[scaleIdx]), limit: String(EVENT_REQ_LIMIT), }); let responseObj: HistEventJson[] = await queryServer(urlParams); -- cgit v1.2.3