aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-12-28 19:46:34 +1100
committerTerry Truong <terry06890@gmail.com>2022-12-28 19:46:34 +1100
commitdb9321ca32f283f7fd59e2e8b5f8a695c66fce98 (patch)
tree656e6ae784773029c28d9dcf75ed308d41b6dd7a
parent6b8d7fcda5dc884086bfb5df914e3f13dca5ac3c (diff)
Use scale-associated event scores for uniform spread
Add 'dist' and 'scores' db tables Add 'scale' server query parameter (and remove 'excl') Disregard event-end when querying database
-rw-r--r--backend/hist_data/README.md11
-rw-r--r--backend/hist_data/gen_score_data.py80
-rwxr-xr-xbackend/histplorer.py82
-rw-r--r--backend/tests/test_gen_score_data.py82
-rw-r--r--backend/tests/test_histplorer.py24
-rw-r--r--backend/tests/test_reduce_event_data.py2
-rw-r--r--src/App.vue3
7 files changed, 235 insertions, 49 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index bfecc1e..1a832ef 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -27,9 +27,15 @@ This directory holds files used to generate the history database data.db.
- `event_imgs`: <br>
Format: `id INT PRIMARY KEY, img_id INT` <br>
Assocates events with images
-- `descs` <br>
+- `descs`: <br>
Format: `id INT PRIMARY KEY, wiki_id INT, desc TEXT` <br>
Associates an event's enwiki title with a short description.
+- `dist`: <br>
+ Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br>
+ Maps scale units to event counts.
+- `scores`: <br>
+ Format: `id INT, scale INT, score INT, PRIMARY KEY (id, scale)` <br>
+ Maps events to score values for each scale (used to show events by popularity and uniformity across scale).
# Generating the Database
@@ -76,3 +82,6 @@ Some of the scripts use third-party packages:
## Remove Events Without Images/Descs
1. Run `reduce_event_data.py` to remove data for events that have no image/description.
+
+## Generate Distribution and Score Data
+1. Run `gen_score_data.py`, which add the `dist` and `scores` tables.
diff --git a/backend/hist_data/gen_score_data.py b/backend/hist_data/gen_score_data.py
new file mode 100644
index 0000000..4ea66c3
--- /dev/null
+++ b/backend/hist_data/gen_score_data.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python3
+
+"""
+Adds data about event distribution and scores to the database.
+"""
+
+# Enable unit testing code to, when running this script, resolve imports of modules within this directory
+import os, sys
+parentDir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(parentDir)
+
+import sqlite3
+from cal import gregorianToJdn, jdnToGregorian
+
+MONTH_SCALE = -1;
+DAY_SCALE = -2;
+SCALES: list[int] = [int(x) for x in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]];
+MAX_DISPLAYED_PER_UNIT = 4
+#
+DB_FILE = 'data.db'
+
+def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None:
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ #
+ print('Reading through events')
+ scaleUnitToCounts: dict[tuple[int, int], list[int]] = {}
+ # Maps scale and unit to two counts (num events in that unit, num events displayable for that unit)
+ # Only includes events with popularity values
+ idAndScaleToScore: dict[tuple[int, int], int] = {} # Maps event id and scale to score
+ iterNum = 0
+ query = 'SELECT events.id, start, fmt, pop FROM events INNER JOIN pop ON events.id = pop.id ORDER BY pop.pop DESC'
+ for eventId, eventStart, fmt, pop in dbCur.execute(query):
+ iterNum += 1
+ if iterNum % 1e3 == 0:
+ print(f'At iteration {iterNum}')
+ # For each scale
+ for scale in scales:
+ # Get unit
+ unit: int
+ if scale >= 1:
+ unit = (eventStart if fmt == 0 else jdnToGregorian(eventStart)[0]) // scale
+ elif scale == MONTH_SCALE:
+ if fmt == 0:
+ unit = gregorianToJdn(eventStart, 1, 1)
+ else:
+ year, month, day = jdnToGregorian(eventStart)
+ unit = eventStart if day == 1 else gregorianToJdn(year, month, 1)
+ else: # scale == DAY_SCALE
+ unit = eventStart if fmt != 0 else gregorianToJdn(eventStart, 1, 1)
+ # Update maps
+ counts: list[int]
+ if (scale, unit) in scaleUnitToCounts:
+ counts = scaleUnitToCounts[(scale, unit)]
+ counts[0] += 1
+ else:
+ counts = [1, 0]
+ if counts[1] < maxDisplayedPerUnit:
+ counts[1] += 1
+ idAndScaleToScore[(eventId, scale)] = pop
+ scaleUnitToCounts[(scale, unit)] = counts
+ #
+ print('Writing to db')
+ dbCur.execute('CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))')
+ dbCur.execute('CREATE TABLE scores (id INT, scale INT, score INT, PRIMARY KEY (id, scale))')
+ for (scale, unit), (count, _) in scaleUnitToCounts.items():
+ dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, count))
+ for (eventId, scale), score in idAndScaleToScore.items():
+ dbCur.execute('INSERT INTO scores VALUES (?, ?, ?)', (eventId, scale, score))
+ #
+ print('Closing db')
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ genData(DB_FILE, SCALES, MAX_DISPLAYED_PER_UNIT)
diff --git a/backend/histplorer.py b/backend/histplorer.py
index 6d1298e..09aabd8 100755
--- a/backend/histplorer.py
+++ b/backend/histplorer.py
@@ -3,7 +3,7 @@ WSGI script that serves historical data
Expected HTTP query parameters:
- type:
- If 'events', reply with list of event objects, within a date range
+ If 'events', reply with list of event objects, within a date range, for a given scale
If 'info', reply with information about a given event
If 'sugg', reply with search suggestions for an event search string
- range: With type=events, specifies a historical-date range
@@ -11,8 +11,8 @@ Expected HTTP query parameters:
Examples:
range=1000.1910-10-09 means '1000 CE to 09/10/1910 (inclusive)'
range=-13000. means '13000 BCE onwards'
+- scale: With type=events, specifies a date scale (matched against 'scale' column in 'scores' table)
- incl: With type=events, specifies an event to include, as an event ID
-- excl: With type=events, specifies events to exclude, as period-separated event IDs
- event: With type=info, specifies the event to get info for
- input: With type=sugg, specifies a search string to suggest for
- limit: With type=events or type=sugg, specifies the max number of results
@@ -26,9 +26,8 @@ import gzip, jsonpickle
from hist_data.cal import gregorianToJdn, jdnToGregorian, jdnToJulian
DB_FILE = 'hist_data/data.db'
-MAX_REQ_EVENTS = 100
+MAX_REQ_EVENTS = 500
DEFAULT_REQ_EVENTS = 20
-MAX_REQ_EXCLS = 100
MAX_REQ_SUGGS = 50
DEFAULT_REQ_SUGGS = 5
MIN_CAL_YEAR = -4713 # Disallow within-year dates before this year
@@ -182,17 +181,22 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor):
except ValueError:
print(f'INFO: Invalid date-range value {dateRange}', file=sys.stderr)
return None
+ # Get scale
+ if 'scale' not in params:
+ print('INFO: No scale provided', file=sys.stderr)
+ return None
+ try:
+ scale = int(params['scale'])
+ except ValueError:
+ print('INFO: Invalid scale value', file=sys.stderr)
+ return None
# Get event category
ctg = params['ctg'] if 'ctg' in params else None
- # Get incl/excl
+ # Get incl value
try:
incl = int(params['incl']) if 'incl' in params else None
- excl = [int(x) for x in params['excl'].split('.')] if 'excl' in params else []
except ValueError:
- print('INFO: Invalid incl/excl value', file=sys.stderr)
- return None
- if len(excl) > MAX_REQ_EXCLS:
- print('INFO: Exceeded excl value limit', file=sys.stderr)
+ print('INFO: Invalid incl value', file=sys.stderr)
return None
# Get result set limit
try:
@@ -204,65 +208,57 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor):
print(f'INFO: Invalid results limit {resultLimit}', file=sys.stderr)
return None
#
- return lookupEvents(start, end, ctg, incl, excl, resultLimit, dbCur)
-def lookupEvents(start: HistDate | None, end: HistDate | None, ctg: str | None,
- incl: int | None, excl: list[int], resultLimit: int, dbCur: sqlite3.Cursor) -> list[Event] | None:
- """ Looks for events within a date range, restricted by event category,
- particular inclusions/exclusions, and a result limit """
- query = 'SELECT events.id, title, start, start_upper, end, end_upper, fmt, ctg, images.id, pop.pop from events' \
+ return lookupEvents(start, end, scale, ctg, incl, resultLimit, dbCur)
+def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: str | None,
+ incl: int | None, resultLimit: int, dbCur: sqlite3.Cursor) -> list[Event] | None:
+ """ Looks for events within a date range, in given scale,
+ restricted by event category, an optional particular inclusion, and a result limit """
+ query = \
+ 'SELECT events.id, title, start, start_upper, end, end_upper, fmt, ctg, images.id, scores.score FROM events' \
+ ' INNER JOIN scores ON events.id = scores.id' \
' INNER JOIN event_imgs ON events.id = event_imgs.id' \
- ' INNER JOIN images ON event_imgs.img_id = images.id LEFT JOIN pop ON events.id = pop.id'
- constraints = []
- params: list[str | int] = []
+ ' INNER JOIN images ON event_imgs.img_id = images.id'
+ constraints = ['scores.scale = ?']
+ params: list[str | int] = [scale]
# Constrain by start/end
if start is not None:
constraint = '(start >= ? AND fmt > 0 OR start >= ? AND fmt = 0)'
if start.gcal is None:
startJdn = gregorianToJdn(start.year, 1, 1) if start.year >= -4713 else 0
constraints.append(constraint)
- params.append(startJdn)
- params.append(start.year)
+ params.extend([startJdn, start.year])
else:
startJdn = gregorianToJdn(start.year, start.month, start.day)
constraints.append(constraint)
- params.append(startJdn)
- params.append(start.year if start.month == 1 and start.day == 1 else start.year + 1)
+ year = start.year if start.month == 1 and start.day == 1 else start.year + 1
+ params.extend([startJdn, year])
if end is not None:
- constraint = '(end IS NULL AND (start <= ? AND fmt > 0 OR start <= ? AND fmt = 0) OR ' \
- 'end IS NOT NULL AND (end <= ? AND fmt > 0 OR end <= ? AND fmt = 0))'
+ constraint = '(start <= ? AND fmt > 0 OR start <= ? AND fmt = 0)'
if end.gcal is None:
endJdn = gregorianToJdn(end.year, 1, 1) if end.year >= -4713 else -1
constraints.append(constraint)
- params.extend([endJdn, end.year, endJdn, end.year])
+ params.extend([endJdn, end.year])
else:
endJdn = gregorianToJdn(end.year, end.month, end.day)
constraints.append(constraint)
year = end.year if end.month == 12 and end.day == 31 else end.year - 1
- params.extend([endJdn, year, endJdn, year])
+ params.extend([endJdn, year])
# Constrain by event category
if ctg is not None:
constraints.append('ctg = ?')
params.append(ctg)
- # For exclusions, lookup extra, and remove later
- tempLimit = resultLimit + len(excl)
- exclusions = set(excl)
# Add constraints to query
query2 = query
if constraints:
- query2 += ' WHERE' + ' AND '.join(constraints)
- query2 += ' ORDER BY pop.pop DESC'
- query2 += f' LIMIT {tempLimit}'
+ query2 += ' WHERE ' + ' AND '.join(constraints)
+ query2 += ' ORDER BY scores.score DESC'
+ query2 += f' LIMIT {resultLimit}'
# Run query
results: list[Event] = []
for row in dbCur.execute(query2, params):
- eventId = row[0]
- if eventId in exclusions:
- continue
- if incl is not None and incl == eventId:
- incl = None
- if len(results) == resultLimit:
- break
results.append(eventEntryToResults(row))
+ if incl is not None and incl == row[0]:
+ incl = None
# Get any additional inclusion
if incl is not None:
row = dbCur.execute(query + ' WHERE events.id = ?', (incl,)).fetchone()
@@ -274,7 +270,7 @@ def lookupEvents(start: HistDate | None, end: HistDate | None, ctg: str | None,
return results
def eventEntryToResults(
row: tuple[int, str, int, int | None, int | None, int | None, int, str, int, int | None]) -> Event:
- eventId, title, start, startUpper, end, endUpper, fmt, ctg, imageId, pop = row
+ eventId, title, start, startUpper, end, endUpper, fmt, ctg, imageId, score = row
""" Helper for converting an 'events' db entry into an Event object """
# Convert dates
dateVals: list[int | None] = [start, startUpper, end, endUpper]
@@ -296,10 +292,8 @@ def eventEntryToResults(
newDates[i] = HistDate(False, *jdnToJulian(n))
else:
newDates[i] = HistDate(True, *jdnToGregorian(n))
- if pop is None:
- pop = 0
#
- return Event(eventId, title, newDates[0], newDates[1], newDates[2], newDates[3], ctg, imageId, pop)
+ return Event(eventId, title, newDates[0], newDates[1], newDates[2], newDates[3], ctg, imageId, score)
# For type=info
def handleInfoReq(params: dict[str, str], dbCur: sqlite3.Cursor):
diff --git a/backend/tests/test_gen_score_data.py b/backend/tests/test_gen_score_data.py
new file mode 100644
index 0000000..a466679
--- /dev/null
+++ b/backend/tests/test_gen_score_data.py
@@ -0,0 +1,82 @@
+import unittest
+import tempfile, os
+
+from tests.common import createTestDbTable, readTestDbTable
+from hist_data.gen_score_data import genData, MONTH_SCALE, DAY_SCALE
+from hist_data.cal import gregorianToJdn
+
+class TestGenData(unittest.TestCase):
+ def test_gen(self):
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp history db
+ dbFile = os.path.join(tempDir, 'data.db')
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)',
+ 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+ {
+ (1, 'event one', 1900, None, None, None, 0, 'event'),
+ (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 15/11/2002 to 21/06/2010
+ (3, 'event three', 1900, None, 2000, None, 0, 'event'),
+ (4, 'event four', 1901, None, 2000, 2010, 0, 'event'),
+ (5, 'event five', 2415294, None, None, None, 1, 'event'), # 01/10/1900
+ (6, 'event six', 2415030, None, None, None, 1, 'event'), # 10/01/1900
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)',
+ 'INSERT INTO pop VALUES (?, ?)',
+ {
+ (1, 11),
+ (2, 21),
+ (4, 5),
+ (5, 50),
+ (6, 10),
+ }
+ )
+ # Run
+ genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2)
+ # Check
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT scale, unit, count FROM dist'),
+ {
+ (10, 190, 4),
+ (10, 200, 1),
+ (1, 1900, 3),
+ (1, 1901, 1),
+ (1, 2002, 1),
+ (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2),
+ (MONTH_SCALE, gregorianToJdn(1901, 1, 1), 1),
+ (MONTH_SCALE, gregorianToJdn(1900, 10, 1), 1),
+ (MONTH_SCALE, gregorianToJdn(2002, 11, 1), 1),
+ (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1),
+ (DAY_SCALE, gregorianToJdn(1900, 1, 10), 1),
+ (DAY_SCALE, gregorianToJdn(1900, 10, 1), 1),
+ (DAY_SCALE, gregorianToJdn(1901, 1, 1), 1),
+ (DAY_SCALE, gregorianToJdn(2002, 11, 15), 1),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT id, scale, score FROM scores'),
+ {
+ (5, 10, 50),
+ (1, 10, 11),
+ (2, 10, 21),
+ (5, 1, 50),
+ (1, 1, 11),
+ (4, 1, 5),
+ (2, 1, 21),
+ (1, MONTH_SCALE, 11),
+ (6, MONTH_SCALE, 10),
+ (4, MONTH_SCALE, 5),
+ (5, MONTH_SCALE, 50),
+ (2, MONTH_SCALE, 21),
+ (1, DAY_SCALE, 11),
+ (4, DAY_SCALE, 5),
+ (5, DAY_SCALE, 50),
+ (6, DAY_SCALE, 10),
+ (2, DAY_SCALE, 21),
+ }
+ )
diff --git a/backend/tests/test_histplorer.py b/backend/tests/test_histplorer.py
index 6487977..eb89f50 100644
--- a/backend/tests/test_histplorer.py
+++ b/backend/tests/test_histplorer.py
@@ -16,6 +16,7 @@ def initTestDb(dbFile: str) -> None:
(3, 'event three', 2448175, 2451828, None, None, 2, 'discovery'), # 10/10/1990 to 10/10/2000
(4, 'event four', 991206, None, 1721706, None, 1, 'event'), # 10/10/-2000 to 10/10/1
(5, 'event five', 2000, None, 2001, None, 0, 'event'),
+ (6, 'event six', 1500, None, 2000, None, 0, 'event'),
}
)
createTestDbTable(
@@ -28,6 +29,21 @@ def initTestDb(dbFile: str) -> None:
(3, 0),
(4, 1000),
(5, 51),
+ (6, 60),
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE scores (id INT, scale INT, score INT, PRIMARY KEY (id, scale))',
+ 'INSERT INTO scores VALUES (?, ?, ?)',
+ {
+ (1, 1, 11),
+ (1, 10, 11),
+ (2, 1, 21),
+ (3, 1, 0),
+ (4, 1, 1000),
+ (5, 1, 51),
+ (6, 10, 60),
}
)
createTestDbTable(
@@ -40,6 +56,7 @@ def initTestDb(dbFile: str) -> None:
(3, 30),
(4, 20),
(5, 50),
+ (6, 60),
}
)
createTestDbTable(
@@ -51,6 +68,7 @@ def initTestDb(dbFile: str) -> None:
(20, 'example.com/2', 'cc-by', 'artist two', 'credits two'),
(30, 'example.com/3', 'cc-by-sa 3.0', 'artist three', 'credits three'),
(50, 'example.com/5', 'cc-by', 'artist five', 'credits five'),
+ (60, 'example.com/6', 'cc-by', 'artist six', 'credits six'),
}
)
createTestDbTable(
@@ -63,6 +81,7 @@ def initTestDb(dbFile: str) -> None:
(3, 300, 'desc three'),
(4, 400, 'desc four'),
(5, 500, 'desc five'),
+ (6, 600, 'desc six'),
}
)
@@ -75,17 +94,18 @@ class TestHandleReq(unittest.TestCase):
def tearDown(self):
self.tempDir.cleanup()
def test_events_req(self):
- response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=-1999.2002-11-1&incl=3&limit=2'})
+ response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=-1999.2002-11-1&scale=1&incl=3&limit=2'})
self.assertEqual(response, [
Event(5, 'event five', HistDate(True, 2000, 1, 1), None, HistDate(True, 2001, 1, 1), None,
'event', 50, 51),
Event(3, 'event three', HistDate(True, 1990, 10, 10), HistDate(True, 2000, 10, 10), None, None,
'discovery', 30, 0),
])
- response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=.1999-11-27&excl=1&ctg=event'})
+ response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=.1999-11-27&scale=1&ctg=event'})
self.assertEqual(response, [
Event(4, 'event four', HistDate(False, -2000, 10, 10), None, HistDate(False, 1, 10, 10), None,
'event', 20, 1000),
+ Event(1, 'event one', HistDate(True, 1900, 1, 1), None, None, None, 'event', 10, 11),
])
def test_info_req(self):
response = handleReq(self.dbFile, {'QUERY_STRING': 'type=info&event=3'})
diff --git a/backend/tests/test_reduce_event_data.py b/backend/tests/test_reduce_event_data.py
index cb54758..c879150 100644
--- a/backend/tests/test_reduce_event_data.py
+++ b/backend/tests/test_reduce_event_data.py
@@ -57,7 +57,7 @@ class TestReduceData(unittest.TestCase):
reduceData(dbFile)
# Check
self.assertEqual(
- readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg from events'),
+ readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg FROM events'),
{
(1, 'event one', 1900, None, None, None, 0, 'event'),
}
diff --git a/src/App.vue b/src/App.vue
index eacc73f..feba10e 100644
--- a/src/App.vue
+++ b/src/App.vue
@@ -122,7 +122,7 @@ function reduceEvents(){
idToEvent = eventsToKeep;
}
// For getting events from server
-const EVENT_REQ_LIMIT = 100;
+const EVENT_REQ_LIMIT = 500;
let queriedRanges: DateRangeTree[] = SCALES.map(() => new DateRangeTree());
// For each scale, holds date ranges for which data has already been queried fromm the server
let pendingReq = false; // Used to serialise event-req handling
@@ -141,6 +141,7 @@ async function onEventDisplay(
let urlParams = new URLSearchParams({
type: 'events',
range: `${firstDate}.${lastDate}`,
+ scale: String(SCALES[scaleIdx]),
limit: String(EVENT_REQ_LIMIT),
});
let responseObj: HistEventJson[] = await queryServer(urlParams);