aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/hist_data/README.md28
-rw-r--r--backend/hist_data/enwiki/README.md2
-rwxr-xr-xbackend/hist_data/enwiki/gen_img_data.py43
-rwxr-xr-xbackend/hist_data/gen_desc_data.py5
-rw-r--r--backend/hist_data/gen_disp_data.py45
-rwxr-xr-xbackend/hist_data/reduce_event_data.py42
-rw-r--r--backend/tests/enwiki/test_gen_img_data.py21
-rw-r--r--backend/tests/test_gen_desc_data.py13
-rw-r--r--backend/tests/test_gen_disp_data.py61
-rw-r--r--backend/tests/test_reduce_event_data.py91
10 files changed, 231 insertions, 120 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index b557b14..d05016c 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -21,6 +21,12 @@ This directory holds files used to generate the history database data.db.
- `pop`: <br>
Format: `id INT PRIMARY KEY, pop INT` <br>
Associates each event with a popularity measure (currently an average monthly viewcount)
+- `dist`: <br>
+ Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br>
+ Maps scale units to counts of events in them.
+- `event_disp`: <br>
+ Format: `id INT, scale INT, PRIMARY KEY (id, scale)` <br>
+ Maps events to scales they are 'displayable' on (used to make displayed events more uniform across time).
- `images`: <br>
Format: `id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT` <br>
Holds metadata for available images
@@ -30,12 +36,6 @@ This directory holds files used to generate the history database data.db.
- `descs`: <br>
Format: `id INT PRIMARY KEY, wiki_id INT, desc TEXT` <br>
Associates an event's enwiki title with a short description.
-- `dist`: <br>
- Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br>
- Maps scale units to event counts.
-- `event_disp`: <br>
- Format: `id INT, scale INT, PRIMARY KEY (id, scale)` <br>
- Maps events to scales they are 'displayable' on (used to make displayed events more uniform across time).
# Generating the Database
@@ -51,13 +51,15 @@ Some of the scripts use third-party packages:
1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table.
## Generate Popularity Data
-1. Obtain 'page view files' in enwiki/, as specified in it's README.
+1. Obtain an enwiki dump and 'page view files' in enwiki/, as specified in the README.
1. Run `gen_pop_data.py`, which adds the `pop` table, using data in enwiki/ and the `events` table.
+## Generate Event Display Data, and Reduce Dataset
+1. Run `gen_disp_data.py`, which adds the `dist` and `event_disp` tables, and removes events not in `event_disp`.
+
## Generate Image Data and Popularity Data
1. In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`,
looks for infobox image names, and stores them in an image database.
- Uses popularity data in enwiki/ to find the top N events in each event category.
1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
images, and adds them to the image database.
1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
@@ -69,11 +71,8 @@ Some of the scripts use third-party packages:
- An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
## Generate Description Data
-1. Obtain an enwiki dump in enwiki/, as specified in the README.
-1. In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
1. In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
-1. Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/,
- and the `events` and `images` tables (only adds descriptions for events with images).
+1. Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/, and the `events` table.
## Optionally Add Extra Event Data
1. Additional events can be described in `picked/events.json`, with images for them put
@@ -81,7 +80,4 @@ Some of the scripts use third-party packages:
1. Can run `gen_picked_data.py` to add those described events to the database.
## Remove Events Without Images/Descs
-1. Run `reduce_event_data.py` to remove data for events that have no image/description.
-
-## Generate Distribution and Displayability Data
-1. Run `gen_disp_data.py`, which add the `dist` and `event_disp` tables.
+1. Run `reduce_event_data.py` to remove data for events that have no image.
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index 29fc2ff..262ebdb 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -38,7 +38,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database.
- `img_data.db` <br>
Used to hold metadata about infobox images for a set of page IDs.
- Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br>
+ Generated using `gen_img_data.py` and `download_img_license_info.py`. <br>
Tables: <br>
- `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br>
`img_name` may be NULL, which means 'none found', and is used to avoid re-processing page IDs.
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index b4ade9f..922b893 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -14,10 +14,8 @@ import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
INDEX_DB = 'dump_index.db'
-PAGEVIEW_DB = 'pageview_data.db'
IMG_DB = 'img_data.db' # The database to create
DB_FILE = os.path.join('..', 'data.db')
-MAX_IMGS_PER_CTG = 20000
#
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
@@ -179,49 +177,28 @@ def getImageName(content: list[str]) -> str | None:
return None
return None
-def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]:
+def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
print('Getting event data')
- titleToCtg: dict[str, str] = {}
+ titles: set[str] = set()
dbCon = sqlite3.connect(dbFile)
- for title, ctg in dbCon.execute('SELECT title, ctg from events'):
- titleToCtg[title] = ctg
+ for (title,) in dbCon.execute('SELECT title from events'):
+ titles.add(title)
dbCon.close()
- print('Getting top images for each event category')
- ctgToTitles: dict[str, list[str]] = {}
- dbCon = sqlite3.connect(pageviewDb)
- for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'):
- if title not in titleToCtg:
- continue
- ctg = titleToCtg[title]
- if ctg not in ctgToTitles:
- ctgToTitles[ctg] = []
- elif len(ctgToTitles[ctg]) == maxImgsPerCtg:
- continue
- ctgToTitles[ctg].append(title)
- del titleToCtg[title]
- dbCon.close()
- for title, ctg in titleToCtg.items(): # Account for titles without view counts
- if ctg not in ctgToTitles:
- ctgToTitles[ctg] = []
- elif len(ctgToTitles[ctg]) == maxImgsPerCtg:
- continue
- ctgToTitles[ctg].append(title)
print('Getting page IDs')
pageIds: set[int] = set()
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
- for ctg in ctgToTitles:
- for title in ctgToTitles[ctg]:
- row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
- if row:
- pageIds.add(row[0])
+ for title in titles:
+ row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+ if row:
+ pageIds.add(row[0])
dbCon.close()
- print(f'Result: {len(pageIds)} out of {len(titleToCtg)}')
+ print(f'Result: {len(pageIds)} out of {len(titles)}')
return pageIds
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
#
- pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG)
+ pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB)
genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py
index 68f9e56..0d7ee88 100755
--- a/backend/hist_data/gen_desc_data.py
+++ b/backend/hist_data/gen_desc_data.py
@@ -16,10 +16,9 @@ def genData(enwikiDb: str, dbFile: str) -> None:
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)')
#
- print('Getting events with images')
+ print('Getting events')
titleToId: dict[str, int] = {}
- query = 'SELECT events.id, events.title FROM events INNER JOIN event_imgs ON events.id = event_imgs.id'
- for eventId, title in dbCur.execute(query):
+ for eventId, title in dbCur.execute('SELECT id, title FROM events'):
titleToId[title] = eventId
#
print('Getting Wikipedia descriptions')
diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py
index a81263f..e771e57 100644
--- a/backend/hist_data/gen_disp_data.py
+++ b/backend/hist_data/gen_disp_data.py
@@ -1,7 +1,7 @@
#!/usr/bin/python3
"""
-Adds data about event distribution and displayability to the database.
+Adds data about event distribution to the database, and removes events not eligible for display.
"""
# Enable unit testing code to, when running this script, resolve imports of modules within this directory
@@ -23,12 +23,12 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None:
scaleUnitToCounts: dict[tuple[int, int], list[int]] = {}
# Maps scale and unit to two counts (num events in that unit, num events displayable for that unit)
# Only includes events with popularity values
- idScales: set[tuple[int, int]] = set() # Maps event ids to scales they are displayable on
+ idScales: dict[int, list[int]] = {} # Maps event ids to scales they are displayable on
iterNum = 0
query = 'SELECT events.id, start, fmt FROM events INNER JOIN pop ON events.id = pop.id ORDER BY pop.pop DESC'
for eventId, eventStart, fmt in dbCur.execute(query):
iterNum += 1
- if iterNum % 1e3 == 0:
+ if iterNum % 1e5 == 0:
print(f'At iteration {iterNum}')
# For each scale
for scale in scales:
@@ -42,16 +42,49 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None:
counts = [1, 0]
if counts[1] < maxDisplayedPerUnit:
counts[1] += 1
- idScales.add((eventId, scale))
+ if eventId not in idScales:
+ idScales[eventId] = []
+ idScales[eventId].append(scale)
scaleUnitToCounts[(scale, unit)] = counts
+ print(f'Results: {len(idScales)} displayable events')
+ #
+ print('Looking for non-displayable events')
+ eventsToDel: list[int] = []
+ for eventId, eventStart, fmt in dbCur.execute(query):
+ if eventId in idScales:
+ continue
+ eventsToDel.append(eventId)
+ # Remove from data to be added to 'dist'
+ for scale in scales:
+ unit = dateToUnit(dbDateToHistDate(eventStart, fmt), scale)
+ count = scaleUnitToCounts[(scale, unit)][0] - 1
+ if count == 0:
+ del scaleUnitToCounts[(scale, unit)]
+ else:
+ scaleUnitToCounts[(scale, unit)][0] = count
+ query2 = 'SELECT events.id FROM events LEFT JOIN pop ON events.id = pop.id WHERE pop.id IS NULL'
+ for (eventId,) in dbCur.execute(query2): # Include events without scores
+ eventsToDel.append(eventId)
+ print(f'Found {len(eventsToDel)}')
+ #
+ print(f'Deleting {len(eventsToDel)} events')
+ iterNum = 0
+ for eventId in eventsToDel:
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,))
+ dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,))
#
print('Writing to db')
dbCur.execute('CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))')
for (scale, unit), (count, _) in scaleUnitToCounts.items():
dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, count))
dbCur.execute('CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))')
- for eventId, scale in idScales:
- dbCur.execute('INSERT INTO event_disp VALUES (?, ?)', (eventId, scale))
+ for eventId, scales in idScales.items():
+ for scale in scales:
+ dbCur.execute('INSERT INTO event_disp VALUES (?, ?)', (eventId, scale))
#
print('Closing db')
dbCon.commit()
diff --git a/backend/hist_data/reduce_event_data.py b/backend/hist_data/reduce_event_data.py
index 15c2ab5..c061f90 100755
--- a/backend/hist_data/reduce_event_data.py
+++ b/backend/hist_data/reduce_event_data.py
@@ -1,23 +1,43 @@
#!/usr/bin/python3
"""
-Delete extraneous events from the database that have no image (and consequently no description)
+Delete events from the database that have no image.
"""
+# Enable unit testing code to, when running this script, resolve imports of modules within this directory
+import os, sys
+parentDir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(parentDir)
+
import argparse
import sqlite3
+from cal import SCALES, dbDateToHistDate, dateToUnit
DB_FILE = 'data.db'
-def reduceData(dbFile: str) -> None:
+def reduceData(dbFile: str, scales: list[int]) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
#
print('Getting events to delete')
- eventsToDel = set()
- query = 'SELECT events.id FROM events LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL'
- for (eventId,) in dbCur.execute(query):
- eventsToDel.add(eventId)
+ eventsToDel: list[int] = []
+ scaleUnitToDelCount: dict[tuple[int, int], int] = {} # Stores counts to subtract from entries in 'dist'
+ query = 'SELECT events.id, events.start, events.fmt FROM events' \
+ ' LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL'
+ iterNum = 0
+ for (eventId, start, fmt) in dbCur.execute(query):
+ if iterNum % 1e5 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ eventsToDel.append(eventId)
+ date = dbDateToHistDate(start, fmt)
+ for scale in scales:
+ unit = dateToUnit(date, scale)
+ if (scale, unit) not in scaleUnitToDelCount:
+ scaleUnitToDelCount[(scale, unit)] = 1
+ else:
+ scaleUnitToDelCount[(scale, unit)] += 1
+ print(f'Found {len(eventsToDel)}')
#
print('Deleting events')
iterNum = 0
@@ -26,8 +46,12 @@ def reduceData(dbFile: str) -> None:
if iterNum % 1e5 == 0:
print(f'At iteration {iterNum}')
#
- dbCur.execute('DELETE from events where id = ?', (eventId,))
- dbCur.execute('DELETE from pop where id = ?', (eventId,))
+ dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,))
+ dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,))
+ dbCur.execute('DELETE FROM event_disp WHERE id = ?', (eventId,))
+ for (scale, unit), delCount in scaleUnitToDelCount.items():
+ dbCur.execute('UPDATE dist SET count = count - ? WHERE scale = ? AND unit = ?', (delCount, scale, unit))
+ dbCur.execute('DELETE FROM dist WHERE count < 1')
#
dbCon.commit()
dbCon.close()
@@ -36,4 +60,4 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
#
- reduceData(DB_FILE)
+ reduceData(DB_FILE, SCALES)
diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py
index 93bb196..04fdd69 100644
--- a/backend/tests/enwiki/test_gen_img_data.py
+++ b/backend/tests/enwiki/test_gen_img_data.py
@@ -9,7 +9,7 @@ TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_ar
class TestGetInputPageIdsFromDb(unittest.TestCase):
def test_get(self):
with tempfile.TemporaryDirectory() as tempDir:
- # Create temp tree-of-life db
+ # Create temp history db
dbFile = os.path.join(tempDir, 'data.db')
createTestDbTable(
dbFile,
@@ -24,19 +24,6 @@ class TestGetInputPageIdsFromDb(unittest.TestCase):
(5, 'Marie Curie', 2403277, None, 2427622, None, 2, 'human'),
}
)
- # Create temp pageviews db
- pageviewDb = os.path.join(tempDir, 'pageview_data.db')
- createTestDbTable(
- pageviewDb,
- 'CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)',
- 'INSERT INTO views VALUES (?, ?, ?)',
- {
- ('George Washington', 2, 8),
- ('Marie Curie', 5, 10),
- ('Douglas Adams', 3, 5),
- ('Belgium', 1, 100),
- }
- )
# Create temp dump-index db
indexDb = os.path.join(tempDir, 'dump_index.db')
createTestDbTable(
@@ -46,15 +33,15 @@ class TestGetInputPageIdsFromDb(unittest.TestCase):
{
('Belgium',10,0,-1),
('George Washington',20,0,-1),
- ('Douglas Adamns',30,0,-1),
+ ('Douglas Adams',30,0,-1),
('Marie Curie',50,0,-1),
('Autism',25,0,-1),
}
)
# Run
- pageIds = getInputPageIdsFromDb(dbFile, pageviewDb, indexDb, 2)
+ pageIds = getInputPageIdsFromDb(dbFile, indexDb)
# Check
- self.assertEqual(pageIds, {50, 20, 10})
+ self.assertEqual(pageIds, {10, 20, 30, 50})
class TestGenData(unittest.TestCase):
def test_gen(self):
diff --git a/backend/tests/test_gen_desc_data.py b/backend/tests/test_gen_desc_data.py
index 6f321b4..eabe644 100644
--- a/backend/tests/test_gen_desc_data.py
+++ b/backend/tests/test_gen_desc_data.py
@@ -18,7 +18,6 @@ class TestGenData(unittest.TestCase):
(3, 'III'),
(4, 'IV'),
(5, 'V'),
- (6, 'VI'),
}
)
createTestDbTable(
@@ -38,7 +37,6 @@ class TestGenData(unittest.TestCase):
(3, 'Three'),
(4, 'Four'),
(5, 'Five'),
- (6, 'Six'),
}
)
# Create temp history db
@@ -53,17 +51,6 @@ class TestGenData(unittest.TestCase):
(20, 'II', 200, None, None, None, 0, 'discovery'),
(30, 'III', 300, None, 350, None, 0, 'event'),
(50, 'V', 5, 10, None, None, 1, 'human'),
- (60, 'VI', 6000, None, None, None, None, 'event'),
- }
- )
- createTestDbTable(
- dbFile,
- 'CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)',
- 'INSERT INTO event_imgs VALUES (?, ?)',
- {
- (10, 100),
- (30, 300),
- (50, 500),
}
)
# Run
diff --git a/backend/tests/test_gen_disp_data.py b/backend/tests/test_gen_disp_data.py
index 464405a..c39c962 100644
--- a/backend/tests/test_gen_disp_data.py
+++ b/backend/tests/test_gen_disp_data.py
@@ -17,11 +17,16 @@ class TestGenData(unittest.TestCase):
'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
{
(1, 'event one', 1900, None, None, None, 0, 'event'),
- (2, 'event two', 2452607, None, 2455369, None, 3, 'human'), # 15/11/2002 to 21/06/2010
- (3, 'event three', 1900, None, 2000, None, 0, 'event'),
+ (2, 'event two', 2452607, None, 2455369, None, 3, 'human'), # 15/11/2002
+ (3, 'event three', 1900, None, 2000, None, 0, 'event'), # version of 1 without pop score
(4, 'event four', 1901, None, 2000, 2010, 0, 'event'),
(5, 'event five', 2415307, None, None, None, 1, 'event'), # 01/10/1900
(6, 'event six', 2415030, None, None, None, 2, 'event'), # 10/01/1900
+ (7, 'event seven', 1900, None, None, None, 0, 'event'), # popular version of 1
+ (8, 'event eight', 1900, None, None, None, 0, 'event'), # less popular version of 1
+ (9, 'event nine', 1900, None, None, None, 0, 'event'), # less popular version of 1
+ (10, 'event ten', 2415307, None, None, None, 1, 'event'), # less popular version of 5
+ (11, 'event eleven', 2415307, None, None, None, 1, 'event'), # slightly less popular version of 5
}
)
createTestDbTable(
@@ -34,26 +39,55 @@ class TestGenData(unittest.TestCase):
(4, 5),
(5, 50),
(6, 10),
+ (7, 100),
+ (8, 1),
+ (9, 2),
+ (10, 40),
+ (11, 45),
}
)
# Run
genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2)
# Check
self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT * FROM events'),
+ {
+ (1, 'event one', 1900, None, None, None, 0, 'event'),
+ (2, 'event two', 2452607, None, 2455369, None, 3, 'human'),
+ (4, 'event four', 1901, None, 2000, 2010, 0, 'event'),
+ (5, 'event five', 2415307, None, None, None, 1, 'event'),
+ (6, 'event six', 2415030, None, None, None, 2, 'event'),
+ (7, 'event seven', 1900, None, None, None, 0, 'event'),
+ (11, 'event eleven', 2415307, None, None, None, 1, 'event'), # 01/10/1900
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT * FROM pop'),
+ {
+ (1, 11),
+ (2, 21),
+ (4, 5),
+ (5, 50),
+ (6, 10),
+ (7, 100),
+ (11, 45),
+ }
+ )
+ self.assertEqual(
readTestDbTable(dbFile, 'SELECT scale, unit, count FROM dist'),
{
- (10, 190, 4),
+ (10, 190, 6),
(10, 200, 1),
- (1, 1900, 3),
+ (1, 1900, 5),
(1, 1901, 1),
(1, 2002, 1),
- (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2),
+ (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 3),
(MONTH_SCALE, gregorianToJdn(1901, 1, 1), 1),
- (MONTH_SCALE, julianToJdn(1900, 10, 1), 1),
+ (MONTH_SCALE, julianToJdn(1900, 10, 1), 2),
(MONTH_SCALE, julianToJdn(2002, 11, 1), 1),
- (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1),
+ (DAY_SCALE, gregorianToJdn(1900, 1, 1), 2),
(DAY_SCALE, gregorianToJdn(1900, 1, 10), 1),
- (DAY_SCALE, julianToJdn(1900, 10, 1), 1),
+ (DAY_SCALE, julianToJdn(1900, 10, 1), 2),
(DAY_SCALE, gregorianToJdn(1901, 1, 1), 1),
(DAY_SCALE, julianToJdn(2002, 11, 15), 1),
}
@@ -62,21 +96,24 @@ class TestGenData(unittest.TestCase):
readTestDbTable(dbFile, 'SELECT id, scale FROM event_disp'),
{
(5, 10),
- (1, 10),
+ (7, 10),
(2, 10),
(5, 1),
- (1, 1),
+ (7, 1),
(4, 1),
(2, 1),
(1, MONTH_SCALE),
- (6, MONTH_SCALE),
+ (7, MONTH_SCALE),
(4, MONTH_SCALE),
(5, MONTH_SCALE),
+ (11, MONTH_SCALE),
(2, MONTH_SCALE),
(1, DAY_SCALE),
+ (7, DAY_SCALE),
+ (6, DAY_SCALE),
(4, DAY_SCALE),
(5, DAY_SCALE),
- (6, DAY_SCALE),
+ (11, DAY_SCALE),
(2, DAY_SCALE),
}
)
diff --git a/backend/tests/test_reduce_event_data.py b/backend/tests/test_reduce_event_data.py
index c879150..7f1ce73 100644
--- a/backend/tests/test_reduce_event_data.py
+++ b/backend/tests/test_reduce_event_data.py
@@ -3,6 +3,7 @@ import tempfile, os
from tests.common import createTestDbTable, readTestDbTable
from hist_data.reduce_event_data import reduceData
+from hist_data.cal import gregorianToJdn, julianToJdn, MONTH_SCALE, DAY_SCALE
class TestReduceData(unittest.TestCase):
def test_reduce(self):
@@ -16,8 +17,10 @@ class TestReduceData(unittest.TestCase):
'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
{
(1, 'event one', 1900, None, None, None, 0, 'event'),
- (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 2/11/2002 to 21/06/2010
- (3, 'event three', 2448175, 2451828, None, None, 2, 'discovery'), # 10/10/1990 to 10/10/2000
+ (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 2/11/2002
+ (3, 'event three', 2448175, 2448200, None, None, 2, 'discovery'), # 10/10/1990
+ (4, 'event four', 1900, None, None, None, 0, 'event'), # Copy of 1
+ (5, 'event five', 2452595, None, 2455369, None, 3, 'human'), # Day after 2
}
)
createTestDbTable(
@@ -25,8 +28,50 @@ class TestReduceData(unittest.TestCase):
'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)',
'INSERT INTO pop VALUES (?, ?)',
{
- (1, 11),
- (2, 21),
+ (1, 10),
+ (2, 20),
+ (3, 30),
+ (4, 40),
+ (5, 50),
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))',
+ 'INSERT INTO dist VALUES (?, ?, ?)',
+ {
+ (1, 1900, 2),
+ (1, 1990, 1),
+ (1, 2002, 2),
+ (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2),
+ (MONTH_SCALE, gregorianToJdn(1990, 10, 1), 1),
+ (MONTH_SCALE, julianToJdn(2002, 11, 1), 2),
+ (DAY_SCALE, gregorianToJdn(1900, 1, 1), 2),
+ (DAY_SCALE, gregorianToJdn(1990, 10, 10), 1),
+ (DAY_SCALE, 2452594, 1),
+ (DAY_SCALE, 2452595, 1),
+ }
+ )
+ createTestDbTable(
+ dbFile,
+ 'CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))',
+ 'INSERT INTO event_disp VALUES (?, ?)',
+ {
+ (1, 1),
+ (1, MONTH_SCALE),
+ (1, DAY_SCALE),
+ (2, 1),
+ (2, MONTH_SCALE),
+ (2, DAY_SCALE),
+ (3, 1),
+ (3, MONTH_SCALE),
+ (3, DAY_SCALE),
+ (4, 1),
+ (4, MONTH_SCALE),
+ (4, DAY_SCALE),
+ (5, 1),
+ (5, MONTH_SCALE),
+ (5, DAY_SCALE),
}
)
createTestDbTable(
@@ -34,7 +79,8 @@ class TestReduceData(unittest.TestCase):
'CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)',
'INSERT INTO event_imgs VALUES (?, ?)',
{
- (1, 10),
+ (1, 11),
+ (2, 21),
}
)
createTestDbTable(
@@ -42,7 +88,8 @@ class TestReduceData(unittest.TestCase):
'CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)',
'INSERT INTO images VALUES (?, ?, ?, ?, ?)',
{
- (10, 'example.com/1', 'cc0', 'artist one', 'credits one'),
+ (11, 'example.com/1', 'cc0', 'artist one', 'credits one'),
+ (21, 'example.com/1', 'cc0', 'artist two', 'credits two'),
}
)
createTestDbTable(
@@ -54,17 +101,41 @@ class TestReduceData(unittest.TestCase):
}
)
# Run
- reduceData(dbFile)
+ reduceData(dbFile, [1, MONTH_SCALE, DAY_SCALE])
# Check
self.assertEqual(
- readTestDbTable(dbFile, 'SELECT id, title, start, start_upper, end, end_upper, fmt, ctg FROM events'),
+ readTestDbTable(dbFile, 'SELECT * FROM events'),
{
(1, 'event one', 1900, None, None, None, 0, 'event'),
+ (2, 'event two', 2452594, None, 2455369, None, 3, 'human'),
}
)
self.assertEqual(
- readTestDbTable(dbFile, 'SELECT id, pop from pop'),
+ readTestDbTable(dbFile, 'SELECT * from pop'),
{
- (1, 11),
+ (1, 10),
+ (2, 20),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT * from dist'),
+ {
+ (1, 1900, 1),
+ (1, 2002, 1),
+ (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 1),
+ (MONTH_SCALE, julianToJdn(2002, 11, 1), 1),
+ (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1),
+ (DAY_SCALE, 2452594, 1),
+ }
+ )
+ self.assertEqual(
+ readTestDbTable(dbFile, 'SELECT * from event_disp'),
+ {
+ (1, 1),
+ (1, MONTH_SCALE),
+ (1, DAY_SCALE),
+ (2, 1),
+ (2, MONTH_SCALE),
+ (2, DAY_SCALE),
}
)