diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-12-30 23:28:09 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-12-30 23:33:37 +1100 |
| commit | 0e5e46cedaaeacf59cfd0f2e30c1ae6923466870 (patch) | |
| tree | 016b712ce1d4255895bbba11714e624df09cfc4a /backend/hist_data/reduce_event_data.py | |
| parent | 086b0c30afdf2c0fbff48e1005b2f9220b028094 (diff) | |
Generate event_disp data before image-generation
Make gen_disp_data.py delete non-displayable events
Make reduce_event_data.py also delete from 'dist' and 'event_disp'
Remove MAX_IMGS_PER_CTG from enwiki/gen_img_data.py
Make gen_desc_data.py include events without images
Diffstat (limited to 'backend/hist_data/reduce_event_data.py')
| -rwxr-xr-x | backend/hist_data/reduce_event_data.py | 42 |
1 files changed, 33 insertions, 9 deletions
diff --git a/backend/hist_data/reduce_event_data.py b/backend/hist_data/reduce_event_data.py index 15c2ab5..c061f90 100755 --- a/backend/hist_data/reduce_event_data.py +++ b/backend/hist_data/reduce_event_data.py @@ -1,23 +1,43 @@ #!/usr/bin/python3 """ -Delete extraneous events from the database that have no image (and consequently no description) +Delete events from the database that have no image. """ +# Enable unit testing code to, when running this script, resolve imports of modules within this directory +import os, sys +parentDir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(parentDir) + import argparse import sqlite3 +from cal import SCALES, dbDateToHistDate, dateToUnit DB_FILE = 'data.db' -def reduceData(dbFile: str) -> None: +def reduceData(dbFile: str, scales: list[int]) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() # print('Getting events to delete') - eventsToDel = set() - query = 'SELECT events.id FROM events LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL' - for (eventId,) in dbCur.execute(query): - eventsToDel.add(eventId) + eventsToDel: list[int] = [] + scaleUnitToDelCount: dict[tuple[int, int], int] = {} # Stores counts to subtract from entries in 'dist' + query = 'SELECT events.id, events.start, events.fmt FROM events' \ + ' LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL' + iterNum = 0 + for (eventId, start, fmt) in dbCur.execute(query): + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + eventsToDel.append(eventId) + date = dbDateToHistDate(start, fmt) + for scale in scales: + unit = dateToUnit(date, scale) + if (scale, unit) not in scaleUnitToDelCount: + scaleUnitToDelCount[(scale, unit)] = 1 + else: + scaleUnitToDelCount[(scale, unit)] += 1 + print(f'Found {len(eventsToDel)}') # print('Deleting events') iterNum = 0 @@ -26,8 +46,12 @@ def reduceData(dbFile: str) -> None: if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') # - dbCur.execute('DELETE from events where id = ?', (eventId,)) - dbCur.execute('DELETE from pop where id = ?', (eventId,)) + dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,)) + dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,)) + dbCur.execute('DELETE FROM event_disp WHERE id = ?', (eventId,)) + for (scale, unit), delCount in scaleUnitToDelCount.items(): + dbCur.execute('UPDATE dist SET count = count - ? WHERE scale = ? AND unit = ?', (delCount, scale, unit)) + dbCur.execute('DELETE FROM dist WHERE count < 1') # dbCon.commit() dbCon.close() @@ -36,4 +60,4 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() # - reduceData(DB_FILE) + reduceData(DB_FILE, SCALES) |
