17 files changed, 222 insertions, 209 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index b557b14..9ae7811 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -11,16 +11,22 @@ This directory holds files used to generate the history database data.db.
         If `start_upper` is present, it and `start` denote an uncertain range of start times.
         Similarly for 'end' and 'end_upper'.
     -   `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`.
+        -   If 0, they denote a number of years AD (if positive) or BC (if negative).
         -   If 1, they denote a Julian date number.
-            This allows simple comparison of events with day-level precision, but only goes back to 4713 BCE.
-        -   If 2, same as 1, but dates are preferably displayed using the Gregorian calendar, not the Julian calendar.
+            This allows simple comparison of events with day-level precision, but only goes back to 4713 BC.
+        -   If 2, same as 1, but with a preference for display using the Julian calendar, not the Gregorian calendar.
             For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not.
-        -   If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'.
+        -   If 3, same as 2, but where 'start' and 'start_upper' are 'preferably Julian'.
             For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
-        -   If 0, they denote a number of years CE (if positive) or BCE (if negative).
 -   `pop`: <br>
     Format: `id INT PRIMARY KEY, pop INT` <br>
     Associates each event with a popularity measure (currently an average monthly viewcount)
+-   `dist`: <br>
+    Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br>
+    Maps scale units to counts of events in them.
+-   `event_disp`: <br>
+    Format: `id INT, scale INT, PRIMARY KEY (id, scale)` <br>
+    Maps events to scales they are 'displayable' on (used to make displayed events more uniform across time).
 -   `images`: <br>
     Format: `id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT` <br>
     Holds metadata for available images
@@ -30,12 +36,6 @@ This directory holds files used to generate the history database data.db.
 -   `descs`: <br>
     Format: `id INT PRIMARY KEY, wiki_id INT, desc TEXT` <br>
     Associates an event's enwiki title with a short description.
--   `dist`: <br>
-    Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br>
-    Maps scale units to event counts.
--   `event_disp`: <br>
-    Format: `id INT, scale INT, PRIMARY KEY (id, scale)` <br>
-    Maps events to scales they are 'displayable' on (used to make displayed events more uniform across time).
 
 # Generating the Database
 
@@ -44,36 +44,38 @@ Some of the scripts use third-party packages:
 -   `indexed_bzip2`: For parallelised bzip2 processing
 -   `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps
 -   `requests`: For downloading data
--   `Pillow`: For image processing
 
 ## Generate Event Data
 1.  Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
 1.  Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table.
+    You might want to set WIKIDATA_FILE in the script to the dump file's name.
 
 ## Generate Popularity Data
-1.  Obtain 'page view files' in enwiki/, as specified in it's README.
+1.  Obtain an enwiki dump and 'page view files' in enwiki/, as specified in the README.
 1.  Run `gen_pop_data.py`, which adds the `pop` table, using data in enwiki/ and the `events` table.
 
+## Generate Event Display Data, and Reduce Dataset
+1.  Run `gen_disp_data.py`, which adds the `dist` and `event_disp` tables, and removes events not in `event_disp`.
+
 ## Generate Image Data and Popularity Data
 1.  In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`,
     looks for infobox image names, and stores them in an image database.
-    Uses popularity data in enwiki/ to find the top N events in each event category.
 1.  In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
-    images, and adds them to the image database.
-1.  In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
+    images, and adds them to the image database. You should probably first change the USER_AGENT
+    script variable to identify yourself to the online API (this is expected
+    [best practice](https://www.mediawiki.org/wiki/API:Etiquette)).
+1.  In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. Setting the
+    USER_AGENT variable applies here as well.
 1.  Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/.
     Adds the `imgs` and `event_imgs` tables. <br>
-    The outputs will likely need additional manual changes:
+    The output images may need additional manual changes:
     -   An input image might have no output produced, possibly due to
         data incompatibilities, memory limits, etc.
     -   An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
 
 ## Generate Description Data
-1.  Obtain an enwiki dump in enwiki/, as specified in the README.
-1.  In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
 1.  In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
-1.  Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/,
-    and the `events` and `images` tables (only adds descriptions for events with images).
+1.  Run `gen_desc_data.py`, which adds the `descs` table, using data in enwiki/, and the `events` table.
 
 ## Optionally Add Extra Event Data
 1.  Additional events can be described in `picked/events.json`, with images for them put
@@ -81,7 +83,4 @@ Some of the scripts use third-party packages:
 1.  Can run `gen_picked_data.py` to add those described events to the database.
 
 ## Remove Events Without Images/Descs
-1.  Run `reduce_event_data.py` to remove data for events that have no image/description.
-
-## Generate Distribution and Displayability Data
-1.  Run `gen_disp_data.py`, which add the `dist` and `event_disp` tables.
+1.  Run `reduce_event_data.py` to remove data for events that have no image.
diff --git a/backend/hist_data/cal.py b/backend/hist_data/cal.py
index 3b65205..550303e 100644
--- a/backend/hist_data/cal.py
+++ b/backend/hist_data/cal.py
@@ -1,14 +1,14 @@
 """
 Provides date conversion functions, HistDate, and date scales.
-Algorithms for converting between calendars and Julian day number values were obtained from
-https://en.wikipedia.org/wiki/Julian_day#Converting_Gregorian_calendar_date_to_Julian_Day_Number.
 """
 
+# For conversion between calendars and Julian day numbers.  Algorithms were obtained from
+# https://en.wikipedia.org/wiki/Julian_day#Converting_Gregorian_calendar_date_to_Julian_Day_Number.
 def gregorianToJdn(year: int, month: int, day: int) -> int:
 	"""
 	Converts a Gregorian calendar date to a Julian day number,
 	denoting the noon-to-noon 'Julian day' that starts within the input day.
-	A year of 1 means 1 CE, and -1 means 1 BC (0 is treated like -1).
+	A year of 1 means 1 AD, and -1 means 1 BC (0 is treated like -1).
 	A month of 1 means January. Can use a month of 13 and a day of 0.
 	Valid for dates from 24th Nov 4714 BC onwards.
 	"""
@@ -20,7 +20,6 @@ def gregorianToJdn(year: int, month: int, day: int) -> int:
 	jdn -= int((3 * int((year + 4900 + x) / 100)) / 4)
 	jdn += day - 32075
 	return jdn
-
 def julianToJdn(year: int, month: int, day: int) -> int:
 	"""
 	Like gregorianToJdn(), but converts a Julian calendar date.
@@ -33,7 +32,6 @@ def julianToJdn(year: int, month: int, day: int) -> int:
 	jdn += int(275 * month / 9)
 	jdn += day + 1729777
 	return jdn
-
 def jdnToGregorian(jdn: int) -> tuple[int, int, int]:
 	"""
 	Converts a Julian day number to a Gregorian calendar date, denoting the
@@ -50,7 +48,6 @@ def jdnToGregorian(jdn: int) -> tuple[int, int, int]:
 	if Y <= 0:
 		Y -= 1
 	return Y, M, D
-
 def jdnToJulian(jdn: int) -> tuple[int, int, int]:
 	""" Like jdnToGregorian(), but converts to a Julian calendar date """
 	f = jdn + 1401
@@ -63,26 +60,25 @@ def jdnToJulian(jdn: int) -> tuple[int, int, int]:
 	if Y <= 0:
 		Y -= 1
 	return Y, M, D
-
 def julianToGregorian(year: int, month: int, day: int) -> tuple[int, int, int]:
 	return jdnToGregorian(julianToJdn(year, month, day))
-
 def gregorianToJulian(year: int, month: int, day: int) -> tuple[int, int, int]:
 	return jdnToJulian(gregorianToJdn(year, month, day))
 
-MIN_CAL_YEAR = -4713 # Disallow within-year dates before this year
+# For date representation
+MIN_CAL_YEAR = -4713 # Year before which JDNs are not usable
 MONTH_SCALE = -1;
 DAY_SCALE = -2;
-SCALES: list[int] = [int(x) for x in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]];
+SCALES: list[int] = [int(s) for s in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]];
 class HistDate:
 	"""
 	Represents a historical date
-	- 'year' may be negative (-1 means 1 BCE)
+	- 'year' may be negative (-1 means 1 BC)
 	- 'month' and 'day' are at least 1, if given
 	- 'gcal' may be:
 		- True: Indicates a Gregorian calendar date
 		- False: Means the date should, for display, be converted to a Julian calendar date
-		- None: 'month' and 'day' are 1 (used for dates before the Julian period starting year 4713 BCE)
+		- None: 'month' and 'day' are 1 (required for dates before MIN_CAL_YEAR)
 	"""
 	def __init__(self, gcal: bool | None, year: int, month=1, day=1):
 		self.gcal = gcal
@@ -96,22 +92,24 @@ class HistDate:
 	def __repr__(self):
 		return str(self.__dict__)
 def dbDateToHistDate(n: int, fmt: int, end=False) -> HistDate:
+	""" Converts a start/start_upper/etc and fmt value in the 'events' db table, into a HistDate """
 	if fmt == 0: # year
 		if n >= MIN_CAL_YEAR:
 			return HistDate(True, n, 1, 1)
 		else:
 			return HistDate(None, n)
-	elif fmt == 1 or fmt == 3 and not end: # jdn for julian calendar
-		return HistDate(False, *jdnToJulian(n))
-	else: # fmt == 2 or fmt == 3 and end
+	elif fmt == 1 or fmt == 3 and end: # jdn for gregorian calendar
 		return HistDate(True, *jdnToGregorian(n))
+	else: # fmt == 2 or fmt == 3 and not end
+		return HistDate(False, *jdnToJulian(n))
 def dateToUnit(date: HistDate, scale: int) -> int:
+	""" Converts a date to an int representing a unit on a scale """
 	if scale >= 1:
 		return date.year // scale
 	elif scale == MONTH_SCALE:
 		if date.gcal == False:
 			return julianToJdn(date.year, date.month, 1)
-		else:
+		else: # True or None
 			return gregorianToJdn(date.year, date.month, 1)
 	else: # scale == DAY_SCALE
 		if date.gcal == False:
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index 29fc2ff..76d33e5 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -33,12 +33,12 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
 
 # Image Files
 -   `gen_img_data.py` <br>
-    Used to find infobox image names for page IDs, and store them into a database.
+    Finds infobox image names for page IDs, and stores them into a database.
 -   `download_img_license_info.py` <br>
-    Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database.
+    Downloads licensing metadata for image names, via wikipedia's online API, and stores them into a database.
 -   `img_data.db` <br>
-    Used to hold metadata about infobox images for a set of page IDs.
-    Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br>
+    Holds metadata about infobox images for a set of page IDs.
+    Generated using `gen_img_data.py` and `download_img_license_info.py`. <br>
     Tables: <br>
     -   `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br>
         `img_name` may be NULL, which means 'none found', and is used to avoid re-processing page IDs.
@@ -47,7 +47,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
             <br>
         Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
 -   `download_imgs.py` <br>
-    Used to download image files into imgs/.
+    Downloads image files into imgs/.
 
 # Description Files
 -   `gen_desc_data.py` <br>
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 1217caf..43f2c43 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks
 at already-processed names to decide what to skip.
 """
 
-import re
+import argparse
+import re, time, signal
 import sqlite3, urllib.parse, html
 import requests
-import time, signal
 
 IMG_DB = 'img_data.db'
 #
@@ -150,7 +150,6 @@ def downloadInfo(imgDb: str) -> None:
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index bbd2cda..7dd0771 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks
 in the output directory do decide what to skip.
 """
 
-import re, os
+import argparse
+import re, os, time, signal
 import sqlite3
 import urllib.parse, requests
-import time, signal
 
 IMG_DB = 'img_data.db' # About 130k image names
 OUT_DIR = 'imgs'
@@ -22,7 +22,7 @@ USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
 TIMEOUT = 1
 	# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
 	# It's unclear how to properly check for cache misses, so we just aim for 1 per sec
-BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit)
+EXP_BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit)
 
 def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	if not os.path.exists(outDir):
@@ -84,7 +84,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 			time.sleep(timeout)
 		except Exception as e:
 			print(f'Error while downloading to {outFile}: {e}')
-			if not BACKOFF:
+			if not EXP_BACKOFF:
 				return
 			else:
 				timeout *= 2
@@ -94,7 +94,6 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index b3fde52..bb2b845 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -7,14 +7,14 @@ and adds them to a database
 
 # In testing, this script took over 10 hours to run, and generated about 5GB
 
+import argparse
 import sys, os, re
-import bz2
-import html, mwxml, mwparserfromhell
+import bz2, html, mwxml, mwparserfromhell
 import sqlite3
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
 DB_FILE = 'desc_data.db'
-
+# Regexps
 DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
 EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
 	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
@@ -119,7 +119,6 @@ def convertTitle(title: str) -> str:
 	return html.unescape(title).replace('_', ' ')
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
index 5778680..6be8bc5 100755
--- a/backend/hist_data/enwiki/gen_dump_index_db.py
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -1,11 +1,12 @@
 #!/usr/bin/python3
 
 """
-Adds data from the wiki dump index-file into a database
+Adds data from the wiki-dump index-file into a database
 """
+
+import argparse
 import sys, os, re
-import bz2
-import sqlite3
+import bz2, sqlite3
 
 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
 DB_FILE = 'dump_index.db'
@@ -53,7 +54,6 @@ def genData(indexFile: str, dbFile: str) -> None:
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index b4ade9f..9aa3863 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -8,17 +8,15 @@ The program can be re-run with an updated set of page IDs, and
 will skip already-processed page IDs.
 """
 
-import re
-import os, bz2, html, urllib.parse
+import os, re
+import bz2, html, urllib.parse
 import sqlite3
 
 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
 INDEX_DB = 'dump_index.db'
-PAGEVIEW_DB = 'pageview_data.db'
 IMG_DB = 'img_data.db' # The database to create
 DB_FILE = os.path.join('..', 'data.db')
-MAX_IMGS_PER_CTG = 20000
-#
+# Regexps
 ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
 IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
 BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
@@ -35,7 +33,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
 	if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
 		# Create tables if not present
 		imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)')
-			# 'img_name' may be NULL
+			# 'img_name' values are set to NULL to indicate page IDs where no image was found
 		imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
 	else:
 		# Check for already-processed page IDs
@@ -179,49 +177,28 @@ def getImageName(content: list[str]) -> str | None:
 			return None
 	return None
 
-def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]:
+def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
 	print('Getting event data')
-	titleToCtg: dict[str, str] = {}
+	titles: set[str] = set()
 	dbCon = sqlite3.connect(dbFile)
-	for title, ctg in dbCon.execute('SELECT title, ctg from events'):
-		titleToCtg[title] = ctg
+	for (title,) in dbCon.execute('SELECT title from events'):
+		titles.add(title)
 	dbCon.close()
-	print('Getting top images for each event category')
-	ctgToTitles: dict[str, list[str]] = {}
-	dbCon = sqlite3.connect(pageviewDb)
-	for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'):
-		if title not in titleToCtg:
-			continue
-		ctg = titleToCtg[title]
-		if ctg not in ctgToTitles:
-			ctgToTitles[ctg] = []
-		elif len(ctgToTitles[ctg]) == maxImgsPerCtg:
-			continue
-		ctgToTitles[ctg].append(title)
-		del titleToCtg[title]
-	dbCon.close()
-	for title, ctg in titleToCtg.items(): # Account for titles without view counts
-		if ctg not in ctgToTitles:
-			ctgToTitles[ctg] = []
-		elif len(ctgToTitles[ctg]) == maxImgsPerCtg:
-			continue
-		ctgToTitles[ctg].append(title)
 	print('Getting page IDs')
 	pageIds: set[int] = set()
 	dbCon = sqlite3.connect(indexDb)
 	dbCur = dbCon.cursor()
-	for ctg in ctgToTitles:
-		for title in ctgToTitles[ctg]:
-			row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
-			if row:
-				pageIds.add(row[0])
+	for title in titles:
+		row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+		if row:
+			pageIds.add(row[0])
 	dbCon.close()
-	print(f'Result: {len(pageIds)} out of {len(titleToCtg)}')
+	print(f'Result: {len(pageIds)} out of {len(titles)}')
 	return pageIds
 if __name__ == '__main__':
 	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
-	pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG)
+	pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB)
 	genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 90ec925..935b303 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -36,9 +36,13 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
 				if not line.startswith(linePrefix):
 					continue
 				# Get second and second-last fields
-				line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
-				title = line[:line.find(b' ')].decode('utf-8')
-				viewCount = int(line[line.rfind(b' ')+1:])
+				linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
+				title = linePart[:linePart.find(b' ')].decode('utf-8')
+				try:
+					viewCount = int(linePart[linePart.rfind(b' ')+1:])
+				except ValueError:
+					print(f'Unable to read count in line {lineNum}: {line}')
+					continue
 				if namespaceRegex.match(title) is not None:
 					continue
 				# Update map
diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py
index 68f9e56..6c9fee2 100755
--- a/backend/hist_data/gen_desc_data.py
+++ b/backend/hist_data/gen_desc_data.py
@@ -1,10 +1,10 @@
 #!/usr/bin/python3
 
 """
-Maps events to short descriptions from Wikipedia,
-and stores them in the database.
+Maps events to short descriptions from Wikipedia, and stores them in the database.
 """
 
+import argparse
 import os, sqlite3
 
 ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
@@ -16,10 +16,9 @@ def genData(enwikiDb: str, dbFile: str) -> None:
 	dbCur = dbCon.cursor()
 	dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)')
 	#
-	print('Getting events with images')
+	print('Getting events')
 	titleToId: dict[str, int] = {}
-	query = 'SELECT events.id, events.title FROM events INNER JOIN event_imgs ON events.id = event_imgs.id'
-	for eventId, title in dbCur.execute(query):
+	for eventId, title in dbCur.execute('SELECT id, title FROM events'):
 		titleToId[title] = eventId
 	#
 	print('Getting Wikipedia descriptions')
@@ -53,7 +52,6 @@ def genData(enwikiDb: str, dbFile: str) -> None:
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
 	#
diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py
index a81263f..d796d92 100644..100755
--- a/backend/hist_data/gen_disp_data.py
+++ b/backend/hist_data/gen_disp_data.py
@@ -1,15 +1,18 @@
 #!/usr/bin/python3
 
 """
-Adds data about event distribution and displayability to the database.
+Adds data about event distribution to the database,
+and removes events not eligible for display
 """
 
-# Enable unit testing code to, when running this script, resolve imports of modules within this directory
+# Code used in unit testing (for resolving imports of modules within this directory)
 import os, sys
 parentDir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(parentDir)
-
+# Standard imports
+import argparse
 import sqlite3
+# Local imports
 from cal import SCALES, dbDateToHistDate, dateToUnit
 
 MAX_DISPLAYED_PER_UNIT = 4
@@ -23,12 +26,12 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None:
 	scaleUnitToCounts: dict[tuple[int, int], list[int]] = {}
 		# Maps scale and unit to two counts (num events in that unit, num events displayable for that unit)
 		# Only includes events with popularity values
-	idScales: set[tuple[int, int]] = set() # Maps event ids to scales they are displayable on
+	idScales: dict[int, list[int]] = {} # Maps event ids to scales they are displayable on
 	iterNum = 0
 	query = 'SELECT events.id, start, fmt FROM events INNER JOIN pop ON events.id = pop.id ORDER BY pop.pop DESC'
 	for eventId, eventStart, fmt in dbCur.execute(query):
 		iterNum += 1
-		if iterNum % 1e3 == 0:
+		if iterNum % 1e5 == 0:
 			print(f'At iteration {iterNum}')
 		# For each scale
 		for scale in scales:
@@ -42,23 +45,55 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None:
 				counts = [1, 0]
 			if counts[1] < maxDisplayedPerUnit:
 				counts[1] += 1
-				idScales.add((eventId, scale))
+				if eventId not in idScales:
+					idScales[eventId] = []
+				idScales[eventId].append(scale)
 			scaleUnitToCounts[(scale, unit)] = counts
+	print(f'Results: {len(idScales)} displayable events')
+	#
+	print('Looking for non-displayable events')
+	eventsToDel: list[int] = []
+	for eventId, eventStart, fmt in dbCur.execute(query):
+		if eventId in idScales:
+			continue
+		eventsToDel.append(eventId)
+		# Remove from data to be added to 'dist'
+		for scale in scales:
+			unit = dateToUnit(dbDateToHistDate(eventStart, fmt), scale)
+			count = scaleUnitToCounts[(scale, unit)][0] - 1
+			if count == 0:
+				del scaleUnitToCounts[(scale, unit)]
+			else:
+				scaleUnitToCounts[(scale, unit)][0] = count
+	for (eventId,) in dbCur.execute( # Find events without scores
+		'SELECT events.id FROM events LEFT JOIN pop ON events.id = pop.id WHERE pop.id IS NULL'):
+		eventsToDel.append(eventId)
+	print(f'Found {len(eventsToDel)}')
+	#
+	print(f'Deleting {len(eventsToDel)} events')
+	iterNum = 0
+	for eventId in eventsToDel:
+		iterNum += 1
+		if iterNum % 1e5 == 0:
+			print(f'At iteration {iterNum}')
+		#
+		dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,))
+		dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,))
 	#
 	print('Writing to db')
 	dbCur.execute('CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))')
 	for (scale, unit), (count, _) in scaleUnitToCounts.items():
 		dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, count))
 	dbCur.execute('CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))')
-	for eventId, scale in idScales:
-		dbCur.execute('INSERT INTO event_disp VALUES (?, ?)', (eventId, scale))
+	for eventId, scales in idScales.items():
+		for scale in scales:
+			dbCur.execute('INSERT INTO event_disp VALUES (?, ?)', (eventId, scale))
 	#
 	print('Closing db')
 	dbCon.commit()
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
 	#
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py
index f054f76..deaf794 100755
--- a/backend/hist_data/gen_events_data.py
+++ b/backend/hist_data/gen_events_data.py
@@ -2,8 +2,8 @@
 
 """
 Reads a Wikidata JSON dump, looking for entities usable as historical events.  For each such
-entity, finds a start date (may be a range), optional end date, and event category (eg: normal
-event, person with birth/death date, country, etc).  Writes the results into a database.
+entity, finds a start date (may be a range), optional end date, and event category (eg: discovery,
+person with birth/death date, etc).  Writes the results into a database.
 
 The JSON dump contains an array of objects, each of which describes a Wikidata item item1,
 and takes up it's own line.
@@ -12,11 +12,11 @@ and takes up it's own line.
 - Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue']
 	'idx1' indexes an array of statements
 
-Value objects have a 'type' and 'value' field.
+'datavalue' objects have a 'type' and 'value' field.
 Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates
 	An example:
 		{"value":{
-			"time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits (-0001 means 1 BCE)
+			"time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits (-0001 means 1 BC)
 			"timezone":0, # Unused
 			"before":0,   # Unused
 			"after":0,    # Unused
@@ -52,30 +52,33 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or
 		"http://www.wikidata.org/entity/Q524410"   - gigaannum (1e9 yrs)
 """
 
-# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by:
-# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock.
-# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state.
+# On Linux, running on the full dataset seems to make the processes hang when done.  This was resolved by:
+# - Storing subprocess results in temp files.  Apparently passing large objects through pipes can cause deadlock.
+# - Using set_start_method('spawn').  Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state.
 #   Related: https://bugs.python.org/issue6721
 # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
 #   Possibly related: https://github.com/python/cpython/issues/72882
 
-# Enable unit testing code to, when running this script, resolve imports of modules within this directory
+# Took about 4.5 hours to run
+
+# Code used in unit testing (for resolving imports of modules within this directory)
 import os, sys
 parentDir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(parentDir)
+# Standard imports
+import argparse
+import math, re
+import io, bz2, json, sqlite3
+import indexed_bzip2, pickle, multiprocessing, tempfile
+# Local imports
+from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR
 
-import io, math, re, argparse
-import bz2, json, sqlite3
-import multiprocessing, indexed_bzip2, pickle, tempfile
-# Modules in this directory
-from cal import gregorianToJdn, julianToJdn
-
+# Constants
 WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2')
 DUMP_YEAR = 2022 # Used for converting 'age' values into dates
 OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat')
 DB_FILE = 'data.db'
-N_PROCS = 6
-
+N_PROCS = 6 # Number of processes to use
 # For getting Wikidata entity IDs
 INSTANCE_OF = 'P31'
 EVENT_CTG: dict[str, dict[str, str]] = {
@@ -91,31 +94,33 @@ EVENT_CTG: dict[str, dict[str, str]] = {
 		'recurring event': 'Q15275719',
 		'event sequence': 'Q15900616',
 		'incident': 'Q18669875',
+		'project': 'Q170584',
+		'number of deaths': 'P1120',
 	},
-	'human': {
-		'human': 'Q5',
-	},
-	'country': {
+	'place': {
 		'country': 'Q6256',
 		'state': 'Q7275',
 		'sovereign state': 'Q3624078',
+		'city': 'Q515',
+		'tourist attraction': 'Q570116',
+		'heritage site': 'Q358',
+		'terrestrial planet': 'Q128207',
+		'navigational star': 'Q108171565',
+		'G-type main-sequence star': 'Q5864',
+	},
+	'organism': {
+		'taxon': 'Q16521',
+	},
+	'person': {
+		'human': 'Q5',
+	},
+	'work': {
+		'creator': 'P170',
+		'genre': 'P136',
 	},
 	'discovery': {
 		'time of discovery or invention': 'P575',
 	},
-	'media': {
-		'work of art': 'Q4502142',
-		'literary work': 'Q7725634',
-		'comic book series': 'Q14406742',
-		'painting': 'Q3305213',
-		'musical work/composition': 'Q105543609',
-		'film': 'Q11424',
-		'animated film': 'Q202866',
-		'television series': 'Q16401',
-		'anime television series': 'Q63952888',
-		'video game': 'Q7889',
-		'video game series': 'Q7058673',
-	},
 }
 ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()}
 EVENT_PROP: dict[str, str] = {
@@ -148,14 +153,14 @@ PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [
 	('time of discovery or invention',),
 	('publication date',),
 ]
-UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years
+UNIT_TO_SCALE: dict[str, int] = {
+	# Maps 'unit' values (found in 'datavalue' objects with type=quantity) to numbers of years
 	'http://www.wikidata.org/entity/Q577':          1, # 'year'
 	'http://www.wikidata.org/entity/Q24564698':     1, # 'years old'
 	'http://www.wikidata.org/entity/Q3013059':  10**3, # 'kiloannum' (1e3 yrs)
 	'http://www.wikidata.org/entity/Q20764':    10**6, # 'megaannum' (1e6 yrs)
 	'http://www.wikidata.org/entity/Q524410':   10**9, # 'gigaannum' (1e9 yrs)
 }
-
 # For filtering lines before parsing JSON
 TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()
 PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()
@@ -183,12 +188,12 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 						# The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects)
 	else:
 		if not os.path.exists(offsetsFile):
-			print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours)
+			print('Creating offsets file') # For indexed access used in multiprocessing (may take about 7 hours)
 			with indexed_bzip2.open(wikidataFile) as file:
 				with open(offsetsFile, 'wb') as file2:
 					pickle.dump(file.block_offsets(), file2)
 		print('Allocating file into chunks')
-		fileSz: int # About 1.4 TB
+		fileSz: int # Was about 1.4 TB
 		with indexed_bzip2.open(wikidataFile) as file:
 			with open(offsetsFile, 'rb') as file2:
 				file.set_block_offsets(pickle.load(file2))
@@ -206,15 +211,15 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 						chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]):
 					# Add entries from subprocess output file
 					with open(outFile, 'rb') as file:
-						for entry in pickle.load(file):
-							dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
+						for item in pickle.load(file):
+							dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', item)
 	dbCon.commit()
 	dbCon.close()
 
 # For data extraction
 def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None:
 	""" Parses a Wikidata dump line, returning an entry to add to the db """
-	# Check with regex
+	# Check with regexes
 	if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None:
 		return None
 	# Decode
@@ -283,7 +288,7 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non
 	#
 	return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg)
 def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None:
-	""" Obtains event start+end data from value objects with type 'time', according to 'timeType' """
+	""" Obtains event start+end data from 'datavalue' objects with type 'time', according to 'timeType' """
 	# Values to return
 	start: int
 	startUpper: int | None = None
@@ -317,7 +322,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
 		else:
 			start = DUMP_YEAR - upperBound * scale
 			startUpper = DUMP_YEAR - lowerBound * scale
-		# Account for non-existence of 0 CE
+		# Account for non-existence of 0 AD
 		if start <= 0:
 			start -= 1
 		if startUpper is not None and startUpper <= 0:
@@ -342,7 +347,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
 			return None
 		end, _, timeFmt2 = endTimeVals
 		if timeFmt != timeFmt2:
-			if timeFmt == 1 and timeFmt2 == 2:
+			if timeFmt == 2 and timeFmt2 == 1:
 				timeFmt = 3
 			else:
 				return None
@@ -359,13 +364,13 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
 				return None
 			end, endUpper, timeFmt2 = endTimeVals
 			if timeFmt != timeFmt2:
-				if timeFmt == 1 and timeFmt2 == 2:
+				if timeFmt == 2 and timeFmt2 == 1:
 					timeFmt = 3
 				else:
 					return None
 	return start, startUpper, end, endUpper, timeFmt
 def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
-	""" Obtains event start (or end) data from a value object with type 'time' """
+	""" Obtains event start (or end) data from a 'datavalue' object with type 'time' """
 	if 'type' not in dataVal or dataVal['type'] != 'time':
 		return None
 	# Get time data
@@ -385,20 +390,20 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
 	startUpper: int | None = None
 	timeFmt: int
 	if precision in [10, 11]: # 'month' or 'day' precision
-		if year < -4713: # If before 4713 BCE (start of valid julian date period)
-			print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}')
+		if year < MIN_CAL_YEAR: # If before start of valid julian date period
+			print(f'WARNING: Skipping sub-year-precision date before {-MIN_CAL_YEAR} BC: {json.dumps(dataVal)}')
 			return None
 		day = max(day, 1) # With month-precision, entry may have a 'day' of 0
 		if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar'
 			start = gregorianToJdn(year, month, day)
 			if precision == 10:
 				startUpper = gregorianToJdn(year, month+1, 0)
-			timeFmt = 2
+			timeFmt = 1
 		else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar')
 			start = julianToJdn(year, month, day)
 			if precision == 10:
 				startUpper = julianToJdn(year, month+1, 0)
-			timeFmt = 1
+			timeFmt = 2
 	elif 0 <= precision < 10: # 'year' to 'gigaannum' precision
 		scale: int = 10 ** (9 - precision)
 		start = year // scale * scale
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
index 817de03..6d57180 100755
--- a/backend/hist_data/gen_imgs.py
+++ b/backend/hist_data/gen_imgs.py
@@ -10,18 +10,16 @@ processing. It uses already-existing database entries to decide what
 to skip.
 """
 
-import os, math, subprocess
+import argparse
+import os, subprocess, signal
 import sqlite3, urllib.parse
-import signal
-from PIL import Image
 
 IMG_DIR = os.path.join('enwiki', 'imgs')
 IMG_DB = os.path.join('enwiki', 'img_data.db')
 OUT_DIR = 'img'
 DB_FILE = 'data.db'
 #
-MAX_MINOR_DIM = 200
-MAX_DIM_RATIO = 3/2
+IMG_OUT_SZ = 200
 
 def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
 	""" Converts images and updates db, checking for entries to skip """
@@ -110,32 +108,9 @@ def convertImage(imgPath: str, outPath: str):
 	if os.path.exists(outPath):
 		print('ERROR: Output image already exists')
 		return False
-	# Get image dims
-	width: int
-	height: int
-	try:
-		with Image.open(imgPath) as image:
-			width, height = image.size
-	except Exception as e: # Being more specific runs the risk of ending the program without committing to db
-		print(f'ERROR: Unable to open {imgPath}: {e}')
-		return False
-	# Limit output dims
-	if width > height:
-		if height > MAX_MINOR_DIM:
-			width = math.ceil(width * height / MAX_MINOR_DIM)
-			height = MAX_MINOR_DIM
-		if width / height > MAX_DIM_RATIO:
-			width = math.ceil(height * MAX_DIM_RATIO)
-	else:
-		if width > MAX_MINOR_DIM:
-			height = math.ceil(height * width / MAX_MINOR_DIM)
-			width = MAX_MINOR_DIM
-		if height / width > MAX_DIM_RATIO:
-			height = math.ceil(width * MAX_DIM_RATIO)
-	# Convert image
 	try:
 		completedProcess = subprocess.run(
-			['npx', 'smartcrop-cli', '--width', str(width), '--height', str(height), imgPath, outPath],
+			['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
 			stdout=subprocess.DEVNULL
 		)
 	except Exception as e:
@@ -147,7 +122,6 @@ def convertImage(imgPath: str, outPath: str):
 	return True
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	parser.parse_args()
 	#
diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py
index 7d6071a..933af24 100755
--- a/backend/hist_data/gen_picked_data.py
+++ b/backend/hist_data/gen_picked_data.py
@@ -4,12 +4,14 @@
 Adds additional manually-picked events to the database
 """
 
-# Enable unit testing code to, when running this script, resolve imports of modules within this directory
+# Code used in unit testing (for resolving imports of modules within this directory)
 import os, sys
 parentDir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(parentDir)
-
+# Standard imports
+import argparse
 import json, sqlite3
+# Local imports
 from gen_imgs import convertImage
 
 PICKED_DIR = 'picked'
@@ -55,7 +57,6 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str) ->
 	dbCon.close()
 
 if __name__ == '__main__':
-	import argparse
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
 	#
diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py
index 8eaa142..aaaf69d 100755
--- a/backend/hist_data/gen_pop_data.py
+++ b/backend/hist_data/gen_pop_data.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 
 """
-Adds Wikipedia page view info to the database as popularity values.
+Adds Wikipedia page view info to the database as popularity values
 """
 
 import os, sqlite3
diff --git a/backend/hist_data/picked/README.md b/backend/hist_data/picked/README.md
index becbd24..395fd9d 100644
--- a/backend/hist_data/picked/README.md
+++ b/backend/hist_data/picked/README.md
@@ -1,4 +1,4 @@
-This directory holds data for additional events
+This directory holds data for additional manually-picked events.
 
 Files
 =====
diff --git a/backend/hist_data/reduce_event_data.py b/backend/hist_data/reduce_event_data.py
index 15c2ab5..5801f4d 100755
--- a/backend/hist_data/reduce_event_data.py
+++ b/backend/hist_data/reduce_event_data.py
@@ -1,23 +1,44 @@
 #!/usr/bin/python3
 
 """
-Delete extraneous events from the database that have no image (and consequently no description)
+Delete events from the database that have no image
 """
 
+# Code used in unit testing (for resolving imports of modules within this directory)
+import os, sys
+parentDir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(parentDir)
+# Standard imports
 import argparse
 import sqlite3
+# Local imports
+from cal import SCALES, dbDateToHistDate, dateToUnit
 
 DB_FILE = 'data.db'
 
-def reduceData(dbFile: str) -> None:
+def reduceData(dbFile: str, scales: list[int]) -> None:
 	dbCon = sqlite3.connect(dbFile)
 	dbCur = dbCon.cursor()
 	#
 	print('Getting events to delete')
-	eventsToDel = set()
-	query = 'SELECT events.id FROM events LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL'
-	for (eventId,) in dbCur.execute(query):
-		eventsToDel.add(eventId)
+	eventsToDel: list[int] = []
+	scaleUnitToDelCount: dict[tuple[int, int], int] = {} # Stores counts to subtract from entries in 'dist'
+	query = 'SELECT events.id, events.start, events.fmt FROM events' \
+		' LEFT JOIN event_imgs ON events.id = event_imgs.id WHERE event_imgs.id IS NULL'
+	iterNum = 0
+	for (eventId, start, fmt) in dbCur.execute(query):
+		if iterNum % 1e5 == 0:
+			print(f'At iteration {iterNum}')
+		#
+		eventsToDel.append(eventId)
+		date = dbDateToHistDate(start, fmt)
+		for scale in scales:
+			unit = dateToUnit(date, scale)
+			if (scale, unit) not in scaleUnitToDelCount:
+				scaleUnitToDelCount[(scale, unit)] = 1
+			else:
+				scaleUnitToDelCount[(scale, unit)] += 1
+	print(f'Found {len(eventsToDel)}')
 	#
 	print('Deleting events')
 	iterNum = 0
@@ -26,8 +47,12 @@ def reduceData(dbFile: str) -> None:
 		if iterNum % 1e5 == 0:
 			print(f'At iteration {iterNum}')
 		#
-		dbCur.execute('DELETE from events where id = ?', (eventId,))
-		dbCur.execute('DELETE from pop where id = ?', (eventId,))
+		dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,))
+		dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,))
+		dbCur.execute('DELETE FROM event_disp WHERE id = ?', (eventId,))
+	for (scale, unit), delCount in scaleUnitToDelCount.items():
+		dbCur.execute('UPDATE dist SET count = count - ? WHERE scale = ? AND unit = ?', (delCount, scale, unit))
+	dbCur.execute('DELETE FROM dist WHERE count < 1')
 	#
 	dbCon.commit()
 	dbCon.close()
@@ -36,4 +61,4 @@ if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 	args = parser.parse_args()
 	#
-	reduceData(DB_FILE)
+	reduceData(DB_FILE, SCALES)