From 4edb7998012bcc804482a76277cd25b90fb373c9 Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Sat, 1 Oct 2022 21:08:59 +1000
Subject: Update READMEs and .gitignore

---
 backend/hist_data/README.md                    |  44 +++
 backend/hist_data/gen_events_data.py           | 454 ++++++++++++++++++++++++
 backend/hist_data/wikidata/README.md           |  19 +-
 backend/hist_data/wikidata/gen_events_data.py  | 456 -------------------------
 backend/tests/test_gen_events_data.py          | 171 ++++++++++
 backend/tests/wikidata/test_gen_events_data.py | 171 ----------
 6 files changed, 670 insertions(+), 645 deletions(-)
 create mode 100644 backend/hist_data/README.md
 create mode 100755 backend/hist_data/gen_events_data.py
 delete mode 100755 backend/hist_data/wikidata/gen_events_data.py
 create mode 100644 backend/tests/test_gen_events_data.py
 delete mode 100644 backend/tests/wikidata/test_gen_events_data.py

(limited to 'backend')
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
new file mode 100644
index 0000000..c55549e
--- /dev/null
+++ b/backend/hist_data/README.md
@@ -0,0 +1,44 @@
+This directory holds files used to generate the history database data.db.
+
+# Database Tables
+-   `events` <br>
+    Format:
+        `id INT PRIMARY KEY, title TEXT UNIQUE, start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT`
+        <br>
+    Each row has a Wikidata ID, Wikipedia title, start and end dates, and an event category.
+    -   `start*` and `end*` specify start and end dates.
+        `start_upper`, `end`, and `end_upper`, are optional.
+        If `start_upper` is present, it and `start` denote an uncertain range of start times.
+        Similarly for 'end' and 'end_upper'.
+    -   `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`.
+        -   If 1, they denote a Julian date (with 0.5 removed to align with midnight).
+            This allows simple comparison of events with day-level precision, but only goes back to 4713 BCE.
+        -   If 2, same as 1, but dates are preferably displayed using the Gregorian calendar, not the Julian calendar.
+            For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not.
+        -   If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'.
+            For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
+        -   If 0, they denote a number of years CE (if positive) or BCE (if negative).
+
+# Generating the Database
+
+## Generate Event Data
+1.  Obtain a Wikidata JSON dump in wikidata/, as specified in it's README.
+1.  Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table.
+
+## Generate Description Data
+1.  Obtain an enwiki dump in enwiki/, as specified in the README.
+1.  In enwiki/, run `gen_dump_index.db.py`, which generates a database for indexing the dump.
+1.  In enwiki/, run `gen_desc_data.py`, which extracts page descriptions into a database.
+1.  Run 
+
+## Generate Popularity Data
+1.  Obtain 'page view files' in enwiki/, as specified in it's README.
+1.  Run 
+
+## Generate Image Data and Popularity Data
+1.  In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`,
+    looks for infobox image names, and stores them in an image database.
+1.  In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
+    images, and adds them to the image database.
+1.  In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/.
+1.  Run 
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py
new file mode 100755
index 0000000..1f990d0
--- /dev/null
+++ b/backend/hist_data/gen_events_data.py
@@ -0,0 +1,454 @@
+#!/usr/bin/python3
+
+"""
+Reads a Wikidata JSON dump, looking for entities usable as historical events.  For each such
+entity, finds a start date (may be a range), optional end date, and event category (eg: normal
+event, person with birth/death date, country, etc).  Writes the results into a database.
+
+The JSON dump contains an array of objects, each of which describes a Wikidata item item1,
+and takes up it's own line.
+- Getting item1's Wikidata ID: item1['id'] (eg: "Q144")
+- Checking for a property: item1['claims'][prop1] == array1
+- Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue']
+	'idx1' indexes an array of statements
+
+Value objects have a 'type' and 'value' field.
+Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates
+	An example:
+		{"value":{
+			"time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits
+			"timezone":0, # Unused
+			"before":0,   # Unused
+			"after":0,    # Unused
+			"precision":11,
+			"calendarmodel":"http://www.wikidata.org/entity/Q1985727"
+		}, "type":"time"}
+	'precision' can be one of:
+		0  - billion years (timestamp eg: -5000000000-00-00T00:00:00Z)
+		1  - hundred million years
+		...
+		6  - millenium (warning: represents ranges from *1 to *0, eg: 1001-2000)
+		7  - century (warning: represents ranges from *1 to *0, eg: 1801-1900)
+		8  - decade (represents ranges from *0 to *9, eg: 2010-2019)
+		9  - year
+		10 - month
+		11 - day
+	'calendarmodel' can be one of:
+		"http://www.wikidata.org/entity/Q1985727" - proleptic Gregorian calendar
+		"http://www.wikidata.org/entity/Q1985786" - proleptic Julian calendar
+Info about objects with type 'quantity' can be found at: https://www.wikidata.org/wiki/Help:Data_type#Quantity
+	An example:
+		{"value":{
+			"amount":"+10.9",
+			"unit":"http://www.wikidata.org/entity/Q20764",
+			"lowerBound":"+170.1", # May be absent
+			"upperBound":"+470", # May be absent
+		}, "type":"quantity"}
+	'unit' can be one of:
+		"http://www.wikidata.org/entity/Q577"      - year
+		"http://www.wikidata.org/entity/Q24564698" - years old
+		"http://www.wikidata.org/entity/Q3013059"  - kiloannum (1e3 yrs)
+		"http://www.wikidata.org/entity/Q20764"    - megaannum (1e6 yrs)
+		"http://www.wikidata.org/entity/Q524410"   - gigaannum (1e9 yrs)
+"""
+
+# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by:
+# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock.
+# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state.
+#   Related: https://bugs.python.org/issue6721
+# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
+#   Possibly related: https://github.com/python/cpython/issues/72882
+
+import os, io, math, re, argparse
+import bz2, json, sqlite3
+import multiprocessing, indexed_bzip2, pickle, tempfile
+from jdcal import gcal2jd, jcal2jd
+
+WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2')
+DUMP_YEAR = 2022 # Used for converting 'age' values into dates
+OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat')
+DB_FILE = 'data.db'
+N_PROCS = 6
+
+# For getting Wikidata entity IDs
+INSTANCE_OF = 'P31'
+EVENT_CTG: dict[str, dict[str, str]] = {
+	# Maps event-categories to dicts that map event-indicative entity names to their IDs
+		# If the ID starts with 'Q', it expects entities to be an 'instance of' that ID
+		# If the ID starts with 'P', it expects entities to have a property with that ID
+	'event': {
+		'occurrence': 'Q1190554',
+		'time interval': 'Q186081',
+		'historical period': 'Q11514315',
+		'era': 'Q6428674',
+		'event': 'Q1656682',
+		'recurring event': 'Q15275719',
+		'event sequence': 'Q15900616',
+		'incident': 'Q18669875',
+	},
+	'human': {
+		'human': 'Q5',
+	},
+	'country': {
+		'country': 'Q6256',
+		'state': 'Q7275',
+		'sovereign state': 'Q3624078',
+	},
+	'discovery': {
+		'time of discovery or invention': 'P575',
+	},
+	'media': {
+		'work of art': 'Q4502142',
+		'literary work': 'Q7725634',
+		'comic book series': 'Q14406742',
+		'painting': 'Q3305213',
+		'musical work/composition': 'Q105543609',
+		'film': 'Q11424',
+		'animated film': 'Q202866',
+		'television series': 'Q16401',
+		'anime television series': 'Q63952888',
+		'video game': 'Q7889',
+		'video game series': 'Q7058673',
+	},
+}
+ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()}
+EVENT_PROP: dict[str, str] = {
+	# Maps event-start/end-indicative property names to their IDs
+	'start time': 'P580',
+	'end time': 'P582',
+	'point in time': 'P585',
+	'inception': 'P571',
+	'age estimated by a dating method': 'P7584',
+	'temporal range start': 'P523',
+	'temporal range end': 'P524',
+	'earliest date': 'P1319',
+	'latest date': 'P1326',
+	'date of birth': 'P569',
+	'date of death': 'P570',
+	'time of discovery or invention': 'P575',
+	'publication date': 'P577',
+}
+PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [
+	# Indicates how event start/end data should be obtained from EVENT_PROP props
+		# Each tuple starts with a start-time prop to check for, followed by an optional
+		# end-time prop, and an optional 'both props must be present' boolean indicator
+	('start time', 'end time'),
+	('point in time',),
+	('inception',),
+	('age estimated by a dating method',),
+	('temporal range start', 'temporal range end'),
+	('earliest date', 'latest date', True),
+	('date of birth', 'date of death'),
+	('time of discovery or invention',),
+	('publication date',),
+]
+UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years
+	'http://www.wikidata.org/entity/Q577':          1, # 'year'
+	'http://www.wikidata.org/entity/Q24564698':     1, # 'years old'
+	'http://www.wikidata.org/entity/Q3013059':  10**3, # 'kiloannum' (1e3 yrs)
+	'http://www.wikidata.org/entity/Q20764':    10**6, # 'megaannum' (1e6 yrs)
+	'http://www.wikidata.org/entity/Q524410':   10**9, # 'gigaannum' (1e9 yrs)
+}
+
+# For filtering lines before parsing JSON
+TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()
+PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()
+
+def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
+	""" Reads the dump and writes to db """
+	# Check db
+	if os.path.exists(dbFile):
+		print('ERROR: Database already exists')
+		return
+	# Read dump, and write to db
+	print('Writing to db')
+	dbCon = sqlite3.connect(dbFile)
+	dbCur = dbCon.cursor()
+	dbCur.execute('CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+		'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)')
+	if nProcs == 1:
+		with bz2.open(wikidataFile, mode='rb') as file:
+			for lineNum, line in enumerate(file, 1):
+				if lineNum % 1e4 == 0:
+					print(f'At line {lineNum}')
+				entry = readDumpLine(line)
+				if entry:
+					dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
+						# The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects)
+	else:
+		if not os.path.exists(offsetsFile):
+			print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours)
+			with indexed_bzip2.open(wikidataFile) as file:
+				with open(offsetsFile, 'wb') as file2:
+					pickle.dump(file.block_offsets(), file2)
+		print('Allocating file into chunks')
+		fileSz: int # About 1.4 TB
+		with indexed_bzip2.open(wikidataFile) as file:
+			with open(offsetsFile, 'rb') as file2:
+				file.set_block_offsets(pickle.load(file2))
+				fileSz = file.seek(0, io.SEEK_END)
+		chunkSz = fileSz // nProcs
+		chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
+			# Each adjacent pair specifies a start+end byte index for readDumpChunk()
+		print(f'- Chunk size: {chunkSz:,}')
+		print('Starting processes to read dump')
+		with tempfile.TemporaryDirectory() as tempDirName:
+			with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool:
+				# Used maxtasksperchild=1 to free resources on task completion
+				for outFile in pool.map(readDumpChunkOneParam,
+					[(i, wikidataFile, offsetsFile, os.path.join(tempDirName, f'{i}.pickle'),
+						chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]):
+					# Add entries from subprocess output file
+					with open(outFile, 'rb') as file:
+						for entry in pickle.load(file):
+							dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
+	dbCon.commit()
+	dbCon.close()
+
+# For data extraction
+def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None:
+	""" Parses a Wikidata dump line, returning an entry to add to the db """
+	# Check with regex
+	if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None:
+		return None
+	# Decode
+	try:
+		line = lineBytes.decode('utf-8').rstrip().rstrip(',')
+		jsonItem = json.loads(line)
+	except json.JSONDecodeError:
+		print(f'Unable to parse line {line} as JSON')
+		return None
+	if 'claims' not in jsonItem:
+		return None
+	claims = jsonItem['claims']
+	# Get wikidata ID, enwiki title
+	try:
+		itemId = int(jsonItem['id'][1:]) # Skip initial 'Q'
+		itemTitle: str = jsonItem['sitelinks']['enwiki']['title']
+	except (KeyError, ValueError):
+		return None
+	# Get event category
+	eventCtg: str | None = None
+	if INSTANCE_OF in claims: # Check types
+		for statement in claims[INSTANCE_OF]:
+			try:
+				itemType = statement['mainsnak']['datavalue']['value']['id']
+			except KeyError:
+				return None
+			if itemType in ID_TO_CTG:
+				eventCtg = ID_TO_CTG[itemType]
+				break
+	if not eventCtg:
+		for prop in claims: # Check props
+			if prop in ID_TO_CTG:
+				eventCtg = ID_TO_CTG[prop]
+		if not eventCtg:
+			return None
+	# Check for event-start/end props
+	startVal: str
+	endVal: str | None
+	timeType: str
+	found = False
+	for props in PROP_RULES:
+		startProp: str = EVENT_PROP[props[0]]
+		endProp = None if len(props) < 2 else EVENT_PROP[props[1]]
+		needBoth = False if len(props) < 3 else props[2]
+		if startProp not in claims:
+			continue
+		try:
+			startVal = claims[startProp][0]['mainsnak']['datavalue']
+			endVal = None
+			if endProp and endProp in claims:
+				endVal = claims[endProp][0]['mainsnak']['datavalue']
+			elif needBoth:
+				continue
+		except (KeyError, ValueError):
+			continue
+		timeType = props[0]
+		found = True
+		break
+	if not found:
+		return None
+	# Convert time values
+	timeData = getTimeData(startVal, endVal, timeType)
+	if timeData is None:
+		return None
+	start, startUpper, end, endUpper, timeFmt = timeData
+	#
+	return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg)
+def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None:
+	""" Obtains event start+end data from value objects with type 'time', according to 'timeType' """
+	# Values to return
+	start: int
+	startUpper: int | None = None
+	end: int | None = None
+	endUpper: int | None = None
+	timeFmt: int
+	#
+	if timeType == 'age estimated by a dating method':
+		if 'type' not in startVal or startVal['type'] != 'quantity':
+			return None
+		# Get quantity data
+		try:
+			value = startVal['value']
+			amount = math.ceil(float(value['amount']))
+			unit = value['unit']
+			if 'lowerBound' in value and 'upperBound' in value:
+				lowerBound = math.ceil(float(value['lowerBound']))
+				upperBound = math.ceil(float(value['upperBound']))
+			else:
+				lowerBound = None
+				upperBound = None
+		except (KeyError, ValueError):
+			return None
+		# Get unit scale
+		if unit not in UNIT_TO_SCALE:
+			return None
+		scale = UNIT_TO_SCALE[unit]
+		# Get start+startUpper
+		if lowerBound is None:
+			start = DUMP_YEAR - amount * scale
+		else:
+			start = DUMP_YEAR - upperBound * scale
+			startUpper = DUMP_YEAR - lowerBound * scale
+		# Account for non-existence of 0 CE
+		if start <= 0:
+			start -= 1
+		if startUpper is not None and startUpper <= 0:
+			startUpper -= 1
+		# Adjust precision
+		start = start // scale * scale
+		if startUpper is not None:
+			startUpper = startUpper // scale * scale
+		elif scale > 1:
+			startUpper = start + scale - 1
+		#
+		timeFmt = 0
+	elif timeType == 'earliest date':
+		# Get start
+		startTimeVals = getEventTime(startVal)
+		if startTimeVals is None:
+			return None
+		start, _, timeFmt = startTimeVals
+		# Get end
+		endTimeVals = getEventTime(endVal)
+		if endTimeVals is None:
+			return None
+		end, _, timeFmt2 = endTimeVals
+		if timeFmt != timeFmt2:
+			if timeFmt == 1 and timeFmt2 == 2:
+				timeFmt = 3
+			else:
+				return None
+	else:
+		# Get start+startUpper
+		startTimeVals = getEventTime(startVal)
+		if startTimeVals is None:
+			return None
+		start, startUpper, timeFmt = startTimeVals
+		# Get end+endUpper
+		if endVal is not None:
+			endTimeVals = getEventTime(endVal)
+			if endTimeVals is None:
+				return None
+			end, endUpper, timeFmt2 = endTimeVals
+			if timeFmt != timeFmt2:
+				if timeFmt == 1 and timeFmt2 == 2:
+					timeFmt = 3
+				else:
+					return None
+	return start, startUpper, end, endUpper, timeFmt
+def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
+	""" Obtains event start (or end) data from a value object with type 'time' """
+	if 'type' not in dataVal or dataVal['type'] != 'time':
+		return None
+	# Get time data
+	try:
+		value = dataVal['value']
+		time = value['time']
+		match = re.match(r'([+-]\d+)-(\d+)-(\d+)', time)
+		if match is None:
+			return None
+		year, month, day = (int(x) for x in match.groups())
+		precision = value['precision']
+		calendarmodel = value['calendarmodel']
+	except (KeyError, ValueError):
+		return None
+	# Get start+startUpper
+	start: int
+	startUpper: int | None = None
+	timeFmt: int
+	if precision in [10, 11]: # 'month' or 'day' precision
+		if year < -4712: # If before 4713 BCE (start of valid julian date period)
+			print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}')
+			return None
+		day = max(day, 1) # With month-precision, entry may have a 'day' of 0
+		if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar'
+			start = jdPairToJd(gcal2jd(year, month, day))
+			if precision == 10:
+				startUpper = jdPairToJd(gcal2jd(year, month+1, 0))
+			timeFmt = 2
+		else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar')
+			start = jdPairToJd(jcal2jd(year, month, day))
+			if precision == 10:
+				startUpper = jdPairToJd(jcal2jd(year, month+1, 0))
+			timeFmt = 1
+	elif 0 <= precision < 10: # 'year' to 'gigaannum' precision
+		scale: int = 10 ** (9 - precision)
+		start = year // scale * scale
+		if scale > 1:
+			startUpper = start + scale - 1
+		if precision in [6, 7]: # Account for century/millenia ranges being from *1 to *0
+			start += 1
+			if startUpper is not None:
+				startUpper += 1
+		timeFmt = 0
+	else:
+		return None
+	return start, startUpper, timeFmt
+def jdPairToJd(jdPair: tuple[int, int]) -> int:
+	""" Converts a julian-date-representing value from jdcal into an int """
+	return math.floor(sum(jdPair))
+
+# For using multiple processes
+def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str:
+	""" Forwards to readDumpChunk() (for use with pool.map()) """
+	return readDumpChunk(*params)
+def readDumpChunk(
+		procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str:
+	""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
+		If startByte is -1, start at the first line. """
+	# Read dump
+	entries = []
+	with indexed_bzip2.open(wikidataFile) as file:
+		# Load offsets file
+		with open(offsetsFile, 'rb') as file2:
+			offsets = pickle.load(file2)
+			file.set_block_offsets(offsets)
+		# Seek to chunk
+		if startByte != -1:
+			file.seek(startByte)
+			file.readline()
+		else:
+			startByte = 0 # Used for progress calculation
+		# Read lines
+		count = 0
+		while file.tell() <= endByte:
+			count += 1
+			if count % 1e4 == 0:
+				perc = (file.tell() - startByte) / (endByte - startByte) * 100
+				print(f'Thread {procId}: {perc:.2f}%')
+			entry = readDumpLine(file.readline())
+			if entry:
+				entries.append(entry)
+		# Output results into file
+		with open(outFile, 'wb') as file:
+			pickle.dump(entries, file)
+		return outFile
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+	args = parser.parse_args()
+	#
+	multiprocessing.set_start_method('spawn')
+	genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)
diff --git a/backend/hist_data/wikidata/README.md b/backend/hist_data/wikidata/README.md
index 35dad34..d5b2c5e 100644
--- a/backend/hist_data/wikidata/README.md
+++ b/backend/hist_data/wikidata/README.md
@@ -6,23 +6,6 @@ This directory holds files obtained/derived from [Wikidata](https://www.wikidata
     Format info can be found at <https://doc.wikimedia.org/Wikibase/master/php/md_docs_topics_json.html>.
 
 # Other Files
--   `gen_events_data.py` <br>
-    Used to generate a database holding event information from the dump.
 -   `offsets.dat` <br>
     Holds bzip2 block offsets for the dump. Generated and used by
-    gen_events_data.py for parallel processing of the dump.
--   `events.db` <br>
-    Generated by `gen_events_data.py`. <br>
-    Has one table `events`:
-    -   Columns: `id INT, title TEXT, start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT`
-    -   Contains a Wikidata ID, Wikipedia title, start and end dates, and an event category.
-    -   If `start_upper` is not NULL, `start` and `start_upper` denote an uncertain range of start times.
-        And similarly for 'end' and 'end_upper'.
-    -   `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`.
-        -   If 1, they denote a Julian date (with 0.5 removed to align with midnight).
-            This allows simple comparison of events with day-level precision, but only goes back to 4713 BCE.
-        -   If 2, same as 1, but dates are preferably displayed using the Gregorian calendar, not the Julian calendar.
-            For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not.
-        -   If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'.
-            For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
-        -   If 0, they denote a number of years CE (if positive) or BCE (if negative).
+    `../gen_events_data.py` for parallel processing of the dump.
diff --git a/backend/hist_data/wikidata/gen_events_data.py b/backend/hist_data/wikidata/gen_events_data.py
deleted file mode 100755
index f4766f0..0000000
--- a/backend/hist_data/wikidata/gen_events_data.py
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/python3
-
-"""
-Reads a Wikidata JSON dump, looking for entities usable as historical events.  For each such
-entity, finds a start date (may be a range), optional end date, and event category (eg: normal
-event, person with birth/death date, country, etc).  Writes the results into a database.
-
-The JSON dump contains an array of objects, each of which describes a Wikidata item item1,
-and takes up it's own line.
-- Getting item1's Wikidata ID: item1['id'] (eg: "Q144")
-- Checking for a property: item1['claims'][prop1] == array1
-- Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue']
-	'idx1' indexes an array of statements
-
-Value objects have a 'type' and 'value' field.
-Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates
-	An example:
-		{"value":{
-			"time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits
-			"timezone":0, # Unused
-			"before":0,   # Unused
-			"after":0,    # Unused
-			"precision":11,
-			"calendarmodel":"http://www.wikidata.org/entity/Q1985727"
-		}, "type":"time"}
-	'precision' can be one of:
-		0  - billion years (timestamp eg: -5000000000-00-00T00:00:00Z)
-		1  - hundred million years
-		...
-		6  - millenium (warning: represents ranges from *1 to *0, eg: 1001-2000)
-		7  - century (warning: represents ranges from *1 to *0, eg: 1801-1900)
-		8  - decade (represents ranges from *0 to *9, eg: 2010-2019)
-		9  - year
-		10 - month
-		11 - day
-	'calendarmodel' can be one of:
-		"http://www.wikidata.org/entity/Q1985727" - proleptic Gregorian calendar
-		"http://www.wikidata.org/entity/Q1985786" - proleptic Julian calendar
-Info about objects with type 'quantity' can be found at: https://www.wikidata.org/wiki/Help:Data_type#Quantity
-	An example:
-		{"value":{
-			"amount":"+10.9",
-			"unit":"http://www.wikidata.org/entity/Q20764",
-			"lowerBound":"+170.1", # May be absent
-			"upperBound":"+470", # May be absent
-		}, "type":"quantity"}
-	'unit' can be one of:
-		"http://www.wikidata.org/entity/Q577"      - year
-		"http://www.wikidata.org/entity/Q24564698" - years old
-		"http://www.wikidata.org/entity/Q3013059"  - kiloannum (1e3 yrs)
-		"http://www.wikidata.org/entity/Q20764"    - megaannum (1e6 yrs)
-		"http://www.wikidata.org/entity/Q524410"   - gigaannum (1e9 yrs)
-"""
-
-# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by:
-# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock.
-# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state.
-#   Related: https://bugs.python.org/issue6721
-# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
-#   Possibly related: https://github.com/python/cpython/issues/72882
-
-import os, io, math, re, argparse
-import bz2, json, sqlite3
-import multiprocessing, indexed_bzip2, pickle, tempfile
-from jdcal import gcal2jd, jcal2jd
-
-WIKIDATA_FILE = 'latest-all.json.bz2'
-DUMP_YEAR = 2022 # Used for converting 'age' values into dates
-OFFSETS_FILE = 'offsets.dat'
-DB_FILE = 'events.db'
-N_PROCS = 6
-
-# For getting Wikidata entity IDs
-INSTANCE_OF = 'P31'
-EVENT_CTG: dict[str, dict[str, str]] = {
-	# Maps event-categories to dicts that map event-indicative entity names to their IDs
-		# If the ID starts with 'Q', it expects entities to be an 'instance of' that ID
-		# If the ID starts with 'P', it expects entities to have a property with that ID
-	'event': {
-		'occurrence': 'Q1190554',
-		'time interval': 'Q186081',
-		'historical period': 'Q11514315',
-		'era': 'Q6428674',
-		'event': 'Q1656682',
-		'recurring event': 'Q15275719',
-		'event sequence': 'Q15900616',
-		'incident': 'Q18669875',
-	},
-	'human': {
-		'human': 'Q5',
-	},
-	'country': {
-		'country': 'Q6256',
-		'state': 'Q7275',
-		'sovereign state': 'Q3624078',
-	},
-	'discovery': {
-		'time of discovery or invention': 'P575',
-	},
-	'media': {
-		'work of art': 'Q4502142',
-		'literary work': 'Q7725634',
-		'comic book series': 'Q14406742',
-		'painting': 'Q3305213',
-		'musical work/composition': 'Q105543609',
-		'film': 'Q11424',
-		'animated film': 'Q202866',
-		'television series': 'Q16401',
-		'anime television series': 'Q63952888',
-		'video game': 'Q7889',
-		'video game series': 'Q7058673',
-	},
-}
-ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()}
-EVENT_PROP: dict[str, str] = {
-	# Maps event-start/end-indicative property names to their IDs
-	'start time': 'P580',
-	'end time': 'P582',
-	'point in time': 'P585',
-	'inception': 'P571',
-	'age estimated by a dating method': 'P7584',
-	'temporal range start': 'P523',
-	'temporal range end': 'P524',
-	'earliest date': 'P1319',
-	'latest date': 'P1326',
-	'date of birth': 'P569',
-	'date of death': 'P570',
-	'time of discovery or invention': 'P575',
-	'publication date': 'P577',
-}
-PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [
-	# Indicates how event start/end data should be obtained from EVENT_PROP props
-		# Each tuple starts with a start-time prop to check for, followed by an optional
-		# end-time prop, and an optional 'both props must be present' boolean indicator
-	('start time', 'end time'),
-	('point in time',),
-	('inception',),
-	('age estimated by a dating method',),
-	('temporal range start', 'temporal range end'),
-	('earliest date', 'latest date', True),
-	('date of birth', 'date of death'),
-	('time of discovery or invention',),
-	('publication date',),
-]
-UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years
-	'http://www.wikidata.org/entity/Q577':          1, # 'year'
-	'http://www.wikidata.org/entity/Q24564698':     1, # 'years old'
-	'http://www.wikidata.org/entity/Q3013059':  10**3, # 'kiloannum' (1e3 yrs)
-	'http://www.wikidata.org/entity/Q20764':    10**6, # 'megaannum' (1e6 yrs)
-	'http://www.wikidata.org/entity/Q524410':   10**9, # 'gigaannum' (1e9 yrs)
-}
-
-# For filtering lines before parsing JSON
-TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()
-PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()
-
-def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
-	""" Reads the dump and writes to db """
-	# Check db
-	if os.path.exists(dbFile):
-		print('ERROR: Database already exists')
-		return
-	# Read dump, and write to db
-	print('Writing to db')
-	dbCon = sqlite3.connect(dbFile)
-	dbCur = dbCon.cursor()
-	dbCur.execute('CREATE TABLE events (' \
-		'id INT NOT NULL PRIMARY KEY, title TEXT NOT NULL UNIQUE,' \
-		'start INT NOT NULL, start_upper INT, end INT, end_upper INT,' \
-		'fmt INT, ctg TEXT NOT NULL)')
-	if nProcs == 1:
-		with bz2.open(wikidataFile, mode='rb') as file:
-			for lineNum, line in enumerate(file, 1):
-				if lineNum % 1e4 == 0:
-					print(f'At line {lineNum}')
-				entry = readDumpLine(line)
-				if entry:
-					dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
-						# The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects)
-	else:
-		if not os.path.exists(offsetsFile):
-			print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours)
-			with indexed_bzip2.open(wikidataFile) as file:
-				with open(offsetsFile, 'wb') as file2:
-					pickle.dump(file.block_offsets(), file2)
-		print('Allocating file into chunks')
-		fileSz: int # About 1.4 TB
-		with indexed_bzip2.open(wikidataFile) as file:
-			with open(offsetsFile, 'rb') as file2:
-				file.set_block_offsets(pickle.load(file2))
-				fileSz = file.seek(0, io.SEEK_END)
-		chunkSz = fileSz // nProcs
-		chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
-			# Each adjacent pair specifies a start+end byte index for readDumpChunk()
-		print(f'- Chunk size: {chunkSz:,}')
-		print('Starting processes to read dump')
-		with tempfile.TemporaryDirectory() as tempDirName:
-			with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool:
-				# Used maxtasksperchild=1 to free resources on task completion
-				for outFile in pool.map(readDumpChunkOneParam,
-					[(i, wikidataFile, offsetsFile, os.path.join(tempDirName, f'{i}.pickle'),
-						chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]):
-					# Add entries from subprocess output file
-					with open(outFile, 'rb') as file:
-						for entry in pickle.load(file):
-							dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
-	dbCon.commit()
-	dbCon.close()
-
-# For data extraction
-def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None:
-	""" Parses a Wikidata dump line, returning an entry to add to the db """
-	# Check with regex
-	if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None:
-		return None
-	# Decode
-	try:
-		line = lineBytes.decode('utf-8').rstrip().rstrip(',')
-		jsonItem = json.loads(line)
-	except json.JSONDecodeError:
-		print(f'Unable to parse line {line} as JSON')
-		return None
-	if 'claims' not in jsonItem:
-		return None
-	claims = jsonItem['claims']
-	# Get wikidata ID, enwiki title
-	try:
-		itemId = int(jsonItem['id'][1:]) # Skip initial 'Q'
-		itemTitle: str = jsonItem['sitelinks']['enwiki']['title']
-	except (KeyError, ValueError):
-		return None
-	# Get event category
-	eventCtg: str | None = None
-	if INSTANCE_OF in claims: # Check types
-		for statement in claims[INSTANCE_OF]:
-			try:
-				itemType = statement['mainsnak']['datavalue']['value']['id']
-			except KeyError:
-				return None
-			if itemType in ID_TO_CTG:
-				eventCtg = ID_TO_CTG[itemType]
-				break
-	if not eventCtg:
-		for prop in claims: # Check props
-			if prop in ID_TO_CTG:
-				eventCtg = ID_TO_CTG[prop]
-		if not eventCtg:
-			return None
-	# Check for event-start/end props
-	startVal: str
-	endVal: str | None
-	timeType: str
-	found = False
-	for props in PROP_RULES:
-		startProp: str = EVENT_PROP[props[0]]
-		endProp = None if len(props) < 2 else EVENT_PROP[props[1]]
-		needBoth = False if len(props) < 3 else props[2]
-		if startProp not in claims:
-			continue
-		try:
-			startVal = claims[startProp][0]['mainsnak']['datavalue']
-			endVal = None
-			if endProp and endProp in claims:
-				endVal = claims[endProp][0]['mainsnak']['datavalue']
-			elif needBoth:
-				continue
-		except (KeyError, ValueError):
-			continue
-		timeType = props[0]
-		found = True
-		break
-	if not found:
-		return None
-	# Convert time values
-	timeData = getTimeData(startVal, endVal, timeType)
-	if timeData is None:
-		return None
-	start, startUpper, end, endUpper, timeFmt = timeData
-	#
-	return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg)
-def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None:
-	""" Obtains event start+end data from value objects with type 'time', according to 'timeType' """
-	# Values to return
-	start: int
-	startUpper: int | None = None
-	end: int | None = None
-	endUpper: int | None = None
-	timeFmt: int
-	#
-	if timeType == 'age estimated by a dating method':
-		if 'type' not in startVal or startVal['type'] != 'quantity':
-			return None
-		# Get quantity data
-		try:
-			value = startVal['value']
-			amount = math.ceil(float(value['amount']))
-			unit = value['unit']
-			if 'lowerBound' in value and 'upperBound' in value:
-				lowerBound = math.ceil(float(value['lowerBound']))
-				upperBound = math.ceil(float(value['upperBound']))
-			else:
-				lowerBound = None
-				upperBound = None
-		except (KeyError, ValueError):
-			return None
-		# Get unit scale
-		if unit not in UNIT_TO_SCALE:
-			return None
-		scale = UNIT_TO_SCALE[unit]
-		# Get start+startUpper
-		if lowerBound is None:
-			start = DUMP_YEAR - amount * scale
-		else:
-			start = DUMP_YEAR - upperBound * scale
-			startUpper = DUMP_YEAR - lowerBound * scale
-		# Account for non-existence of 0 CE
-		if start <= 0:
-			start -= 1
-		if startUpper is not None and startUpper <= 0:
-			startUpper -= 1
-		# Adjust precision
-		start = start // scale * scale
-		if startUpper is not None:
-			startUpper = startUpper // scale * scale
-		elif scale > 1:
-			startUpper = start + scale - 1
-		#
-		timeFmt = 0
-	elif timeType == 'earliest date':
-		# Get start
-		startTimeVals = getEventTime(startVal)
-		if startTimeVals is None:
-			return None
-		start, _, timeFmt = startTimeVals
-		# Get end
-		endTimeVals = getEventTime(endVal)
-		if endTimeVals is None:
-			return None
-		end, _, timeFmt2 = endTimeVals
-		if timeFmt != timeFmt2:
-			if timeFmt == 1 and timeFmt2 == 2:
-				timeFmt = 3
-			else:
-				return None
-	else:
-		# Get start+startUpper
-		startTimeVals = getEventTime(startVal)
-		if startTimeVals is None:
-			return None
-		start, startUpper, timeFmt = startTimeVals
-		# Get end+endUpper
-		if endVal is not None:
-			endTimeVals = getEventTime(endVal)
-			if endTimeVals is None:
-				return None
-			end, endUpper, timeFmt2 = endTimeVals
-			if timeFmt != timeFmt2:
-				if timeFmt == 1 and timeFmt2 == 2:
-					timeFmt = 3
-				else:
-					return None
-	return start, startUpper, end, endUpper, timeFmt
-def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
-	""" Obtains event start (or end) data from a value object with type 'time' """
-	if 'type' not in dataVal or dataVal['type'] != 'time':
-		return None
-	# Get time data
-	try:
-		value = dataVal['value']
-		time = value['time']
-		match = re.match(r'([+-]\d+)-(\d+)-(\d+)', time)
-		if match is None:
-			return None
-		year, month, day = (int(x) for x in match.groups())
-		precision = value['precision']
-		calendarmodel = value['calendarmodel']
-	except (KeyError, ValueError):
-		return None
-	# Get start+startUpper
-	start: int
-	startUpper: int | None = None
-	timeFmt: int
-	if precision in [10, 11]: # 'month' or 'day' precision
-		if year < -4712: # If before 4713 BCE (start of valid julian date period)
-			print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}')
-			return None
-		day = max(day, 1) # With month-precision, entry may have a 'day' of 0
-		if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar'
-			start = jdPairToJd(gcal2jd(year, month, day))
-			if precision == 10:
-				startUpper = jdPairToJd(gcal2jd(year, month+1, 0))
-			timeFmt = 2
-		else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar')
-			start = jdPairToJd(jcal2jd(year, month, day))
-			if precision == 10:
-				startUpper = jdPairToJd(jcal2jd(year, month+1, 0))
-			timeFmt = 1
-	elif 0 <= precision < 10: # 'year' to 'gigaannum' precision
-		scale: int = 10 ** (9 - precision)
-		start = year // scale * scale
-		if scale > 1:
-			startUpper = start + scale - 1
-		if precision in [6, 7]: # Account for century/millenia ranges being from *1 to *0
-			start += 1
-			if startUpper is not None:
-				startUpper += 1
-		timeFmt = 0
-	else:
-		return None
-	return start, startUpper, timeFmt
-def jdPairToJd(jdPair: tuple[int, int]) -> int:
-	""" Converts a julian-date-representing value from jdcal into an int """
-	return math.floor(sum(jdPair))
-
-# For using multiple processes
-def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str:
-	""" Forwards to readDumpChunk() (for use with pool.map()) """
-	return readDumpChunk(*params)
-def readDumpChunk(
-		procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str:
-	""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
-		If startByte is -1, start at the first line. """
-	# Read dump
-	entries = []
-	with indexed_bzip2.open(wikidataFile) as file:
-		# Load offsets file
-		with open(offsetsFile, 'rb') as file2:
-			offsets = pickle.load(file2)
-			file.set_block_offsets(offsets)
-		# Seek to chunk
-		if startByte != -1:
-			file.seek(startByte)
-			file.readline()
-		else:
-			startByte = 0 # Used for progress calculation
-		# Read lines
-		count = 0
-		while file.tell() <= endByte:
-			count += 1
-			if count % 1e4 == 0:
-				perc = (file.tell() - startByte) / (endByte - startByte) * 100
-				print(f'Thread {procId}: {perc:.2f}%')
-			entry = readDumpLine(file.readline())
-			if entry:
-				entries.append(entry)
-		# Output results into file
-		with open(outFile, 'wb') as file:
-			pickle.dump(entries, file)
-		return outFile
-
-if __name__ == '__main__':
-	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
-	args = parser.parse_args()
-	#
-	multiprocessing.set_start_method('spawn')
-	genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)
diff --git a/backend/tests/test_gen_events_data.py b/backend/tests/test_gen_events_data.py
new file mode 100644
index 0000000..37b24a3
--- /dev/null
+++ b/backend/tests/test_gen_events_data.py
@@ -0,0 +1,171 @@
+import unittest
+import tempfile, os, json, bz2, pickle, indexed_bzip2
+
+from tests.common import readTestDbTable
+from hist_data.gen_events_data import genData
+
+def runGenData(wikiItemArray: str, preGenOffsets: bool, nProcs: int):
+	""" Sets up wikidata file to be read by genData(), runs it, and returns the output database's contents.
+		If 'preGenOffsets' is True, generates a bz2 offsets file before running genData(). """
+	with tempfile.TemporaryDirectory() as tempDir:
+		# Create temp wikidata file
+		wikidataFile = os.path.join(tempDir, 'dump.json.bz2')
+		with bz2.open(wikidataFile, mode='wb') as file:
+			file.write(b'[\n')
+			for i in range(len(wikiItemArray)):
+				file.write(json.dumps(wikiItemArray[i], separators=(',',':')).encode())
+				if i < len(wikiItemArray) - 1:
+					file.write(b',')
+				file.write(b'\n')
+			file.write(b']\n')
+		# Create temp offsets file if requested
+		offsetsFile = os.path.join(tempDir, 'offsets.dat')
+		if preGenOffsets:
+			with indexed_bzip2.open(wikidataFile) as file:
+				with open(offsetsFile, 'wb') as file2:
+					pickle.dump(file.block_offsets(), file2)
+		# Run genData()
+		dbFile = os.path.join(tempDir, 'events.db')
+		genData(wikidataFile, offsetsFile, dbFile, nProcs)
+		# Read db
+		return readTestDbTable(dbFile, 'SELECT * FROM events')
+
+class TestGenData(unittest.TestCase):
+	def setUp(self):
+		self.maxDiff = None # Remove output-diff size limit
+		self.testWikiItems = [
+			{
+				'id': 'Q1',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q1656682'}}}}], # 'instance of' 'event'
+					'P585': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'point in time'
+						'time':'+1950-12-00T00:00:00Z',
+						'timezone':0,
+						'before':0,
+						'after':0,
+						'precision':10, # month precision
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar'
+					}}}}],
+					'P141': [{'mainsnak': {'datavalue': {'value': {'id': 'Q211005'}}}}], # Other random property
+				},
+				'sitelinks': {'enwiki': {'title': 'event one'}},
+			},
+			{
+				'id': 'Q2',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q5'}}}}], # 'instance of' 'human'
+					'P569': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of birth'
+						'time':'+2002-11-02T00:00:00Z',
+						'precision':11, # day precision
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
+					}}}}],
+					'P570': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of death'
+						'time':'+2010-06-21T00:00:01Z',
+						'timezone':1,
+						'precision':11,
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar'
+					}}}}],
+				},
+				'sitelinks': {'enwiki': {'title': 'Human One'}},
+			},
+			{
+				'id': 'Q3',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7275'}}}}], # 'instance of' 'state'
+					'P580': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'start time'
+						'time':'-1001-00-00T00:00:00Z',
+						'precision':9, # year precision
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985727'
+					}}}}],
+					'P582': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'end time'
+						'time':'-99-00-00T00:00:01Z',
+						'precision':9,
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985786'
+					}}}}],
+				},
+				'sitelinks': {'enwiki': {'title': 'country one'}},
+			},
+			{
+				'id': 'Q4',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q6256'}}}}], # 'instance of' 'country'
+					'P7584': [{'mainsnak': {'datavalue': {'type': 'quantity', 'value': {
+						# 'age estimated by a dating method'
+						"amount":"+10.9",
+						"unit":"http://www.wikidata.org/entity/Q3013059", # kiloannum
+						"lowerBound":"+9",
+						"upperBound":"+11",
+					}}}}],
+				},
+				'sitelinks': {'enwiki': {'title': 'country two'}},
+			},
+			{
+				'id': 'Q5',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11019'}}}}], # 'instance of' 'machine'
+					'P575': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'time of discovery or invention'
+						'time':'+0101-00-00T00:00:01Z',
+						'precision':6, # millenium precision
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985786'
+					}}}}],
+				},
+				'sitelinks': {'enwiki': {'title': 'discovery one'}},
+			},
+			{
+				'id': 'Q6',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7725634'}}}}], # 'instance of' 'literary work'
+					'P1319': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'earliest date'
+						'time':'-0020-08-01T00:00:00Z',
+						'precision':11, # day precision
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
+					}}}}],
+					'P1326': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'latest date'
+						'time':'-0020-09-01T00:00:00Z',
+						'precision':11,
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
+					}}}}],
+				},
+				'sitelinks': {'enwiki': {'title': 'media one'}},
+			},
+			{
+				'id': 'Q7',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11424'}}}}], # 'instance of' 'film'
+					'P577': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'publication date'
+						'time':'-2103-00-00T00:00:00Z',
+						'precision':7, # century precision
+						'calendarmodel':'http://www.wikidata.org/entity/Q1985727'
+					}}}}],
+				},
+				'sitelinks': {'enwiki': {'title': 'media two'}},
+			},
+			{
+				'id': 'Q8',
+				'claims': {
+					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # 'instance of' 'taxon'
+				}
+				# No title
+			},
+		]
+		self.expectedRows = {
+			(1, 'event one', 2433616, 2433646, None, None, 2, 'event'),
+			(2, 'Human One', 2452593, None, 2455368, None, 3, 'human'),
+			(3, 'country one', -1001, None, -99, None, 0, 'country'),
+			(4, 'country two', -9000, -7000, None, None, 0, 'country'),
+			(5, 'discovery one', 1, 1000, None, None, 0, 'discovery'),
+			(6, 'media one', 1713965, None, 1713996, None, 1, 'media'),
+			(7, 'media two', -2199, -2100, None, None, 0, 'media'),
+		}
+	def test_wikiItems(self):
+		rows = runGenData(self.testWikiItems, False, 1)
+		self.assertEqual(rows, self.expectedRows)
+	def test_empty_dump(self):
+		rows = runGenData([{}], False, 1)
+		self.assertEqual(rows, set())
+	def test_multiprocessing(self):
+		rows = runGenData(self.testWikiItems, False, 4)
+		self.assertEqual(rows, self.expectedRows)
+	def test_existing_offsets(self):
+		rows = runGenData(self.testWikiItems, True, 3)
+		self.assertEqual(rows, self.expectedRows)
diff --git a/backend/tests/wikidata/test_gen_events_data.py b/backend/tests/wikidata/test_gen_events_data.py
deleted file mode 100644
index faa19c9..0000000
--- a/backend/tests/wikidata/test_gen_events_data.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import unittest
-import tempfile, os, json, bz2, pickle, indexed_bzip2
-
-from tests.common import readTestDbTable
-from hist_data.wikidata.gen_events_data import genData
-
-def runGenData(wikiItemArray: str, preGenOffsets: bool, nProcs: int):
-	""" Sets up wikidata file to be read by genData(), runs it, and returns the output database's contents.
-		If 'preGenOffsets' is True, generates a bz2 offsets file before running genData(). """
-	with tempfile.TemporaryDirectory() as tempDir:
-		# Create temp wikidata file
-		wikidataFile = os.path.join(tempDir, 'dump.json.bz2')
-		with bz2.open(wikidataFile, mode='wb') as file:
-			file.write(b'[\n')
-			for i in range(len(wikiItemArray)):
-				file.write(json.dumps(wikiItemArray[i], separators=(',',':')).encode())
-				if i < len(wikiItemArray) - 1:
-					file.write(b',')
-				file.write(b'\n')
-			file.write(b']\n')
-		# Create temp offsets file if requested
-		offsetsFile = os.path.join(tempDir, 'offsets.dat')
-		if preGenOffsets:
-			with indexed_bzip2.open(wikidataFile) as file:
-				with open(offsetsFile, 'wb') as file2:
-					pickle.dump(file.block_offsets(), file2)
-		# Run genData()
-		dbFile = os.path.join(tempDir, 'events.db')
-		genData(wikidataFile, offsetsFile, dbFile, nProcs)
-		# Read db
-		return readTestDbTable(dbFile, 'SELECT * FROM events')
-
-class TestGenData(unittest.TestCase):
-	def setUp(self):
-		self.maxDiff = None # Remove output-diff size limit
-		self.testWikiItems = [
-			{
-				'id': 'Q1',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q1656682'}}}}], # 'instance of' 'event'
-					'P585': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'point in time'
-						'time':'+1950-12-00T00:00:00Z',
-						'timezone':0,
-						'before':0,
-						'after':0,
-						'precision':10, # month precision
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar'
-					}}}}],
-					'P141': [{'mainsnak': {'datavalue': {'value': {'id': 'Q211005'}}}}], # Other random property
-				},
-				'sitelinks': {'enwiki': {'title': 'event one'}},
-			},
-			{
-				'id': 'Q2',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q5'}}}}], # 'instance of' 'human'
-					'P569': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of birth'
-						'time':'+2002-11-02T00:00:00Z',
-						'precision':11, # day precision
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
-					}}}}],
-					'P570': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of death'
-						'time':'+2010-06-21T00:00:01Z',
-						'timezone':1,
-						'precision':11,
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar'
-					}}}}],
-				},
-				'sitelinks': {'enwiki': {'title': 'Human One'}},
-			},
-			{
-				'id': 'Q3',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7275'}}}}], # 'instance of' 'state'
-					'P580': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'start time'
-						'time':'-1001-00-00T00:00:00Z',
-						'precision':9, # year precision
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985727'
-					}}}}],
-					'P582': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'end time'
-						'time':'-99-00-00T00:00:01Z',
-						'precision':9,
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985786'
-					}}}}],
-				},
-				'sitelinks': {'enwiki': {'title': 'country one'}},
-			},
-			{
-				'id': 'Q4',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q6256'}}}}], # 'instance of' 'country'
-					'P7584': [{'mainsnak': {'datavalue': {'type': 'quantity', 'value': {
-						# 'age estimated by a dating method'
-						"amount":"+10.9",
-						"unit":"http://www.wikidata.org/entity/Q3013059", # kiloannum
-						"lowerBound":"+9",
-						"upperBound":"+11",
-					}}}}],
-				},
-				'sitelinks': {'enwiki': {'title': 'country two'}},
-			},
-			{
-				'id': 'Q5',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11019'}}}}], # 'instance of' 'machine'
-					'P575': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'time of discovery or invention'
-						'time':'+0101-00-00T00:00:01Z',
-						'precision':6, # millenium precision
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985786'
-					}}}}],
-				},
-				'sitelinks': {'enwiki': {'title': 'discovery one'}},
-			},
-			{
-				'id': 'Q6',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7725634'}}}}], # 'instance of' 'literary work'
-					'P1319': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'earliest date'
-						'time':'-0020-08-01T00:00:00Z',
-						'precision':11, # day precision
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
-					}}}}],
-					'P1326': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'latest date'
-						'time':'-0020-09-01T00:00:00Z',
-						'precision':11,
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
-					}}}}],
-				},
-				'sitelinks': {'enwiki': {'title': 'media one'}},
-			},
-			{
-				'id': 'Q7',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11424'}}}}], # 'instance of' 'film'
-					'P577': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'publication date'
-						'time':'-2103-00-00T00:00:00Z',
-						'precision':7, # century precision
-						'calendarmodel':'http://www.wikidata.org/entity/Q1985727'
-					}}}}],
-				},
-				'sitelinks': {'enwiki': {'title': 'media two'}},
-			},
-			{
-				'id': 'Q8',
-				'claims': {
-					'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # 'instance of' 'taxon'
-				}
-				# No title
-			},
-		]
-		self.expectedRows = {
-			(1, 'event one', 2433616, 2433646, None, None, 2, 'event'),
-			(2, 'Human One', 2452593, None, 2455368, None, 3, 'human'),
-			(3, 'country one', -1001, None, -99, None, 0, 'country'),
-			(4, 'country two', -9000, -7000, None, None, 0, 'country'),
-			(5, 'discovery one', 1, 1000, None, None, 0, 'discovery'),
-			(6, 'media one', 1713965, None, 1713996, None, 1, 'media'),
-			(7, 'media two', -2199, -2100, None, None, 0, 'media'),
-		}
-	def test_wikiItems(self):
-		rows = runGenData(self.testWikiItems, False, 1)
-		self.assertEqual(rows, self.expectedRows)
-	def test_empty_dump(self):
-		rows = runGenData([{}], False, 1)
-		self.assertEqual(rows, set())
-	def test_multiprocessing(self):
-		rows = runGenData(self.testWikiItems, False, 4)
-		self.assertEqual(rows, self.expectedRows)
-	def test_existing_offsets(self):
-		rows = runGenData(self.testWikiItems, True, 3)
-		self.assertEqual(rows, self.expectedRows)
-- 
cgit v1.2.3