From 55ffd41c5a9257ea44afc3b6a44499359790c3fb Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Sat, 1 Oct 2022 13:58:17 +1000
Subject: Add conversions for wikidata time data

---
 backend/hist_data/wikidata/README.md          |  28 +++
 backend/hist_data/wikidata/gen_events_data.py | 305 +++++++++++++++++++++-----
 2 files changed, 275 insertions(+), 58 deletions(-)
 create mode 100644 backend/hist_data/wikidata/README.md

(limited to 'backend')
diff --git a/backend/hist_data/wikidata/README.md b/backend/hist_data/wikidata/README.md
new file mode 100644
index 0000000..35dad34
--- /dev/null
+++ b/backend/hist_data/wikidata/README.md
@@ -0,0 +1,28 @@
+This directory holds files obtained/derived from [Wikidata](https://www.wikidata.org/).
+
+# Downloaded Files
+-   `latest-all.json.bz2` <br>
+    Obtained from <https://dumps.wikimedia.org/wikidatawiki/entities/> (on 23/08/22).
+    Format info can be found at <https://doc.wikimedia.org/Wikibase/master/php/md_docs_topics_json.html>.
+
+# Other Files
+-   `gen_events_data.py` <br>
+    Used to generate a database holding event information from the dump.
+-   `offsets.dat` <br>
+    Holds bzip2 block offsets for the dump. Generated and used by
+    gen_events_data.py for parallel processing of the dump.
+-   `events.db` <br>
+    Generated by `gen_events_data.py`. <br>
+    Has one table `events`:
+    -   Columns: `id INT, title TEXT, start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT`
+    -   Contains a Wikidata ID, Wikipedia title, start and end dates, and an event category.
+    -   If `start_upper` is not NULL, `start` and `start_upper` denote an uncertain range of start times.
+        And similarly for 'end' and 'end_upper'.
+    -   `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`.
+        -   If 1, they denote a Julian date (with 0.5 removed to align with midnight).
+            This allows simple comparison of events with day-level precision, but only goes back to 4713 BCE.
+        -   If 2, same as 1, but dates are preferably displayed using the Gregorian calendar, not the Julian calendar.
+            For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not.
+        -   If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'.
+            For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
+        -   If 0, they denote a number of years CE (if positive) or BCE (if negative).
diff --git a/backend/hist_data/wikidata/gen_events_data.py b/backend/hist_data/wikidata/gen_events_data.py
index 84dbb5f..f4766f0 100755
--- a/backend/hist_data/wikidata/gen_events_data.py
+++ b/backend/hist_data/wikidata/gen_events_data.py
@@ -1,36 +1,79 @@
 #!/usr/bin/python3
 
 """
-Reads a wikidata JSON dump, looking for entities usable as historical events.
-Writes results into a database.
+Reads a Wikidata JSON dump, looking for entities usable as historical events.  For each such
+entity, finds a start date (may be a range), optional end date, and event category (eg: normal
+event, person with birth/death date, country, etc).  Writes the results into a database.
 
-The JSON dump contains an array of objects, each of which describes a
-Wikidata item item1, and takes up it's own line.
+The JSON dump contains an array of objects, each of which describes a Wikidata item item1,
+and takes up it's own line.
 - Getting item1's Wikidata ID: item1['id'] (eg: "Q144")
 - Checking for a property: item1['claims'][prop1] == array1
 - Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue']
 	'idx1' indexes an array of statements
+
+Value objects have a 'type' and 'value' field.
+Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates
+	An example:
+		{"value":{
+			"time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits
+			"timezone":0, # Unused
+			"before":0,   # Unused
+			"after":0,    # Unused
+			"precision":11,
+			"calendarmodel":"http://www.wikidata.org/entity/Q1985727"
+		}, "type":"time"}
+	'precision' can be one of:
+		0  - billion years (timestamp eg: -5000000000-00-00T00:00:00Z)
+		1  - hundred million years
+		...
+		6  - millenium (warning: represents ranges from *1 to *0, eg: 1001-2000)
+		7  - century (warning: represents ranges from *1 to *0, eg: 1801-1900)
+		8  - decade (represents ranges from *0 to *9, eg: 2010-2019)
+		9  - year
+		10 - month
+		11 - day
+	'calendarmodel' can be one of:
+		"http://www.wikidata.org/entity/Q1985727" - proleptic Gregorian calendar
+		"http://www.wikidata.org/entity/Q1985786" - proleptic Julian calendar
+Info about objects with type 'quantity' can be found at: https://www.wikidata.org/wiki/Help:Data_type#Quantity
+	An example:
+		{"value":{
+			"amount":"+10.9",
+			"unit":"http://www.wikidata.org/entity/Q20764",
+			"lowerBound":"+170.1", # May be absent
+			"upperBound":"+470", # May be absent
+		}, "type":"quantity"}
+	'unit' can be one of:
+		"http://www.wikidata.org/entity/Q577"      - year
+		"http://www.wikidata.org/entity/Q24564698" - years old
+		"http://www.wikidata.org/entity/Q3013059"  - kiloannum (1e3 yrs)
+		"http://www.wikidata.org/entity/Q20764"    - megaannum (1e6 yrs)
+		"http://www.wikidata.org/entity/Q524410"   - gigaannum (1e9 yrs)
 """
 
 # On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by:
+# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock.
 # - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state.
 #   Related: https://bugs.python.org/issue6721
 # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
 #   Possibly related: https://github.com/python/cpython/issues/72882
 
-import os, io, re, argparse
+import os, io, math, re, argparse
 import bz2, json, sqlite3
 import multiprocessing, indexed_bzip2, pickle, tempfile
+from jdcal import gcal2jd, jcal2jd
 
 WIKIDATA_FILE = 'latest-all.json.bz2'
+DUMP_YEAR = 2022 # Used for converting 'age' values into dates
 OFFSETS_FILE = 'offsets.dat'
 DB_FILE = 'events.db'
 N_PROCS = 6
 
-# For handling Wikidata entity IDs
+# For getting Wikidata entity IDs
 INSTANCE_OF = 'P31'
 EVENT_CTG: dict[str, dict[str, str]] = {
-	# Map from event-categories to dicts that map event-indicative entity names to their IDs
+	# Maps event-categories to dicts that map event-indicative entity names to their IDs
 		# If the ID starts with 'Q', it expects entities to be an 'instance of' that ID
 		# If the ID starts with 'P', it expects entities to have a property with that ID
 	'event': {
@@ -69,7 +112,8 @@ EVENT_CTG: dict[str, dict[str, str]] = {
 	},
 }
 ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()}
-EVENT_PROP: dict[str, str] = { # Maps event-start/end-indicative property names to their IDs
+EVENT_PROP: dict[str, str] = {
+	# Maps event-start/end-indicative property names to their IDs
 	'start time': 'P580',
 	'end time': 'P582',
 	'point in time': 'P585',
@@ -98,14 +142,20 @@ PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [
 	('time of discovery or invention',),
 	('publication date',),
 ]
+UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years
+	'http://www.wikidata.org/entity/Q577':          1, # 'year'
+	'http://www.wikidata.org/entity/Q24564698':     1, # 'years old'
+	'http://www.wikidata.org/entity/Q3013059':  10**3, # 'kiloannum' (1e3 yrs)
+	'http://www.wikidata.org/entity/Q20764':    10**6, # 'megaannum' (1e6 yrs)
+	'http://www.wikidata.org/entity/Q524410':   10**9, # 'gigaannum' (1e9 yrs)
+}
+
 # For filtering lines before parsing JSON
-TYPE_ID_REGEX = re.compile(
-	('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode())
-PROP_ID_REGEX = re.compile(
-	('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode())
+TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()
+PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()
 
 def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
-	""" Reads the dump and writes info to db """
+	""" Reads the dump and writes to db """
 	# Check db
 	if os.path.exists(dbFile):
 		print('ERROR: Database already exists')
@@ -113,22 +163,20 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 	# Read dump, and write to db
 	print('Writing to db')
 	dbCon = sqlite3.connect(dbFile)
-	dbCon.execute('CREATE TABLE events (' \
-		'id INT PRIMARY KEY, title TEXT, start TEXT, end TEXT, time_type TEXT, ctg TEXT)')
-	dbCon.commit()
-	dbCon.close()
+	dbCur = dbCon.cursor()
+	dbCur.execute('CREATE TABLE events (' \
+		'id INT NOT NULL PRIMARY KEY, title TEXT NOT NULL UNIQUE,' \
+		'start INT NOT NULL, start_upper INT, end INT, end_upper INT,' \
+		'fmt INT, ctg TEXT NOT NULL)')
 	if nProcs == 1:
-		dbCon = sqlite3.connect(dbFile)
-		dbCur = dbCon.cursor()
 		with bz2.open(wikidataFile, mode='rb') as file:
 			for lineNum, line in enumerate(file, 1):
 				if lineNum % 1e4 == 0:
 					print(f'At line {lineNum}')
 				entry = readDumpLine(line)
 				if entry:
-					dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?)', entry)
-		dbCon.commit()
-		dbCon.close()
+					dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
+						# The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects)
 	else:
 		if not os.path.exists(offsetsFile):
 			print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours)
@@ -146,23 +194,24 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
 			# Each adjacent pair specifies a start+end byte index for readDumpChunk()
 		print(f'- Chunk size: {chunkSz:,}')
 		print('Starting processes to read dump')
-		dbCon = sqlite3.connect(dbFile)
-		dbCur = dbCon.cursor()
 		with tempfile.TemporaryDirectory() as tempDirName:
 			with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool:
 				# Used maxtasksperchild=1 to free resources on task completion
 				for outFile in pool.map(readDumpChunkOneParam,
-					((i, wikidataFile, offsetsFile, os.path.join(tempDirName, f'{i}.pickle'),
-						chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs))):
+					[(i, wikidataFile, offsetsFile, os.path.join(tempDirName, f'{i}.pickle'),
+						chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]):
 					# Add entries from subprocess output file
 					with open(outFile, 'rb') as file:
-						for e in pickle.load(file):
-							dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?)', e)
-		dbCon.commit()
-		dbCon.close()
-def readDumpLine(lineBytes: bytes) -> tuple[int, str, str, str, str, str] | None:
+						for entry in pickle.load(file):
+							dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
+	dbCon.commit()
+	dbCon.close()
+
+# For data extraction
+def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None:
+	""" Parses a Wikidata dump line, returning an entry to add to the db """
 	# Check with regex
-	if TYPE_ID_REGEX.search(lineBytes) is None and PROP_ID_REGEX.search(lineBytes) is None:
+	if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None:
 		return None
 	# Decode
 	try:
@@ -174,41 +223,46 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, str, str, str, str] | None
 	if 'claims' not in jsonItem:
 		return None
 	claims = jsonItem['claims']
+	# Get wikidata ID, enwiki title
+	try:
+		itemId = int(jsonItem['id'][1:]) # Skip initial 'Q'
+		itemTitle: str = jsonItem['sitelinks']['enwiki']['title']
+	except (KeyError, ValueError):
+		return None
 	# Get event category
 	eventCtg: str | None = None
-	if INSTANCE_OF not in claims:
-		return None
-	for statement in claims[INSTANCE_OF]:
-		try:
-			itemType = statement['mainsnak']['datavalue']['value']['id']
-		except KeyError:
-			return None
-		if itemType in ID_TO_CTG:
-			eventCtg = ID_TO_CTG[itemType]
-			break
+	if INSTANCE_OF in claims: # Check types
+		for statement in claims[INSTANCE_OF]:
+			try:
+				itemType = statement['mainsnak']['datavalue']['value']['id']
+			except KeyError:
+				return None
+			if itemType in ID_TO_CTG:
+				eventCtg = ID_TO_CTG[itemType]
+				break
 	if not eventCtg:
-		for prop in claims:
+		for prop in claims: # Check props
 			if prop in ID_TO_CTG:
 				eventCtg = ID_TO_CTG[prop]
 		if not eventCtg:
 			return None
-	# Check for event props
-	start: str
-	end: str | None
+	# Check for event-start/end props
+	startVal: str
+	endVal: str | None
 	timeType: str
 	found = False
 	for props in PROP_RULES:
 		startProp: str = EVENT_PROP[props[0]]
-		endProp = None if len(props) < 2 else EVENT_PROP[props[1]] # type: ignore
-		needBoth = False if len(props) < 3 else props[2] # type: ignore
+		endProp = None if len(props) < 2 else EVENT_PROP[props[1]]
+		needBoth = False if len(props) < 3 else props[2]
 		if startProp not in claims:
 			continue
 		try:
-			start = json.dumps(claims[startProp][0]['mainsnak']['datavalue'], separators=(',', ':'))
-			end = None
+			startVal = claims[startProp][0]['mainsnak']['datavalue']
+			endVal = None
 			if endProp and endProp in claims:
-				end = json.dumps(claims[endProp][0]['mainsnak']['datavalue'], separators=(',', ':'))
-			if needBoth and end == None:
+				endVal = claims[endProp][0]['mainsnak']['datavalue']
+			elif needBoth:
 				continue
 		except (KeyError, ValueError):
 			continue
@@ -217,18 +271,153 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, str, str, str, str] | None
 		break
 	if not found:
 		return None
-	# Get wikidata ID, enwiki title
+	# Convert time values
+	timeData = getTimeData(startVal, endVal, timeType)
+	if timeData is None:
+		return None
+	start, startUpper, end, endUpper, timeFmt = timeData
+	#
+	return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg)
+def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None:
+	""" Obtains event start+end data from value objects with type 'time', according to 'timeType' """
+	# Values to return
+	start: int
+	startUpper: int | None = None
+	end: int | None = None
+	endUpper: int | None = None
+	timeFmt: int
+	#
+	if timeType == 'age estimated by a dating method':
+		if 'type' not in startVal or startVal['type'] != 'quantity':
+			return None
+		# Get quantity data
+		try:
+			value = startVal['value']
+			amount = math.ceil(float(value['amount']))
+			unit = value['unit']
+			if 'lowerBound' in value and 'upperBound' in value:
+				lowerBound = math.ceil(float(value['lowerBound']))
+				upperBound = math.ceil(float(value['upperBound']))
+			else:
+				lowerBound = None
+				upperBound = None
+		except (KeyError, ValueError):
+			return None
+		# Get unit scale
+		if unit not in UNIT_TO_SCALE:
+			return None
+		scale = UNIT_TO_SCALE[unit]
+		# Get start+startUpper
+		if lowerBound is None:
+			start = DUMP_YEAR - amount * scale
+		else:
+			start = DUMP_YEAR - upperBound * scale
+			startUpper = DUMP_YEAR - lowerBound * scale
+		# Account for non-existence of 0 CE
+		if start <= 0:
+			start -= 1
+		if startUpper is not None and startUpper <= 0:
+			startUpper -= 1
+		# Adjust precision
+		start = start // scale * scale
+		if startUpper is not None:
+			startUpper = startUpper // scale * scale
+		elif scale > 1:
+			startUpper = start + scale - 1
+		#
+		timeFmt = 0
+	elif timeType == 'earliest date':
+		# Get start
+		startTimeVals = getEventTime(startVal)
+		if startTimeVals is None:
+			return None
+		start, _, timeFmt = startTimeVals
+		# Get end
+		endTimeVals = getEventTime(endVal)
+		if endTimeVals is None:
+			return None
+		end, _, timeFmt2 = endTimeVals
+		if timeFmt != timeFmt2:
+			if timeFmt == 1 and timeFmt2 == 2:
+				timeFmt = 3
+			else:
+				return None
+	else:
+		# Get start+startUpper
+		startTimeVals = getEventTime(startVal)
+		if startTimeVals is None:
+			return None
+		start, startUpper, timeFmt = startTimeVals
+		# Get end+endUpper
+		if endVal is not None:
+			endTimeVals = getEventTime(endVal)
+			if endTimeVals is None:
+				return None
+			end, endUpper, timeFmt2 = endTimeVals
+			if timeFmt != timeFmt2:
+				if timeFmt == 1 and timeFmt2 == 2:
+					timeFmt = 3
+				else:
+					return None
+	return start, startUpper, end, endUpper, timeFmt
+def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
+	""" Obtains event start (or end) data from a value object with type 'time' """
+	if 'type' not in dataVal or dataVal['type'] != 'time':
+		return None
+	# Get time data
 	try:
-		itemId = int(jsonItem['id'][1:]) # Skip initial 'Q'
-		itemTitle: str = jsonItem['sitelinks']['enwiki']['title']
+		value = dataVal['value']
+		time = value['time']
+		match = re.match(r'([+-]\d+)-(\d+)-(\d+)', time)
+		if match is None:
+			return None
+		year, month, day = (int(x) for x in match.groups())
+		precision = value['precision']
+		calendarmodel = value['calendarmodel']
 	except (KeyError, ValueError):
 		return None
-	# Return result
-	return (itemId, itemTitle, start, end, timeType, eventCtg) # type: ignore
+	# Get start+startUpper
+	start: int
+	startUpper: int | None = None
+	timeFmt: int
+	if precision in [10, 11]: # 'month' or 'day' precision
+		if year < -4712: # If before 4713 BCE (start of valid julian date period)
+			print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}')
+			return None
+		day = max(day, 1) # With month-precision, entry may have a 'day' of 0
+		if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar'
+			start = jdPairToJd(gcal2jd(year, month, day))
+			if precision == 10:
+				startUpper = jdPairToJd(gcal2jd(year, month+1, 0))
+			timeFmt = 2
+		else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar')
+			start = jdPairToJd(jcal2jd(year, month, day))
+			if precision == 10:
+				startUpper = jdPairToJd(jcal2jd(year, month+1, 0))
+			timeFmt = 1
+	elif 0 <= precision < 10: # 'year' to 'gigaannum' precision
+		scale: int = 10 ** (9 - precision)
+		start = year // scale * scale
+		if scale > 1:
+			startUpper = start + scale - 1
+		if precision in [6, 7]: # Account for century/millenia ranges being from *1 to *0
+			start += 1
+			if startUpper is not None:
+				startUpper += 1
+		timeFmt = 0
+	else:
+		return None
+	return start, startUpper, timeFmt
+def jdPairToJd(jdPair: tuple[int, int]) -> int:
+	""" Converts a julian-date-representing value from jdcal into an int """
+	return math.floor(sum(jdPair))
+
+# For using multiple processes
 def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str:
 	""" Forwards to readDumpChunk() (for use with pool.map()) """
 	return readDumpChunk(*params)
-def readDumpChunk(procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str:
+def readDumpChunk(
+		procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str:
 	""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
 		If startByte is -1, start at the first line. """
 	# Read dump
-- 
cgit v1.2.3