aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/gen_events_data.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-10-01 21:08:59 +1000
committerTerry Truong <terry06890@gmail.com>2022-10-01 21:08:59 +1000
commit4edb7998012bcc804482a76277cd25b90fb373c9 (patch)
tree0e978eab34667d6ab6a2fa4453171e3fdb7fc42e /backend/hist_data/gen_events_data.py
parenta0b1e1a8a303504dd2cc743ab72937aee7f60f4d (diff)
Update READMEs and .gitignore
Diffstat (limited to 'backend/hist_data/gen_events_data.py')
-rwxr-xr-xbackend/hist_data/gen_events_data.py454
1 files changed, 454 insertions, 0 deletions
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py
new file mode 100755
index 0000000..1f990d0
--- /dev/null
+++ b/backend/hist_data/gen_events_data.py
@@ -0,0 +1,454 @@
+#!/usr/bin/python3
+
+"""
+Reads a Wikidata JSON dump, looking for entities usable as historical events. For each such
+entity, finds a start date (may be a range), optional end date, and event category (eg: normal
+event, person with birth/death date, country, etc). Writes the results into a database.
+
+The JSON dump contains an array of objects, each of which describes a Wikidata item item1,
+and takes up it's own line.
+- Getting item1's Wikidata ID: item1['id'] (eg: "Q144")
+- Checking for a property: item1['claims'][prop1] == array1
+- Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue']
+ 'idx1' indexes an array of statements
+
+Value objects have a 'type' and 'value' field.
+Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates
+ An example:
+ {"value":{
+ "time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits
+ "timezone":0, # Unused
+ "before":0, # Unused
+ "after":0, # Unused
+ "precision":11,
+ "calendarmodel":"http://www.wikidata.org/entity/Q1985727"
+ }, "type":"time"}
+ 'precision' can be one of:
+ 0 - billion years (timestamp eg: -5000000000-00-00T00:00:00Z)
+ 1 - hundred million years
+ ...
+ 6 - millenium (warning: represents ranges from *1 to *0, eg: 1001-2000)
+ 7 - century (warning: represents ranges from *1 to *0, eg: 1801-1900)
+ 8 - decade (represents ranges from *0 to *9, eg: 2010-2019)
+ 9 - year
+ 10 - month
+ 11 - day
+ 'calendarmodel' can be one of:
+ "http://www.wikidata.org/entity/Q1985727" - proleptic Gregorian calendar
+ "http://www.wikidata.org/entity/Q1985786" - proleptic Julian calendar
+Info about objects with type 'quantity' can be found at: https://www.wikidata.org/wiki/Help:Data_type#Quantity
+ An example:
+ {"value":{
+ "amount":"+10.9",
+ "unit":"http://www.wikidata.org/entity/Q20764",
+ "lowerBound":"+170.1", # May be absent
+ "upperBound":"+470", # May be absent
+ }, "type":"quantity"}
+ 'unit' can be one of:
+ "http://www.wikidata.org/entity/Q577" - year
+ "http://www.wikidata.org/entity/Q24564698" - years old
+ "http://www.wikidata.org/entity/Q3013059" - kiloannum (1e3 yrs)
+ "http://www.wikidata.org/entity/Q20764" - megaannum (1e6 yrs)
+ "http://www.wikidata.org/entity/Q524410" - gigaannum (1e9 yrs)
+"""
+
+# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by:
+# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock.
+# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state.
+# Related: https://bugs.python.org/issue6721
+# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
+# Possibly related: https://github.com/python/cpython/issues/72882
+
+import os, io, math, re, argparse
+import bz2, json, sqlite3
+import multiprocessing, indexed_bzip2, pickle, tempfile
+from jdcal import gcal2jd, jcal2jd
+
+WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2')
+DUMP_YEAR = 2022 # Used for converting 'age' values into dates
+OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat')
+DB_FILE = 'data.db'
+N_PROCS = 6
+
+# For getting Wikidata entity IDs
+INSTANCE_OF = 'P31'
+EVENT_CTG: dict[str, dict[str, str]] = {
+ # Maps event-categories to dicts that map event-indicative entity names to their IDs
+ # If the ID starts with 'Q', it expects entities to be an 'instance of' that ID
+ # If the ID starts with 'P', it expects entities to have a property with that ID
+ 'event': {
+ 'occurrence': 'Q1190554',
+ 'time interval': 'Q186081',
+ 'historical period': 'Q11514315',
+ 'era': 'Q6428674',
+ 'event': 'Q1656682',
+ 'recurring event': 'Q15275719',
+ 'event sequence': 'Q15900616',
+ 'incident': 'Q18669875',
+ },
+ 'human': {
+ 'human': 'Q5',
+ },
+ 'country': {
+ 'country': 'Q6256',
+ 'state': 'Q7275',
+ 'sovereign state': 'Q3624078',
+ },
+ 'discovery': {
+ 'time of discovery or invention': 'P575',
+ },
+ 'media': {
+ 'work of art': 'Q4502142',
+ 'literary work': 'Q7725634',
+ 'comic book series': 'Q14406742',
+ 'painting': 'Q3305213',
+ 'musical work/composition': 'Q105543609',
+ 'film': 'Q11424',
+ 'animated film': 'Q202866',
+ 'television series': 'Q16401',
+ 'anime television series': 'Q63952888',
+ 'video game': 'Q7889',
+ 'video game series': 'Q7058673',
+ },
+}
+ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()}
+EVENT_PROP: dict[str, str] = {
+ # Maps event-start/end-indicative property names to their IDs
+ 'start time': 'P580',
+ 'end time': 'P582',
+ 'point in time': 'P585',
+ 'inception': 'P571',
+ 'age estimated by a dating method': 'P7584',
+ 'temporal range start': 'P523',
+ 'temporal range end': 'P524',
+ 'earliest date': 'P1319',
+ 'latest date': 'P1326',
+ 'date of birth': 'P569',
+ 'date of death': 'P570',
+ 'time of discovery or invention': 'P575',
+ 'publication date': 'P577',
+}
+PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [
+ # Indicates how event start/end data should be obtained from EVENT_PROP props
+ # Each tuple starts with a start-time prop to check for, followed by an optional
+ # end-time prop, and an optional 'both props must be present' boolean indicator
+ ('start time', 'end time'),
+ ('point in time',),
+ ('inception',),
+ ('age estimated by a dating method',),
+ ('temporal range start', 'temporal range end'),
+ ('earliest date', 'latest date', True),
+ ('date of birth', 'date of death'),
+ ('time of discovery or invention',),
+ ('publication date',),
+]
+UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years
+ 'http://www.wikidata.org/entity/Q577': 1, # 'year'
+ 'http://www.wikidata.org/entity/Q24564698': 1, # 'years old'
+ 'http://www.wikidata.org/entity/Q3013059': 10**3, # 'kiloannum' (1e3 yrs)
+ 'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs)
+ 'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs)
+}
+
+# For filtering lines before parsing JSON
+TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()
+PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()
+
+def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
+ """ Reads the dump and writes to db """
+ # Check db
+ if os.path.exists(dbFile):
+ print('ERROR: Database already exists')
+ return
+ # Read dump, and write to db
+ print('Writing to db')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ dbCur.execute('CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
+ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)')
+ if nProcs == 1:
+ with bz2.open(wikidataFile, mode='rb') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e4 == 0:
+ print(f'At line {lineNum}')
+ entry = readDumpLine(line)
+ if entry:
+ dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
+ # The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects)
+ else:
+ if not os.path.exists(offsetsFile):
+ print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours)
+ with indexed_bzip2.open(wikidataFile) as file:
+ with open(offsetsFile, 'wb') as file2:
+ pickle.dump(file.block_offsets(), file2)
+ print('Allocating file into chunks')
+ fileSz: int # About 1.4 TB
+ with indexed_bzip2.open(wikidataFile) as file:
+ with open(offsetsFile, 'rb') as file2:
+ file.set_block_offsets(pickle.load(file2))
+ fileSz = file.seek(0, io.SEEK_END)
+ chunkSz = fileSz // nProcs
+ chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
+ # Each adjacent pair specifies a start+end byte index for readDumpChunk()
+ print(f'- Chunk size: {chunkSz:,}')
+ print('Starting processes to read dump')
+ with tempfile.TemporaryDirectory() as tempDirName:
+ with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool:
+ # Used maxtasksperchild=1 to free resources on task completion
+ for outFile in pool.map(readDumpChunkOneParam,
+ [(i, wikidataFile, offsetsFile, os.path.join(tempDirName, f'{i}.pickle'),
+ chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]):
+ # Add entries from subprocess output file
+ with open(outFile, 'rb') as file:
+ for entry in pickle.load(file):
+ dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry)
+ dbCon.commit()
+ dbCon.close()
+
+# For data extraction
+def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None:
+ """ Parses a Wikidata dump line, returning an entry to add to the db """
+ # Check with regex
+ if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None:
+ return None
+ # Decode
+ try:
+ line = lineBytes.decode('utf-8').rstrip().rstrip(',')
+ jsonItem = json.loads(line)
+ except json.JSONDecodeError:
+ print(f'Unable to parse line {line} as JSON')
+ return None
+ if 'claims' not in jsonItem:
+ return None
+ claims = jsonItem['claims']
+ # Get wikidata ID, enwiki title
+ try:
+ itemId = int(jsonItem['id'][1:]) # Skip initial 'Q'
+ itemTitle: str = jsonItem['sitelinks']['enwiki']['title']
+ except (KeyError, ValueError):
+ return None
+ # Get event category
+ eventCtg: str | None = None
+ if INSTANCE_OF in claims: # Check types
+ for statement in claims[INSTANCE_OF]:
+ try:
+ itemType = statement['mainsnak']['datavalue']['value']['id']
+ except KeyError:
+ return None
+ if itemType in ID_TO_CTG:
+ eventCtg = ID_TO_CTG[itemType]
+ break
+ if not eventCtg:
+ for prop in claims: # Check props
+ if prop in ID_TO_CTG:
+ eventCtg = ID_TO_CTG[prop]
+ if not eventCtg:
+ return None
+ # Check for event-start/end props
+ startVal: str
+ endVal: str | None
+ timeType: str
+ found = False
+ for props in PROP_RULES:
+ startProp: str = EVENT_PROP[props[0]]
+ endProp = None if len(props) < 2 else EVENT_PROP[props[1]]
+ needBoth = False if len(props) < 3 else props[2]
+ if startProp not in claims:
+ continue
+ try:
+ startVal = claims[startProp][0]['mainsnak']['datavalue']
+ endVal = None
+ if endProp and endProp in claims:
+ endVal = claims[endProp][0]['mainsnak']['datavalue']
+ elif needBoth:
+ continue
+ except (KeyError, ValueError):
+ continue
+ timeType = props[0]
+ found = True
+ break
+ if not found:
+ return None
+ # Convert time values
+ timeData = getTimeData(startVal, endVal, timeType)
+ if timeData is None:
+ return None
+ start, startUpper, end, endUpper, timeFmt = timeData
+ #
+ return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg)
+def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None:
+ """ Obtains event start+end data from value objects with type 'time', according to 'timeType' """
+ # Values to return
+ start: int
+ startUpper: int | None = None
+ end: int | None = None
+ endUpper: int | None = None
+ timeFmt: int
+ #
+ if timeType == 'age estimated by a dating method':
+ if 'type' not in startVal or startVal['type'] != 'quantity':
+ return None
+ # Get quantity data
+ try:
+ value = startVal['value']
+ amount = math.ceil(float(value['amount']))
+ unit = value['unit']
+ if 'lowerBound' in value and 'upperBound' in value:
+ lowerBound = math.ceil(float(value['lowerBound']))
+ upperBound = math.ceil(float(value['upperBound']))
+ else:
+ lowerBound = None
+ upperBound = None
+ except (KeyError, ValueError):
+ return None
+ # Get unit scale
+ if unit not in UNIT_TO_SCALE:
+ return None
+ scale = UNIT_TO_SCALE[unit]
+ # Get start+startUpper
+ if lowerBound is None:
+ start = DUMP_YEAR - amount * scale
+ else:
+ start = DUMP_YEAR - upperBound * scale
+ startUpper = DUMP_YEAR - lowerBound * scale
+ # Account for non-existence of 0 CE
+ if start <= 0:
+ start -= 1
+ if startUpper is not None and startUpper <= 0:
+ startUpper -= 1
+ # Adjust precision
+ start = start // scale * scale
+ if startUpper is not None:
+ startUpper = startUpper // scale * scale
+ elif scale > 1:
+ startUpper = start + scale - 1
+ #
+ timeFmt = 0
+ elif timeType == 'earliest date':
+ # Get start
+ startTimeVals = getEventTime(startVal)
+ if startTimeVals is None:
+ return None
+ start, _, timeFmt = startTimeVals
+ # Get end
+ endTimeVals = getEventTime(endVal)
+ if endTimeVals is None:
+ return None
+ end, _, timeFmt2 = endTimeVals
+ if timeFmt != timeFmt2:
+ if timeFmt == 1 and timeFmt2 == 2:
+ timeFmt = 3
+ else:
+ return None
+ else:
+ # Get start+startUpper
+ startTimeVals = getEventTime(startVal)
+ if startTimeVals is None:
+ return None
+ start, startUpper, timeFmt = startTimeVals
+ # Get end+endUpper
+ if endVal is not None:
+ endTimeVals = getEventTime(endVal)
+ if endTimeVals is None:
+ return None
+ end, endUpper, timeFmt2 = endTimeVals
+ if timeFmt != timeFmt2:
+ if timeFmt == 1 and timeFmt2 == 2:
+ timeFmt = 3
+ else:
+ return None
+ return start, startUpper, end, endUpper, timeFmt
+def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
+ """ Obtains event start (or end) data from a value object with type 'time' """
+ if 'type' not in dataVal or dataVal['type'] != 'time':
+ return None
+ # Get time data
+ try:
+ value = dataVal['value']
+ time = value['time']
+ match = re.match(r'([+-]\d+)-(\d+)-(\d+)', time)
+ if match is None:
+ return None
+ year, month, day = (int(x) for x in match.groups())
+ precision = value['precision']
+ calendarmodel = value['calendarmodel']
+ except (KeyError, ValueError):
+ return None
+ # Get start+startUpper
+ start: int
+ startUpper: int | None = None
+ timeFmt: int
+ if precision in [10, 11]: # 'month' or 'day' precision
+ if year < -4712: # If before 4713 BCE (start of valid julian date period)
+ print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}')
+ return None
+ day = max(day, 1) # With month-precision, entry may have a 'day' of 0
+ if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar'
+ start = jdPairToJd(gcal2jd(year, month, day))
+ if precision == 10:
+ startUpper = jdPairToJd(gcal2jd(year, month+1, 0))
+ timeFmt = 2
+ else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar')
+ start = jdPairToJd(jcal2jd(year, month, day))
+ if precision == 10:
+ startUpper = jdPairToJd(jcal2jd(year, month+1, 0))
+ timeFmt = 1
+ elif 0 <= precision < 10: # 'year' to 'gigaannum' precision
+ scale: int = 10 ** (9 - precision)
+ start = year // scale * scale
+ if scale > 1:
+ startUpper = start + scale - 1
+ if precision in [6, 7]: # Account for century/millenia ranges being from *1 to *0
+ start += 1
+ if startUpper is not None:
+ startUpper += 1
+ timeFmt = 0
+ else:
+ return None
+ return start, startUpper, timeFmt
+def jdPairToJd(jdPair: tuple[int, int]) -> int:
+ """ Converts a julian-date-representing value from jdcal into an int """
+ return math.floor(sum(jdPair))
+
+# For using multiple processes
+def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str:
+ """ Forwards to readDumpChunk() (for use with pool.map()) """
+ return readDumpChunk(*params)
+def readDumpChunk(
+ procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str:
+ """ Reads lines in the dump that begin after a start-byte, and not after an end byte.
+ If startByte is -1, start at the first line. """
+ # Read dump
+ entries = []
+ with indexed_bzip2.open(wikidataFile) as file:
+ # Load offsets file
+ with open(offsetsFile, 'rb') as file2:
+ offsets = pickle.load(file2)
+ file.set_block_offsets(offsets)
+ # Seek to chunk
+ if startByte != -1:
+ file.seek(startByte)
+ file.readline()
+ else:
+ startByte = 0 # Used for progress calculation
+ # Read lines
+ count = 0
+ while file.tell() <= endByte:
+ count += 1
+ if count % 1e4 == 0:
+ perc = (file.tell() - startByte) / (endByte - startByte) * 100
+ print(f'Thread {procId}: {perc:.2f}%')
+ entry = readDumpLine(file.readline())
+ if entry:
+ entries.append(entry)
+ # Output results into file
+ with open(outFile, 'wb') as file:
+ pickle.dump(entries, file)
+ return outFile
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ multiprocessing.set_start_method('spawn')
+ genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)