From 55ffd41c5a9257ea44afc3b6a44499359790c3fb Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sat, 1 Oct 2022 13:58:17 +1000 Subject: Add conversions for wikidata time data --- backend/hist_data/wikidata/README.md | 28 +++ backend/hist_data/wikidata/gen_events_data.py | 305 +++++++++++++++++++++----- 2 files changed, 275 insertions(+), 58 deletions(-) create mode 100644 backend/hist_data/wikidata/README.md (limited to 'backend') diff --git a/backend/hist_data/wikidata/README.md b/backend/hist_data/wikidata/README.md new file mode 100644 index 0000000..35dad34 --- /dev/null +++ b/backend/hist_data/wikidata/README.md @@ -0,0 +1,28 @@ +This directory holds files obtained/derived from [Wikidata](https://www.wikidata.org/). + +# Downloaded Files +- `latest-all.json.bz2`
+ Obtained from (on 23/08/22). + Format info can be found at . + +# Other Files +- `gen_events_data.py`
+ Used to generate a database holding event information from the dump. +- `offsets.dat`
+ Holds bzip2 block offsets for the dump. Generated and used by + gen_events_data.py for parallel processing of the dump. +- `events.db`
+ Generated by `gen_events_data.py`.
+ Has one table `events`: + - Columns: `id INT, title TEXT, start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT` + - Contains a Wikidata ID, Wikipedia title, start and end dates, and an event category. + - If `start_upper` is not NULL, `start` and `start_upper` denote an uncertain range of start times. + And similarly for 'end' and 'end_upper'. + - `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`. + - If 1, they denote a Julian date (with 0.5 removed to align with midnight). + This allows simple comparison of events with day-level precision, but only goes back to 4713 BCE. + - If 2, same as 1, but dates are preferably displayed using the Gregorian calendar, not the Julian calendar. + For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not. + - If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'. + For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not. + - If 0, they denote a number of years CE (if positive) or BCE (if negative). diff --git a/backend/hist_data/wikidata/gen_events_data.py b/backend/hist_data/wikidata/gen_events_data.py index 84dbb5f..f4766f0 100755 --- a/backend/hist_data/wikidata/gen_events_data.py +++ b/backend/hist_data/wikidata/gen_events_data.py @@ -1,36 +1,79 @@ #!/usr/bin/python3 """ -Reads a wikidata JSON dump, looking for entities usable as historical events. -Writes results into a database. +Reads a Wikidata JSON dump, looking for entities usable as historical events. For each such +entity, finds a start date (may be a range), optional end date, and event category (eg: normal +event, person with birth/death date, country, etc). Writes the results into a database. -The JSON dump contains an array of objects, each of which describes a -Wikidata item item1, and takes up it's own line. +The JSON dump contains an array of objects, each of which describes a Wikidata item item1, +and takes up it's own line. - Getting item1's Wikidata ID: item1['id'] (eg: "Q144") - Checking for a property: item1['claims'][prop1] == array1 - Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue'] 'idx1' indexes an array of statements + +Value objects have a 'type' and 'value' field. +Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates + An example: + {"value":{ + "time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits + "timezone":0, # Unused + "before":0, # Unused + "after":0, # Unused + "precision":11, + "calendarmodel":"http://www.wikidata.org/entity/Q1985727" + }, "type":"time"} + 'precision' can be one of: + 0 - billion years (timestamp eg: -5000000000-00-00T00:00:00Z) + 1 - hundred million years + ... + 6 - millenium (warning: represents ranges from *1 to *0, eg: 1001-2000) + 7 - century (warning: represents ranges from *1 to *0, eg: 1801-1900) + 8 - decade (represents ranges from *0 to *9, eg: 2010-2019) + 9 - year + 10 - month + 11 - day + 'calendarmodel' can be one of: + "http://www.wikidata.org/entity/Q1985727" - proleptic Gregorian calendar + "http://www.wikidata.org/entity/Q1985786" - proleptic Julian calendar +Info about objects with type 'quantity' can be found at: https://www.wikidata.org/wiki/Help:Data_type#Quantity + An example: + {"value":{ + "amount":"+10.9", + "unit":"http://www.wikidata.org/entity/Q20764", + "lowerBound":"+170.1", # May be absent + "upperBound":"+470", # May be absent + }, "type":"quantity"} + 'unit' can be one of: + "http://www.wikidata.org/entity/Q577" - year + "http://www.wikidata.org/entity/Q24564698" - years old + "http://www.wikidata.org/entity/Q3013059" - kiloannum (1e3 yrs) + "http://www.wikidata.org/entity/Q20764" - megaannum (1e6 yrs) + "http://www.wikidata.org/entity/Q524410" - gigaannum (1e9 yrs) """ # On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by: +# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock. # - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state. # Related: https://bugs.python.org/issue6721 # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 -import os, io, re, argparse +import os, io, math, re, argparse import bz2, json, sqlite3 import multiprocessing, indexed_bzip2, pickle, tempfile +from jdcal import gcal2jd, jcal2jd WIKIDATA_FILE = 'latest-all.json.bz2' +DUMP_YEAR = 2022 # Used for converting 'age' values into dates OFFSETS_FILE = 'offsets.dat' DB_FILE = 'events.db' N_PROCS = 6 -# For handling Wikidata entity IDs +# For getting Wikidata entity IDs INSTANCE_OF = 'P31' EVENT_CTG: dict[str, dict[str, str]] = { - # Map from event-categories to dicts that map event-indicative entity names to their IDs + # Maps event-categories to dicts that map event-indicative entity names to their IDs # If the ID starts with 'Q', it expects entities to be an 'instance of' that ID # If the ID starts with 'P', it expects entities to have a property with that ID 'event': { @@ -69,7 +112,8 @@ EVENT_CTG: dict[str, dict[str, str]] = { }, } ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()} -EVENT_PROP: dict[str, str] = { # Maps event-start/end-indicative property names to their IDs +EVENT_PROP: dict[str, str] = { + # Maps event-start/end-indicative property names to their IDs 'start time': 'P580', 'end time': 'P582', 'point in time': 'P585', @@ -98,14 +142,20 @@ PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [ ('time of discovery or invention',), ('publication date',), ] +UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years + 'http://www.wikidata.org/entity/Q577': 1, # 'year' + 'http://www.wikidata.org/entity/Q24564698': 1, # 'years old' + 'http://www.wikidata.org/entity/Q3013059': 10**3, # 'kiloannum' (1e3 yrs) + 'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs) + 'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs) +} + # For filtering lines before parsing JSON -TYPE_ID_REGEX = re.compile( - ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()) -PROP_ID_REGEX = re.compile( - ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()) +TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode() +PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode() def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None: - """ Reads the dump and writes info to db """ + """ Reads the dump and writes to db """ # Check db if os.path.exists(dbFile): print('ERROR: Database already exists') @@ -113,22 +163,20 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No # Read dump, and write to db print('Writing to db') dbCon = sqlite3.connect(dbFile) - dbCon.execute('CREATE TABLE events (' \ - 'id INT PRIMARY KEY, title TEXT, start TEXT, end TEXT, time_type TEXT, ctg TEXT)') - dbCon.commit() - dbCon.close() + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE events (' \ + 'id INT NOT NULL PRIMARY KEY, title TEXT NOT NULL UNIQUE,' \ + 'start INT NOT NULL, start_upper INT, end INT, end_upper INT,' \ + 'fmt INT, ctg TEXT NOT NULL)') if nProcs == 1: - dbCon = sqlite3.connect(dbFile) - dbCur = dbCon.cursor() with bz2.open(wikidataFile, mode='rb') as file: for lineNum, line in enumerate(file, 1): if lineNum % 1e4 == 0: print(f'At line {lineNum}') entry = readDumpLine(line) if entry: - dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?)', entry) - dbCon.commit() - dbCon.close() + dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry) + # The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects) else: if not os.path.exists(offsetsFile): print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours) @@ -146,23 +194,24 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No # Each adjacent pair specifies a start+end byte index for readDumpChunk() print(f'- Chunk size: {chunkSz:,}') print('Starting processes to read dump') - dbCon = sqlite3.connect(dbFile) - dbCur = dbCon.cursor() with tempfile.TemporaryDirectory() as tempDirName: with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool: # Used maxtasksperchild=1 to free resources on task completion for outFile in pool.map(readDumpChunkOneParam, - ((i, wikidataFile, offsetsFile, os.path.join(tempDirName, f'{i}.pickle'), - chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs))): + [(i, wikidataFile, offsetsFile, os.path.join(tempDirName, f'{i}.pickle'), + chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]): # Add entries from subprocess output file with open(outFile, 'rb') as file: - for e in pickle.load(file): - dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?)', e) - dbCon.commit() - dbCon.close() -def readDumpLine(lineBytes: bytes) -> tuple[int, str, str, str, str, str] | None: + for entry in pickle.load(file): + dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry) + dbCon.commit() + dbCon.close() + +# For data extraction +def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None: + """ Parses a Wikidata dump line, returning an entry to add to the db """ # Check with regex - if TYPE_ID_REGEX.search(lineBytes) is None and PROP_ID_REGEX.search(lineBytes) is None: + if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None: return None # Decode try: @@ -174,41 +223,46 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, str, str, str, str] | None if 'claims' not in jsonItem: return None claims = jsonItem['claims'] + # Get wikidata ID, enwiki title + try: + itemId = int(jsonItem['id'][1:]) # Skip initial 'Q' + itemTitle: str = jsonItem['sitelinks']['enwiki']['title'] + except (KeyError, ValueError): + return None # Get event category eventCtg: str | None = None - if INSTANCE_OF not in claims: - return None - for statement in claims[INSTANCE_OF]: - try: - itemType = statement['mainsnak']['datavalue']['value']['id'] - except KeyError: - return None - if itemType in ID_TO_CTG: - eventCtg = ID_TO_CTG[itemType] - break + if INSTANCE_OF in claims: # Check types + for statement in claims[INSTANCE_OF]: + try: + itemType = statement['mainsnak']['datavalue']['value']['id'] + except KeyError: + return None + if itemType in ID_TO_CTG: + eventCtg = ID_TO_CTG[itemType] + break if not eventCtg: - for prop in claims: + for prop in claims: # Check props if prop in ID_TO_CTG: eventCtg = ID_TO_CTG[prop] if not eventCtg: return None - # Check for event props - start: str - end: str | None + # Check for event-start/end props + startVal: str + endVal: str | None timeType: str found = False for props in PROP_RULES: startProp: str = EVENT_PROP[props[0]] - endProp = None if len(props) < 2 else EVENT_PROP[props[1]] # type: ignore - needBoth = False if len(props) < 3 else props[2] # type: ignore + endProp = None if len(props) < 2 else EVENT_PROP[props[1]] + needBoth = False if len(props) < 3 else props[2] if startProp not in claims: continue try: - start = json.dumps(claims[startProp][0]['mainsnak']['datavalue'], separators=(',', ':')) - end = None + startVal = claims[startProp][0]['mainsnak']['datavalue'] + endVal = None if endProp and endProp in claims: - end = json.dumps(claims[endProp][0]['mainsnak']['datavalue'], separators=(',', ':')) - if needBoth and end == None: + endVal = claims[endProp][0]['mainsnak']['datavalue'] + elif needBoth: continue except (KeyError, ValueError): continue @@ -217,18 +271,153 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, str, str, str, str] | None break if not found: return None - # Get wikidata ID, enwiki title + # Convert time values + timeData = getTimeData(startVal, endVal, timeType) + if timeData is None: + return None + start, startUpper, end, endUpper, timeFmt = timeData + # + return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg) +def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None: + """ Obtains event start+end data from value objects with type 'time', according to 'timeType' """ + # Values to return + start: int + startUpper: int | None = None + end: int | None = None + endUpper: int | None = None + timeFmt: int + # + if timeType == 'age estimated by a dating method': + if 'type' not in startVal or startVal['type'] != 'quantity': + return None + # Get quantity data + try: + value = startVal['value'] + amount = math.ceil(float(value['amount'])) + unit = value['unit'] + if 'lowerBound' in value and 'upperBound' in value: + lowerBound = math.ceil(float(value['lowerBound'])) + upperBound = math.ceil(float(value['upperBound'])) + else: + lowerBound = None + upperBound = None + except (KeyError, ValueError): + return None + # Get unit scale + if unit not in UNIT_TO_SCALE: + return None + scale = UNIT_TO_SCALE[unit] + # Get start+startUpper + if lowerBound is None: + start = DUMP_YEAR - amount * scale + else: + start = DUMP_YEAR - upperBound * scale + startUpper = DUMP_YEAR - lowerBound * scale + # Account for non-existence of 0 CE + if start <= 0: + start -= 1 + if startUpper is not None and startUpper <= 0: + startUpper -= 1 + # Adjust precision + start = start // scale * scale + if startUpper is not None: + startUpper = startUpper // scale * scale + elif scale > 1: + startUpper = start + scale - 1 + # + timeFmt = 0 + elif timeType == 'earliest date': + # Get start + startTimeVals = getEventTime(startVal) + if startTimeVals is None: + return None + start, _, timeFmt = startTimeVals + # Get end + endTimeVals = getEventTime(endVal) + if endTimeVals is None: + return None + end, _, timeFmt2 = endTimeVals + if timeFmt != timeFmt2: + if timeFmt == 1 and timeFmt2 == 2: + timeFmt = 3 + else: + return None + else: + # Get start+startUpper + startTimeVals = getEventTime(startVal) + if startTimeVals is None: + return None + start, startUpper, timeFmt = startTimeVals + # Get end+endUpper + if endVal is not None: + endTimeVals = getEventTime(endVal) + if endTimeVals is None: + return None + end, endUpper, timeFmt2 = endTimeVals + if timeFmt != timeFmt2: + if timeFmt == 1 and timeFmt2 == 2: + timeFmt = 3 + else: + return None + return start, startUpper, end, endUpper, timeFmt +def getEventTime(dataVal) -> tuple[int, int | None, int] | None: + """ Obtains event start (or end) data from a value object with type 'time' """ + if 'type' not in dataVal or dataVal['type'] != 'time': + return None + # Get time data try: - itemId = int(jsonItem['id'][1:]) # Skip initial 'Q' - itemTitle: str = jsonItem['sitelinks']['enwiki']['title'] + value = dataVal['value'] + time = value['time'] + match = re.match(r'([+-]\d+)-(\d+)-(\d+)', time) + if match is None: + return None + year, month, day = (int(x) for x in match.groups()) + precision = value['precision'] + calendarmodel = value['calendarmodel'] except (KeyError, ValueError): return None - # Return result - return (itemId, itemTitle, start, end, timeType, eventCtg) # type: ignore + # Get start+startUpper + start: int + startUpper: int | None = None + timeFmt: int + if precision in [10, 11]: # 'month' or 'day' precision + if year < -4712: # If before 4713 BCE (start of valid julian date period) + print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}') + return None + day = max(day, 1) # With month-precision, entry may have a 'day' of 0 + if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar' + start = jdPairToJd(gcal2jd(year, month, day)) + if precision == 10: + startUpper = jdPairToJd(gcal2jd(year, month+1, 0)) + timeFmt = 2 + else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar') + start = jdPairToJd(jcal2jd(year, month, day)) + if precision == 10: + startUpper = jdPairToJd(jcal2jd(year, month+1, 0)) + timeFmt = 1 + elif 0 <= precision < 10: # 'year' to 'gigaannum' precision + scale: int = 10 ** (9 - precision) + start = year // scale * scale + if scale > 1: + startUpper = start + scale - 1 + if precision in [6, 7]: # Account for century/millenia ranges being from *1 to *0 + start += 1 + if startUpper is not None: + startUpper += 1 + timeFmt = 0 + else: + return None + return start, startUpper, timeFmt +def jdPairToJd(jdPair: tuple[int, int]) -> int: + """ Converts a julian-date-representing value from jdcal into an int """ + return math.floor(sum(jdPair)) + +# For using multiple processes def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str: """ Forwards to readDumpChunk() (for use with pool.map()) """ return readDumpChunk(*params) -def readDumpChunk(procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str: +def readDumpChunk( + procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str: """ Reads lines in the dump that begin after a start-byte, and not after an end byte. If startByte is -1, start at the first line. """ # Read dump -- cgit v1.2.3