diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-02 14:51:53 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-02 14:51:53 +1100 |
| commit | 56369bccd977ac726bef70895883e79da4e1edd8 (patch) | |
| tree | 67a894fe1579f2da150f0162ccbdc8a0a19ef9be /backend/hist_data/gen_events_data.py | |
| parent | 0e5e46cedaaeacf59cfd0f2e30c1ae6923466870 (diff) | |
Adjust wikidata event specifiers
Do minor refactors:
- Swap fmt=1 and fmt=2 in 'events' table
- Make documentation consistently use BC and AD
- import argparse at start of scripts
Diffstat (limited to 'backend/hist_data/gen_events_data.py')
| -rwxr-xr-x | backend/hist_data/gen_events_data.py | 101 |
1 files changed, 52 insertions, 49 deletions
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py index f054f76..51d6940 100755 --- a/backend/hist_data/gen_events_data.py +++ b/backend/hist_data/gen_events_data.py @@ -2,8 +2,8 @@ """ Reads a Wikidata JSON dump, looking for entities usable as historical events. For each such -entity, finds a start date (may be a range), optional end date, and event category (eg: normal -event, person with birth/death date, country, etc). Writes the results into a database. +entity, finds a start date (may be a range), optional end date, and event category (eg: discovery, +person with birth/death date, etc). Writes the results into a database. The JSON dump contains an array of objects, each of which describes a Wikidata item item1, and takes up it's own line. @@ -12,11 +12,11 @@ and takes up it's own line. - Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue'] 'idx1' indexes an array of statements -Value objects have a 'type' and 'value' field. +'datavalue' objects have a 'type' and 'value' field. Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates An example: {"value":{ - "time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits (-0001 means 1 BCE) + "time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits (-0001 means 1 BC) "timezone":0, # Unused "before":0, # Unused "after":0, # Unused @@ -52,30 +52,31 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or "http://www.wikidata.org/entity/Q524410" - gigaannum (1e9 yrs) """ -# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by: -# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock. -# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state. +# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by: +# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock. +# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state. # Related: https://bugs.python.org/issue6721 # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 -# Enable unit testing code to, when running this script, resolve imports of modules within this directory +# Code used in unit testing (for resolving imports of modules within this directory) import os, sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) +# Standard imports +import argparse +import math, re +import io, bz2, json, sqlite3 +import indexed_bzip2, pickle, multiprocessing, tempfile +# Local imports +from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR -import io, math, re, argparse -import bz2, json, sqlite3 -import multiprocessing, indexed_bzip2, pickle, tempfile -# Modules in this directory -from cal import gregorianToJdn, julianToJdn - +# Constants WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2') DUMP_YEAR = 2022 # Used for converting 'age' values into dates OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat') DB_FILE = 'data.db' -N_PROCS = 6 - +N_PROCS = 6 # Number of processes to use # For getting Wikidata entity IDs INSTANCE_OF = 'P31' EVENT_CTG: dict[str, dict[str, str]] = { @@ -91,31 +92,33 @@ EVENT_CTG: dict[str, dict[str, str]] = { 'recurring event': 'Q15275719', 'event sequence': 'Q15900616', 'incident': 'Q18669875', + 'project': 'Q170584', + 'number of deaths': 'P1120', }, - 'human': { - 'human': 'Q5', - }, - 'country': { + 'place': { 'country': 'Q6256', 'state': 'Q7275', 'sovereign state': 'Q3624078', + 'city': 'Q515', + 'tourist attraction': 'Q570116', + 'heritage site': 'Q358', + 'terrestrial planet': 'Q128207', + 'navigational star': 'Q108171565', + 'G-type main-sequence star': 'Q5864', + }, + 'organism': { + 'taxon': 'Q16521', + }, + 'person': { + 'human': 'Q5', + }, + 'work': { + 'creator': 'P170', + 'genre': 'P136', }, 'discovery': { 'time of discovery or invention': 'P575', }, - 'media': { - 'work of art': 'Q4502142', - 'literary work': 'Q7725634', - 'comic book series': 'Q14406742', - 'painting': 'Q3305213', - 'musical work/composition': 'Q105543609', - 'film': 'Q11424', - 'animated film': 'Q202866', - 'television series': 'Q16401', - 'anime television series': 'Q63952888', - 'video game': 'Q7889', - 'video game series': 'Q7058673', - }, } ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()} EVENT_PROP: dict[str, str] = { @@ -148,14 +151,14 @@ PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [ ('time of discovery or invention',), ('publication date',), ] -UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years +UNIT_TO_SCALE: dict[str, int] = { + # Maps 'unit' values (found in 'datavalue' objects with type=quantity) to numbers of years 'http://www.wikidata.org/entity/Q577': 1, # 'year' 'http://www.wikidata.org/entity/Q24564698': 1, # 'years old' 'http://www.wikidata.org/entity/Q3013059': 10**3, # 'kiloannum' (1e3 yrs) 'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs) 'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs) } - # For filtering lines before parsing JSON TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode() PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode() @@ -183,12 +186,12 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No # The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects) else: if not os.path.exists(offsetsFile): - print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours) + print('Creating offsets file') # For indexed access used in multiprocessing (may take about 7 hours) with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'wb') as file2: pickle.dump(file.block_offsets(), file2) print('Allocating file into chunks') - fileSz: int # About 1.4 TB + fileSz: int # Was about 1.4 TB with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'rb') as file2: file.set_block_offsets(pickle.load(file2)) @@ -206,15 +209,15 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]): # Add entries from subprocess output file with open(outFile, 'rb') as file: - for entry in pickle.load(file): - dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry) + for item in pickle.load(file): + dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', item) dbCon.commit() dbCon.close() # For data extraction def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None: """ Parses a Wikidata dump line, returning an entry to add to the db """ - # Check with regex + # Check with regexes if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None: return None # Decode @@ -283,7 +286,7 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non # return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg) def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None: - """ Obtains event start+end data from value objects with type 'time', according to 'timeType' """ + """ Obtains event start+end data from 'datavalue' objects with type 'time', according to 'timeType' """ # Values to return start: int startUpper: int | None = None @@ -317,7 +320,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | else: start = DUMP_YEAR - upperBound * scale startUpper = DUMP_YEAR - lowerBound * scale - # Account for non-existence of 0 CE + # Account for non-existence of 0 AD if start <= 0: start -= 1 if startUpper is not None and startUpper <= 0: @@ -342,7 +345,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | return None end, _, timeFmt2 = endTimeVals if timeFmt != timeFmt2: - if timeFmt == 1 and timeFmt2 == 2: + if timeFmt == 2 and timeFmt2 == 1: timeFmt = 3 else: return None @@ -359,13 +362,13 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | return None end, endUpper, timeFmt2 = endTimeVals if timeFmt != timeFmt2: - if timeFmt == 1 and timeFmt2 == 2: + if timeFmt == 2 and timeFmt2 == 1: timeFmt = 3 else: return None return start, startUpper, end, endUpper, timeFmt def getEventTime(dataVal) -> tuple[int, int | None, int] | None: - """ Obtains event start (or end) data from a value object with type 'time' """ + """ Obtains event start (or end) data from a 'datavalue' object with type 'time' """ if 'type' not in dataVal or dataVal['type'] != 'time': return None # Get time data @@ -385,20 +388,20 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None: startUpper: int | None = None timeFmt: int if precision in [10, 11]: # 'month' or 'day' precision - if year < -4713: # If before 4713 BCE (start of valid julian date period) - print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}') + if year < MIN_CAL_YEAR: # If before start of valid julian date period + print(f'WARNING: Skipping sub-year-precision date before {-MIN_CAL_YEAR} BC: {json.dumps(dataVal)}') return None day = max(day, 1) # With month-precision, entry may have a 'day' of 0 if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar' start = gregorianToJdn(year, month, day) if precision == 10: startUpper = gregorianToJdn(year, month+1, 0) - timeFmt = 2 + timeFmt = 1 else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar') start = julianToJdn(year, month, day) if precision == 10: startUpper = julianToJdn(year, month+1, 0) - timeFmt = 1 + timeFmt = 2 elif 0 <= precision < 10: # 'year' to 'gigaannum' precision scale: int = 10 ** (9 - precision) start = year // scale * scale |
