diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-21 12:21:03 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-21 12:32:01 +1100 |
| commit | 0a9b2c2e5eca8a04e37fbdd423379882863237c2 (patch) | |
| tree | 1812bdb6bb13e4f76fdd7ef04075b291f775c213 /backend/hist_data/gen_events_data.py | |
| parent | 8321e2f92dbc073b8f1de87895d6620a2021b22e (diff) | |
Adjust backend coding style
Increase line spacing, add section comments, etc
Diffstat (limited to 'backend/hist_data/gen_events_data.py')
| -rwxr-xr-x | backend/hist_data/gen_events_data.py | 85 |
1 files changed, 63 insertions, 22 deletions
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py index 60402b5..453a9ad 100755 --- a/backend/hist_data/gen_events_data.py +++ b/backend/hist_data/gen_events_data.py @@ -59,26 +59,37 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 -# Took about 4.5 hours to run +# Note: Took about 4.5 hours to run -# Code used in unit testing (for resolving imports of modules within this directory) -import os, sys +# For unit testing, resolve imports of modules within this directory +import os +import sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) -# Standard imports + from typing import cast import argparse -import math, re -import io, bz2, json, sqlite3 -import indexed_bzip2, pickle, multiprocessing, tempfile -# Local imports +import math +import re +import io +import bz2 +import json +import sqlite3 + +import indexed_bzip2 +import pickle +import multiprocessing +import tempfile + from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR -# Constants +# ========== Constants ========== + WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2') OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat') DB_FILE = 'data.db' N_PROCS = 6 # Number of processes to use + # For getting Wikidata entity IDs INSTANCE_OF = 'P31' EVENT_CTG: dict[str, dict[str, str]] = { @@ -173,24 +184,28 @@ UNIT_TO_SCALE: dict[str, int] = { 'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs) 'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs) } + # For filtering lines before parsing JSON TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode() PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode() +# ========== Main function ========== + def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None: """ Reads the dump and writes to db """ - # Check db if os.path.exists(dbFile): print('ERROR: Database already exists') return - # Read dump, and write to db - print('Writing to db') + + print('Opening db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)') dbCur.execute('CREATE INDEX events_id_start_idx ON events(id, start)') dbCur.execute('CREATE INDEX events_title_nocase_idx ON events(title COLLATE NOCASE)') + if nProcs == 1: with bz2.open(wikidataFile, mode='rb') as file: for lineNum, line in enumerate(file, 1): @@ -206,6 +221,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'wb') as file2: pickle.dump(file.block_offsets(), file2) + print('Allocating file into chunks') fileSz: int # Was about 1.4 TB with indexed_bzip2.open(wikidataFile) as file: @@ -216,6 +232,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1] # Each adjacent pair specifies a start+end byte index for readDumpChunk() print(f'- Chunk size: {chunkSz:,}') + print('Starting processes to read dump') with tempfile.TemporaryDirectory() as tempDirName: with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool: @@ -227,15 +244,19 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No with open(outFile, 'rb') as file: for item in pickle.load(file): dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', item) + + print('Closing db') dbCon.commit() dbCon.close() -# For data extraction +# ========== For data extraction ========== + def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None: """ Parses a Wikidata dump line, returning an entry to add to the db """ # Check with regexes if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None: return None + # Decode try: line = lineBytes.decode('utf-8').rstrip().rstrip(',') @@ -246,12 +267,14 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non if 'claims' not in jsonItem: return None claims = jsonItem['claims'] + # Get wikidata ID, enwiki title try: itemId = int(jsonItem['id'][1:]) # Skip initial 'Q' itemTitle: str = jsonItem['sitelinks']['enwiki']['title'] except (KeyError, ValueError): return None + # Get event category eventCtg: str | None = None if INSTANCE_OF in claims: # Check types @@ -269,6 +292,7 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non eventCtg = ID_TO_CTG[prop] if not eventCtg: return None + # Check for event-start/end props startVal: str endVal: str | None @@ -297,13 +321,15 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non break if not found: return None + # Convert time values timeData = getTimeData(startVal, endVal, timeType) if timeData is None: return None start, startUpper, end, endUpper, timeFmt = timeData - # + return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg) + def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None: """ Obtains event start+end data from 'datavalue' objects with type 'time', according to 'timeType' """ # Values to return @@ -312,13 +338,13 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | end: int | None = None endUpper: int | None = None timeFmt: int - # + if timeType == 'age estimated by a dating method': + # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in + # 'datedness' and undesirable small offsets to values like '1 billion years old'. if 'type' not in startVal or startVal['type'] != 'quantity': return None - # Get quantity data - # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in - # 'datedness' and undesirable small offsets to values like '1 billion years old'. + try: value = startVal['value'] amount = math.ceil(float(value['amount'])) @@ -331,23 +357,26 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | upperBound = None except (KeyError, ValueError): return None - # Get unit scale + + # Get scale if unit not in UNIT_TO_SCALE: return None scale = UNIT_TO_SCALE[unit] + # Get start+startUpper if lowerBound is None: start = -amount * scale else: start = -cast(int, upperBound) * scale startUpper = -lowerBound * scale + # Adjust precision start = start // scale * scale if startUpper is not None: startUpper = startUpper // scale * scale elif scale > 1: startUpper = start + scale - 1 - # + timeFmt = 0 elif timeType == 'earliest date': # Get start @@ -355,6 +384,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | if startTimeVals is None: return None start, _, timeFmt = startTimeVals + # Get end endTimeVals = getEventTime(endVal) if endTimeVals is None: @@ -371,6 +401,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | if startTimeVals is None: return None start, startUpper, timeFmt = startTimeVals + # Get end+endUpper if endVal is not None: endTimeVals = getEventTime(endVal) @@ -383,6 +414,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | else: return None return start, startUpper, end, endUpper, timeFmt + def getEventTime(dataVal) -> tuple[int, int | None, int] | None: """ Obtains event start (or end) data from a 'datavalue' object with type 'time' """ if 'type' not in dataVal or dataVal['type'] != 'time': @@ -399,6 +431,7 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None: calendarmodel = value['calendarmodel'] except (KeyError, ValueError): return None + # Get start+startUpper start: int startUpper: int | None = None @@ -430,12 +463,15 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None: timeFmt = 0 else: return None + return start, startUpper, timeFmt -# For using multiple processes +# ========== For using multiple processes ========== + def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str: """ Forwards to readDumpChunk() (for use with pool.map()) """ return readDumpChunk(*params) + def readDumpChunk( procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str: """ Reads lines in the dump that begin after a start-byte, and not after an end byte. @@ -447,12 +483,14 @@ def readDumpChunk( with open(offsetsFile, 'rb') as file2: offsets = pickle.load(file2) file.set_block_offsets(offsets) + # Seek to chunk if startByte != -1: file.seek(startByte) file.readline() else: startByte = 0 # Used for progress calculation + # Read lines count = 0 while file.tell() <= endByte: @@ -463,14 +501,17 @@ def readDumpChunk( entry = readDumpLine(file.readline()) if entry: entries.append(entry) + # Output results into file with open(outFile, 'wb') as file: pickle.dump(entries, file) return outFile +# ========== Main block ========== + if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + multiprocessing.set_start_method('spawn') genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS) |
