aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/gen_events_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data/gen_events_data.py')
-rwxr-xr-xbackend/hist_data/gen_events_data.py85
1 files changed, 63 insertions, 22 deletions
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py
index 60402b5..453a9ad 100755
--- a/backend/hist_data/gen_events_data.py
+++ b/backend/hist_data/gen_events_data.py
@@ -59,26 +59,37 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or
# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
# Possibly related: https://github.com/python/cpython/issues/72882
-# Took about 4.5 hours to run
+# Note: Took about 4.5 hours to run
-# Code used in unit testing (for resolving imports of modules within this directory)
-import os, sys
+# For unit testing, resolve imports of modules within this directory
+import os
+import sys
parentDir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parentDir)
-# Standard imports
+
from typing import cast
import argparse
-import math, re
-import io, bz2, json, sqlite3
-import indexed_bzip2, pickle, multiprocessing, tempfile
-# Local imports
+import math
+import re
+import io
+import bz2
+import json
+import sqlite3
+
+import indexed_bzip2
+import pickle
+import multiprocessing
+import tempfile
+
from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR
-# Constants
+# ========== Constants ==========
+
WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2')
OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat')
DB_FILE = 'data.db'
N_PROCS = 6 # Number of processes to use
+
# For getting Wikidata entity IDs
INSTANCE_OF = 'P31'
EVENT_CTG: dict[str, dict[str, str]] = {
@@ -173,24 +184,28 @@ UNIT_TO_SCALE: dict[str, int] = {
'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs)
'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs)
}
+
# For filtering lines before parsing JSON
TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()
PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()
+# ========== Main function ==========
+
def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
""" Reads the dump and writes to db """
- # Check db
if os.path.exists(dbFile):
print('ERROR: Database already exists')
return
- # Read dump, and write to db
- print('Writing to db')
+
+ print('Opening db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+
dbCur.execute('CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)')
dbCur.execute('CREATE INDEX events_id_start_idx ON events(id, start)')
dbCur.execute('CREATE INDEX events_title_nocase_idx ON events(title COLLATE NOCASE)')
+
if nProcs == 1:
with bz2.open(wikidataFile, mode='rb') as file:
for lineNum, line in enumerate(file, 1):
@@ -206,6 +221,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
with indexed_bzip2.open(wikidataFile) as file:
with open(offsetsFile, 'wb') as file2:
pickle.dump(file.block_offsets(), file2)
+
print('Allocating file into chunks')
fileSz: int # Was about 1.4 TB
with indexed_bzip2.open(wikidataFile) as file:
@@ -216,6 +232,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
# Each adjacent pair specifies a start+end byte index for readDumpChunk()
print(f'- Chunk size: {chunkSz:,}')
+
print('Starting processes to read dump')
with tempfile.TemporaryDirectory() as tempDirName:
with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool:
@@ -227,15 +244,19 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
with open(outFile, 'rb') as file:
for item in pickle.load(file):
dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', item)
+
+ print('Closing db')
dbCon.commit()
dbCon.close()
-# For data extraction
+# ========== For data extraction ==========
+
def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None:
""" Parses a Wikidata dump line, returning an entry to add to the db """
# Check with regexes
if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None:
return None
+
# Decode
try:
line = lineBytes.decode('utf-8').rstrip().rstrip(',')
@@ -246,12 +267,14 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non
if 'claims' not in jsonItem:
return None
claims = jsonItem['claims']
+
# Get wikidata ID, enwiki title
try:
itemId = int(jsonItem['id'][1:]) # Skip initial 'Q'
itemTitle: str = jsonItem['sitelinks']['enwiki']['title']
except (KeyError, ValueError):
return None
+
# Get event category
eventCtg: str | None = None
if INSTANCE_OF in claims: # Check types
@@ -269,6 +292,7 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non
eventCtg = ID_TO_CTG[prop]
if not eventCtg:
return None
+
# Check for event-start/end props
startVal: str
endVal: str | None
@@ -297,13 +321,15 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non
break
if not found:
return None
+
# Convert time values
timeData = getTimeData(startVal, endVal, timeType)
if timeData is None:
return None
start, startUpper, end, endUpper, timeFmt = timeData
- #
+
return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg)
+
def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None:
""" Obtains event start+end data from 'datavalue' objects with type 'time', according to 'timeType' """
# Values to return
@@ -312,13 +338,13 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
end: int | None = None
endUpper: int | None = None
timeFmt: int
- #
+
if timeType == 'age estimated by a dating method':
+ # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in
+ # 'datedness' and undesirable small offsets to values like '1 billion years old'.
if 'type' not in startVal or startVal['type'] != 'quantity':
return None
- # Get quantity data
- # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in
- # 'datedness' and undesirable small offsets to values like '1 billion years old'.
+
try:
value = startVal['value']
amount = math.ceil(float(value['amount']))
@@ -331,23 +357,26 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
upperBound = None
except (KeyError, ValueError):
return None
- # Get unit scale
+
+ # Get scale
if unit not in UNIT_TO_SCALE:
return None
scale = UNIT_TO_SCALE[unit]
+
# Get start+startUpper
if lowerBound is None:
start = -amount * scale
else:
start = -cast(int, upperBound) * scale
startUpper = -lowerBound * scale
+
# Adjust precision
start = start // scale * scale
if startUpper is not None:
startUpper = startUpper // scale * scale
elif scale > 1:
startUpper = start + scale - 1
- #
+
timeFmt = 0
elif timeType == 'earliest date':
# Get start
@@ -355,6 +384,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
if startTimeVals is None:
return None
start, _, timeFmt = startTimeVals
+
# Get end
endTimeVals = getEventTime(endVal)
if endTimeVals is None:
@@ -371,6 +401,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
if startTimeVals is None:
return None
start, startUpper, timeFmt = startTimeVals
+
# Get end+endUpper
if endVal is not None:
endTimeVals = getEventTime(endVal)
@@ -383,6 +414,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
else:
return None
return start, startUpper, end, endUpper, timeFmt
+
def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
""" Obtains event start (or end) data from a 'datavalue' object with type 'time' """
if 'type' not in dataVal or dataVal['type'] != 'time':
@@ -399,6 +431,7 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
calendarmodel = value['calendarmodel']
except (KeyError, ValueError):
return None
+
# Get start+startUpper
start: int
startUpper: int | None = None
@@ -430,12 +463,15 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
timeFmt = 0
else:
return None
+
return start, startUpper, timeFmt
-# For using multiple processes
+# ========== For using multiple processes ==========
+
def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str:
""" Forwards to readDumpChunk() (for use with pool.map()) """
return readDumpChunk(*params)
+
def readDumpChunk(
procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str:
""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
@@ -447,12 +483,14 @@ def readDumpChunk(
with open(offsetsFile, 'rb') as file2:
offsets = pickle.load(file2)
file.set_block_offsets(offsets)
+
# Seek to chunk
if startByte != -1:
file.seek(startByte)
file.readline()
else:
startByte = 0 # Used for progress calculation
+
# Read lines
count = 0
while file.tell() <= endByte:
@@ -463,14 +501,17 @@ def readDumpChunk(
entry = readDumpLine(file.readline())
if entry:
entries.append(entry)
+
# Output results into file
with open(outFile, 'wb') as file:
pickle.dump(entries, file)
return outFile
+# ========== Main block ==========
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
multiprocessing.set_start_method('spawn')
genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)