diff options
Diffstat (limited to 'backend/tests/wikidata/test_gen_events_data.py')
| -rw-r--r-- | backend/tests/wikidata/test_gen_events_data.py | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/backend/tests/wikidata/test_gen_events_data.py b/backend/tests/wikidata/test_gen_events_data.py new file mode 100644 index 0000000..faa19c9 --- /dev/null +++ b/backend/tests/wikidata/test_gen_events_data.py @@ -0,0 +1,171 @@ +import unittest +import tempfile, os, json, bz2, pickle, indexed_bzip2 + +from tests.common import readTestDbTable +from hist_data.wikidata.gen_events_data import genData + +def runGenData(wikiItemArray: str, preGenOffsets: bool, nProcs: int): + """ Sets up wikidata file to be read by genData(), runs it, and returns the output database's contents. + If 'preGenOffsets' is True, generates a bz2 offsets file before running genData(). """ + with tempfile.TemporaryDirectory() as tempDir: + # Create temp wikidata file + wikidataFile = os.path.join(tempDir, 'dump.json.bz2') + with bz2.open(wikidataFile, mode='wb') as file: + file.write(b'[\n') + for i in range(len(wikiItemArray)): + file.write(json.dumps(wikiItemArray[i], separators=(',',':')).encode()) + if i < len(wikiItemArray) - 1: + file.write(b',') + file.write(b'\n') + file.write(b']\n') + # Create temp offsets file if requested + offsetsFile = os.path.join(tempDir, 'offsets.dat') + if preGenOffsets: + with indexed_bzip2.open(wikidataFile) as file: + with open(offsetsFile, 'wb') as file2: + pickle.dump(file.block_offsets(), file2) + # Run genData() + dbFile = os.path.join(tempDir, 'events.db') + genData(wikidataFile, offsetsFile, dbFile, nProcs) + # Read db + return readTestDbTable(dbFile, 'SELECT * FROM events') + +class TestGenData(unittest.TestCase): + def setUp(self): + self.maxDiff = None # Remove output-diff size limit + self.testWikiItems = [ + { + 'id': 'Q1', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q1656682'}}}}], # 'instance of' 'event' + 'P585': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'point in time' + 'time':'+1950-12-00T00:00:00Z', + 'timezone':0, + 'before':0, + 'after':0, + 'precision':10, # month precision + 'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar' + }}}}], + 'P141': [{'mainsnak': {'datavalue': {'value': {'id': 'Q211005'}}}}], # Other random property + }, + 'sitelinks': {'enwiki': {'title': 'event one'}}, + }, + { + 'id': 'Q2', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q5'}}}}], # 'instance of' 'human' + 'P569': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of birth' + 'time':'+2002-11-02T00:00:00Z', + 'precision':11, # day precision + 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar' + }}}}], + 'P570': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of death' + 'time':'+2010-06-21T00:00:01Z', + 'timezone':1, + 'precision':11, + 'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar' + }}}}], + }, + 'sitelinks': {'enwiki': {'title': 'Human One'}}, + }, + { + 'id': 'Q3', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7275'}}}}], # 'instance of' 'state' + 'P580': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'start time' + 'time':'-1001-00-00T00:00:00Z', + 'precision':9, # year precision + 'calendarmodel':'http://www.wikidata.org/entity/Q1985727' + }}}}], + 'P582': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'end time' + 'time':'-99-00-00T00:00:01Z', + 'precision':9, + 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' + }}}}], + }, + 'sitelinks': {'enwiki': {'title': 'country one'}}, + }, + { + 'id': 'Q4', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q6256'}}}}], # 'instance of' 'country' + 'P7584': [{'mainsnak': {'datavalue': {'type': 'quantity', 'value': { + # 'age estimated by a dating method' + "amount":"+10.9", + "unit":"http://www.wikidata.org/entity/Q3013059", # kiloannum + "lowerBound":"+9", + "upperBound":"+11", + }}}}], + }, + 'sitelinks': {'enwiki': {'title': 'country two'}}, + }, + { + 'id': 'Q5', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11019'}}}}], # 'instance of' 'machine' + 'P575': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'time of discovery or invention' + 'time':'+0101-00-00T00:00:01Z', + 'precision':6, # millenium precision + 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' + }}}}], + }, + 'sitelinks': {'enwiki': {'title': 'discovery one'}}, + }, + { + 'id': 'Q6', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7725634'}}}}], # 'instance of' 'literary work' + 'P1319': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'earliest date' + 'time':'-0020-08-01T00:00:00Z', + 'precision':11, # day precision + 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar' + }}}}], + 'P1326': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'latest date' + 'time':'-0020-09-01T00:00:00Z', + 'precision':11, + 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar' + }}}}], + }, + 'sitelinks': {'enwiki': {'title': 'media one'}}, + }, + { + 'id': 'Q7', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11424'}}}}], # 'instance of' 'film' + 'P577': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'publication date' + 'time':'-2103-00-00T00:00:00Z', + 'precision':7, # century precision + 'calendarmodel':'http://www.wikidata.org/entity/Q1985727' + }}}}], + }, + 'sitelinks': {'enwiki': {'title': 'media two'}}, + }, + { + 'id': 'Q8', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # 'instance of' 'taxon' + } + # No title + }, + ] + self.expectedRows = { + (1, 'event one', 2433616, 2433646, None, None, 2, 'event'), + (2, 'Human One', 2452593, None, 2455368, None, 3, 'human'), + (3, 'country one', -1001, None, -99, None, 0, 'country'), + (4, 'country two', -9000, -7000, None, None, 0, 'country'), + (5, 'discovery one', 1, 1000, None, None, 0, 'discovery'), + (6, 'media one', 1713965, None, 1713996, None, 1, 'media'), + (7, 'media two', -2199, -2100, None, None, 0, 'media'), + } + def test_wikiItems(self): + rows = runGenData(self.testWikiItems, False, 1) + self.assertEqual(rows, self.expectedRows) + def test_empty_dump(self): + rows = runGenData([{}], False, 1) + self.assertEqual(rows, set()) + def test_multiprocessing(self): + rows = runGenData(self.testWikiItems, False, 4) + self.assertEqual(rows, self.expectedRows) + def test_existing_offsets(self): + rows = runGenData(self.testWikiItems, True, 3) + self.assertEqual(rows, self.expectedRows) |
