aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore8
-rw-r--r--backend/hist_data/__init__.py0
-rw-r--r--backend/hist_data/wikidata/__init__.py0
-rw-r--r--backend/tests/__init__.py0
-rw-r--r--backend/tests/common.py49
-rw-r--r--backend/tests/wikidata/__init__.py0
-rw-r--r--backend/tests/wikidata/test_gen_events_data.py171
7 files changed, 228 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3127a4c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+/package-lock.json
+/node_modules/
+/dist/
+__pycache__
+.venv
+
+/backend/hist_data/wikidata/events.db
+/backend/hist_data/wikidata/*.json.bz2
diff --git a/backend/hist_data/__init__.py b/backend/hist_data/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/hist_data/__init__.py
diff --git a/backend/hist_data/wikidata/__init__.py b/backend/hist_data/wikidata/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/hist_data/wikidata/__init__.py
diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/tests/__init__.py
diff --git a/backend/tests/common.py b/backend/tests/common.py
new file mode 100644
index 0000000..cb455e4
--- /dev/null
+++ b/backend/tests/common.py
@@ -0,0 +1,49 @@
+"""
+Utilities for testing
+"""
+
+from typing import Any
+import bz2, gzip, sqlite3
+
+def createTestFile(filename: str, content: str) -> None:
+ """ Creates a file with the given name and contents """
+ with open(filename, 'w') as file:
+ file.write(content)
+
+def readTestFile(filename: str) -> str:
+ """ Returns the contents of a file with the given name """
+ with open(filename) as file:
+ return file.read()
+
+def createTestBz2(filename: str, content: str) -> None:
+ """ Creates a bzip2 file with the given name and contents """
+ with bz2.open(filename, mode='wb') as file:
+ file.write(content.encode())
+
+def createTestGzip(filename: str, content: str) -> None:
+ """ Creates a gzip file with the given name and contents """
+ with gzip.open(filename, mode='wt') as file:
+ file.write(content)
+
+TableRows = set[tuple[Any, ...]]
+def createTestDbTable(filename: str, createCmd: str | None, insertCmd: str, rows: TableRows) -> None:
+ """ Creates an sqlite db with a table specified by creation+insertion commands and records.
+ If 'createCmd' is None, just insert into an existing table."""
+ dbCon = sqlite3.connect(filename)
+ dbCur = dbCon.cursor()
+ if createCmd is not None:
+ dbCur.execute(createCmd)
+ for row in rows:
+ dbCur.execute(insertCmd, row)
+ dbCon.commit()
+ dbCon.close()
+
+def readTestDbTable(filename: str, selectCmd: str) -> TableRows:
+ """ Returns the records in a sqlite db with the given name, using the given select command """
+ rows: set[tuple[Any, ...]] = set()
+ dbCon = sqlite3.connect(filename)
+ dbCur = dbCon.cursor()
+ for row in dbCur.execute(selectCmd):
+ rows.add(row)
+ dbCon.close()
+ return rows
diff --git a/backend/tests/wikidata/__init__.py b/backend/tests/wikidata/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/tests/wikidata/__init__.py
diff --git a/backend/tests/wikidata/test_gen_events_data.py b/backend/tests/wikidata/test_gen_events_data.py
new file mode 100644
index 0000000..faa19c9
--- /dev/null
+++ b/backend/tests/wikidata/test_gen_events_data.py
@@ -0,0 +1,171 @@
+import unittest
+import tempfile, os, json, bz2, pickle, indexed_bzip2
+
+from tests.common import readTestDbTable
+from hist_data.wikidata.gen_events_data import genData
+
+def runGenData(wikiItemArray: str, preGenOffsets: bool, nProcs: int):
+ """ Sets up wikidata file to be read by genData(), runs it, and returns the output database's contents.
+ If 'preGenOffsets' is True, generates a bz2 offsets file before running genData(). """
+ with tempfile.TemporaryDirectory() as tempDir:
+ # Create temp wikidata file
+ wikidataFile = os.path.join(tempDir, 'dump.json.bz2')
+ with bz2.open(wikidataFile, mode='wb') as file:
+ file.write(b'[\n')
+ for i in range(len(wikiItemArray)):
+ file.write(json.dumps(wikiItemArray[i], separators=(',',':')).encode())
+ if i < len(wikiItemArray) - 1:
+ file.write(b',')
+ file.write(b'\n')
+ file.write(b']\n')
+ # Create temp offsets file if requested
+ offsetsFile = os.path.join(tempDir, 'offsets.dat')
+ if preGenOffsets:
+ with indexed_bzip2.open(wikidataFile) as file:
+ with open(offsetsFile, 'wb') as file2:
+ pickle.dump(file.block_offsets(), file2)
+ # Run genData()
+ dbFile = os.path.join(tempDir, 'events.db')
+ genData(wikidataFile, offsetsFile, dbFile, nProcs)
+ # Read db
+ return readTestDbTable(dbFile, 'SELECT * FROM events')
+
+class TestGenData(unittest.TestCase):
+ def setUp(self):
+ self.maxDiff = None # Remove output-diff size limit
+ self.testWikiItems = [
+ {
+ 'id': 'Q1',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q1656682'}}}}], # 'instance of' 'event'
+ 'P585': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'point in time'
+ 'time':'+1950-12-00T00:00:00Z',
+ 'timezone':0,
+ 'before':0,
+ 'after':0,
+ 'precision':10, # month precision
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar'
+ }}}}],
+ 'P141': [{'mainsnak': {'datavalue': {'value': {'id': 'Q211005'}}}}], # Other random property
+ },
+ 'sitelinks': {'enwiki': {'title': 'event one'}},
+ },
+ {
+ 'id': 'Q2',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q5'}}}}], # 'instance of' 'human'
+ 'P569': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of birth'
+ 'time':'+2002-11-02T00:00:00Z',
+ 'precision':11, # day precision
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
+ }}}}],
+ 'P570': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'date of death'
+ 'time':'+2010-06-21T00:00:01Z',
+ 'timezone':1,
+ 'precision':11,
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar'
+ }}}}],
+ },
+ 'sitelinks': {'enwiki': {'title': 'Human One'}},
+ },
+ {
+ 'id': 'Q3',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7275'}}}}], # 'instance of' 'state'
+ 'P580': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'start time'
+ 'time':'-1001-00-00T00:00:00Z',
+ 'precision':9, # year precision
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985727'
+ }}}}],
+ 'P582': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'end time'
+ 'time':'-99-00-00T00:00:01Z',
+ 'precision':9,
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985786'
+ }}}}],
+ },
+ 'sitelinks': {'enwiki': {'title': 'country one'}},
+ },
+ {
+ 'id': 'Q4',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q6256'}}}}], # 'instance of' 'country'
+ 'P7584': [{'mainsnak': {'datavalue': {'type': 'quantity', 'value': {
+ # 'age estimated by a dating method'
+ "amount":"+10.9",
+ "unit":"http://www.wikidata.org/entity/Q3013059", # kiloannum
+ "lowerBound":"+9",
+ "upperBound":"+11",
+ }}}}],
+ },
+ 'sitelinks': {'enwiki': {'title': 'country two'}},
+ },
+ {
+ 'id': 'Q5',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11019'}}}}], # 'instance of' 'machine'
+ 'P575': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'time of discovery or invention'
+ 'time':'+0101-00-00T00:00:01Z',
+ 'precision':6, # millenium precision
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985786'
+ }}}}],
+ },
+ 'sitelinks': {'enwiki': {'title': 'discovery one'}},
+ },
+ {
+ 'id': 'Q6',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7725634'}}}}], # 'instance of' 'literary work'
+ 'P1319': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'earliest date'
+ 'time':'-0020-08-01T00:00:00Z',
+ 'precision':11, # day precision
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
+ }}}}],
+ 'P1326': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'latest date'
+ 'time':'-0020-09-01T00:00:00Z',
+ 'precision':11,
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985786' # 'proleptic julian calendar'
+ }}}}],
+ },
+ 'sitelinks': {'enwiki': {'title': 'media one'}},
+ },
+ {
+ 'id': 'Q7',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11424'}}}}], # 'instance of' 'film'
+ 'P577': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'publication date'
+ 'time':'-2103-00-00T00:00:00Z',
+ 'precision':7, # century precision
+ 'calendarmodel':'http://www.wikidata.org/entity/Q1985727'
+ }}}}],
+ },
+ 'sitelinks': {'enwiki': {'title': 'media two'}},
+ },
+ {
+ 'id': 'Q8',
+ 'claims': {
+ 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # 'instance of' 'taxon'
+ }
+ # No title
+ },
+ ]
+ self.expectedRows = {
+ (1, 'event one', 2433616, 2433646, None, None, 2, 'event'),
+ (2, 'Human One', 2452593, None, 2455368, None, 3, 'human'),
+ (3, 'country one', -1001, None, -99, None, 0, 'country'),
+ (4, 'country two', -9000, -7000, None, None, 0, 'country'),
+ (5, 'discovery one', 1, 1000, None, None, 0, 'discovery'),
+ (6, 'media one', 1713965, None, 1713996, None, 1, 'media'),
+ (7, 'media two', -2199, -2100, None, None, 0, 'media'),
+ }
+ def test_wikiItems(self):
+ rows = runGenData(self.testWikiItems, False, 1)
+ self.assertEqual(rows, self.expectedRows)
+ def test_empty_dump(self):
+ rows = runGenData([{}], False, 1)
+ self.assertEqual(rows, set())
+ def test_multiprocessing(self):
+ rows = runGenData(self.testWikiItems, False, 4)
+ self.assertEqual(rows, self.expectedRows)
+ def test_existing_offsets(self):
+ rows = runGenData(self.testWikiItems, True, 3)
+ self.assertEqual(rows, self.expectedRows)