diff options
Diffstat (limited to 'backend')
25 files changed, 179 insertions, 167 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index d05016c..a3ae6c1 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -11,13 +11,13 @@ This directory holds files used to generate the history database data.db. If `start_upper` is present, it and `start` denote an uncertain range of start times. Similarly for 'end' and 'end_upper'. - `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`. + - If 0, they denote a number of years AD (if positive) or BC (if negative). - If 1, they denote a Julian date number. - This allows simple comparison of events with day-level precision, but only goes back to 4713 BCE. - - If 2, same as 1, but dates are preferably displayed using the Gregorian calendar, not the Julian calendar. + This allows simple comparison of events with day-level precision, but only goes back to 4713 BC. + - If 2, same as 1, but with a preference for display using the Julian calendar, not the Gregorian calendar. For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not. - - If 3, same as 1, but 'end' and 'end_upper' are 'preferably Gregorian'. + - If 3, same as 2, but where 'start' and 'start_upper' are 'preferably Julian'. For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not. - - If 0, they denote a number of years CE (if positive) or BCE (if negative). - `pop`: <br> Format: `id INT PRIMARY KEY, pop INT` <br> Associates each event with a popularity measure (currently an average monthly viewcount) @@ -49,6 +49,7 @@ Some of the scripts use third-party packages: ## Generate Event Data 1. Obtain a Wikidata JSON dump in wikidata/, as specified in it's README. 1. Run `gen_events_data.py`, which creates `data.db`, and adds the `events` table. + You might want to set WIKIDATA_FILE in the script to the dump file's name. ## Generate Popularity Data 1. Obtain an enwiki dump and 'page view files' in enwiki/, as specified in the README. @@ -61,11 +62,14 @@ Some of the scripts use third-party packages: 1. In enwiki/, run `gen_img_data.py` which looks at pages in the dump that match entries in `events`, looks for infobox image names, and stores them in an image database. 1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found - images, and adds them to the image database. -1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. + images, and adds them to the image database. You should probably first change the USER_AGENT + script variable to identify yourself to the online API (this is expected + [best practice](https://www.mediawiki.org/wiki/API:Etiquette)). +1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. Setting the + USER_AGENT variable applies here as well. 1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/. Adds the `imgs` and `event_imgs` tables. <br> - The outputs will likely need additional manual changes: + The output images may need additional manual changes: - An input image might have no output produced, possibly due to data incompatibilities, memory limits, etc. - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. diff --git a/backend/hist_data/cal.py b/backend/hist_data/cal.py index 3b65205..550303e 100644 --- a/backend/hist_data/cal.py +++ b/backend/hist_data/cal.py @@ -1,14 +1,14 @@ """ Provides date conversion functions, HistDate, and date scales. -Algorithms for converting between calendars and Julian day number values were obtained from -https://en.wikipedia.org/wiki/Julian_day#Converting_Gregorian_calendar_date_to_Julian_Day_Number. """ +# For conversion between calendars and Julian day numbers. Algorithms were obtained from +# https://en.wikipedia.org/wiki/Julian_day#Converting_Gregorian_calendar_date_to_Julian_Day_Number. def gregorianToJdn(year: int, month: int, day: int) -> int: """ Converts a Gregorian calendar date to a Julian day number, denoting the noon-to-noon 'Julian day' that starts within the input day. - A year of 1 means 1 CE, and -1 means 1 BC (0 is treated like -1). + A year of 1 means 1 AD, and -1 means 1 BC (0 is treated like -1). A month of 1 means January. Can use a month of 13 and a day of 0. Valid for dates from 24th Nov 4714 BC onwards. """ @@ -20,7 +20,6 @@ def gregorianToJdn(year: int, month: int, day: int) -> int: jdn -= int((3 * int((year + 4900 + x) / 100)) / 4) jdn += day - 32075 return jdn - def julianToJdn(year: int, month: int, day: int) -> int: """ Like gregorianToJdn(), but converts a Julian calendar date. @@ -33,7 +32,6 @@ def julianToJdn(year: int, month: int, day: int) -> int: jdn += int(275 * month / 9) jdn += day + 1729777 return jdn - def jdnToGregorian(jdn: int) -> tuple[int, int, int]: """ Converts a Julian day number to a Gregorian calendar date, denoting the @@ -50,7 +48,6 @@ def jdnToGregorian(jdn: int) -> tuple[int, int, int]: if Y <= 0: Y -= 1 return Y, M, D - def jdnToJulian(jdn: int) -> tuple[int, int, int]: """ Like jdnToGregorian(), but converts to a Julian calendar date """ f = jdn + 1401 @@ -63,26 +60,25 @@ def jdnToJulian(jdn: int) -> tuple[int, int, int]: if Y <= 0: Y -= 1 return Y, M, D - def julianToGregorian(year: int, month: int, day: int) -> tuple[int, int, int]: return jdnToGregorian(julianToJdn(year, month, day)) - def gregorianToJulian(year: int, month: int, day: int) -> tuple[int, int, int]: return jdnToJulian(gregorianToJdn(year, month, day)) -MIN_CAL_YEAR = -4713 # Disallow within-year dates before this year +# For date representation +MIN_CAL_YEAR = -4713 # Year before which JDNs are not usable MONTH_SCALE = -1; DAY_SCALE = -2; -SCALES: list[int] = [int(x) for x in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]]; +SCALES: list[int] = [int(s) for s in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]]; class HistDate: """ Represents a historical date - - 'year' may be negative (-1 means 1 BCE) + - 'year' may be negative (-1 means 1 BC) - 'month' and 'day' are at least 1, if given - 'gcal' may be: - True: Indicates a Gregorian calendar date - False: Means the date should, for display, be converted to a Julian calendar date - - None: 'month' and 'day' are 1 (used for dates before the Julian period starting year 4713 BCE) + - None: 'month' and 'day' are 1 (required for dates before MIN_CAL_YEAR) """ def __init__(self, gcal: bool | None, year: int, month=1, day=1): self.gcal = gcal @@ -96,22 +92,24 @@ class HistDate: def __repr__(self): return str(self.__dict__) def dbDateToHistDate(n: int, fmt: int, end=False) -> HistDate: + """ Converts a start/start_upper/etc and fmt value in the 'events' db table, into a HistDate """ if fmt == 0: # year if n >= MIN_CAL_YEAR: return HistDate(True, n, 1, 1) else: return HistDate(None, n) - elif fmt == 1 or fmt == 3 and not end: # jdn for julian calendar - return HistDate(False, *jdnToJulian(n)) - else: # fmt == 2 or fmt == 3 and end + elif fmt == 1 or fmt == 3 and end: # jdn for gregorian calendar return HistDate(True, *jdnToGregorian(n)) + else: # fmt == 2 or fmt == 3 and not end + return HistDate(False, *jdnToJulian(n)) def dateToUnit(date: HistDate, scale: int) -> int: + """ Converts a date to an int representing a unit on a scale """ if scale >= 1: return date.year // scale elif scale == MONTH_SCALE: if date.gcal == False: return julianToJdn(date.year, date.month, 1) - else: + else: # True or None return gregorianToJdn(date.year, date.month, 1) else: # scale == DAY_SCALE if date.gcal == False: diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index 262ebdb..76d33e5 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -33,11 +33,11 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. # Image Files - `gen_img_data.py` <br> - Used to find infobox image names for page IDs, and store them into a database. + Finds infobox image names for page IDs, and stores them into a database. - `download_img_license_info.py` <br> - Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database. + Downloads licensing metadata for image names, via wikipedia's online API, and stores them into a database. - `img_data.db` <br> - Used to hold metadata about infobox images for a set of page IDs. + Holds metadata about infobox images for a set of page IDs. Generated using `gen_img_data.py` and `download_img_license_info.py`. <br> Tables: <br> - `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br> @@ -47,7 +47,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. <br> Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. - `download_imgs.py` <br> - Used to download image files into imgs/. + Downloads image files into imgs/. # Description Files - `gen_desc_data.py` <br> diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 1217caf..43f2c43 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks at already-processed names to decide what to skip. """ -import re +import argparse +import re, time, signal import sqlite3, urllib.parse, html import requests -import time, signal IMG_DB = 'img_data.db' # @@ -150,7 +150,6 @@ def downloadInfo(imgDb: str) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index bbd2cda..7dd0771 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. """ -import re, os +import argparse +import re, os, time, signal import sqlite3 import urllib.parse, requests -import time, signal IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' @@ -22,7 +22,7 @@ USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' # It's unclear how to properly check for cache misses, so we just aim for 1 per sec -BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit) +EXP_BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit) def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if not os.path.exists(outDir): @@ -84,7 +84,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: time.sleep(timeout) except Exception as e: print(f'Error while downloading to {outFile}: {e}') - if not BACKOFF: + if not EXP_BACKOFF: return else: timeout *= 2 @@ -94,7 +94,6 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index b3fde52..bb2b845 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -7,14 +7,14 @@ and adds them to a database # In testing, this script took over 10 hours to run, and generated about 5GB +import argparse import sys, os, re -import bz2 -import html, mwxml, mwparserfromhell +import bz2, html, mwxml, mwparserfromhell import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' - +# Regexps DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag @@ -119,7 +119,6 @@ def convertTitle(title: str) -> str: return html.unescape(title).replace('_', ' ') if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 5778680..6be8bc5 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -1,11 +1,12 @@ #!/usr/bin/python3 """ -Adds data from the wiki dump index-file into a database +Adds data from the wiki-dump index-file into a database """ + +import argparse import sys, os, re -import bz2 -import sqlite3 +import bz2, sqlite3 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines DB_FILE = 'dump_index.db' @@ -53,7 +54,6 @@ def genData(indexFile: str, dbFile: str) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 922b893..9aa3863 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -8,15 +8,15 @@ The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """ -import re -import os, bz2, html, urllib.parse +import os, re +import bz2, html, urllib.parse import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -# +# Regexps ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') @@ -33,7 +33,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)') - # 'img_name' may be NULL + # 'img_name' values are set to NULL to indicate page IDs where no image was found imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') else: # Check for already-processed page IDs diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py index 0d7ee88..6c9fee2 100755 --- a/backend/hist_data/gen_desc_data.py +++ b/backend/hist_data/gen_desc_data.py @@ -1,10 +1,10 @@ #!/usr/bin/python3 """ -Maps events to short descriptions from Wikipedia, -and stores them in the database. +Maps events to short descriptions from Wikipedia, and stores them in the database. """ +import argparse import os, sqlite3 ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') @@ -52,7 +52,6 @@ def genData(enwikiDb: str, dbFile: str) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() # diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py index e771e57..d796d92 100644..100755 --- a/backend/hist_data/gen_disp_data.py +++ b/backend/hist_data/gen_disp_data.py @@ -1,15 +1,18 @@ #!/usr/bin/python3 """ -Adds data about event distribution to the database, and removes events not eligible for display. +Adds data about event distribution to the database, +and removes events not eligible for display """ -# Enable unit testing code to, when running this script, resolve imports of modules within this directory +# Code used in unit testing (for resolving imports of modules within this directory) import os, sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) - +# Standard imports +import argparse import sqlite3 +# Local imports from cal import SCALES, dbDateToHistDate, dateToUnit MAX_DISPLAYED_PER_UNIT = 4 @@ -62,8 +65,8 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None: del scaleUnitToCounts[(scale, unit)] else: scaleUnitToCounts[(scale, unit)][0] = count - query2 = 'SELECT events.id FROM events LEFT JOIN pop ON events.id = pop.id WHERE pop.id IS NULL' - for (eventId,) in dbCur.execute(query2): # Include events without scores + for (eventId,) in dbCur.execute( # Find events without scores + 'SELECT events.id FROM events LEFT JOIN pop ON events.id = pop.id WHERE pop.id IS NULL'): eventsToDel.append(eventId) print(f'Found {len(eventsToDel)}') # @@ -91,7 +94,6 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() # diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py index f054f76..51d6940 100755 --- a/backend/hist_data/gen_events_data.py +++ b/backend/hist_data/gen_events_data.py @@ -2,8 +2,8 @@ """ Reads a Wikidata JSON dump, looking for entities usable as historical events. For each such -entity, finds a start date (may be a range), optional end date, and event category (eg: normal -event, person with birth/death date, country, etc). Writes the results into a database. +entity, finds a start date (may be a range), optional end date, and event category (eg: discovery, +person with birth/death date, etc). Writes the results into a database. The JSON dump contains an array of objects, each of which describes a Wikidata item item1, and takes up it's own line. @@ -12,11 +12,11 @@ and takes up it's own line. - Getting a property statement value: item1['claims'][prop1][idx1]['mainsnak']['datavalue'] 'idx1' indexes an array of statements -Value objects have a 'type' and 'value' field. +'datavalue' objects have a 'type' and 'value' field. Info about objects with type 'time' can be found at: https://www.wikidata.org/wiki/Help:Dates An example: {"value":{ - "time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits (-0001 means 1 BCE) + "time":"+1830-10-04T00:00:00Z", # The year is always signed and padded to 4-16 digits (-0001 means 1 BC) "timezone":0, # Unused "before":0, # Unused "after":0, # Unused @@ -52,30 +52,31 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or "http://www.wikidata.org/entity/Q524410" - gigaannum (1e9 yrs) """ -# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by: -# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock. -# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state. +# On Linux, running on the full dataset seems to make the processes hang when done. This was resolved by: +# - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock. +# - Using set_start_method('spawn'). Apparently 'fork' can cause unexpected copying of lock/semaphore/etc state. # Related: https://bugs.python.org/issue6721 # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 -# Enable unit testing code to, when running this script, resolve imports of modules within this directory +# Code used in unit testing (for resolving imports of modules within this directory) import os, sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) +# Standard imports +import argparse +import math, re +import io, bz2, json, sqlite3 +import indexed_bzip2, pickle, multiprocessing, tempfile +# Local imports +from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR -import io, math, re, argparse -import bz2, json, sqlite3 -import multiprocessing, indexed_bzip2, pickle, tempfile -# Modules in this directory -from cal import gregorianToJdn, julianToJdn - +# Constants WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2') DUMP_YEAR = 2022 # Used for converting 'age' values into dates OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat') DB_FILE = 'data.db' -N_PROCS = 6 - +N_PROCS = 6 # Number of processes to use # For getting Wikidata entity IDs INSTANCE_OF = 'P31' EVENT_CTG: dict[str, dict[str, str]] = { @@ -91,31 +92,33 @@ EVENT_CTG: dict[str, dict[str, str]] = { 'recurring event': 'Q15275719', 'event sequence': 'Q15900616', 'incident': 'Q18669875', + 'project': 'Q170584', + 'number of deaths': 'P1120', }, - 'human': { - 'human': 'Q5', - }, - 'country': { + 'place': { 'country': 'Q6256', 'state': 'Q7275', 'sovereign state': 'Q3624078', + 'city': 'Q515', + 'tourist attraction': 'Q570116', + 'heritage site': 'Q358', + 'terrestrial planet': 'Q128207', + 'navigational star': 'Q108171565', + 'G-type main-sequence star': 'Q5864', + }, + 'organism': { + 'taxon': 'Q16521', + }, + 'person': { + 'human': 'Q5', + }, + 'work': { + 'creator': 'P170', + 'genre': 'P136', }, 'discovery': { 'time of discovery or invention': 'P575', }, - 'media': { - 'work of art': 'Q4502142', - 'literary work': 'Q7725634', - 'comic book series': 'Q14406742', - 'painting': 'Q3305213', - 'musical work/composition': 'Q105543609', - 'film': 'Q11424', - 'animated film': 'Q202866', - 'television series': 'Q16401', - 'anime television series': 'Q63952888', - 'video game': 'Q7889', - 'video game series': 'Q7058673', - }, } ID_TO_CTG = {id: ctg for ctg, nmToId in EVENT_CTG.items() for name, id in nmToId.items()} EVENT_PROP: dict[str, str] = { @@ -148,14 +151,14 @@ PROP_RULES: list[tuple[str] | tuple[str, str] | tuple[str, str, bool]] = [ ('time of discovery or invention',), ('publication date',), ] -UNIT_TO_SCALE: dict[str, int] = { # Maps 'unit' values (found in type=quantity value objects) to numbers of years +UNIT_TO_SCALE: dict[str, int] = { + # Maps 'unit' values (found in 'datavalue' objects with type=quantity) to numbers of years 'http://www.wikidata.org/entity/Q577': 1, # 'year' 'http://www.wikidata.org/entity/Q24564698': 1, # 'years old' 'http://www.wikidata.org/entity/Q3013059': 10**3, # 'kiloannum' (1e3 yrs) 'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs) 'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs) } - # For filtering lines before parsing JSON TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode() PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode() @@ -183,12 +186,12 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No # The 'OR IGNORE' is for a few entries that share the same title (and seem like redirects) else: if not os.path.exists(offsetsFile): - print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours) + print('Creating offsets file') # For indexed access used in multiprocessing (may take about 7 hours) with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'wb') as file2: pickle.dump(file.block_offsets(), file2) print('Allocating file into chunks') - fileSz: int # About 1.4 TB + fileSz: int # Was about 1.4 TB with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'rb') as file2: file.set_block_offsets(pickle.load(file2)) @@ -206,15 +209,15 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No chunkIdxs[i], chunkIdxs[i+1]) for i in range(nProcs)]): # Add entries from subprocess output file with open(outFile, 'rb') as file: - for entry in pickle.load(file): - dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', entry) + for item in pickle.load(file): + dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', item) dbCon.commit() dbCon.close() # For data extraction def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None: """ Parses a Wikidata dump line, returning an entry to add to the db """ - # Check with regex + # Check with regexes if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None: return None # Decode @@ -283,7 +286,7 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non # return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg) def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None: - """ Obtains event start+end data from value objects with type 'time', according to 'timeType' """ + """ Obtains event start+end data from 'datavalue' objects with type 'time', according to 'timeType' """ # Values to return start: int startUpper: int | None = None @@ -317,7 +320,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | else: start = DUMP_YEAR - upperBound * scale startUpper = DUMP_YEAR - lowerBound * scale - # Account for non-existence of 0 CE + # Account for non-existence of 0 AD if start <= 0: start -= 1 if startUpper is not None and startUpper <= 0: @@ -342,7 +345,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | return None end, _, timeFmt2 = endTimeVals if timeFmt != timeFmt2: - if timeFmt == 1 and timeFmt2 == 2: + if timeFmt == 2 and timeFmt2 == 1: timeFmt = 3 else: return None @@ -359,13 +362,13 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | return None end, endUpper, timeFmt2 = endTimeVals if timeFmt != timeFmt2: - if timeFmt == 1 and timeFmt2 == 2: + if timeFmt == 2 and timeFmt2 == 1: timeFmt = 3 else: return None return start, startUpper, end, endUpper, timeFmt def getEventTime(dataVal) -> tuple[int, int | None, int] | None: - """ Obtains event start (or end) data from a value object with type 'time' """ + """ Obtains event start (or end) data from a 'datavalue' object with type 'time' """ if 'type' not in dataVal or dataVal['type'] != 'time': return None # Get time data @@ -385,20 +388,20 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None: startUpper: int | None = None timeFmt: int if precision in [10, 11]: # 'month' or 'day' precision - if year < -4713: # If before 4713 BCE (start of valid julian date period) - print(f'WARNING: Skipping sub-year-precision date before 4713 BCE: {json.dumps(dataVal)}') + if year < MIN_CAL_YEAR: # If before start of valid julian date period + print(f'WARNING: Skipping sub-year-precision date before {-MIN_CAL_YEAR} BC: {json.dumps(dataVal)}') return None day = max(day, 1) # With month-precision, entry may have a 'day' of 0 if calendarmodel == 'http://www.wikidata.org/entity/Q1985727': # 'proleptic gregorian calendar' start = gregorianToJdn(year, month, day) if precision == 10: startUpper = gregorianToJdn(year, month+1, 0) - timeFmt = 2 + timeFmt = 1 else: # "http://www.wikidata.org/entity/Q1985786" ('proleptic julian calendar') start = julianToJdn(year, month, day) if precision == 10: startUpper = julianToJdn(year, month+1, 0) - timeFmt = 1 + timeFmt = 2 elif 0 <= precision < 10: # 'year' to 'gigaannum' precision scale: int = 10 ** (9 - precision) start = year // scale * scale diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py index 817de03..bf3bcd0 100755 --- a/backend/hist_data/gen_imgs.py +++ b/backend/hist_data/gen_imgs.py @@ -10,9 +10,9 @@ processing. It uses already-existing database entries to decide what to skip. """ -import os, math, subprocess +import argparse +import os, math, subprocess, signal import sqlite3, urllib.parse -import signal from PIL import Image IMG_DIR = os.path.join('enwiki', 'imgs') @@ -147,7 +147,6 @@ def convertImage(imgPath: str, outPath: str): return True if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py index 7d6071a..933af24 100755 --- a/backend/hist_data/gen_picked_data.py +++ b/backend/hist_data/gen_picked_data.py @@ -4,12 +4,14 @@ Adds additional manually-picked events to the database """ -# Enable unit testing code to, when running this script, resolve imports of modules within this directory +# Code used in unit testing (for resolving imports of modules within this directory) import os, sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) - +# Standard imports +import argparse import json, sqlite3 +# Local imports from gen_imgs import convertImage PICKED_DIR = 'picked' @@ -55,7 +57,6 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str) -> dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() # diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py index 8eaa142..aaaf69d 100755 --- a/backend/hist_data/gen_pop_data.py +++ b/backend/hist_data/gen_pop_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 """ -Adds Wikipedia page view info to the database as popularity values. +Adds Wikipedia page view info to the database as popularity values """ import os, sqlite3 diff --git a/backend/hist_data/picked/README.md b/backend/hist_data/picked/README.md index becbd24..395fd9d 100644 --- a/backend/hist_data/picked/README.md +++ b/backend/hist_data/picked/README.md @@ -1,4 +1,4 @@ -This directory holds data for additional events +This directory holds data for additional manually-picked events. Files ===== diff --git a/backend/hist_data/reduce_event_data.py b/backend/hist_data/reduce_event_data.py index c061f90..5801f4d 100755 --- a/backend/hist_data/reduce_event_data.py +++ b/backend/hist_data/reduce_event_data.py @@ -1,16 +1,17 @@ #!/usr/bin/python3 """ -Delete events from the database that have no image. +Delete events from the database that have no image """ -# Enable unit testing code to, when running this script, resolve imports of modules within this directory +# Code used in unit testing (for resolving imports of modules within this directory) import os, sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) - +# Standard imports import argparse import sqlite3 +# Local imports from cal import SCALES, dbDateToHistDate, dateToUnit DB_FILE = 'data.db' diff --git a/backend/histplorer.py b/backend/histplorer.py index d397c17..20c63de 100755 --- a/backend/histplorer.py +++ b/backend/histplorer.py @@ -1,29 +1,29 @@ """ -WSGI script that serves historical data +WSGI script that serves historical data. Expected HTTP query parameters: - type: - If 'events', reply with list of event objects, within a date range, for a given scale + If 'events', reply with information on events within a date range, for a given scale If 'info', reply with information about a given event If 'sugg', reply with search suggestions for an event search string - range: With type=events, specifies a historical-date range - If absent, the default is 'all of time' + If absent, the default is 'all of time'. Examples: - range=1000.1910-10-09 means '1000 CE to 09/10/1910 (inclusive)' - range=-13000. means '13000 BCE onwards' -- scale: With type=events, specifies a date scale (matched against 'scale' column in 'event_disp' table) + range=1000.1910-10-09 means '1000 AD to 09/10/1910 (inclusive)' + range=-13000. means '13000 BC onwards' +- scale: With type=events, specifies a date scale - incl: With type=events, specifies an event to include, as an event ID - event: With type=info, specifies the event to get info for - input: With type=sugg, specifies a search string to suggest for - limit: With type=events or type=sugg, specifies the max number of results -- ctg: With type=events or type=sugg, specifies event categories to restrict results to +- ctg: With type=events or type=sugg, specifies an event category to restrict results to """ from typing import Iterable import sys, re import urllib.parse, sqlite3 import gzip, jsonpickle -from hist_data.cal import gregorianToJdn, HistDate, dbDateToHistDate, dateToUnit +from hist_data.cal import gregorianToJdn, HistDate, MIN_CAL_YEAR, dbDateToHistDate, dateToUnit DB_FILE = 'hist_data/data.db' MAX_REQ_EVENTS = 500 @@ -32,7 +32,7 @@ DEFAULT_REQ_EVENTS = 20 MAX_REQ_SUGGS = 50 DEFAULT_REQ_SUGGS = 5 -# Classes for objects sent as responses +# Classes for values sent as responses class Event: """ Represents an historical event """ def __init__( @@ -146,17 +146,6 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | EventResponse | Ev elif reqType == 'sugg': return handleSuggReq(params, dbCur) return None -def reqParamToHistDate(s: str): - """ Produces a HistDate from strings like '2010-10-3', '-8000', and '' (throws ValueError if invalid) """ - if not s: - return None - m = re.match(r'(-?\d+)(?:-(\d+)-(\d+))?', s) - if m is None: - raise ValueError('Invalid HistDate string') - if m.lastindex == 1: - return HistDate(None, int(m.group(1))) - else: - return HistDate(True, int(m.group(1)), int(m.group(2)), int(m.group(3))) # For type=events def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor) -> EventResponse | None: @@ -201,6 +190,17 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor) -> EventRespo events = lookupEvents(start, end, scale, ctg, incl, resultLimit, dbCur) unitCounts = lookupUnitCounts(start, end, scale, dbCur) return EventResponse(events, unitCounts) +def reqParamToHistDate(s: str): + """ Produces a HistDate from strings like '2010-10-3', '-8000', and '' (throws ValueError if invalid) """ + if not s: + return None + m = re.match(r'(-?\d+)(?:-(\d+)-(\d+))?', s) + if m is None: + raise ValueError('Invalid HistDate string') + if m.lastindex == 1: + return HistDate(None, int(m.group(1))) + else: + return HistDate(True, int(m.group(1)), int(m.group(2)), int(m.group(3))) def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: str | None, incl: int | None, resultLimit: int, dbCur: sqlite3.Cursor) -> list[Event]: """ Looks for events within a date range, in given scale, @@ -217,7 +217,7 @@ def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: if start is not None: constraint = '(start >= ? AND fmt > 0 OR start >= ? AND fmt = 0)' if start.gcal is None: - startJdn = gregorianToJdn(start.year, 1, 1) if start.year >= -4713 else 0 + startJdn = gregorianToJdn(start.year, 1, 1) if start.year >= MIN_CAL_YEAR else 0 constraints.append(constraint) params.extend([startJdn, start.year]) else: @@ -228,7 +228,7 @@ def lookupEvents(start: HistDate | None, end: HistDate | None, scale: int, ctg: if end is not None: constraint = '(start <= ? AND fmt > 0 OR start <= ? AND fmt = 0)' if end.gcal is None: - endJdn = gregorianToJdn(end.year, 1, 1) if end.year >= -4713 else -1 + endJdn = gregorianToJdn(end.year, 1, 1) if end.year >= MIN_CAL_YEAR else -1 constraints.append(constraint) params.extend([endJdn, end.year]) else: @@ -269,7 +269,7 @@ def eventEntryToResults( dateVals: list[int | None] = [start, startUpper, end, endUpper] newDates: list[HistDate | None] = [None for n in dateVals] for i, n in enumerate(dateVals): - if n: + if n is not None: newDates[i] = dbDateToHistDate(n, fmt, i < 2) # return Event(eventId, title, newDates[0], newDates[1], newDates[2], newDates[3], ctg, imageId, pop) diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py index 04fdd69..d18dddf 100644 --- a/backend/tests/enwiki/test_gen_img_data.py +++ b/backend/tests/enwiki/test_gen_img_data.py @@ -17,11 +17,11 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', { - (1, 'Belgium', 2389729, None, None, None, 2, 'country'), - (2, 'George Washington', 2353711, None, 2378478, None, 2, 'human'), - (3, 'Douglas Adams', 2434082, None, 2452040, None, 2, 'human'), - (4, 'World War II', 2429507, None, 2431700, None, 2, 'event'), - (5, 'Marie Curie', 2403277, None, 2427622, None, 2, 'human'), + (1, 'Belgium', 2389729, None, None, None, 1, 'country'), + (2, 'George Washington', 2353711, None, 2378478, None, 1, 'human'), + (3, 'Douglas Adams', 2434082, None, 2452040, None, 1, 'human'), + (4, 'World War II', 2429507, None, 2431700, None, 1, 'event'), + (5, 'Marie Curie', 2403277, None, 2427622, None, 1, 'human'), } ) # Create temp dump-index db diff --git a/backend/tests/test_cal.py b/backend/tests/test_cal.py index d5f2860..78b2c8b 100644 --- a/backend/tests/test_cal.py +++ b/backend/tests/test_cal.py @@ -30,8 +30,8 @@ class TestCal(unittest.TestCase): self.assertEqual(julianToGregorian(1616, 4, 23), (1616, 5, 3)) def test_db_to_hist_date(self): self.assertEqual(dbDateToHistDate(2001, 0), HistDate(True, 2001, 1, 1)) - self.assertEqual(dbDateToHistDate(1721455, 1), HistDate(False, 1, 2, 1)) - self.assertEqual(dbDateToHistDate(1356438, 2), HistDate(True, -1000, 9, 13)) + self.assertEqual(dbDateToHistDate(1356438, 1), HistDate(True, -1000, 9, 13)) + self.assertEqual(dbDateToHistDate(1721455, 2), HistDate(False, 1, 2, 1)) self.assertEqual(dbDateToHistDate(2268942, 3, False), HistDate(False, 1500, 1, 10)) self.assertEqual(dbDateToHistDate(2268933, 3, True), HistDate(True, 1500, 1, 10)) def test_date_to_unit(self): diff --git a/backend/tests/test_gen_desc_data.py b/backend/tests/test_gen_desc_data.py index eabe644..4c902ad 100644 --- a/backend/tests/test_gen_desc_data.py +++ b/backend/tests/test_gen_desc_data.py @@ -50,7 +50,7 @@ class TestGenData(unittest.TestCase): (10, 'I', 100, None, None, None, 0, 'event'), (20, 'II', 200, None, None, None, 0, 'discovery'), (30, 'III', 300, None, 350, None, 0, 'event'), - (50, 'V', 5, 10, None, None, 1, 'human'), + (50, 'V', 5, 10, None, None, 2, 'human'), } ) # Run diff --git a/backend/tests/test_gen_disp_data.py b/backend/tests/test_gen_disp_data.py index c39c962..db6ddc0 100644 --- a/backend/tests/test_gen_disp_data.py +++ b/backend/tests/test_gen_disp_data.py @@ -20,13 +20,13 @@ class TestGenData(unittest.TestCase): (2, 'event two', 2452607, None, 2455369, None, 3, 'human'), # 15/11/2002 (3, 'event three', 1900, None, 2000, None, 0, 'event'), # version of 1 without pop score (4, 'event four', 1901, None, 2000, 2010, 0, 'event'), - (5, 'event five', 2415307, None, None, None, 1, 'event'), # 01/10/1900 - (6, 'event six', 2415030, None, None, None, 2, 'event'), # 10/01/1900 + (5, 'event five', 2415307, None, None, None, 2, 'event'), # 01/10/1900 + (6, 'event six', 2415030, None, None, None, 1, 'event'), # 10/01/1900 (7, 'event seven', 1900, None, None, None, 0, 'event'), # popular version of 1 (8, 'event eight', 1900, None, None, None, 0, 'event'), # less popular version of 1 (9, 'event nine', 1900, None, None, None, 0, 'event'), # less popular version of 1 - (10, 'event ten', 2415307, None, None, None, 1, 'event'), # less popular version of 5 - (11, 'event eleven', 2415307, None, None, None, 1, 'event'), # slightly less popular version of 5 + (10, 'event ten', 2415307, None, None, None, 2, 'event'), # less popular version of 5 + (11, 'event eleven', 2415307, None, None, None, 2, 'event'), # slightly less popular version of 5 } ) createTestDbTable( @@ -55,10 +55,10 @@ class TestGenData(unittest.TestCase): (1, 'event one', 1900, None, None, None, 0, 'event'), (2, 'event two', 2452607, None, 2455369, None, 3, 'human'), (4, 'event four', 1901, None, 2000, 2010, 0, 'event'), - (5, 'event five', 2415307, None, None, None, 1, 'event'), - (6, 'event six', 2415030, None, None, None, 2, 'event'), + (5, 'event five', 2415307, None, None, None, 2, 'event'), + (6, 'event six', 2415030, None, None, None, 1, 'event'), (7, 'event seven', 1900, None, None, None, 0, 'event'), - (11, 'event eleven', 2415307, None, None, None, 1, 'event'), # 01/10/1900 + (11, 'event eleven', 2415307, None, None, None, 2, 'event'), # 01/10/1900 } ) self.assertEqual( diff --git a/backend/tests/test_gen_events_data.py b/backend/tests/test_gen_events_data.py index 0f298ca..b3dfddc 100644 --- a/backend/tests/test_gen_events_data.py +++ b/backend/tests/test_gen_events_data.py @@ -1,6 +1,6 @@ import unittest import tempfile, os, json, bz2, pickle, indexed_bzip2 - +# Local imports from tests.common import readTestDbTable from hist_data.gen_events_data import genData @@ -115,6 +115,7 @@ class TestGenData(unittest.TestCase): 'id': 'Q6', 'claims': { 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q7725634'}}}}], # 'instance of' 'literary work' + 'P170': [{'mainsnak': {'datavalue': {'value': {'id': 'Q180'}}}}], # 'creator' 'P1319': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'earliest date' 'time':'-0020-08-01T00:00:00Z', 'precision':11, # day precision @@ -132,6 +133,7 @@ class TestGenData(unittest.TestCase): 'id': 'Q7', 'claims': { 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11424'}}}}], # 'instance of' 'film' + 'P136': [{'mainsnak': {'datavalue': {'value': {'id': 'Q157394'}}}}], # 'genre' 'P577': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'publication date' 'time':'-2103-00-00T00:00:00Z', 'precision':7, # century precision @@ -144,18 +146,24 @@ class TestGenData(unittest.TestCase): 'id': 'Q8', 'claims': { 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # 'instance of' 'taxon' - } - # No title + 'P571': [{'mainsnak': {'datavalue': {'type': 'time', 'value': { # 'inception' + 'time':'-400000000-00-00T00:00:01Z', + 'precision':1, # hundred million years precision + 'calendarmodel':'http://www.wikidata.org/entity/Q1985727' # 'proleptic gregorian calendar' + }}}}], + }, + 'sitelinks': {'enwiki': {'title': 'organism one'}}, }, ] self.expectedRows = { - (1, 'event one', 2433617, 2433647, None, None, 2, 'event'), - (2, 'Human One', 2452594, None, 2455369, None, 3, 'human'), - (3, 'country one', -1001, None, -99, None, 0, 'country'), - (4, 'country two', -9000, -7000, None, None, 0, 'country'), + (1, 'event one', 2433617, 2433647, None, None, 1, 'event'), + (2, 'Human One', 2452594, None, 2455369, None, 3, 'person'), + (3, 'country one', -1001, None, -99, None, 0, 'place'), + (4, 'country two', -9000, -7000, None, None, 0, 'place'), (5, 'discovery one', 1, 1000, None, None, 0, 'discovery'), - (6, 'media one', 1714331, None, 1714362, None, 1, 'media'), - (7, 'media two', -2199, -2100, None, None, 0, 'media'), + (6, 'media one', 1714331, None, 1714362, None, 2, 'work'), + (7, 'media two', -2199, -2100, None, None, 0, 'work'), + (8, 'organism one', -400000000, -300000001, None, None, 0, 'organism'), } def test_wikiItems(self): rows = runGenData(self.testWikiItems, False, 1) diff --git a/backend/tests/test_gen_imgs.py b/backend/tests/test_gen_imgs.py index f8bfeb6..ea4bd70 100644 --- a/backend/tests/test_gen_imgs.py +++ b/backend/tests/test_gen_imgs.py @@ -50,7 +50,7 @@ class TestGenImgs(unittest.TestCase): { (10, 'first', 100, 1000, None, None, 0, 'event'), (20, 'second', 10, 20, None, None, 0, 'event'), - (30, 'third', 1, 20, 30, 40, 2, 'event'), + (30, 'third', 1, 20, 30, 40, 1, 'event'), } ) # Run diff --git a/backend/tests/test_histplorer.py b/backend/tests/test_histplorer.py index be01a90..592d534 100644 --- a/backend/tests/test_histplorer.py +++ b/backend/tests/test_histplorer.py @@ -13,8 +13,8 @@ def initTestDb(dbFile: str) -> None: { (1, 'event one', 1900, None, None, None, 0, 'event'), (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 2/11/2002 to 21/06/2010 - (3, 'event three', 2448175, 2451828, None, None, 2, 'discovery'), # 10/10/1990 til 10/10/2000 - (4, 'event four', 991206, None, 1721706, None, 1, 'event'), # 10/10/-2000 to 10/10/1 + (3, 'event three', 2448175, 2451828, None, None, 1, 'discovery'), # 10/10/1990 til 10/10/2000 + (4, 'event four', 991206, None, 1721706, None, 2, 'event'), # 10/10/-2000 to 10/10/1 (5, 'event five', 2000, None, 2001, None, 0, 'event'), (6, 'event six', 1900, None, 2000, None, 0, 'event'), } diff --git a/backend/tests/test_reduce_event_data.py b/backend/tests/test_reduce_event_data.py index 7f1ce73..22fe204 100644 --- a/backend/tests/test_reduce_event_data.py +++ b/backend/tests/test_reduce_event_data.py @@ -18,7 +18,7 @@ class TestReduceData(unittest.TestCase): { (1, 'event one', 1900, None, None, None, 0, 'event'), (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 2/11/2002 - (3, 'event three', 2448175, 2448200, None, None, 2, 'discovery'), # 10/10/1990 + (3, 'event three', 2448175, 2448200, None, None, 1, 'discovery'), # 10/10/1990 (4, 'event four', 1900, None, None, None, 0, 'event'), # Copy of 1 (5, 'event five', 2452595, None, 2455369, None, 3, 'human'), # Day after 2 } |
