diff options
Diffstat (limited to 'backend')
32 files changed, 479 insertions, 346 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md index 9fe2d0e..09a71fc 100644 --- a/backend/hist_data/README.md +++ b/backend/hist_data/README.md @@ -9,37 +9,38 @@ This directory holds files used to generate the history database data.db. - `start*` and `end*` specify start and end dates. `start_upper`, `end`, and `end_upper`, are optional. If `start_upper` is present, it and `start` denote an uncertain range of start times. - Similarly for 'end' and 'end_upper'. + Similarly for `end` and `end_upper`. - `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`. - If 0, they denote a number of years AD (if positive) or BC (if negative). - If 1, they denote a Julian date number. This allows simple comparison of events with day-level precision, but only goes back to 4713 BC. - If 2, same as 1, but with a preference for display using the Julian calendar, not the Gregorian calendar. For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not. - - If 3, same as 2, but where 'start' and 'start_upper' are 'preferably Julian'. + - If 3, same as 2, but where only `start` and `start_upper` are 'preferably Julian'. For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not. - `pop`: <br> Format: `id INT PRIMARY KEY, pop INT` <br> - Associates each event with a popularity measure (currently an average monthly viewcount) + Associates each event with a popularity measure (currently an average monthly viewcount). - `dist`: <br> Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br> - Maps scale units to counts of events in them. + For each scale, maps its units to event counts. + For example, on the monthly scale, the unit for Jan 2010 might have 10 events. - `event_disp`: <br> Format: `id INT, scale INT, unit INT, PRIMARY KEY (id, scale)` <br> Maps events to scales+units they are 'displayable' on (used to make displayed events more uniform across time). -- `img_dist`: <br> - Like `dist`, but only counts events with images. -- `img_disp`: <br> - Like `events_disp`, but only counts events with images. - `images`: <br> Format: `id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT` <br> - Holds metadata for available images + Holds metadata for available images. - `event_imgs`: <br> Format: `id INT PRIMARY KEY, img_id INT` <br> - Assocates events with images + Assocates events with images. - `descs`: <br> Format: `id INT PRIMARY KEY, wiki_id INT, desc TEXT` <br> Associates an event's enwiki title with a short description. +- `img_dist`: <br> + Like `dist`, but only counts events with images. +- `img_disp`: <br> + Like `events_disp`, but only counts events with images. # Generating the Database @@ -66,12 +67,12 @@ Some of the scripts use third-party packages: looks for infobox image names, and stores them in an image database. 1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found images, and adds them to the image database. You should probably first change the USER_AGENT - script variable to identify yourself to the online API (this is expected - [best practice](https://www.mediawiki.org/wiki/API:Etiquette)). + script variable to identify yourself to the online API (this is + [expected best practice](https://www.mediawiki.org/wiki/API:Etiquette)). 1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. Setting the USER_AGENT variable applies here as well. <br> In some rare cases, the download won't produce an image file, but a text file containing - 'File not found: ...'. These can simply be deleted. + 'File not found: ...'. These can be deleted. 1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/. Adds the `imgs` and `event_imgs` tables. <br> The output images might need additional manual changes: diff --git a/backend/hist_data/cal.py b/backend/hist_data/cal.py index efb5bab..d86589b 100644 --- a/backend/hist_data/cal.py +++ b/backend/hist_data/cal.py @@ -2,8 +2,11 @@ Provides date conversion functions, HistDate, and date scales. """ -# For conversion between calendars and Julian day numbers. Algorithms were obtained from +# ========== For conversion between calendars and Julian day numbers. ========== + +# Algorithms were obtained from: # https://en.wikipedia.org/wiki/Julian_day#Converting_Gregorian_calendar_date_to_Julian_Day_Number. + def gregorianToJdn(year: int, month: int, day: int) -> int: """ Converts a Gregorian calendar date to a Julian day number, @@ -20,6 +23,7 @@ def gregorianToJdn(year: int, month: int, day: int) -> int: jdn -= int((3 * int((year + 4900 + x) / 100)) / 4) jdn += day - 32075 return jdn + def julianToJdn(year: int, month: int, day: int) -> int: """ Like gregorianToJdn(), but converts a Julian calendar date. @@ -32,6 +36,7 @@ def julianToJdn(year: int, month: int, day: int) -> int: jdn += int(275 * month / 9) jdn += day + 1729777 return jdn + def jdnToGregorian(jdn: int) -> tuple[int, int, int]: """ Converts a Julian day number to a Gregorian calendar date, denoting the @@ -48,6 +53,7 @@ def jdnToGregorian(jdn: int) -> tuple[int, int, int]: if Y <= 0: Y -= 1 return Y, M, D + def jdnToJulian(jdn: int) -> tuple[int, int, int]: """ Like jdnToGregorian(), but converts to a Julian calendar date """ f = jdn + 1401 @@ -60,16 +66,20 @@ def jdnToJulian(jdn: int) -> tuple[int, int, int]: if Y <= 0: Y -= 1 return Y, M, D + def julianToGregorian(year: int, month: int, day: int) -> tuple[int, int, int]: return jdnToGregorian(julianToJdn(year, month, day)) + def gregorianToJulian(year: int, month: int, day: int) -> tuple[int, int, int]: return jdnToJulian(gregorianToJdn(year, month, day)) -# For date representation +# ========== For date representation ========== + MIN_CAL_YEAR = -4713 # Year before which JDNs are not usable MONTH_SCALE = -1; DAY_SCALE = -2; SCALES: list[int] = [int(s) for s in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]]; + class HistDate: """ Represents a historical date @@ -85,12 +95,14 @@ class HistDate: self.year = year self.month = month self.day = day - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, HistDate) and \ (self.gcal, self.year, self.month, self.day) == (other.gcal, other.year, other.month, other.day) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + def dbDateToHistDate(n: int, fmt: int, end=False) -> HistDate: """ Converts a start/start_upper/etc and fmt value in the 'events' db table, into a HistDate """ if fmt == 0: # year @@ -99,6 +111,7 @@ def dbDateToHistDate(n: int, fmt: int, end=False) -> HistDate: return HistDate(True, *jdnToGregorian(n)) else: # fmt == 2 or fmt == 3 and not end return HistDate(False, *jdnToJulian(n)) + def dateToUnit(date: HistDate, scale: int) -> int: """ Converts a date to an int representing a unit on a scale """ if scale >= 1: diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 43f2c43..6fd710c 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -10,12 +10,16 @@ at already-processed names to decide what to skip. """ import argparse -import re, time, signal -import sqlite3, urllib.parse, html +import re +import time +import signal +import sqlite3 +import urllib.parse +import html import requests IMG_DB = 'img_data.db' -# + API_URL = 'https://en.wikipedia.org/w/api.php' USER_AGENT = 'terryt.dev (terry06890@gmail.com)' BATCH_SZ = 50 # Max 50 @@ -26,17 +30,18 @@ def downloadInfo(imgDb: str) -> None: print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() + print('Checking for table') if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: dbCur.execute('CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \ 'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') - # + print('Reading image names') imgNames: set[str] = set() for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): imgNames.add(imgName) print(f'Found {len(imgNames)}') - # + print('Checking for already-processed images') nextImgId = 1 oldSz = len(imgNames) @@ -45,7 +50,7 @@ def downloadInfo(imgDb: str) -> None: if imgId >= nextImgId: nextImgId = imgId + 1 print(f'Found {oldSz - len(imgNames)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -54,7 +59,7 @@ def downloadInfo(imgDb: str) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Iterating through image names') imgNameList = list(imgNames) iterNum = 0 @@ -65,9 +70,11 @@ def downloadInfo(imgDb: str) -> None: if interrupted: print(f'Exiting loop at iteration {iterNum}') break + # Get batch imgBatch = imgNameList[i:i+BATCH_SZ] imgBatch = ['File:' + x for x in imgBatch] + # Make request headers = { 'user-agent': USER_AGENT, @@ -90,6 +97,7 @@ def downloadInfo(imgDb: str) -> None: print(f'ERROR: Exception while downloading info: {e}') print('\tImage batch: ' + '|'.join(imgBatch)) continue + # Parse response-object if 'query' not in responseObj or 'pages' not in responseObj['query']: print('WARNING: Response object doesn\'t have page data') @@ -120,6 +128,7 @@ def downloadInfo(imgDb: str) -> None: if title not in imgNames: print(f'WARNING: Got title "{title}" not in image-name list') continue + if 'imageinfo' not in page: print(f'WARNING: No imageinfo section for page "{title}"') continue @@ -129,6 +138,7 @@ def downloadInfo(imgDb: str) -> None: artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup if artist is not None: artist = TAG_REGEX.sub(' ', artist).strip() @@ -140,11 +150,12 @@ def downloadInfo(imgDb: str) -> None: credit = WHITESPACE_REGEX.sub(' ', credit) credit = html.unescape(credit) credit = urllib.parse.unquote(credit) + # Add to db dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)', (nextImgId, title, license, artist, credit, restrictions, url)) nextImgId += 1 - # + print('Closing database') dbCon.commit() dbCon.close() @@ -152,5 +163,5 @@ def downloadInfo(imgDb: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadInfo(IMG_DB) diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index df40bae..e484b33 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -9,33 +9,38 @@ The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. """ -# Took about a week to downloaded about 60k images +# Note: Took about a week to downloaded about 60k images import argparse -import re, os, time, signal +import re +import os +import time +import signal import sqlite3 -import urllib.parse, requests +import urllib.parse +import requests IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' -# + LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' - # It's unclear how to properly check for cache misses, so we just aim for 1 per sec + # Note: https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'. + # It's unclear how to properly check for cache misses, so we just aim for 1 per sec. EXP_BACKOFF = True # If True, double the timeout each time a download error occurs (otherwise just exit) def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if not os.path.exists(outDir): os.mkdir(outDir) + print('Checking for already-downloaded images') fileList = os.listdir(outDir) imgIdsDone: set[int] = set() for filename in fileList: imgIdsDone.add(int(os.path.splitext(filename)[0])) print(f'Found {len(imgIdsDone)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -44,10 +49,11 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() + print('Starting downloads') iterNum = 0 query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs' @@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if interrupted: print('Exiting loop') break + # Check for problematic attributes if license is None or LICENSE_REGEX.fullmatch(license) is None: continue @@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: continue if restrictions is not None and restrictions != '': continue + # Download image iterNum += 1 print(f'Iteration {iterNum}: Downloading for image ID {imgId}') @@ -92,11 +100,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: timeout *= 2 print(f'New timeout: {timeout}') continue + print('Closing database') dbCon.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index bb2b845..194afe8 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -5,30 +5,40 @@ Reads through the wiki dump, attempts to parse short-descriptions, and adds them to a database """ -# In testing, this script took over 10 hours to run, and generated about 5GB +# Note: In testing, this script took over 10 hours to run, and generated about 5GB import argparse -import sys, os, re -import bz2, html, mwxml, mwparserfromhell +import sys +import os +import re import sqlite3 +import bz2 +import html + +import mwxml +import mwparserfromhell DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' -# Regexps + DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + def convertTemplateReplace(match): """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ if match.group(2) is None: return f'{match.group(1)} {match.group(4)}' else: return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' -PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') -LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +# ========== For data generation ========== def genData(dumpFile: str, dbFile: str) -> None: + """ Reads dump, parses descriptions, and writes to db """ print('Creating database') if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') @@ -39,13 +49,13 @@ def genData(dumpFile: str, dbFile: str) -> None: dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') - # + print('Iterating through dump file') with bz2.open(dumpFile, mode='rt') as file: for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): if pageNum % 1e4 == 0: print(f'At page {pageNum}') - # Parse page + if page.namespace == 0: try: dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) @@ -60,15 +70,22 @@ def genData(dumpFile: str, dbFile: str) -> None: desc = parseDesc(revision.text) if desc is not None: dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) - # + print('Closing database') dbCon.commit() dbCon.close() + def parseDesc(text: str) -> str | None: - # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, - # and then accumulate lines until a blank one. - # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ + Looks for a description in wikitext content. + + Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs, + and then accumulates lines until a blank one. + + Some cases not accounted for include: + disambiguation pages, abstracts with sentences split-across-lines, + nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ lines: list[str] = [] openBraceCount = 0 openBracketCount = 0 @@ -108,18 +125,24 @@ def parseDesc(text: str) -> str | None: if lines: return removeMarkup(' '.join(lines)) return None + def removeMarkup(content: str) -> str: + """ Tries to remove markup from wikitext content """ content = EMBEDDED_HTML_REGEX.sub('', content) content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup content = PARENS_GROUP_REGEX.sub('', content) content = LEFTOVER_BRACE_REGEX.sub('', content) return content + def convertTitle(title: str) -> str: + """ Replaces underscores in wiki item title """ return html.unescape(title).replace('_', ' ') +# ========== Main block ========== + if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(DUMP_FILE, DB_FILE) diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 6be8bc5..8872171 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -1,24 +1,28 @@ #!/usr/bin/python3 """ -Adds data from the wiki-dump index-file into a database +Converts data from the wiki-dump index-file into a database """ import argparse -import sys, os, re -import bz2, sqlite3 +import sys +import os +import re +import bz2 +import sqlite3 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines DB_FILE = 'dump_index.db' def genData(indexFile: str, dbFile: str) -> None: - """ Reads the index file and creates the db """ if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') lastOffset = 0 @@ -29,7 +33,7 @@ def genData(indexFile: str, dbFile: str) -> None: lineNum += 1 if lineNum % 1e5 == 0: print(f'At line {lineNum}') - # + match = lineRegex.fullmatch(line.rstrip()) assert match is not None offsetStr, pageId, title = match.group(1,2,3) @@ -49,6 +53,7 @@ def genData(indexFile: str, dbFile: str) -> None: dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) except sqlite3.IntegrityError as e: print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') dbCon.commit() dbCon.close() @@ -56,5 +61,5 @@ def genData(indexFile: str, dbFile: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(INDEX_FILE, DB_FILE) diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 9aa3863..05df63d 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -8,35 +8,42 @@ The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """ -import os, re -import bz2, html, urllib.parse +import argparse +import os +import re +import bz2 +import html +import urllib.parse import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -# Regexps + ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) +# ========== For data generation ========== + def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: + """ Looks up page IDs in dump and creates database """ print('Opening databases') indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() + print('Checking tables') if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)') # 'img_name' values are set to NULL to indicate page IDs where no image was found imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') - else: - # Check for already-processed page IDs + else: # Check for already-processed page IDs numSkipped = 0 for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'): if pid in pageIds: @@ -45,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: else: print(f'Found already-processed page ID {pid} which was not in input set') print(f'Will skip {numSkipped} already-processed page IDs') - # + print('Getting dump-file offsets') offsetToPageId: dict[int, list[int]] = {} offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets @@ -55,7 +62,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') - # + query = 'SELECT offset, next_offset, title FROM offsets WHERE id = ?' row = indexDbCur.execute(query, (pageId,)).fetchone() if row is None: @@ -68,7 +75,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: offsetToPageId[chunkOffset].append(pageId) pageIdToTitle[pageId] = title print(f'Found {len(offsetToEnd)} chunks to check') - # + print('Iterating through chunks in dump file') with open(dumpFile, mode='rb') as file: iterNum = 0 @@ -76,7 +83,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 100 == 0: print(f'At iteration {iterNum}') - # + chunkPageIds = offsetToPageId[pageOffset] # Jump to chunk file.seek(pageOffset) @@ -122,21 +129,24 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: content.append(line[:line.rfind('</text>')]) # Look for image-filename imageName = getImageName(content) - imgDbCur.execute('INSERT into page_imgs VALUES (?, ?, ?)', (pageId, None if imageName is None else pageIdToTitle[pageId], imageName)) + imgDbCur.execute( + 'INSERT into page_imgs VALUES (?, ?, ?)', + (pageId, None if imageName is None else pageIdToTitle[pageId], imageName)) break if not foundTextEnd: print(f'WARNING: Did not find </text> for page id {pageId}') break if not foundText: print(f'WARNING: Did not find <text> for page id {pageId}') - # + print('Closing databases') indexDbCon.close() imgDbCon.commit() imgDbCon.close() + def getImageName(content: list[str]) -> str | None: """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ - # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections + # Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections for line in content: match = IMG_LINE_REGEX.match(line) if match is not None: @@ -177,6 +187,8 @@ def getImageName(content: list[str]) -> str | None: return None return None +# ========== For getting input page IDs ========== + def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: print('Getting event data') titles: set[str] = set() @@ -184,6 +196,7 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: for (title,) in dbCon.execute('SELECT title from events'): titles.add(title) dbCon.close() + print('Getting page IDs') pageIds: set[int] = set() dbCon = sqlite3.connect(indexDb) @@ -193,12 +206,15 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: if row: pageIds.add(row[0]) dbCon.close() + print(f'Result: {len(pageIds)} out of {len(titles)}') return pageIds + +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index 935b303..57d6c7b 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -3,27 +3,34 @@ """ Reads through wikimedia files containing pageview counts, computes average counts, and adds them to a database + +Each pageview file has lines that seem to hold these space-separated fields: + wiki code (eg: en.wikipedia), article title, page ID (may be: null), + platform (eg: mobile-web), monthly view count, + hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) """ -# Took about 10min per file (each had about 180e6 lines) +# Note: Took about 10min per file (each had about 180e6 lines) -import sys, os, glob, math, re +import argparse +import sys +import os +import glob +import math +import re from collections import defaultdict -import bz2, sqlite3 +import bz2 +import sqlite3 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') DUMP_INDEX_DB = 'dump_index.db' DB_FILE = 'pageview_data.db' def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: - # Each pageview file has lines that seem to hold these space-separated fields: - # wiki code (eg: en.wikipedia), article title, page ID (may be: null), - # platform (eg: mobile-web), monthly view count, - # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) if os.path.exists(dbFile): print('ERROR: Database already exists') sys.exit(1) - # + namespaceRegex = re.compile(r'[a-zA-Z]+:') titleToViews: dict[str, int] = defaultdict(int) linePrefix = b'en.wikipedia ' @@ -35,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: print(f'At line {lineNum}') if not line.startswith(linePrefix): continue + # Get second and second-last fields linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields title = linePart[:linePart.find(b' ')].decode('utf-8') @@ -45,11 +53,12 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: continue if namespaceRegex.match(title) is not None: continue + # Update map title = title.replace('_', ' ') titleToViews[title] += viewCount print(f'Found {len(titleToViews)} titles') - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -66,8 +75,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: idbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE) diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py index 6c9fee2..bcd8870 100755 --- a/backend/hist_data/gen_desc_data.py +++ b/backend/hist_data/gen_desc_data.py @@ -5,7 +5,8 @@ Maps events to short descriptions from Wikipedia, and stores them in the databas """ import argparse -import os, sqlite3 +import os +import sqlite3 ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') DB_FILE = 'data.db' @@ -15,12 +16,12 @@ def genData(enwikiDb: str, dbFile: str) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)') - # + print('Getting events') titleToId: dict[str, int] = {} for eventId, title in dbCur.execute('SELECT id, title FROM events'): titleToId[title] = eventId - # + print('Getting Wikipedia descriptions') enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() @@ -29,11 +30,13 @@ def genData(enwikiDb: str, dbFile: str) -> None: iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') + # Get wiki ID row = enwikiCur.execute('SELECT id FROM pages WHERE title = ?', (title,)).fetchone() if row is None: continue wikiId = row[0] + # Check for redirect wikiIdToGet = wikiId query = \ @@ -41,12 +44,13 @@ def genData(enwikiDb: str, dbFile: str) -> None: row = enwikiCur.execute(query, (wikiId,)).fetchone() if row is not None: wikiIdToGet = row[0] + # Get desc row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone() if row is None: continue dbCur.execute('INSERT INTO descs VALUES (?, ?, ?)', (eventId, wikiId, row[0])) - # + print('Closing databases') dbCon.commit() dbCon.close() @@ -54,5 +58,5 @@ def genData(enwikiDb: str, dbFile: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(ENWIKI_DB, DB_FILE) diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py index 193adbb..6bb84ad 100755 --- a/backend/hist_data/gen_disp_data.py +++ b/backend/hist_data/gen_disp_data.py @@ -5,14 +5,15 @@ Adds data about event distribution to the database, and removes events not eligible for display """ -# Code used in unit testing (for resolving imports of modules within this directory) -import os, sys +# For unit testing, resolve imports of modules within this directory +import os +import sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) -# Standard imports + import argparse import sqlite3 -# Local imports + from cal import SCALES, dbDateToHistDate, dateToUnit MAX_DISPLAYED_PER_UNIT = 4 @@ -21,7 +22,7 @@ DB_FILE = 'data.db' def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTables: bool) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Reading through events') scaleUnitToCounts: dict[tuple[int, int], list[int]] = {} # Maps scale and unit to two counts (num events in that unit, num events displayable for that unit) @@ -35,7 +36,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa iterNum += 1 if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') - # For each scale + for scale in scales: unit = dateToUnit(dbDateToHistDate(eventStart, fmt), scale) # Update maps @@ -52,7 +53,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa idScales[eventId].append((scale, unit)) scaleUnitToCounts[(scale, unit)] = counts print(f'Results: {len(idScales)} displayable events') - # + print('Looking for non-displayable events') eventsToDel: list[int] = [] for eventId, eventStart, fmt in dbCur.execute(query): @@ -71,7 +72,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa 'SELECT events.id FROM events LEFT JOIN pop ON events.id = pop.id WHERE pop.id IS NULL'): eventsToDel.append(eventId) print(f'Found {len(eventsToDel)}') - # + if not forImageTables: print(f'Deleting {len(eventsToDel)} events') iterNum = 0 @@ -82,7 +83,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa # dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,)) dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,)) - # + print('Writing to db') distTable = 'dist' if not forImageTables else 'img_dist' dispTable = 'event_disp' if not forImageTables else 'img_disp' @@ -94,7 +95,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa for eventId, scaleUnits in idScales.items(): for [scale, unit] in scaleUnits: dbCur.execute(f'INSERT INTO {dispTable} VALUES (?, ?, ?)', (eventId, scale, unit)) - # + print('Closing db') dbCon.commit() dbCon.close() @@ -104,5 +105,5 @@ if __name__ == '__main__': parser.add_argument( 'type', nargs='?', choices=['event', 'img'], default='event', help='The type of tables to generate') args = parser.parse_args() - # + genData(DB_FILE, SCALES, MAX_DISPLAYED_PER_UNIT, args.type == 'img') diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py index 60402b5..453a9ad 100755 --- a/backend/hist_data/gen_events_data.py +++ b/backend/hist_data/gen_events_data.py @@ -59,26 +59,37 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 -# Took about 4.5 hours to run +# Note: Took about 4.5 hours to run -# Code used in unit testing (for resolving imports of modules within this directory) -import os, sys +# For unit testing, resolve imports of modules within this directory +import os +import sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) -# Standard imports + from typing import cast import argparse -import math, re -import io, bz2, json, sqlite3 -import indexed_bzip2, pickle, multiprocessing, tempfile -# Local imports +import math +import re +import io +import bz2 +import json +import sqlite3 + +import indexed_bzip2 +import pickle +import multiprocessing +import tempfile + from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR -# Constants +# ========== Constants ========== + WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2') OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat') DB_FILE = 'data.db' N_PROCS = 6 # Number of processes to use + # For getting Wikidata entity IDs INSTANCE_OF = 'P31' EVENT_CTG: dict[str, dict[str, str]] = { @@ -173,24 +184,28 @@ UNIT_TO_SCALE: dict[str, int] = { 'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs) 'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs) } + # For filtering lines before parsing JSON TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode() PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode() +# ========== Main function ========== + def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None: """ Reads the dump and writes to db """ - # Check db if os.path.exists(dbFile): print('ERROR: Database already exists') return - # Read dump, and write to db - print('Writing to db') + + print('Opening db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)') dbCur.execute('CREATE INDEX events_id_start_idx ON events(id, start)') dbCur.execute('CREATE INDEX events_title_nocase_idx ON events(title COLLATE NOCASE)') + if nProcs == 1: with bz2.open(wikidataFile, mode='rb') as file: for lineNum, line in enumerate(file, 1): @@ -206,6 +221,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'wb') as file2: pickle.dump(file.block_offsets(), file2) + print('Allocating file into chunks') fileSz: int # Was about 1.4 TB with indexed_bzip2.open(wikidataFile) as file: @@ -216,6 +232,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1] # Each adjacent pair specifies a start+end byte index for readDumpChunk() print(f'- Chunk size: {chunkSz:,}') + print('Starting processes to read dump') with tempfile.TemporaryDirectory() as tempDirName: with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool: @@ -227,15 +244,19 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No with open(outFile, 'rb') as file: for item in pickle.load(file): dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', item) + + print('Closing db') dbCon.commit() dbCon.close() -# For data extraction +# ========== For data extraction ========== + def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None: """ Parses a Wikidata dump line, returning an entry to add to the db """ # Check with regexes if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None: return None + # Decode try: line = lineBytes.decode('utf-8').rstrip().rstrip(',') @@ -246,12 +267,14 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non if 'claims' not in jsonItem: return None claims = jsonItem['claims'] + # Get wikidata ID, enwiki title try: itemId = int(jsonItem['id'][1:]) # Skip initial 'Q' itemTitle: str = jsonItem['sitelinks']['enwiki']['title'] except (KeyError, ValueError): return None + # Get event category eventCtg: str | None = None if INSTANCE_OF in claims: # Check types @@ -269,6 +292,7 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non eventCtg = ID_TO_CTG[prop] if not eventCtg: return None + # Check for event-start/end props startVal: str endVal: str | None @@ -297,13 +321,15 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non break if not found: return None + # Convert time values timeData = getTimeData(startVal, endVal, timeType) if timeData is None: return None start, startUpper, end, endUpper, timeFmt = timeData - # + return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg) + def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None: """ Obtains event start+end data from 'datavalue' objects with type 'time', according to 'timeType' """ # Values to return @@ -312,13 +338,13 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | end: int | None = None endUpper: int | None = None timeFmt: int - # + if timeType == 'age estimated by a dating method': + # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in + # 'datedness' and undesirable small offsets to values like '1 billion years old'. if 'type' not in startVal or startVal['type'] != 'quantity': return None - # Get quantity data - # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in - # 'datedness' and undesirable small offsets to values like '1 billion years old'. + try: value = startVal['value'] amount = math.ceil(float(value['amount'])) @@ -331,23 +357,26 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | upperBound = None except (KeyError, ValueError): return None - # Get unit scale + + # Get scale if unit not in UNIT_TO_SCALE: return None scale = UNIT_TO_SCALE[unit] + # Get start+startUpper if lowerBound is None: start = -amount * scale else: start = -cast(int, upperBound) * scale startUpper = -lowerBound * scale + # Adjust precision start = start // scale * scale if startUpper is not None: startUpper = startUpper // scale * scale elif scale > 1: startUpper = start + scale - 1 - # + timeFmt = 0 elif timeType == 'earliest date': # Get start @@ -355,6 +384,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | if startTimeVals is None: return None start, _, timeFmt = startTimeVals + # Get end endTimeVals = getEventTime(endVal) if endTimeVals is None: @@ -371,6 +401,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | if startTimeVals is None: return None start, startUpper, timeFmt = startTimeVals + # Get end+endUpper if endVal is not None: endTimeVals = getEventTime(endVal) @@ -383,6 +414,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | else: return None return start, startUpper, end, endUpper, timeFmt + def getEventTime(dataVal) -> tuple[int, int | None, int] | None: """ Obtains event start (or end) data from a 'datavalue' object with type 'time' """ if 'type' not in dataVal or dataVal['type'] != 'time': @@ -399,6 +431,7 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None: calendarmodel = value['calendarmodel'] except (KeyError, ValueError): return None + # Get start+startUpper start: int startUpper: int | None = None @@ -430,12 +463,15 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None: timeFmt = 0 else: return None + return start, startUpper, timeFmt -# For using multiple processes +# ========== For using multiple processes ========== + def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str: """ Forwards to readDumpChunk() (for use with pool.map()) """ return readDumpChunk(*params) + def readDumpChunk( procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str: """ Reads lines in the dump that begin after a start-byte, and not after an end byte. @@ -447,12 +483,14 @@ def readDumpChunk( with open(offsetsFile, 'rb') as file2: offsets = pickle.load(file2) file.set_block_offsets(offsets) + # Seek to chunk if startByte != -1: file.seek(startByte) file.readline() else: startByte = 0 # Used for progress calculation + # Read lines count = 0 while file.tell() <= endByte: @@ -463,14 +501,17 @@ def readDumpChunk( entry = readDumpLine(file.readline()) if entry: entries.append(entry) + # Output results into file with open(outFile, 'wb') as file: pickle.dump(entries, file) return outFile +# ========== Main block ========== + if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + multiprocessing.set_start_method('spawn') genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS) diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py index 46cf6ee..44c0020 100755 --- a/backend/hist_data/gen_imgs.py +++ b/backend/hist_data/gen_imgs.py @@ -10,17 +10,20 @@ processing. It uses already-existing database entries to decide what to skip. """ -# Took about 10 hours to process about 60k images +# Note: Took about 10 hours to process about 60k images import argparse -import os, subprocess, signal -import sqlite3, urllib.parse +import os +import subprocess +import signal +import sqlite3 +import urllib.parse IMG_DIR = os.path.join('enwiki', 'imgs') IMG_DB = os.path.join('enwiki', 'img_data.db') OUT_DIR = 'img' DB_FILE = 'data.db' -# + IMG_OUT_SZ = 200 def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): @@ -29,7 +32,7 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): os.mkdir(outDir) dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Checking for image tables') eventsDone: set[int] = set() imgsDone: set[int] = set() @@ -45,23 +48,26 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str): for (imgId,) in dbCur.execute('SELECT id from images'): imgsDone.add(imgId) print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip') - # + print('Processing images') processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone) - # + dbCon.commit() dbCon.close() + def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, eventsDone: set[int], imgsDone: set[int]) -> bool: """ Converts images and updates db, returning False upon interruption or failure """ imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() + # Set SIGINT handler interrupted = False def onSigint(sig, frame): nonlocal interrupted interrupted = True signal.signal(signal.SIGINT, onSigint) + # Convert images flag = False # Set to True upon interruption or failure for imgFile in os.listdir(imgDir): @@ -70,9 +76,11 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, print('Exiting') flag = True break + # Get image ID imgIdStr, _ = os.path.splitext(imgFile) imgId = int(imgIdStr) + # Get associated events eventIds: set[int] = set() query = 'SELECT title FROM page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name WHERE imgs.id = ?' @@ -85,12 +93,14 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, eventIds = eventIds.difference(eventsDone) if not eventIds: continue + # Convert image if imgId not in imgsDone: success = convertImage(os.path.join(imgDir, imgFile), os.path.join(outDir, str(imgId) + '.jpg')) if not success: flag = True break + # Add image to db row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone() if row is None: @@ -100,16 +110,21 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor, name, license, artist, credit = row url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name) dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit)) + # Add event association to db for eventId in eventIds: dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId)) + imgDbCon.close() return not flag + def convertImage(imgPath: str, outPath: str): + """ Converts an image using smartcrop """ print(f'Converting {imgPath} to {outPath}') if os.path.exists(outPath): print('ERROR: Output image already exists') return False + try: completedProcess = subprocess.run( ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], @@ -126,5 +141,5 @@ def convertImage(imgPath: str, outPath: str): if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genImgs(IMG_DIR, IMG_DB, OUT_DIR, DB_FILE) diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py index c5f4577..a6bb8f8 100755 --- a/backend/hist_data/gen_picked_data.py +++ b/backend/hist_data/gen_picked_data.py @@ -4,14 +4,15 @@ Adds additional manually-picked events to the database """ -# Code used in unit testing (for resolving imports of modules within this directory) -import os, sys +# For unit testing, resolve imports of modules within this directory +import os +import sys parentDir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parentDir) -# Standard imports + import argparse import json, sqlite3 -# Local imports + from gen_imgs import convertImage from cal import SCALES, dbDateToHistDate, dateToUnit @@ -23,7 +24,7 @@ IMG_OUT_DIR = 'img' def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, scales: list[int]) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + with open(os.path.join(pickedDir, pickedEvtFile)) as f: eventsToAdd = json.load(f) nextId = -1 @@ -33,7 +34,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca if eventId is None and title is None: print(f'ERROR: Entry with no ID or title: {event}') break - # + doAdd = eventId is None and len(event) > 1 doModify = eventId is not None and len(event) > 1 doDelete = not doModify and not doAdd @@ -42,6 +43,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', (nextId, event['title'], event['start'], event['start_upper'], event['end'], event['end_upper'], event['fmt'], event['ctg'])) + # Update image, description, and popularity tables if 'image' in event: print('> Adding image') @@ -57,6 +59,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca if 'desc' in event: dbCur.execute('INSERT INTO descs VALUES (?, ?, ?)', (nextId, nextId, event['desc'])) dbCur.execute('INSERT INTO pop VALUES (?, ?)', (nextId, event['pop'])) + # Update event distribution tables for scale in scales: unit = dateToUnit(dbDateToHistDate(event['start'], event['fmt']), scale) @@ -65,7 +68,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca else: dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, 1)) dbCur.execute('INSERT INTO event_disp VALUES (?, ?, ?)', (nextId, scale, unit)) - # + nextId -= 1 elif doDelete: if eventId: @@ -78,6 +81,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca print(f'ERROR: Could not find event with title {title}') break eventId, eventStart, eventFmt = row + # Note: Intentionally not deleting entries or files for images that become unused. dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,)) dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,)) @@ -93,15 +97,18 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca dbCur.execute('UPDATE dist SET count = count - 1 WHERE scale = ? AND unit = ?', (scale, unit)) dbCur.execute('DELETE FROM event_disp WHERE id = ?', (eventId,)) else: # doModify + # Note: Intentionally not updating 'event_disp' table to account for 'indirect event displayability' print(f'Modifying event with ID {eventId}') row = dbCur.execute('SELECT start, fmt FROM events WHERE id = ?', (eventId,)).fetchone() if row is None: print(f'ERROR: Could not find event with ID {eventId}') break oldStart, oldFmt = row + for field in ['title', 'start', 'start_upper', 'end', 'end_upper', 'fmt', 'ctg']: if field in event: dbCur.execute(f'UPDATE events SET {field} = ? WHERE id = ?', (event[field], eventId,)) + if 'image' in event: print('> Adding image') image = event['image'] @@ -117,16 +124,19 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca # Note: Intentionally not deleting entries or files for images that become unused. else: dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, nextId)) + if 'desc' in event: if dbCur.execute('SELECT desc FROM descs WHERE id = ?', (eventId,)).fetchone(): dbCur.execute('UPDATE event_imgs SET desc = ? WHERE id = ?', (event['desc'], eventId)) else: dbCur.execute('INSERT INTO descs VALUES (?, ?)', (eventId, event['desc'])) + if 'pop' in event: if dbCur.execute('SELECT pop FROM pop WHERE id = ?', (eventId,)).fetchone(): dbCur.execute('UPDATE pop SET pop = ? WHERE id = ?', (event['pop'], eventId)) else: dbCur.execute('INSERT INTO pop VALUES (?, ?)', (eventId, event['pop'])) + if 'start' in event: # Remove old distribution data for scale in scales: @@ -147,14 +157,14 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca else: dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, 1)) dbCur.execute('INSERT INTO event_disp VALUES (?, ?, ?)', (eventId, scale, unit)) - # Note: Intentionally not updating 'event_disp' table to account for 'indirect event displayability' + nextId -= 1 - # + dbCon.commit() dbCon.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PICKED_DIR, PICKED_EVT_FILE, DB_FILE, IMG_OUT_DIR, SCALES) diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py index aaaf69d..8d50b6b 100755 --- a/backend/hist_data/gen_pop_data.py +++ b/backend/hist_data/gen_pop_data.py @@ -4,7 +4,9 @@ Adds Wikipedia page view info to the database as popularity values """ -import os, sqlite3 +import argparse +import os +import sqlite3 PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db') DB_FILE = 'data.db' @@ -12,12 +14,12 @@ DB_FILE = 'data.db' def genData(pageviewsDb: str, dbFile: str) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Getting event data') titleToId: dict[str, int] = {} for eventId, title in dbCur.execute('SELECT id, title FROM events'): titleToId[title] = eventId - # + print('Getting view counts') pdbCon = sqlite3.connect(pageviewsDb) pdbCur = pdbCon.cursor() @@ -27,24 +29,23 @@ def genData(pageviewsDb: str, dbFile: str) -> None: iterNum += 1 if iterNum % 1e6 == 0: print(f'At iteration {iterNum}') - # + if title not in titleToId: continue titleToViews[title] = views pdbCon.close() - # + print(f'Result: {len(titleToViews)} out of {len(titleToId)}') dbCur.execute('CREATE TABLE pop (id INT PRIMARY KEY, pop INT)') dbCur.execute('CREATE INDEX pop_idx ON pop(pop)') for title, views in titleToViews.items(): dbCur.execute('INSERT INTO pop VALUES (?, ?)', (titleToId[title], views)) - # + dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PAGEVIEWS_DB, DB_FILE) diff --git a/backend/histplorer.py b/backend/histplorer.py index 26e131e..ab061d1 100755 --- a/backend/histplorer.py +++ b/backend/histplorer.py @@ -3,28 +3,33 @@ WSGI script that serves historical data. Expected HTTP query parameters: - type: - If 'events', reply with information on events within a date range, for a given scale - If 'info', reply with information about a given event - If 'sugg', reply with search suggestions for an event search string -- range: With type=events, specifies a historical-date range + If 'events', reply with information on events within a date range, for a given scale. + If 'info', reply with information about a given event. + If 'sugg', reply with search suggestions for an event search string. +- range: With type=events, specifies a historical-date range. If absent, the default is 'all of time'. Examples: range=1000.1910-10-09 means '1000 AD up to and excluding 09/10/1910' range=-13000. means '13000 BC onwards' -- scale: With type=events, specifies a date scale -- incl: With type=events, specifies an event to include, as an event ID -- event: With type=info, specifies the event title to get info for -- input: With type=sugg, specifies a search string to suggest for -- limit: With type=events or type=sugg, specifies the max number of results -- ctgs: With type=events|info|sugg, specifies event categories to restrict results to - Interpreted as a period-separated list of category names (eg: person.place). An empty string is ignored. +- scale: With type=events, specifies a date scale (see SCALES in hist_data/cal.py). +- incl: With type=events, specifies an event to include, as an event ID. +- event: With type=info, specifies the title of an event to get info for. +- input: With type=sugg, specifies a search string to suggest for. +- limit: With type=events or type=sugg, specifies the max number of results. +- ctgs: With type=events|info|sugg, specifies event categories to restrict results to. + Interpreted as a period-separated list of category names (eg: person.place). + An empty string is ignored. - imgonly: With type=events|info|sugg, if present, restricts results to events with images. """ from typing import Iterable, cast -import sys, re -import urllib.parse, sqlite3 -import gzip, jsonpickle +import sys +import re +import urllib.parse +import sqlite3 +import gzip +import jsonpickle + from hist_data.cal import HistDate, dbDateToHistDate, dateToUnit DB_FILE = 'hist_data/data.db' @@ -34,7 +39,8 @@ DEFAULT_REQ_EVENTS = 20 MAX_REQ_SUGGS = 50 DEFAULT_REQ_SUGGS = 5 -# Classes for values sent as responses +# ========== Classes for values sent as responses ========== + class Event: """ Represents an historical event """ def __init__( @@ -57,26 +63,30 @@ class Event: self.ctg = ctg self.imgId = imgId self.pop = pop - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, Event) and \ (self.id, self.title, self.start, self.startUpper, self.end, self.endUpper, \ self.ctg, self.pop, self.imgId) == \ (other.id, other.title, other.start, other.startUpper, other.end, other.endUpper, \ other.ctg, other.pop, other.imgId) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class EventResponse: """ Used when responding to type=events requests """ def __init__(self, events: list[Event], unitCounts: dict[int, int] | None): self.events = events self.unitCounts = unitCounts # None indicates exceeding MAX_REQ_UNIT_COUNTS - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, EventResponse) and \ (self.events, self.unitCounts) == (other.events, other.unitCounts) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class ImgInfo: """ Represents an event's associated image """ def __init__(self, url: str, license: str, artist: str, credit: str): @@ -84,13 +94,15 @@ class ImgInfo: self.license = license self.artist = artist self.credit = credit - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, ImgInfo) and \ (self.url, self.license, self.artist, self.credit) == \ (other.url, other.license, other.artist, other.credit) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class EventInfo: """ Used when responding to type=info requests """ def __init__(self, event: Event, desc: str | None, wikiId: int, imgInfo: ImgInfo | None): @@ -98,29 +110,34 @@ class EventInfo: self.desc = desc self.wikiId = wikiId self.imgInfo = imgInfo - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, EventInfo) and \ (self.event, self.desc, self.wikiId, self.imgInfo) == (other.event, other.desc, other.wikiId, other.imgInfo) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class SuggResponse: """ Used when responding to type=sugg requests """ def __init__(self, suggs: list[str], hasMore: bool): self.suggs = suggs self.hasMore = hasMore - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, SuggResponse) and \ (self.suggs, self.hasMore) == (other.suggs, other.hasMore) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) -# Entry point +# ========== Entry point ========== + def application(environ: dict[str, str], start_response) -> Iterable[bytes]: """ Entry point for the WSGI script """ # Get response object val = handleReq(DB_FILE, environ) + # Construct response data = jsonpickle.encode(val, unpicklable=False).encode() headers = [('Content-type', 'application/json')] @@ -130,16 +147,20 @@ def application(environ: dict[str, str], start_response) -> Iterable[bytes]: headers.append(('Content-encoding', 'gzip')) headers.append(('Content-Length', str(len(data)))) start_response('200 OK', headers) + return [data] + def handleReq(dbFile: str, environ: dict[str, str]) -> None | EventResponse | EventInfo | SuggResponse: """ Queries the database, and constructs a response object """ # Open db dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() + # Get query params queryStr = environ['QUERY_STRING'] if 'QUERY_STRING' in environ else '' queryDict = urllib.parse.parse_qs(queryStr) params = {k: v[0] for k, v in queryDict.items()} + # Get data of requested type reqType = queryDict['type'][0] if 'type' in queryDict else None if reqType == 'events': @@ -150,7 +171,8 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | EventResponse | Ev return handleSuggReq(params, dbCur) return None -# For type=events +# ========== For handling type=events ========== + def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor) -> EventResponse | None: """ Generates a response for a type=events request """ # Get dates @@ -163,6 +185,7 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor) -> EventRespo except ValueError: print(f'INFO: Invalid date-range value {dateRange}', file=sys.stderr) return None + # Get scale if 'scale' not in params: print('INFO: No scale provided', file=sys.stderr) @@ -172,12 +195,14 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor) -> EventRespo except ValueError: print('INFO: Invalid scale value', file=sys.stderr) return None + # Get incl value try: incl = int(params['incl']) if 'incl' in params else None except ValueError: print('INFO: Invalid incl value', file=sys.stderr) return None + # Get result set limit try: resultLimit = int(params['limit']) if 'limit' in params else DEFAULT_REQ_EVENTS @@ -187,13 +212,15 @@ def handleEventsReq(params: dict[str, str], dbCur: sqlite3.Cursor) -> EventRespo if resultLimit <= 0 or resultLimit > MAX_REQ_EVENTS: print(f'INFO: Invalid results limit {resultLimit}', file=sys.stderr) return None - # + ctgs = params['ctgs'].split('.') if 'ctgs' in params else None imgonly = 'imgonly' in params - # + events = lookupEvents(start, end, scale, incl, resultLimit, ctgs, imgonly, dbCur) unitCounts = lookupUnitCounts(start, end, scale, imgonly, dbCur) + return EventResponse(events, unitCounts) + def reqParamToHistDate(s: str): """ Produces a HistDate from strings like '2010-10-3', '-8000', and '' (throws ValueError if invalid) """ if not s: @@ -205,6 +232,7 @@ def reqParamToHistDate(s: str): return HistDate(None, int(m.group(1))) else: return HistDate(True, int(m.group(1)), int(m.group(2)), int(m.group(3))) + def lookupEvents( start: HistDate | None, end: HistDate | None, scale: int, incl: int | None, resultLimit: int, ctgs: list[str] | None, imgonly: bool, dbCur: sqlite3.Cursor) -> list[Event]: @@ -219,6 +247,7 @@ def lookupEvents( ' LEFT JOIN images ON event_imgs.img_id = images.id' constraints = [f'{dispTable}.scale = ?'] params: list[str | int] = [scale] + # Constrain by start/end startUnit = dateToUnit(start, scale) if start is not None else None endUnit = dateToUnit(end, scale) if end is not None else None @@ -232,22 +261,26 @@ def lookupEvents( if endUnit is not None: constraints.append(f'{dispTable}.unit < ?') params.append(endUnit) + # Constrain by event category if ctgs is not None: constraints.append('ctg IN (' + ','.join('?' * len(ctgs)) + ')') params.extend(ctgs) + # Add constraints to query query2 = query if constraints: query2 += ' WHERE ' + ' AND '.join(constraints) query2 += ' ORDER BY pop.pop DESC' query2 += f' LIMIT {resultLimit}' + # Run query results: list[Event] = [] for row in dbCur.execute(query2, params): results.append(eventEntryToResults(row)) if incl is not None and incl == row[0]: incl = None + # Get any additional inclusion if incl is not None: row = dbCur.execute(query + ' WHERE events.id = ?', (incl,)).fetchone() @@ -255,8 +288,9 @@ def lookupEvents( if len(results) == resultLimit: results.pop() results.append(eventEntryToResults(row)) - # + return results + def eventEntryToResults( row: tuple[int, str, int, int | None, int | None, int | None, int, str, int | None, int]) -> Event: eventId, title, start, startUpper, end, endUpper, fmt, ctg, imageId, pop = row @@ -267,11 +301,13 @@ def eventEntryToResults( for i, n in enumerate(dateVals): if n is not None: newDates[i] = dbDateToHistDate(n, fmt, i < 2) - # + return Event(eventId, title, cast(HistDate, newDates[0]), newDates[1], newDates[2], newDates[3], ctg, imageId, pop) + def lookupUnitCounts( start: HistDate | None, end: HistDate | None, scale: int, imgonly: bool, dbCur: sqlite3.Cursor) -> dict[int, int] | None: + """ Return list of units with counts given scale and a date range """ # Build query distTable = 'dist' if not imgonly else 'img_dist' query = f'SELECT unit, count FROM {distTable} WHERE scale = ?' @@ -283,13 +319,15 @@ def lookupUnitCounts( query += ' AND unit < ?' params.append(dateToUnit(end, scale)) query += ' ORDER BY unit ASC LIMIT ' + str(MAX_REQ_UNIT_COUNTS + 1) + # Get results unitCounts: dict[int, int] = {} for unit, count in dbCur.execute(query, params): unitCounts[unit] = count return unitCounts if len(unitCounts) <= MAX_REQ_UNIT_COUNTS else None -# For type=info +# ========== For handling type=info ========== + def handleInfoReq(params: dict[str, str], dbCur: sqlite3.Cursor): """ Generates a response for a type=info request """ if 'event' not in params: @@ -298,6 +336,7 @@ def handleInfoReq(params: dict[str, str], dbCur: sqlite3.Cursor): ctgs = params['ctgs'].split('.') if 'ctgs' in params else None imgonly = 'imgonly' in params return lookupEventInfo(params['event'], ctgs, imgonly, dbCur) + def lookupEventInfo(eventTitle: str, ctgs: list[str] | None, imgonly: bool, dbCur: sqlite3.Cursor) -> EventInfo | None: """ Look up an event with given title, and return a descriptive EventInfo """ imgJoin = 'INNER JOIN' if imgonly else 'LEFT JOIN' @@ -320,7 +359,8 @@ def lookupEventInfo(eventTitle: str, ctgs: list[str] | None, imgonly: bool, dbCu else: return None -# For type=sugg +# ========== For handling type=sugg ========== + def handleSuggReq(params: dict[str, str], dbCur: sqlite3.Cursor): """ Generates a response for a type=sugg request """ # Get search string @@ -331,6 +371,7 @@ def handleSuggReq(params: dict[str, str], dbCur: sqlite3.Cursor): if not searchStr: print('INFO: Empty \'input\' parameter for type=sugg request', file=sys.stderr) return None + # Get result limit try: resultLimit = int(params['limit']) if 'limit' in params else DEFAULT_REQ_SUGGS @@ -340,10 +381,11 @@ def handleSuggReq(params: dict[str, str], dbCur: sqlite3.Cursor): if resultLimit <= 0 or resultLimit > MAX_REQ_SUGGS: print(f'INFO: Invalid suggestion limit {resultLimit}', file=sys.stderr) return None - # + ctgs = params['ctgs'].split('.') if 'ctgs' in params else None imgonly = 'imgonly' in params return lookupSuggs(searchStr, resultLimit, ctgs, imgonly, dbCur) + def lookupSuggs( searchStr: str, resultLimit: int, ctgs: list[str] | None, imgonly: bool, dbCur: sqlite3.Cursor) -> SuggResponse: """ For a search string, returns a SuggResponse describing search suggestions """ @@ -355,10 +397,12 @@ def lookupSuggs( query += ' AND ctg IN (' + ','.join('?' * len(ctgs)) + ')' query += f' ORDER BY pop.pop DESC LIMIT {tempLimit}' suggs: list[str] = [] + # Prefix search params = [searchStr + '%'] + (ctgs if ctgs is not None else []) for (title,) in dbCur.execute(query, params): suggs.append(title) + # If insufficient results, try substring search if len(suggs) < tempLimit: existing = set(suggs) @@ -368,5 +412,5 @@ def lookupSuggs( suggs.append(title) if len(suggs) == tempLimit: break - # + return SuggResponse(suggs[:resultLimit], len(suggs) > resultLimit) diff --git a/backend/server.py b/backend/server.py index 70e847b..5c3904a 100755 --- a/backend/server.py +++ b/backend/server.py @@ -18,10 +18,8 @@ def wrappingApp(environ: dict[str, str], start_response) -> Iterable[bytes]: """ WSGI handler that uses 'application', but also serves image files """ urlPath = environ['PATH_INFO'] if urlPath.startswith('/data/'): - # Run WSGI script - return application(environ, start_response) - elif urlPath.startswith('/hist_data/img/'): - # Serve image file + return application(environ, start_response) # Run WSGI script + elif urlPath.startswith('/hist_data/img/'): # Serve image file imgPath = os.path.join(os.getcwd(), urlPath[1:]) if os.path.exists(imgPath): imgType = mimetypes.guess_type(imgPath)[0] @@ -33,6 +31,7 @@ def wrappingApp(environ: dict[str, str], start_response) -> Iterable[bytes]: else: start_response('404 Not Found', [('Content-type', 'text/plain')]) return [b'Unrecognised path'] + # Start server with simple_server.make_server('', 8000, wrappingApp) as httpd: print('Serving HTTP on port 8000...') diff --git a/backend/tests/common.py b/backend/tests/common.py index cb455e4..abfa471 100644 --- a/backend/tests/common.py +++ b/backend/tests/common.py @@ -3,7 +3,9 @@ Utilities for testing """ from typing import Any -import bz2, gzip, sqlite3 +import bz2 +import gzip +import sqlite3 def createTestFile(filename: str, content: str) -> None: """ Creates a file with the given name and contents """ diff --git a/backend/tests/enwiki/test_download_img_license_info.py b/backend/tests/enwiki/test_download_img_license_info.py index f285d55..ad6fa52 100644 --- a/backend/tests/enwiki/test_download_img_license_info.py +++ b/backend/tests/enwiki/test_download_img_license_info.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import Mock, patch -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from hist_data.enwiki.download_img_license_info import downloadInfo @@ -53,6 +54,7 @@ TEST_RESPONSE1 = { } } } + TEST_RESPONSE2 = { 'batchcomplete': '', 'query': { @@ -152,6 +154,7 @@ class TestDownloadInfo(unittest.TestCase): (1, 'Octopus2.jpg'), } ) + # Run downloadInfo(imgDb) # Check @@ -162,6 +165,7 @@ class TestDownloadInfo(unittest.TestCase): 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'), } ) + # Run with updated image-data db createTestDbTable( imgDb, diff --git a/backend/tests/enwiki/test_download_imgs.py b/backend/tests/enwiki/test_download_imgs.py index 823ac37..949d885 100644 --- a/backend/tests/enwiki/test_download_imgs.py +++ b/backend/tests/enwiki/test_download_imgs.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import Mock, patch -import tempfile, os +import tempfile +import os from tests.common import readTestFile, createTestDbTable from hist_data.enwiki.download_imgs import downloadImgs @@ -40,6 +41,7 @@ class TestDownloadInfo(unittest.TestCase): (16, 'six','cc-by','','fred','','https://upload.wikimedia.org/6.png'), } ) + # Create temp output directory with tempfile.TemporaryDirectory() as outDir: # Run diff --git a/backend/tests/enwiki/test_gen_desc_data.py b/backend/tests/enwiki/test_gen_desc_data.py index f6d4250..e777a6a 100644 --- a/backend/tests/enwiki/test_gen_desc_data.py +++ b/backend/tests/enwiki/test_gen_desc_data.py @@ -1,5 +1,6 @@ import unittest -import os, tempfile +import os +import tempfile from tests.common import readTestDbTable from hist_data.enwiki.gen_desc_data import genData @@ -12,6 +13,7 @@ class TestGenData(unittest.TestCase): # Run dbFile = os.path.join(tempDir, 'descData.db') genData(TEST_DUMP_FILE, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT id, title FROM pages'), diff --git a/backend/tests/enwiki/test_gen_dump_index_db.py b/backend/tests/enwiki/test_gen_dump_index_db.py index 64053c4..5281911 100644 --- a/backend/tests/enwiki/test_gen_dump_index_db.py +++ b/backend/tests/enwiki/test_gen_dump_index_db.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestBz2, readTestDbTable from hist_data.enwiki.gen_dump_index_db import genData @@ -10,15 +11,18 @@ def runGenData(indexFileContents: str): # Create temp index file indexFile = os.path.join(tempDir, 'index.txt.bz2') createTestBz2(indexFile, indexFileContents) + # Run dbFile = os.path.join(tempDir, 'data.db') genData(indexFile, dbFile) + # Read db return readTestDbTable(dbFile, 'SELECT title, id, offset, next_offset FROM offsets') class TestGenData(unittest.TestCase): def setUp(self): self.maxDiff = None # Remove output-diff size limit + def test_index_file(self): indexFileContents = ( '100:10:apple\n' @@ -33,6 +37,7 @@ class TestGenData(unittest.TestCase): ('banana ice-cream', 99, 300, 1000), ('Custard!', 2030, 1000, -1), }) + def test_emp_index(self): offsetsMap = runGenData('') self.assertEqual(offsetsMap, set()) diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py index d18dddf..91ba481 100644 --- a/backend/tests/enwiki/test_gen_img_data.py +++ b/backend/tests/enwiki/test_gen_img_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from hist_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData @@ -24,6 +25,7 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): (5, 'Marie Curie', 2403277, None, 2427622, None, 1, 'human'), } ) + # Create temp dump-index db indexDb = os.path.join(tempDir, 'dump_index.db') createTestDbTable( @@ -38,6 +40,7 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): ('Autism',25,0,-1), } ) + # Run pageIds = getInputPageIdsFromDb(dbFile, indexDb) # Check @@ -58,6 +61,7 @@ class TestGenData(unittest.TestCase): ('Autism',25,0,-1), } ) + # Run imgDb = os.path.join(tempDir, 'imgData.db') genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb) @@ -69,6 +73,7 @@ class TestGenData(unittest.TestCase): (25, 'Autism', 'Autism-stacking-cans 2nd edit.jpg'), } ) + # Run with updated page-ids set genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb) # Check diff --git a/backend/tests/enwiki/test_gen_pageview_data.py b/backend/tests/enwiki/test_gen_pageview_data.py index 154953e..3209cce 100644 --- a/backend/tests/enwiki/test_gen_pageview_data.py +++ b/backend/tests/enwiki/test_gen_pageview_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestBz2, createTestDbTable, readTestDbTable from hist_data.enwiki.gen_pageview_data import genData @@ -18,6 +19,7 @@ class TestGenData(unittest.TestCase): 'fr.wikipedia Four null desktop 12 T6U6\n' 'en.wikipedia Three null desktop 10 E4G5Z61\n' )) + # Create temp dump-index db dumpIndexDb = os.path.join(tempDir, 'dump_index.db') createTestDbTable( @@ -31,9 +33,11 @@ class TestGenData(unittest.TestCase): ('Four', 4, 0, -1), } ) + # Run dbFile = os.path.join(tempDir, 'data.db') genData(pageviewFiles, dumpIndexDb, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT title, id, views from views'), diff --git a/backend/tests/test_cal.py b/backend/tests/test_cal.py index 78b2c8b..9d481e7 100644 --- a/backend/tests/test_cal.py +++ b/backend/tests/test_cal.py @@ -10,30 +10,37 @@ class TestCal(unittest.TestCase): self.assertEqual(gregorianToJdn(2010, 11, 3), 2455504) self.assertEqual(gregorianToJdn(-4714, 11, 24), 0) self.assertEqual(gregorianToJdn(-1, 1, 1), 1721060) + def test_julian_to_jdn(self): self.assertEqual(julianToJdn(2010, 11, 3), 2455517) self.assertEqual(julianToJdn(-4713, 1, 1), 0) self.assertEqual(julianToJdn(-1, 1, 1), 1721058) + def test_jdn_to_gregorian(self): self.assertEqual(jdnToGregorian(2455504), (2010, 11, 3)) self.assertEqual(jdnToGregorian(0), (-4714, 11, 24)) self.assertEqual(jdnToGregorian(1721060), (-1, 1, 1)) + def test_jdn_to_julian(self): self.assertEqual(jdnToJulian(2455517), (2010, 11, 3)) self.assertEqual(jdnToJulian(0), (-4713, 1, 1)) self.assertEqual(jdnToJulian(1721058), (-1, 1, 1)) + def test_gregorian_to_julian(self): self.assertEqual(gregorianToJulian(2022, 9, 30), (2022, 9, 17)) self.assertEqual(gregorianToJulian(1616, 5, 3), (1616, 4, 23)) + def test_julian_to_gregorian(self): self.assertEqual(julianToGregorian(2022, 9, 17), (2022, 9, 30)) self.assertEqual(julianToGregorian(1616, 4, 23), (1616, 5, 3)) + def test_db_to_hist_date(self): - self.assertEqual(dbDateToHistDate(2001, 0), HistDate(True, 2001, 1, 1)) + self.assertEqual(dbDateToHistDate(2001, 0), HistDate(None, 2001, 1, 1)) self.assertEqual(dbDateToHistDate(1356438, 1), HistDate(True, -1000, 9, 13)) self.assertEqual(dbDateToHistDate(1721455, 2), HistDate(False, 1, 2, 1)) self.assertEqual(dbDateToHistDate(2268942, 3, False), HistDate(False, 1500, 1, 10)) self.assertEqual(dbDateToHistDate(2268933, 3, True), HistDate(True, 1500, 1, 10)) + def test_date_to_unit(self): self.assertEqual(dateToUnit(HistDate(None, 1914, 1, 1), 10), 191) self.assertEqual(dateToUnit(HistDate(True, 1500, 10, 5), MONTH_SCALE), 2269197) diff --git a/backend/tests/test_gen_desc_data.py b/backend/tests/test_gen_desc_data.py index 4c902ad..8fb6ce0 100644 --- a/backend/tests/test_gen_desc_data.py +++ b/backend/tests/test_gen_desc_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from hist_data.gen_desc_data import genData @@ -39,6 +40,7 @@ class TestGenData(unittest.TestCase): (5, 'Five'), } ) + # Create temp history db dbFile = os.path.join(tempDir, 'data.db') createTestDbTable( @@ -53,6 +55,7 @@ class TestGenData(unittest.TestCase): (50, 'V', 5, 10, None, None, 2, 'human'), } ) + # Run genData(enwikiDb, dbFile) # Check diff --git a/backend/tests/test_gen_disp_data.py b/backend/tests/test_gen_disp_data.py index 0d54eb0..8fe13e4 100644 --- a/backend/tests/test_gen_disp_data.py +++ b/backend/tests/test_gen_disp_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from hist_data.gen_disp_data import genData @@ -58,9 +59,11 @@ class TestGenData(unittest.TestCase): (7, 70), } ) + # Run genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2, False) genData(dbFile, [10, 1, MONTH_SCALE, DAY_SCALE], 2, True) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT * FROM events'), diff --git a/backend/tests/test_gen_events_data.py b/backend/tests/test_gen_events_data.py index a94bc89..9622dc2 100644 --- a/backend/tests/test_gen_events_data.py +++ b/backend/tests/test_gen_events_data.py @@ -1,6 +1,11 @@ import unittest -import tempfile, os, json, bz2, pickle, indexed_bzip2 -# Local imports +import tempfile +import os +import json +import bz2 +import pickle +import indexed_bzip2 + from tests.common import readTestDbTable from hist_data.gen_events_data import genData @@ -18,15 +23,18 @@ def runGenData(wikiItemArray: str, preGenOffsets: bool, nProcs: int): file.write(b',') file.write(b'\n') file.write(b']\n') + # Create temp offsets file if requested offsetsFile = os.path.join(tempDir, 'offsets.dat') if preGenOffsets: with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'wb') as file2: pickle.dump(file.block_offsets(), file2) + # Run genData() dbFile = os.path.join(tempDir, 'events.db') genData(wikidataFile, offsetsFile, dbFile, nProcs) + # Read db return readTestDbTable(dbFile, 'SELECT * FROM events') @@ -164,15 +172,19 @@ class TestGenData(unittest.TestCase): (7, 'media two', -2199, -2100, None, None, 0, 'work'), (8, 'organism one', -400000000, -300000001, None, None, 0, 'organism'), } + def test_wikiItems(self): rows = runGenData(self.testWikiItems, False, 1) self.assertEqual(rows, self.expectedRows) + def test_empty_dump(self): rows = runGenData([{}], False, 1) self.assertEqual(rows, set()) + def test_multiprocessing(self): rows = runGenData(self.testWikiItems, False, 4) self.assertEqual(rows, self.expectedRows) + def test_existing_offsets(self): rows = runGenData(self.testWikiItems, True, 3) self.assertEqual(rows, self.expectedRows) diff --git a/backend/tests/test_gen_imgs.py b/backend/tests/test_gen_imgs.py index ea4bd70..893be5c 100644 --- a/backend/tests/test_gen_imgs.py +++ b/backend/tests/test_gen_imgs.py @@ -1,6 +1,8 @@ import unittest from unittest.mock import patch -import tempfile, os, shutil +import tempfile +import os +import shutil from tests.common import createTestDbTable, readTestDbTable from hist_data.gen_imgs import genImgs @@ -12,12 +14,14 @@ class TestGenImgs(unittest.TestCase): def test_gen(self, convertImageMock): with tempfile.TemporaryDirectory() as tempDir: convertImageMock.side_effect = lambda imgPath, outPath: shutil.copy(imgPath, outPath) + # Create temp images imgDir = os.path.join(tempDir, 'enwiki_imgs') os.mkdir(imgDir) shutil.copy(TEST_IMG, os.path.join(imgDir, '100.jpg')) shutil.copy(TEST_IMG, os.path.join(imgDir, '200.jpeg')) shutil.copy(TEST_IMG, os.path.join(imgDir, '400.png')) + # Create temp image db imgDb = os.path.join(tempDir, 'img_data.db') createTestDbTable( @@ -40,6 +44,7 @@ class TestGenImgs(unittest.TestCase): (200, 'two.jpeg', 'cc-by', 'author2', 'credits2', '', 'https://upload.wikimedia.org/two.jpeg'), } ) + # Create temp history db dbFile = os.path.join(tempDir, 'data.db') createTestDbTable( @@ -53,9 +58,11 @@ class TestGenImgs(unittest.TestCase): (30, 'third', 1, 20, 30, 40, 1, 'event'), } ) + # Run outDir = os.path.join(tempDir, 'imgs') genImgs(imgDir, imgDb, outDir, dbFile) + # Check self.assertEqual(set(os.listdir(outDir)), { '100.jpg', diff --git a/backend/tests/test_gen_picked_data.py b/backend/tests/test_gen_picked_data.py index ec1203b..e40c3c8 100644 --- a/backend/tests/test_gen_picked_data.py +++ b/backend/tests/test_gen_picked_data.py @@ -1,6 +1,8 @@ import unittest from unittest.mock import patch -import tempfile, os, shutil +import tempfile +import os +import shutil from tests.common import createTestFile, createTestDbTable, readTestDbTable from hist_data.gen_picked_data import genData @@ -12,6 +14,7 @@ class TestGenImgs(unittest.TestCase): def test_gen(self, convertImageMock): with tempfile.TemporaryDirectory() as tempDir: convertImageMock.side_effect = lambda imgPath, outPath: shutil.copy(imgPath, outPath) + # Create picked-event file pickedDir = os.path.join(tempDir, 'picked') os.mkdir(pickedDir) @@ -51,9 +54,11 @@ class TestGenImgs(unittest.TestCase): "title": "event three" }] ''') + # Create picked images shutil.copy(TEST_IMG, os.path.join(pickedDir, 'covid.jpg')) shutil.copy(TEST_IMG, os.path.join(pickedDir, 'foo.jpg')) + # Create temp history db dbFile = os.path.join(tempDir, 'data.db') createTestDbTable( @@ -126,12 +131,15 @@ class TestGenImgs(unittest.TestCase): (3, 1, 3), } ) + # Create existing event images imgOutDir = os.path.join(tempDir, 'imgs') os.mkdir(imgOutDir) shutil.copy(TEST_IMG, os.path.join(imgOutDir, '10.jpg')) + # Run genData(pickedDir, pickedEvtFile, dbFile, imgOutDir, [10, 1]) + # Check self.assertEqual(set(os.listdir(imgOutDir)), { '10.jpg', diff --git a/backend/tests/test_gen_pop_data.py b/backend/tests/test_gen_pop_data.py index 2f505f0..5080dd9 100644 --- a/backend/tests/test_gen_pop_data.py +++ b/backend/tests/test_gen_pop_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from hist_data.gen_pop_data import genData @@ -19,6 +20,7 @@ class TestGenData(unittest.TestCase): ('three', 3, 30), } ) + # Create temp history db dbFile = os.path.join(tempDir, 'data.db') createTestDbTable( @@ -31,8 +33,10 @@ class TestGenData(unittest.TestCase): (33, 'three', 100, None, None, None, 0, 'event'), } ) + # Run genData(pageviewsDb, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT id, pop from pop'), diff --git a/backend/tests/test_histplorer.py b/backend/tests/test_histplorer.py index 8f7e281..cd52c67 100644 --- a/backend/tests/test_histplorer.py +++ b/backend/tests/test_histplorer.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable from histplorer import handleReq, HistDate, Event, ImgInfo, EventInfo, SuggResponse @@ -105,8 +106,10 @@ class TestHandleReq(unittest.TestCase): self.tempDir = tempfile.TemporaryDirectory() self.dbFile = os.path.join(self.tempDir.name, 'data.db') initTestDb(self.dbFile) + def tearDown(self): self.tempDir.cleanup() + def test_events_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'type=events&range=-1999.2002-11-1&scale=1&incl=3&limit=2'}) self.assertEqual(response.events, [ @@ -123,6 +126,7 @@ class TestHandleReq(unittest.TestCase): Event(1, 'event one', HistDate(None, 1900, 1, 1), None, None, None, 'event', 10, 11), ]) self.assertEqual(response.unitCounts, {-2000: 1, 1900: 2, 1990: 1}) + def test_info_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'type=info&event=event%20three'}) self.assertEqual(response, @@ -136,6 +140,7 @@ class TestHandleReq(unittest.TestCase): Event(4, 'event four', HistDate(False, -2000, 10, 10), None, HistDate(False, 1, 10, 10), None, 'event', 20, 1000), 'desc four', 400, ImgInfo('example.com/2', 'cc-by', 'artist two', 'credits two'))) + def test_sugg_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'type=sugg&input=event t'}) self.assertEqual(response, SuggResponse(['event two', 'event three'], False)) diff --git a/backend/tests/test_reduce_event_data.py b/backend/tests/test_reduce_event_data.py deleted file mode 100644 index 22fe204..0000000 --- a/backend/tests/test_reduce_event_data.py +++ /dev/null @@ -1,141 +0,0 @@ -import unittest -import tempfile, os - -from tests.common import createTestDbTable, readTestDbTable -from hist_data.reduce_event_data import reduceData -from hist_data.cal import gregorianToJdn, julianToJdn, MONTH_SCALE, DAY_SCALE - -class TestReduceData(unittest.TestCase): - def test_reduce(self): - with tempfile.TemporaryDirectory() as tempDir: - # Create temp history db - dbFile = os.path.join(tempDir, 'data.db') - createTestDbTable( - dbFile, - 'CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \ - 'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)', - 'INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', - { - (1, 'event one', 1900, None, None, None, 0, 'event'), - (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), # 2/11/2002 - (3, 'event three', 2448175, 2448200, None, None, 1, 'discovery'), # 10/10/1990 - (4, 'event four', 1900, None, None, None, 0, 'event'), # Copy of 1 - (5, 'event five', 2452595, None, 2455369, None, 3, 'human'), # Day after 2 - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE pop (id INT PRIMARY KEY, pop INT)', - 'INSERT INTO pop VALUES (?, ?)', - { - (1, 10), - (2, 20), - (3, 30), - (4, 40), - (5, 50), - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE dist (scale INT, unit INT, count INT, PRIMARY KEY (scale, unit))', - 'INSERT INTO dist VALUES (?, ?, ?)', - { - (1, 1900, 2), - (1, 1990, 1), - (1, 2002, 2), - (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 2), - (MONTH_SCALE, gregorianToJdn(1990, 10, 1), 1), - (MONTH_SCALE, julianToJdn(2002, 11, 1), 2), - (DAY_SCALE, gregorianToJdn(1900, 1, 1), 2), - (DAY_SCALE, gregorianToJdn(1990, 10, 10), 1), - (DAY_SCALE, 2452594, 1), - (DAY_SCALE, 2452595, 1), - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE event_disp (id INT, scale INT, PRIMARY KEY (id, scale))', - 'INSERT INTO event_disp VALUES (?, ?)', - { - (1, 1), - (1, MONTH_SCALE), - (1, DAY_SCALE), - (2, 1), - (2, MONTH_SCALE), - (2, DAY_SCALE), - (3, 1), - (3, MONTH_SCALE), - (3, DAY_SCALE), - (4, 1), - (4, MONTH_SCALE), - (4, DAY_SCALE), - (5, 1), - (5, MONTH_SCALE), - (5, DAY_SCALE), - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE event_imgs (id INT PRIMARY KEY, img_id INT)', - 'INSERT INTO event_imgs VALUES (?, ?)', - { - (1, 11), - (2, 21), - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE images (id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT)', - 'INSERT INTO images VALUES (?, ?, ?, ?, ?)', - { - (11, 'example.com/1', 'cc0', 'artist one', 'credits one'), - (21, 'example.com/1', 'cc0', 'artist two', 'credits two'), - } - ) - createTestDbTable( - dbFile, - 'CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)', - 'INSERT INTO descs VALUES (?, ?, ?)', - { - (1, 100, 'desc one'), - } - ) - # Run - reduceData(dbFile, [1, MONTH_SCALE, DAY_SCALE]) - # Check - self.assertEqual( - readTestDbTable(dbFile, 'SELECT * FROM events'), - { - (1, 'event one', 1900, None, None, None, 0, 'event'), - (2, 'event two', 2452594, None, 2455369, None, 3, 'human'), - } - ) - self.assertEqual( - readTestDbTable(dbFile, 'SELECT * from pop'), - { - (1, 10), - (2, 20), - } - ) - self.assertEqual( - readTestDbTable(dbFile, 'SELECT * from dist'), - { - (1, 1900, 1), - (1, 2002, 1), - (MONTH_SCALE, gregorianToJdn(1900, 1, 1), 1), - (MONTH_SCALE, julianToJdn(2002, 11, 1), 1), - (DAY_SCALE, gregorianToJdn(1900, 1, 1), 1), - (DAY_SCALE, 2452594, 1), - } - ) - self.assertEqual( - readTestDbTable(dbFile, 'SELECT * from event_disp'), - { - (1, 1), - (1, MONTH_SCALE), - (1, DAY_SCALE), - (2, 1), - (2, MONTH_SCALE), - (2, DAY_SCALE), - } - ) |
