aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data')
-rw-r--r--backend/hist_data/README.md27
-rw-r--r--backend/hist_data/cal.py23
-rwxr-xr-xbackend/hist_data/enwiki/download_img_license_info.py29
-rwxr-xr-xbackend/hist_data/enwiki/download_imgs.py27
-rwxr-xr-xbackend/hist_data/enwiki/gen_desc_data.py51
-rwxr-xr-xbackend/hist_data/enwiki/gen_dump_index_db.py17
-rwxr-xr-xbackend/hist_data/enwiki/gen_img_data.py44
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py30
-rwxr-xr-xbackend/hist_data/gen_desc_data.py14
-rwxr-xr-xbackend/hist_data/gen_disp_data.py23
-rwxr-xr-xbackend/hist_data/gen_events_data.py85
-rwxr-xr-xbackend/hist_data/gen_imgs.py31
-rwxr-xr-xbackend/hist_data/gen_picked_data.py30
-rwxr-xr-xbackend/hist_data/gen_pop_data.py17
14 files changed, 303 insertions, 145 deletions
diff --git a/backend/hist_data/README.md b/backend/hist_data/README.md
index 9fe2d0e..09a71fc 100644
--- a/backend/hist_data/README.md
+++ b/backend/hist_data/README.md
@@ -9,37 +9,38 @@ This directory holds files used to generate the history database data.db.
- `start*` and `end*` specify start and end dates.
`start_upper`, `end`, and `end_upper`, are optional.
If `start_upper` is present, it and `start` denote an uncertain range of start times.
- Similarly for 'end' and 'end_upper'.
+ Similarly for `end` and `end_upper`.
- `fmt` indicates format info for `start`, `start_upper`, `end`, and `end_upper`.
- If 0, they denote a number of years AD (if positive) or BC (if negative).
- If 1, they denote a Julian date number.
This allows simple comparison of events with day-level precision, but only goes back to 4713 BC.
- If 2, same as 1, but with a preference for display using the Julian calendar, not the Gregorian calendar.
For example, William Shakespeare's birth appears 'preferably Julian', but Samuel Johnson's does not.
- - If 3, same as 2, but where 'start' and 'start_upper' are 'preferably Julian'.
+ - If 3, same as 2, but where only `start` and `start_upper` are 'preferably Julian'.
For example, Galileo Galilei's birth date appears 'preferably Julian', but his death date does not.
- `pop`: <br>
Format: `id INT PRIMARY KEY, pop INT` <br>
- Associates each event with a popularity measure (currently an average monthly viewcount)
+ Associates each event with a popularity measure (currently an average monthly viewcount).
- `dist`: <br>
Format: `scale INT, unit INT, count INT, PRIMARY KEY (scale, unit)` <br>
- Maps scale units to counts of events in them.
+ For each scale, maps its units to event counts.
+ For example, on the monthly scale, the unit for Jan 2010 might have 10 events.
- `event_disp`: <br>
Format: `id INT, scale INT, unit INT, PRIMARY KEY (id, scale)` <br>
Maps events to scales+units they are 'displayable' on (used to make displayed events more uniform across time).
-- `img_dist`: <br>
- Like `dist`, but only counts events with images.
-- `img_disp`: <br>
- Like `events_disp`, but only counts events with images.
- `images`: <br>
Format: `id INT PRIMARY KEY, url TEXT, license TEXT, artist TEXT, credit TEXT` <br>
- Holds metadata for available images
+ Holds metadata for available images.
- `event_imgs`: <br>
Format: `id INT PRIMARY KEY, img_id INT` <br>
- Assocates events with images
+ Assocates events with images.
- `descs`: <br>
Format: `id INT PRIMARY KEY, wiki_id INT, desc TEXT` <br>
Associates an event's enwiki title with a short description.
+- `img_dist`: <br>
+ Like `dist`, but only counts events with images.
+- `img_disp`: <br>
+ Like `events_disp`, but only counts events with images.
# Generating the Database
@@ -66,12 +67,12 @@ Some of the scripts use third-party packages:
looks for infobox image names, and stores them in an image database.
1. In enwiki/, run `download_img_license_info.py`, which downloads licensing info for found
images, and adds them to the image database. You should probably first change the USER_AGENT
- script variable to identify yourself to the online API (this is expected
- [best practice](https://www.mediawiki.org/wiki/API:Etiquette)).
+ script variable to identify yourself to the online API (this is
+ [expected best practice](https://www.mediawiki.org/wiki/API:Etiquette)).
1. In enwiki/, run `download_imgs.py`, which downloads images into enwiki/imgs/. Setting the
USER_AGENT variable applies here as well. <br>
In some rare cases, the download won't produce an image file, but a text file containing
- 'File not found: ...'. These can simply be deleted.
+ 'File not found: ...'. These can be deleted.
1. Run `gen_imgs.py`, which creates resized/cropped images in img/, from images in enwiki/imgs/.
Adds the `imgs` and `event_imgs` tables. <br>
The output images might need additional manual changes:
diff --git a/backend/hist_data/cal.py b/backend/hist_data/cal.py
index efb5bab..d86589b 100644
--- a/backend/hist_data/cal.py
+++ b/backend/hist_data/cal.py
@@ -2,8 +2,11 @@
Provides date conversion functions, HistDate, and date scales.
"""
-# For conversion between calendars and Julian day numbers. Algorithms were obtained from
+# ========== For conversion between calendars and Julian day numbers. ==========
+
+# Algorithms were obtained from:
# https://en.wikipedia.org/wiki/Julian_day#Converting_Gregorian_calendar_date_to_Julian_Day_Number.
+
def gregorianToJdn(year: int, month: int, day: int) -> int:
"""
Converts a Gregorian calendar date to a Julian day number,
@@ -20,6 +23,7 @@ def gregorianToJdn(year: int, month: int, day: int) -> int:
jdn -= int((3 * int((year + 4900 + x) / 100)) / 4)
jdn += day - 32075
return jdn
+
def julianToJdn(year: int, month: int, day: int) -> int:
"""
Like gregorianToJdn(), but converts a Julian calendar date.
@@ -32,6 +36,7 @@ def julianToJdn(year: int, month: int, day: int) -> int:
jdn += int(275 * month / 9)
jdn += day + 1729777
return jdn
+
def jdnToGregorian(jdn: int) -> tuple[int, int, int]:
"""
Converts a Julian day number to a Gregorian calendar date, denoting the
@@ -48,6 +53,7 @@ def jdnToGregorian(jdn: int) -> tuple[int, int, int]:
if Y <= 0:
Y -= 1
return Y, M, D
+
def jdnToJulian(jdn: int) -> tuple[int, int, int]:
""" Like jdnToGregorian(), but converts to a Julian calendar date """
f = jdn + 1401
@@ -60,16 +66,20 @@ def jdnToJulian(jdn: int) -> tuple[int, int, int]:
if Y <= 0:
Y -= 1
return Y, M, D
+
def julianToGregorian(year: int, month: int, day: int) -> tuple[int, int, int]:
return jdnToGregorian(julianToJdn(year, month, day))
+
def gregorianToJulian(year: int, month: int, day: int) -> tuple[int, int, int]:
return jdnToJulian(gregorianToJdn(year, month, day))
-# For date representation
+# ========== For date representation ==========
+
MIN_CAL_YEAR = -4713 # Year before which JDNs are not usable
MONTH_SCALE = -1;
DAY_SCALE = -2;
SCALES: list[int] = [int(s) for s in [1e9, 1e8, 1e7, 1e6, 1e5, 1e4, 1e3, 100, 10, 1, MONTH_SCALE, DAY_SCALE]];
+
class HistDate:
"""
Represents a historical date
@@ -85,12 +95,14 @@ class HistDate:
self.year = year
self.month = month
self.day = day
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, HistDate) and \
(self.gcal, self.year, self.month, self.day) == (other.gcal, other.year, other.month, other.day)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
+
def dbDateToHistDate(n: int, fmt: int, end=False) -> HistDate:
""" Converts a start/start_upper/etc and fmt value in the 'events' db table, into a HistDate """
if fmt == 0: # year
@@ -99,6 +111,7 @@ def dbDateToHistDate(n: int, fmt: int, end=False) -> HistDate:
return HistDate(True, *jdnToGregorian(n))
else: # fmt == 2 or fmt == 3 and not end
return HistDate(False, *jdnToJulian(n))
+
def dateToUnit(date: HistDate, scale: int) -> int:
""" Converts a date to an int representing a unit on a scale """
if scale >= 1:
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 43f2c43..6fd710c 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -10,12 +10,16 @@ at already-processed names to decide what to skip.
"""
import argparse
-import re, time, signal
-import sqlite3, urllib.parse, html
+import re
+import time
+import signal
+import sqlite3
+import urllib.parse
+import html
import requests
IMG_DB = 'img_data.db'
-#
+
API_URL = 'https://en.wikipedia.org/w/api.php'
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
BATCH_SZ = 50 # Max 50
@@ -26,17 +30,18 @@ def downloadInfo(imgDb: str) -> None:
print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
+
print('Checking for table')
if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
dbCur.execute('CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \
'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
- #
+
print('Reading image names')
imgNames: set[str] = set()
for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
imgNames.add(imgName)
print(f'Found {len(imgNames)}')
- #
+
print('Checking for already-processed images')
nextImgId = 1
oldSz = len(imgNames)
@@ -45,7 +50,7 @@ def downloadInfo(imgDb: str) -> None:
if imgId >= nextImgId:
nextImgId = imgId + 1
print(f'Found {oldSz - len(imgNames)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -54,7 +59,7 @@ def downloadInfo(imgDb: str) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Iterating through image names')
imgNameList = list(imgNames)
iterNum = 0
@@ -65,9 +70,11 @@ def downloadInfo(imgDb: str) -> None:
if interrupted:
print(f'Exiting loop at iteration {iterNum}')
break
+
# Get batch
imgBatch = imgNameList[i:i+BATCH_SZ]
imgBatch = ['File:' + x for x in imgBatch]
+
# Make request
headers = {
'user-agent': USER_AGENT,
@@ -90,6 +97,7 @@ def downloadInfo(imgDb: str) -> None:
print(f'ERROR: Exception while downloading info: {e}')
print('\tImage batch: ' + '|'.join(imgBatch))
continue
+
# Parse response-object
if 'query' not in responseObj or 'pages' not in responseObj['query']:
print('WARNING: Response object doesn\'t have page data')
@@ -120,6 +128,7 @@ def downloadInfo(imgDb: str) -> None:
if title not in imgNames:
print(f'WARNING: Got title "{title}" not in image-name list')
continue
+
if 'imageinfo' not in page:
print(f'WARNING: No imageinfo section for page "{title}"')
continue
@@ -129,6 +138,7 @@ def downloadInfo(imgDb: str) -> None:
artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
# Remove markup
if artist is not None:
artist = TAG_REGEX.sub(' ', artist).strip()
@@ -140,11 +150,12 @@ def downloadInfo(imgDb: str) -> None:
credit = WHITESPACE_REGEX.sub(' ', credit)
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
+
# Add to db
dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)',
(nextImgId, title, license, artist, credit, restrictions, url))
nextImgId += 1
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
@@ -152,5 +163,5 @@ def downloadInfo(imgDb: str) -> None:
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadInfo(IMG_DB)
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index df40bae..e484b33 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,33 +9,38 @@ The program can be re-run to continue downloading, and looks
in the output directory do decide what to skip.
"""
-# Took about a week to downloaded about 60k images
+# Note: Took about a week to downloaded about 60k images
import argparse
-import re, os, time, signal
+import re
+import os
+import time
+import signal
import sqlite3
-import urllib.parse, requests
+import urllib.parse
+import requests
IMG_DB = 'img_data.db' # About 130k image names
OUT_DIR = 'imgs'
-#
+
LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
TIMEOUT = 1
- # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
- # It's unclear how to properly check for cache misses, so we just aim for 1 per sec
+ # Note: https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'.
+ # It's unclear how to properly check for cache misses, so we just aim for 1 per sec.
EXP_BACKOFF = True # If True, double the timeout each time a download error occurs (otherwise just exit)
def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if not os.path.exists(outDir):
os.mkdir(outDir)
+
print('Checking for already-downloaded images')
fileList = os.listdir(outDir)
imgIdsDone: set[int] = set()
for filename in fileList:
imgIdsDone.add(int(os.path.splitext(filename)[0]))
print(f'Found {len(imgIdsDone)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -44,10 +49,11 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
+
print('Starting downloads')
iterNum = 0
query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs'
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if interrupted:
print('Exiting loop')
break
+
# Check for problematic attributes
if license is None or LICENSE_REGEX.fullmatch(license) is None:
continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
continue
if restrictions is not None and restrictions != '':
continue
+
# Download image
iterNum += 1
print(f'Iteration {iterNum}: Downloading for image ID {imgId}')
@@ -92,11 +100,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
timeout *= 2
print(f'New timeout: {timeout}')
continue
+
print('Closing database')
dbCon.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index bb2b845..194afe8 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -5,30 +5,40 @@ Reads through the wiki dump, attempts to parse short-descriptions,
and adds them to a database
"""
-# In testing, this script took over 10 hours to run, and generated about 5GB
+# Note: In testing, this script took over 10 hours to run, and generated about 5GB
import argparse
-import sys, os, re
-import bz2, html, mwxml, mwparserfromhell
+import sys
+import os
+import re
import sqlite3
+import bz2
+import html
+
+import mwxml
+import mwparserfromhell
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
DB_FILE = 'desc_data.db'
-# Regexps
+
DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
def convertTemplateReplace(match):
""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
if match.group(2) is None:
return f'{match.group(1)} {match.group(4)}'
else:
return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
-PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
-LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+# ========== For data generation ==========
def genData(dumpFile: str, dbFile: str) -> None:
+ """ Reads dump, parses descriptions, and writes to db """
print('Creating database')
if os.path.exists(dbFile):
raise Exception(f'ERROR: Existing {dbFile}')
@@ -39,13 +49,13 @@ def genData(dumpFile: str, dbFile: str) -> None:
dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
- #
+
print('Iterating through dump file')
with bz2.open(dumpFile, mode='rt') as file:
for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
if pageNum % 1e4 == 0:
print(f'At page {pageNum}')
- # Parse page
+
if page.namespace == 0:
try:
dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
@@ -60,15 +70,22 @@ def genData(dumpFile: str, dbFile: str) -> None:
desc = parseDesc(revision.text)
if desc is not None:
dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def parseDesc(text: str) -> str | None:
- # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
- # and then accumulate lines until a blank one.
- # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
- # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
+ Looks for a description in wikitext content.
+
+ Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ and then accumulates lines until a blank one.
+
+ Some cases not accounted for include:
+ disambiguation pages, abstracts with sentences split-across-lines,
+ nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
lines: list[str] = []
openBraceCount = 0
openBracketCount = 0
@@ -108,18 +125,24 @@ def parseDesc(text: str) -> str | None:
if lines:
return removeMarkup(' '.join(lines))
return None
+
def removeMarkup(content: str) -> str:
+ """ Tries to remove markup from wikitext content """
content = EMBEDDED_HTML_REGEX.sub('', content)
content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
content = PARENS_GROUP_REGEX.sub('', content)
content = LEFTOVER_BRACE_REGEX.sub('', content)
return content
+
def convertTitle(title: str) -> str:
+ """ Replaces underscores in wiki item title """
return html.unescape(title).replace('_', ' ')
+# ========== Main block ==========
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(DUMP_FILE, DB_FILE)
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
index 6be8bc5..8872171 100755
--- a/backend/hist_data/enwiki/gen_dump_index_db.py
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -1,24 +1,28 @@
#!/usr/bin/python3
"""
-Adds data from the wiki-dump index-file into a database
+Converts data from the wiki-dump index-file into a database
"""
import argparse
-import sys, os, re
-import bz2, sqlite3
+import sys
+import os
+import re
+import bz2
+import sqlite3
INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
DB_FILE = 'dump_index.db'
def genData(indexFile: str, dbFile: str) -> None:
- """ Reads the index file and creates the db """
if os.path.exists(dbFile):
raise Exception(f'ERROR: Existing {dbFile}')
+
print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+
print('Iterating through index file')
lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
lastOffset = 0
@@ -29,7 +33,7 @@ def genData(indexFile: str, dbFile: str) -> None:
lineNum += 1
if lineNum % 1e5 == 0:
print(f'At line {lineNum}')
- #
+
match = lineRegex.fullmatch(line.rstrip())
assert match is not None
offsetStr, pageId, title = match.group(1,2,3)
@@ -49,6 +53,7 @@ def genData(indexFile: str, dbFile: str) -> None:
dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
except sqlite3.IntegrityError as e:
print(f'Failed on title "{t}": {e}', file=sys.stderr)
+
print('Closing database')
dbCon.commit()
dbCon.close()
@@ -56,5 +61,5 @@ def genData(indexFile: str, dbFile: str) -> None:
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(INDEX_FILE, DB_FILE)
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index 9aa3863..05df63d 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -8,35 +8,42 @@ The program can be re-run with an updated set of page IDs, and
will skip already-processed page IDs.
"""
-import os, re
-import bz2, html, urllib.parse
+import argparse
+import os
+import re
+import bz2
+import html
+import urllib.parse
import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
INDEX_DB = 'dump_index.db'
IMG_DB = 'img_data.db' # The database to create
DB_FILE = os.path.join('..', 'data.db')
-# Regexps
+
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+# ========== For data generation ==========
+
def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
+ """ Looks up page IDs in dump and creates database """
print('Opening databases')
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
+
print('Checking tables')
if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
# Create tables if not present
imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)')
# 'img_name' values are set to NULL to indicate page IDs where no image was found
imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
- else:
- # Check for already-processed page IDs
+ else: # Check for already-processed page IDs
numSkipped = 0
for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
if pid in pageIds:
@@ -45,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
else:
print(f'Found already-processed page ID {pid} which was not in input set')
print(f'Will skip {numSkipped} already-processed page IDs')
- #
+
print('Getting dump-file offsets')
offsetToPageId: dict[int, list[int]] = {}
offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
@@ -55,7 +62,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}')
- #
+
query = 'SELECT offset, next_offset, title FROM offsets WHERE id = ?'
row = indexDbCur.execute(query, (pageId,)).fetchone()
if row is None:
@@ -68,7 +75,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
offsetToPageId[chunkOffset].append(pageId)
pageIdToTitle[pageId] = title
print(f'Found {len(offsetToEnd)} chunks to check')
- #
+
print('Iterating through chunks in dump file')
with open(dumpFile, mode='rb') as file:
iterNum = 0
@@ -76,7 +83,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 100 == 0:
print(f'At iteration {iterNum}')
- #
+
chunkPageIds = offsetToPageId[pageOffset]
# Jump to chunk
file.seek(pageOffset)
@@ -122,21 +129,24 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
content.append(line[:line.rfind('</text>')])
# Look for image-filename
imageName = getImageName(content)
- imgDbCur.execute('INSERT into page_imgs VALUES (?, ?, ?)', (pageId, None if imageName is None else pageIdToTitle[pageId], imageName))
+ imgDbCur.execute(
+ 'INSERT into page_imgs VALUES (?, ?, ?)',
+ (pageId, None if imageName is None else pageIdToTitle[pageId], imageName))
break
if not foundTextEnd:
print(f'WARNING: Did not find </text> for page id {pageId}')
break
if not foundText:
print(f'WARNING: Did not find <text> for page id {pageId}')
- #
+
print('Closing databases')
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()
+
def getImageName(content: list[str]) -> str | None:
""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
- # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+ # Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = IMG_LINE_REGEX.match(line)
if match is not None:
@@ -177,6 +187,8 @@ def getImageName(content: list[str]) -> str | None:
return None
return None
+# ========== For getting input page IDs ==========
+
def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
print('Getting event data')
titles: set[str] = set()
@@ -184,6 +196,7 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
for (title,) in dbCon.execute('SELECT title from events'):
titles.add(title)
dbCon.close()
+
print('Getting page IDs')
pageIds: set[int] = set()
dbCon = sqlite3.connect(indexDb)
@@ -193,12 +206,15 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
if row:
pageIds.add(row[0])
dbCon.close()
+
print(f'Result: {len(pageIds)} out of {len(titles)}')
return pageIds
+
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB)
genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
index 935b303..57d6c7b 100755
--- a/backend/hist_data/enwiki/gen_pageview_data.py
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
"""
Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+ wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+ platform (eg: mobile-web), monthly view count,
+ hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
"""
-# Took about 10min per file (each had about 180e6 lines)
+# Note: Took about 10min per file (each had about 180e6 lines)
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
DUMP_INDEX_DB = 'dump_index.db'
DB_FILE = 'pageview_data.db'
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
- # Each pageview file has lines that seem to hold these space-separated fields:
- # wiki code (eg: en.wikipedia), article title, page ID (may be: null),
- # platform (eg: mobile-web), monthly view count,
- # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
- #
+
namespaceRegex = re.compile(r'[a-zA-Z]+:')
titleToViews: dict[str, int] = defaultdict(int)
linePrefix = b'en.wikipedia '
@@ -35,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
print(f'At line {lineNum}')
if not line.startswith(linePrefix):
continue
+
# Get second and second-last fields
linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
title = linePart[:linePart.find(b' ')].decode('utf-8')
@@ -45,11 +53,12 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
continue
if namespaceRegex.match(title) is not None:
continue
+
# Update map
title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -66,8 +75,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
idbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/hist_data/gen_desc_data.py b/backend/hist_data/gen_desc_data.py
index 6c9fee2..bcd8870 100755
--- a/backend/hist_data/gen_desc_data.py
+++ b/backend/hist_data/gen_desc_data.py
@@ -5,7 +5,8 @@ Maps events to short descriptions from Wikipedia, and stores them in the databas
"""
import argparse
-import os, sqlite3
+import os
+import sqlite3
ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
DB_FILE = 'data.db'
@@ -15,12 +16,12 @@ def genData(enwikiDb: str, dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, wiki_id INT, desc TEXT)')
- #
+
print('Getting events')
titleToId: dict[str, int] = {}
for eventId, title in dbCur.execute('SELECT id, title FROM events'):
titleToId[title] = eventId
- #
+
print('Getting Wikipedia descriptions')
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
@@ -29,11 +30,13 @@ def genData(enwikiDb: str, dbFile: str) -> None:
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}')
+
# Get wiki ID
row = enwikiCur.execute('SELECT id FROM pages WHERE title = ?', (title,)).fetchone()
if row is None:
continue
wikiId = row[0]
+
# Check for redirect
wikiIdToGet = wikiId
query = \
@@ -41,12 +44,13 @@ def genData(enwikiDb: str, dbFile: str) -> None:
row = enwikiCur.execute(query, (wikiId,)).fetchone()
if row is not None:
wikiIdToGet = row[0]
+
# Get desc
row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone()
if row is None:
continue
dbCur.execute('INSERT INTO descs VALUES (?, ?, ?)', (eventId, wikiId, row[0]))
- #
+
print('Closing databases')
dbCon.commit()
dbCon.close()
@@ -54,5 +58,5 @@ def genData(enwikiDb: str, dbFile: str) -> None:
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(ENWIKI_DB, DB_FILE)
diff --git a/backend/hist_data/gen_disp_data.py b/backend/hist_data/gen_disp_data.py
index 193adbb..6bb84ad 100755
--- a/backend/hist_data/gen_disp_data.py
+++ b/backend/hist_data/gen_disp_data.py
@@ -5,14 +5,15 @@ Adds data about event distribution to the database,
and removes events not eligible for display
"""
-# Code used in unit testing (for resolving imports of modules within this directory)
-import os, sys
+# For unit testing, resolve imports of modules within this directory
+import os
+import sys
parentDir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parentDir)
-# Standard imports
+
import argparse
import sqlite3
-# Local imports
+
from cal import SCALES, dbDateToHistDate, dateToUnit
MAX_DISPLAYED_PER_UNIT = 4
@@ -21,7 +22,7 @@ DB_FILE = 'data.db'
def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTables: bool) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Reading through events')
scaleUnitToCounts: dict[tuple[int, int], list[int]] = {}
# Maps scale and unit to two counts (num events in that unit, num events displayable for that unit)
@@ -35,7 +36,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa
iterNum += 1
if iterNum % 1e5 == 0:
print(f'At iteration {iterNum}')
- # For each scale
+
for scale in scales:
unit = dateToUnit(dbDateToHistDate(eventStart, fmt), scale)
# Update maps
@@ -52,7 +53,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa
idScales[eventId].append((scale, unit))
scaleUnitToCounts[(scale, unit)] = counts
print(f'Results: {len(idScales)} displayable events')
- #
+
print('Looking for non-displayable events')
eventsToDel: list[int] = []
for eventId, eventStart, fmt in dbCur.execute(query):
@@ -71,7 +72,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa
'SELECT events.id FROM events LEFT JOIN pop ON events.id = pop.id WHERE pop.id IS NULL'):
eventsToDel.append(eventId)
print(f'Found {len(eventsToDel)}')
- #
+
if not forImageTables:
print(f'Deleting {len(eventsToDel)} events')
iterNum = 0
@@ -82,7 +83,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa
#
dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,))
dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,))
- #
+
print('Writing to db')
distTable = 'dist' if not forImageTables else 'img_dist'
dispTable = 'event_disp' if not forImageTables else 'img_disp'
@@ -94,7 +95,7 @@ def genData(dbFile: str, scales: list[int], maxDisplayedPerUnit: int, forImageTa
for eventId, scaleUnits in idScales.items():
for [scale, unit] in scaleUnits:
dbCur.execute(f'INSERT INTO {dispTable} VALUES (?, ?, ?)', (eventId, scale, unit))
- #
+
print('Closing db')
dbCon.commit()
dbCon.close()
@@ -104,5 +105,5 @@ if __name__ == '__main__':
parser.add_argument(
'type', nargs='?', choices=['event', 'img'], default='event', help='The type of tables to generate')
args = parser.parse_args()
- #
+
genData(DB_FILE, SCALES, MAX_DISPLAYED_PER_UNIT, args.type == 'img')
diff --git a/backend/hist_data/gen_events_data.py b/backend/hist_data/gen_events_data.py
index 60402b5..453a9ad 100755
--- a/backend/hist_data/gen_events_data.py
+++ b/backend/hist_data/gen_events_data.py
@@ -59,26 +59,37 @@ Info about objects with type 'quantity' can be found at: https://www.wikidata.or
# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
# Possibly related: https://github.com/python/cpython/issues/72882
-# Took about 4.5 hours to run
+# Note: Took about 4.5 hours to run
-# Code used in unit testing (for resolving imports of modules within this directory)
-import os, sys
+# For unit testing, resolve imports of modules within this directory
+import os
+import sys
parentDir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parentDir)
-# Standard imports
+
from typing import cast
import argparse
-import math, re
-import io, bz2, json, sqlite3
-import indexed_bzip2, pickle, multiprocessing, tempfile
-# Local imports
+import math
+import re
+import io
+import bz2
+import json
+import sqlite3
+
+import indexed_bzip2
+import pickle
+import multiprocessing
+import tempfile
+
from cal import gregorianToJdn, julianToJdn, MIN_CAL_YEAR
-# Constants
+# ========== Constants ==========
+
WIKIDATA_FILE = os.path.join('wikidata', 'latest-all.json.bz2')
OFFSETS_FILE = os.path.join('wikidata', 'offsets.dat')
DB_FILE = 'data.db'
N_PROCS = 6 # Number of processes to use
+
# For getting Wikidata entity IDs
INSTANCE_OF = 'P31'
EVENT_CTG: dict[str, dict[str, str]] = {
@@ -173,24 +184,28 @@ UNIT_TO_SCALE: dict[str, int] = {
'http://www.wikidata.org/entity/Q20764': 10**6, # 'megaannum' (1e6 yrs)
'http://www.wikidata.org/entity/Q524410': 10**9, # 'gigaannum' (1e9 yrs)
}
+
# For filtering lines before parsing JSON
TYPE_ID_REGEX = ('"id":(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('Q')]) + '")').encode()
PROP_ID_REGEX = ('(?:"' + '"|"'.join([id for id in ID_TO_CTG if id.startswith('P')]) + '"):\[{"mainsnak"').encode()
+# ========== Main function ==========
+
def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
""" Reads the dump and writes to db """
- # Check db
if os.path.exists(dbFile):
print('ERROR: Database already exists')
return
- # Read dump, and write to db
- print('Writing to db')
+
+ print('Opening db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+
dbCur.execute('CREATE TABLE events (id INT PRIMARY KEY, title TEXT UNIQUE, ' \
'start INT, start_upper INT, end INT, end_upper INT, fmt INT, ctg TEXT)')
dbCur.execute('CREATE INDEX events_id_start_idx ON events(id, start)')
dbCur.execute('CREATE INDEX events_title_nocase_idx ON events(title COLLATE NOCASE)')
+
if nProcs == 1:
with bz2.open(wikidataFile, mode='rb') as file:
for lineNum, line in enumerate(file, 1):
@@ -206,6 +221,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
with indexed_bzip2.open(wikidataFile) as file:
with open(offsetsFile, 'wb') as file2:
pickle.dump(file.block_offsets(), file2)
+
print('Allocating file into chunks')
fileSz: int # Was about 1.4 TB
with indexed_bzip2.open(wikidataFile) as file:
@@ -216,6 +232,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
# Each adjacent pair specifies a start+end byte index for readDumpChunk()
print(f'- Chunk size: {chunkSz:,}')
+
print('Starting processes to read dump')
with tempfile.TemporaryDirectory() as tempDirName:
with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool:
@@ -227,15 +244,19 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
with open(outFile, 'rb') as file:
for item in pickle.load(file):
dbCur.execute('INSERT OR IGNORE INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)', item)
+
+ print('Closing db')
dbCon.commit()
dbCon.close()
-# For data extraction
+# ========== For data extraction ==========
+
def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | None, int | None, int, str] | None:
""" Parses a Wikidata dump line, returning an entry to add to the db """
# Check with regexes
if re.search(TYPE_ID_REGEX, lineBytes) is None and re.search(PROP_ID_REGEX, lineBytes) is None:
return None
+
# Decode
try:
line = lineBytes.decode('utf-8').rstrip().rstrip(',')
@@ -246,12 +267,14 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non
if 'claims' not in jsonItem:
return None
claims = jsonItem['claims']
+
# Get wikidata ID, enwiki title
try:
itemId = int(jsonItem['id'][1:]) # Skip initial 'Q'
itemTitle: str = jsonItem['sitelinks']['enwiki']['title']
except (KeyError, ValueError):
return None
+
# Get event category
eventCtg: str | None = None
if INSTANCE_OF in claims: # Check types
@@ -269,6 +292,7 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non
eventCtg = ID_TO_CTG[prop]
if not eventCtg:
return None
+
# Check for event-start/end props
startVal: str
endVal: str | None
@@ -297,13 +321,15 @@ def readDumpLine(lineBytes: bytes) -> tuple[int, str, int, int | None, int | Non
break
if not found:
return None
+
# Convert time values
timeData = getTimeData(startVal, endVal, timeType)
if timeData is None:
return None
start, startUpper, end, endUpper, timeFmt = timeData
- #
+
return (itemId, itemTitle, start, startUpper, end, endUpper, timeFmt, eventCtg)
+
def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int | None, int | None, int] | None:
""" Obtains event start+end data from 'datavalue' objects with type 'time', according to 'timeType' """
# Values to return
@@ -312,13 +338,13 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
end: int | None = None
endUpper: int | None = None
timeFmt: int
- #
+
if timeType == 'age estimated by a dating method':
+ # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in
+ # 'datedness' and undesirable small offsets to values like '1 billion years old'.
if 'type' not in startVal or startVal['type'] != 'quantity':
return None
- # Get quantity data
- # Note: Ages are interpreted relative to 1 AD. Using a year like 2020 results in
- # 'datedness' and undesirable small offsets to values like '1 billion years old'.
+
try:
value = startVal['value']
amount = math.ceil(float(value['amount']))
@@ -331,23 +357,26 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
upperBound = None
except (KeyError, ValueError):
return None
- # Get unit scale
+
+ # Get scale
if unit not in UNIT_TO_SCALE:
return None
scale = UNIT_TO_SCALE[unit]
+
# Get start+startUpper
if lowerBound is None:
start = -amount * scale
else:
start = -cast(int, upperBound) * scale
startUpper = -lowerBound * scale
+
# Adjust precision
start = start // scale * scale
if startUpper is not None:
startUpper = startUpper // scale * scale
elif scale > 1:
startUpper = start + scale - 1
- #
+
timeFmt = 0
elif timeType == 'earliest date':
# Get start
@@ -355,6 +384,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
if startTimeVals is None:
return None
start, _, timeFmt = startTimeVals
+
# Get end
endTimeVals = getEventTime(endVal)
if endTimeVals is None:
@@ -371,6 +401,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
if startTimeVals is None:
return None
start, startUpper, timeFmt = startTimeVals
+
# Get end+endUpper
if endVal is not None:
endTimeVals = getEventTime(endVal)
@@ -383,6 +414,7 @@ def getTimeData(startVal, endVal, timeType: str) -> tuple[int, int | None, int |
else:
return None
return start, startUpper, end, endUpper, timeFmt
+
def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
""" Obtains event start (or end) data from a 'datavalue' object with type 'time' """
if 'type' not in dataVal or dataVal['type'] != 'time':
@@ -399,6 +431,7 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
calendarmodel = value['calendarmodel']
except (KeyError, ValueError):
return None
+
# Get start+startUpper
start: int
startUpper: int | None = None
@@ -430,12 +463,15 @@ def getEventTime(dataVal) -> tuple[int, int | None, int] | None:
timeFmt = 0
else:
return None
+
return start, startUpper, timeFmt
-# For using multiple processes
+# ========== For using multiple processes ==========
+
def readDumpChunkOneParam(params: tuple[int, str, str, str, int, int]) -> str:
""" Forwards to readDumpChunk() (for use with pool.map()) """
return readDumpChunk(*params)
+
def readDumpChunk(
procId: int, wikidataFile: str, offsetsFile: str, outFile: str, startByte: int, endByte: int) -> str:
""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
@@ -447,12 +483,14 @@ def readDumpChunk(
with open(offsetsFile, 'rb') as file2:
offsets = pickle.load(file2)
file.set_block_offsets(offsets)
+
# Seek to chunk
if startByte != -1:
file.seek(startByte)
file.readline()
else:
startByte = 0 # Used for progress calculation
+
# Read lines
count = 0
while file.tell() <= endByte:
@@ -463,14 +501,17 @@ def readDumpChunk(
entry = readDumpLine(file.readline())
if entry:
entries.append(entry)
+
# Output results into file
with open(outFile, 'wb') as file:
pickle.dump(entries, file)
return outFile
+# ========== Main block ==========
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
multiprocessing.set_start_method('spawn')
genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)
diff --git a/backend/hist_data/gen_imgs.py b/backend/hist_data/gen_imgs.py
index 46cf6ee..44c0020 100755
--- a/backend/hist_data/gen_imgs.py
+++ b/backend/hist_data/gen_imgs.py
@@ -10,17 +10,20 @@ processing. It uses already-existing database entries to decide what
to skip.
"""
-# Took about 10 hours to process about 60k images
+# Note: Took about 10 hours to process about 60k images
import argparse
-import os, subprocess, signal
-import sqlite3, urllib.parse
+import os
+import subprocess
+import signal
+import sqlite3
+import urllib.parse
IMG_DIR = os.path.join('enwiki', 'imgs')
IMG_DB = os.path.join('enwiki', 'img_data.db')
OUT_DIR = 'img'
DB_FILE = 'data.db'
-#
+
IMG_OUT_SZ = 200
def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
@@ -29,7 +32,7 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
os.mkdir(outDir)
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Checking for image tables')
eventsDone: set[int] = set()
imgsDone: set[int] = set()
@@ -45,23 +48,26 @@ def genImgs(imgDir: str, imgDb: str, outDir: str, dbFile: str):
for (imgId,) in dbCur.execute('SELECT id from images'):
imgsDone.add(imgId)
print(f'Found {len(eventsDone)} events and {len(imgsDone)} images to skip')
- #
+
print('Processing images')
processImgs(imgDir, imgDb, outDir, dbCur, eventsDone, imgsDone)
- #
+
dbCon.commit()
dbCon.close()
+
def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
eventsDone: set[int], imgsDone: set[int]) -> bool:
""" Converts images and updates db, returning False upon interruption or failure """
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
+
# Set SIGINT handler
interrupted = False
def onSigint(sig, frame):
nonlocal interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
+
# Convert images
flag = False # Set to True upon interruption or failure
for imgFile in os.listdir(imgDir):
@@ -70,9 +76,11 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
print('Exiting')
flag = True
break
+
# Get image ID
imgIdStr, _ = os.path.splitext(imgFile)
imgId = int(imgIdStr)
+
# Get associated events
eventIds: set[int] = set()
query = 'SELECT title FROM page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name WHERE imgs.id = ?'
@@ -85,12 +93,14 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
eventIds = eventIds.difference(eventsDone)
if not eventIds:
continue
+
# Convert image
if imgId not in imgsDone:
success = convertImage(os.path.join(imgDir, imgFile), os.path.join(outDir, str(imgId) + '.jpg'))
if not success:
flag = True
break
+
# Add image to db
row = imgDbCur.execute('SELECT name, license, artist, credit FROM imgs WHERE id = ?', (imgId,)).fetchone()
if row is None:
@@ -100,16 +110,21 @@ def processImgs(imgDir: str, imgDb: str, outDir: str, dbCur: sqlite3.Cursor,
name, license, artist, credit = row
url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name)
dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?)', (imgId, url, license, artist, credit))
+
# Add event association to db
for eventId in eventIds:
dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, imgId))
+
imgDbCon.close()
return not flag
+
def convertImage(imgPath: str, outPath: str):
+ """ Converts an image using smartcrop """
print(f'Converting {imgPath} to {outPath}')
if os.path.exists(outPath):
print('ERROR: Output image already exists')
return False
+
try:
completedProcess = subprocess.run(
['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
@@ -126,5 +141,5 @@ def convertImage(imgPath: str, outPath: str):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genImgs(IMG_DIR, IMG_DB, OUT_DIR, DB_FILE)
diff --git a/backend/hist_data/gen_picked_data.py b/backend/hist_data/gen_picked_data.py
index c5f4577..a6bb8f8 100755
--- a/backend/hist_data/gen_picked_data.py
+++ b/backend/hist_data/gen_picked_data.py
@@ -4,14 +4,15 @@
Adds additional manually-picked events to the database
"""
-# Code used in unit testing (for resolving imports of modules within this directory)
-import os, sys
+# For unit testing, resolve imports of modules within this directory
+import os
+import sys
parentDir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parentDir)
-# Standard imports
+
import argparse
import json, sqlite3
-# Local imports
+
from gen_imgs import convertImage
from cal import SCALES, dbDateToHistDate, dateToUnit
@@ -23,7 +24,7 @@ IMG_OUT_DIR = 'img'
def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, scales: list[int]) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
with open(os.path.join(pickedDir, pickedEvtFile)) as f:
eventsToAdd = json.load(f)
nextId = -1
@@ -33,7 +34,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
if eventId is None and title is None:
print(f'ERROR: Entry with no ID or title: {event}')
break
- #
+
doAdd = eventId is None and len(event) > 1
doModify = eventId is not None and len(event) > 1
doDelete = not doModify and not doAdd
@@ -42,6 +43,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
dbCur.execute('INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
(nextId, event['title'], event['start'], event['start_upper'], event['end'], event['end_upper'],
event['fmt'], event['ctg']))
+
# Update image, description, and popularity tables
if 'image' in event:
print('> Adding image')
@@ -57,6 +59,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
if 'desc' in event:
dbCur.execute('INSERT INTO descs VALUES (?, ?, ?)', (nextId, nextId, event['desc']))
dbCur.execute('INSERT INTO pop VALUES (?, ?)', (nextId, event['pop']))
+
# Update event distribution tables
for scale in scales:
unit = dateToUnit(dbDateToHistDate(event['start'], event['fmt']), scale)
@@ -65,7 +68,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
else:
dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, 1))
dbCur.execute('INSERT INTO event_disp VALUES (?, ?, ?)', (nextId, scale, unit))
- #
+
nextId -= 1
elif doDelete:
if eventId:
@@ -78,6 +81,7 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
print(f'ERROR: Could not find event with title {title}')
break
eventId, eventStart, eventFmt = row
+
# Note: Intentionally not deleting entries or files for images that become unused.
dbCur.execute('DELETE FROM events WHERE id = ?', (eventId,))
dbCur.execute('DELETE FROM pop WHERE id = ?', (eventId,))
@@ -93,15 +97,18 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
dbCur.execute('UPDATE dist SET count = count - 1 WHERE scale = ? AND unit = ?', (scale, unit))
dbCur.execute('DELETE FROM event_disp WHERE id = ?', (eventId,))
else: # doModify
+ # Note: Intentionally not updating 'event_disp' table to account for 'indirect event displayability'
print(f'Modifying event with ID {eventId}')
row = dbCur.execute('SELECT start, fmt FROM events WHERE id = ?', (eventId,)).fetchone()
if row is None:
print(f'ERROR: Could not find event with ID {eventId}')
break
oldStart, oldFmt = row
+
for field in ['title', 'start', 'start_upper', 'end', 'end_upper', 'fmt', 'ctg']:
if field in event:
dbCur.execute(f'UPDATE events SET {field} = ? WHERE id = ?', (event[field], eventId,))
+
if 'image' in event:
print('> Adding image')
image = event['image']
@@ -117,16 +124,19 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
# Note: Intentionally not deleting entries or files for images that become unused.
else:
dbCur.execute('INSERT INTO event_imgs VALUES (?, ?)', (eventId, nextId))
+
if 'desc' in event:
if dbCur.execute('SELECT desc FROM descs WHERE id = ?', (eventId,)).fetchone():
dbCur.execute('UPDATE event_imgs SET desc = ? WHERE id = ?', (event['desc'], eventId))
else:
dbCur.execute('INSERT INTO descs VALUES (?, ?)', (eventId, event['desc']))
+
if 'pop' in event:
if dbCur.execute('SELECT pop FROM pop WHERE id = ?', (eventId,)).fetchone():
dbCur.execute('UPDATE pop SET pop = ? WHERE id = ?', (event['pop'], eventId))
else:
dbCur.execute('INSERT INTO pop VALUES (?, ?)', (eventId, event['pop']))
+
if 'start' in event:
# Remove old distribution data
for scale in scales:
@@ -147,14 +157,14 @@ def genData(pickedDir: str, pickedEvtFile: str, dbFile: str, imgOutDir: str, sca
else:
dbCur.execute('INSERT INTO dist VALUES (?, ?, ?)', (scale, unit, 1))
dbCur.execute('INSERT INTO event_disp VALUES (?, ?, ?)', (eventId, scale, unit))
- # Note: Intentionally not updating 'event_disp' table to account for 'indirect event displayability'
+
nextId -= 1
- #
+
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PICKED_DIR, PICKED_EVT_FILE, DB_FILE, IMG_OUT_DIR, SCALES)
diff --git a/backend/hist_data/gen_pop_data.py b/backend/hist_data/gen_pop_data.py
index aaaf69d..8d50b6b 100755
--- a/backend/hist_data/gen_pop_data.py
+++ b/backend/hist_data/gen_pop_data.py
@@ -4,7 +4,9 @@
Adds Wikipedia page view info to the database as popularity values
"""
-import os, sqlite3
+import argparse
+import os
+import sqlite3
PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
DB_FILE = 'data.db'
@@ -12,12 +14,12 @@ DB_FILE = 'data.db'
def genData(pageviewsDb: str, dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Getting event data')
titleToId: dict[str, int] = {}
for eventId, title in dbCur.execute('SELECT id, title FROM events'):
titleToId[title] = eventId
- #
+
print('Getting view counts')
pdbCon = sqlite3.connect(pageviewsDb)
pdbCur = pdbCon.cursor()
@@ -27,24 +29,23 @@ def genData(pageviewsDb: str, dbFile: str) -> None:
iterNum += 1
if iterNum % 1e6 == 0:
print(f'At iteration {iterNum}')
- #
+
if title not in titleToId:
continue
titleToViews[title] = views
pdbCon.close()
- #
+
print(f'Result: {len(titleToViews)} out of {len(titleToId)}')
dbCur.execute('CREATE TABLE pop (id INT PRIMARY KEY, pop INT)')
dbCur.execute('CREATE INDEX pop_idx ON pop(pop)')
for title, views in titleToViews.items():
dbCur.execute('INSERT INTO pop VALUES (?, ?)', (titleToId[title], views))
- #
+
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PAGEVIEWS_DB, DB_FILE)