aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data')
-rw-r--r--backend/hist_data/enwiki/README.md60
-rwxr-xr-xbackend/hist_data/enwiki/download_img_license_info.py157
-rwxr-xr-xbackend/hist_data/enwiki/download_imgs.py95
-rwxr-xr-xbackend/hist_data/enwiki/gen_desc_data.py126
-rwxr-xr-xbackend/hist_data/enwiki/gen_dump_index_db.py60
-rwxr-xr-xbackend/hist_data/enwiki/gen_img_data.py203
-rwxr-xr-xbackend/hist_data/enwiki/gen_pageview_data.py68
7 files changed, 769 insertions, 0 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
new file mode 100644
index 0000000..e50c7e2
--- /dev/null
+++ b/backend/hist_data/enwiki/README.md
@@ -0,0 +1,60 @@
+This directory holds files obtained/derived from [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page).
+
+# Downloaded Files
+- `enwiki-20220501-pages-articles-multistream.xml.bz2` <br>
+ Contains text content and metadata for pages in enwiki.
+ Obtained via <https://dumps.wikimedia.org/backup-index.html>.
+ Some file content and format information was available from
+ <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+- `enwiki-20220501-pages-articles-multistream-index.txt.bz2` <br>
+ Obtained like above. Holds lines of the form offset1:pageId1:title1,
+ providing, for each page, an offset into the dump file of a chunk of
+ 100 pages that includes it.
+
+# Dump-Index Files
+- `gen_dump_index_db.py` <br>
+ Creates a database version of the enwiki-dump index file.
+- `dump_index.db` <br>
+ Generated by `gen_dump_index_db.py`. <br>
+ Tables: <br>
+ - `offsets`: `id INT PRIMARY KEY, title TEXT UNIQUE, offset INT, next_offset INT`
+
+# Description Files
+- `gen_desc_data.py` <br>
+ Reads through pages in the dump file, and adds short-description info to a database.
+- `desc_data.db` <br>
+ Generated by `gen_desc_data.py`. <br>
+ Tables: <br>
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
+ - `redirects`: `id INT PRIMARY KEY, target TEXT`
+ - `descs`: `id INT PRIMARY KEY, desc TEXT`
+
+# Image Files
+- `gen_img_data.py` <br>
+ Used to find infobox image names for page IDs, and store them into a database.
+- `download_img_license_info.py` <br>
+ Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database.
+- `img_data.db` <br>
+ Used to hold metadata about infobox images for a set of page IDs.
+ Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br>
+ Tables: <br>
+ - `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT` <br>
+ `img_name` may be NULL, which means 'none found', and is used to avoid re-processing page IDs.
+ - `imgs`:
+ `id INT PRIMARY KEY, name TEXT UNIQUE, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT`
+ <br>
+ Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
+- `download_imgs.py` <br>
+ Used to download image files into imgs/.
+
+# Page View Files
+- `pageviews/pageviews-*-user.bz2`
+ Each holds wikimedia article page view data for some month.
+ Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>.
+ Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>.
+- `gen_pageview_data.py` <br>
+ Reads pageview/* and `dump_index.db`, and creates a database holding average monthly pageview counts.
+- `pageview_data.db` <br>
+ Generated using `gen_pageview_data.py`. <br>
+ Tables: <br>
+ - `views`: `title TEXT PRIMARY KEY, id INT UNIQUE, views INT`
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
new file mode 100755
index 0000000..1217caf
--- /dev/null
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -0,0 +1,157 @@
+#!/usr/bin/python3
+
+"""
+Reads image names from a database, and uses enwiki's online API to obtain
+licensing information for them, adding the info to the database.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+at already-processed names to decide what to skip.
+"""
+
+import re
+import sqlite3, urllib.parse, html
+import requests
+import time, signal
+
+IMG_DB = 'img_data.db'
+#
+API_URL = 'https://en.wikipedia.org/w/api.php'
+USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
+BATCH_SZ = 50 # Max 50
+TAG_REGEX = re.compile(r'<[^<]+>')
+WHITESPACE_REGEX = re.compile(r'\s+')
+
+def downloadInfo(imgDb: str) -> None:
+ print('Opening database')
+ dbCon = sqlite3.connect(imgDb)
+ dbCur = dbCon.cursor()
+ print('Checking for table')
+ if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
+ dbCur.execute('CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \
+ 'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
+ #
+ print('Reading image names')
+ imgNames: set[str] = set()
+ for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
+ imgNames.add(imgName)
+ print(f'Found {len(imgNames)}')
+ #
+ print('Checking for already-processed images')
+ nextImgId = 1
+ oldSz = len(imgNames)
+ for (imgId, imgName,) in dbCur.execute('SELECT id, name FROM imgs'):
+ imgNames.discard(imgName)
+ if imgId >= nextImgId:
+ nextImgId = imgId + 1
+ print(f'Found {oldSz - len(imgNames)}')
+ #
+ # Set SIGINT handler
+ interrupted = False
+ oldHandler = None
+ def onSigint(sig, frame):
+ nonlocal interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+ oldHandler = signal.signal(signal.SIGINT, onSigint)
+ #
+ print('Iterating through image names')
+ imgNameList = list(imgNames)
+ iterNum = 0
+ for i in range(0, len(imgNameList), BATCH_SZ):
+ iterNum += 1
+ if iterNum % 1 == 0:
+ print(f'At iteration {iterNum} (after {(iterNum - 1) * BATCH_SZ} images)')
+ if interrupted:
+ print(f'Exiting loop at iteration {iterNum}')
+ break
+ # Get batch
+ imgBatch = imgNameList[i:i+BATCH_SZ]
+ imgBatch = ['File:' + x for x in imgBatch]
+ # Make request
+ headers = {
+ 'user-agent': USER_AGENT,
+ 'accept-encoding': 'gzip',
+ }
+ params = {
+ 'action': 'query',
+ 'format': 'json',
+ 'prop': 'imageinfo',
+ 'iiprop': 'extmetadata|url',
+ 'maxlag': '5',
+ 'titles': '|'.join(imgBatch),
+ 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions',
+ }
+ responseObj = None
+ try:
+ response = requests.get(API_URL, params=params, headers=headers)
+ responseObj = response.json()
+ except Exception as e:
+ print(f'ERROR: Exception while downloading info: {e}')
+ print('\tImage batch: ' + '|'.join(imgBatch))
+ continue
+ # Parse response-object
+ if 'query' not in responseObj or 'pages' not in responseObj['query']:
+ print('WARNING: Response object doesn\'t have page data')
+ print('\tImage batch: ' + '|'.join(imgBatch))
+ if 'error' in responseObj:
+ errorCode = responseObj['error']['code']
+ print(f'\tError code: {errorCode}')
+ if errorCode == 'maxlag':
+ time.sleep(5)
+ continue
+ pages = responseObj['query']['pages']
+ normalisedToInput: dict[str, str] = {}
+ if 'normalized' in responseObj['query']:
+ for entry in responseObj['query']['normalized']:
+ normalisedToInput[entry['to']] = entry['from']
+ for page in pages.values():
+ # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
+ # LicenseShortName: short human-readable license name, apparently more reliable than 'License',
+ # Artist: author name (might contain complex html, multiple authors, etc)
+ # Credit: 'source'
+ # For image-map-like images, can be quite large/complex html, creditng each sub-image
+ # May be <a href='text1'>text2</a>, where the text2 might be non-indicative
+ # Restrictions: specifies non-copyright legal restrictions
+ title: str = page['title']
+ if title in normalisedToInput:
+ title = normalisedToInput[title]
+ title = title[5:] # Remove 'File:'
+ if title not in imgNames:
+ print(f'WARNING: Got title "{title}" not in image-name list')
+ continue
+ if 'imageinfo' not in page:
+ print(f'WARNING: No imageinfo section for page "{title}"')
+ continue
+ metadata = page['imageinfo'][0]['extmetadata']
+ url: str = page['imageinfo'][0]['url']
+ license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
+ artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
+ credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
+ restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+ # Remove markup
+ if artist is not None:
+ artist = TAG_REGEX.sub(' ', artist).strip()
+ artist = WHITESPACE_REGEX.sub(' ', artist)
+ artist = html.unescape(artist)
+ artist = urllib.parse.unquote(artist)
+ if credit is not None:
+ credit = TAG_REGEX.sub(' ', credit).strip()
+ credit = WHITESPACE_REGEX.sub(' ', credit)
+ credit = html.unescape(credit)
+ credit = urllib.parse.unquote(credit)
+ # Add to db
+ dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)',
+ (nextImgId, title, license, artist, credit, restrictions, url))
+ nextImgId += 1
+ #
+ print('Closing database')
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ downloadInfo(IMG_DB)
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
new file mode 100755
index 0000000..664dd28
--- /dev/null
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python3
+
+"""
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'imgId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
+
+import re, os
+import sqlite3
+import urllib.parse, requests
+import time, signal
+
+IMG_DB = 'img_data.db' # About 130k image names
+OUT_DIR = 'imgs'
+#
+LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
+USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
+TIMEOUT = 1
+ # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
+ # It's unclear how to properly check for cache misses, so we just aim for 1 per sec
+
+def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
+ if not os.path.exists(outDir):
+ os.mkdir(outDir)
+ print('Checking for already-downloaded images')
+ fileList = os.listdir(outDir)
+ imgIdsDone: set[int] = set()
+ for filename in fileList:
+ imgIdsDone.add(int(os.path.splitext(filename)[0]))
+ print(f'Found {len(imgIdsDone)}')
+ #
+ # Set SIGINT handler
+ interrupted = False
+ oldHandler = None
+ def onSigint(sig, frame):
+ nonlocal interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+ oldHandler = signal.signal(signal.SIGINT, onSigint)
+ #
+ print('Opening database')
+ dbCon = sqlite3.connect(imgDb)
+ dbCur = dbCon.cursor()
+ print('Starting downloads')
+ iterNum = 0
+ query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs'
+ for imgId, license, artist, credit, restrictions, url in dbCur.execute(query):
+ if imgId in imgIdsDone:
+ continue
+ if interrupted:
+ print('Exiting loop')
+ break
+ # Check for problematic attributes
+ if license is None or LICENSE_REGEX.fullmatch(license) is None:
+ continue
+ if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None:
+ continue
+ if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None:
+ continue
+ if restrictions is not None and restrictions != '':
+ continue
+ # Download image
+ iterNum += 1
+ print(f'Iteration {iterNum}: Downloading for image ID {imgId}')
+ urlParts = urllib.parse.urlparse(url)
+ extension = os.path.splitext(urlParts.path)[1]
+ if len(extension) <= 1:
+ print(f'WARNING: No filename extension found in URL {url}')
+ continue
+ outFile = os.path.join(outDir, f'{imgId}{extension}')
+ headers = {
+ 'user-agent': USER_AGENT,
+ 'accept-encoding': 'gzip',
+ }
+ try:
+ response = requests.get(url, headers=headers)
+ with open(outFile, 'wb') as file:
+ file.write(response.content)
+ time.sleep(timeout)
+ except Exception as e:
+ print(f'Error while downloading to {outFile}: {e}')
+ return
+ print('Closing database')
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
new file mode 100755
index 0000000..b3fde52
--- /dev/null
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -0,0 +1,126 @@
+#!/usr/bin/python3
+
+"""
+Reads through the wiki dump, attempts to parse short-descriptions,
+and adds them to a database
+"""
+
+# In testing, this script took over 10 hours to run, and generated about 5GB
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
+DB_FILE = 'desc_data.db'
+
+DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
+EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
+ # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
+CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+def convertTemplateReplace(match):
+ """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
+ if match.group(2) is None:
+ return f'{match.group(1)} {match.group(4)}'
+ else:
+ return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+def genData(dumpFile: str, dbFile: str) -> None:
+ print('Creating database')
+ if os.path.exists(dbFile):
+ raise Exception(f'ERROR: Existing {dbFile}')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)')
+ dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)')
+ dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
+ dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
+ dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
+ #
+ print('Iterating through dump file')
+ with bz2.open(dumpFile, mode='rt') as file:
+ for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
+ if pageNum % 1e4 == 0:
+ print(f'At page {pageNum}')
+ # Parse page
+ if page.namespace == 0:
+ try:
+ dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain pages that have the same title
+ print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr)
+ continue
+ if page.redirect is not None:
+ dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect)))
+ else:
+ revision = next(page)
+ desc = parseDesc(revision.text)
+ if desc is not None:
+ dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
+ #
+ print('Closing database')
+ dbCon.commit()
+ dbCon.close()
+def parseDesc(text: str) -> str | None:
+ # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ # and then accumulate lines until a blank one.
+ # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
+ # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ lines: list[str] = []
+ openBraceCount = 0
+ openBracketCount = 0
+ inComment = False
+ skip = False
+ for line in text.splitlines():
+ line = line.strip()
+ if not lines:
+ if line:
+ if openBraceCount > 0 or line[0] == '{':
+ openBraceCount += line.count('{')
+ openBraceCount -= line.count('}')
+ skip = True
+ if openBracketCount > 0 or line[0] == '[':
+ openBracketCount += line.count('[')
+ openBracketCount -= line.count(']')
+ skip = True
+ if inComment or line.find('<!--') != -1:
+ if line.find('-->') != -1:
+ if inComment:
+ inComment = False
+ skip = True
+ else:
+ inComment = True
+ skip = True
+ if skip:
+ skip = False
+ continue
+ if line[-1] == ':': # Seems to help avoid disambiguation pages
+ return None
+ if DESC_LINE_REGEX.match(line) is not None:
+ lines.append(line)
+ else:
+ if not line:
+ return removeMarkup(' '.join(lines))
+ lines.append(line)
+ if lines:
+ return removeMarkup(' '.join(lines))
+ return None
+def removeMarkup(content: str) -> str:
+ content = EMBEDDED_HTML_REGEX.sub('', content)
+ content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
+ content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+ content = PARENS_GROUP_REGEX.sub('', content)
+ content = LEFTOVER_BRACE_REGEX.sub('', content)
+ return content
+def convertTitle(title: str) -> str:
+ return html.unescape(title).replace('_', ' ')
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ genData(DUMP_FILE, DB_FILE)
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
new file mode 100755
index 0000000..5778680
--- /dev/null
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python3
+
+"""
+Adds data from the wiki dump index-file into a database
+"""
+import sys, os, re
+import bz2
+import sqlite3
+
+INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
+DB_FILE = 'dump_index.db'
+
+def genData(indexFile: str, dbFile: str) -> None:
+ """ Reads the index file and creates the db """
+ if os.path.exists(dbFile):
+ raise Exception(f'ERROR: Existing {dbFile}')
+ print('Creating database')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+ print('Iterating through index file')
+ lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
+ lastOffset = 0
+ lineNum = 0
+ entriesToAdd: list[tuple[str, str]] = []
+ with bz2.open(indexFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ #
+ match = lineRegex.fullmatch(line.rstrip())
+ assert match is not None
+ offsetStr, pageId, title = match.group(1,2,3)
+ offset = int(offsetStr)
+ if offset > lastOffset:
+ for t, p in entriesToAdd:
+ try:
+ dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain entries in the file that have the same title
+ print(f'Failed on title "{t}": {e}', file=sys.stderr)
+ entriesToAdd = []
+ lastOffset = offset
+ entriesToAdd.append((title, pageId))
+ for title, pageId in entriesToAdd:
+ try:
+ dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
+ except sqlite3.IntegrityError as e:
+ print(f'Failed on title "{t}": {e}', file=sys.stderr)
+ print('Closing database')
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ genData(INDEX_FILE, DB_FILE)
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
new file mode 100755
index 0000000..29ae7b6
--- /dev/null
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -0,0 +1,203 @@
+#!/usr/bin/python3
+
+"""
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
+
+import re
+import os, bz2, html, urllib.parse
+import sqlite3
+
+DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+INDEX_DB = 'dump_index.db'
+IMG_DB = 'img_data.db' # The database to create
+DB_FILE = os.path.join('..', 'data.db')
+#
+ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
+IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
+BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
+IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
+CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+
+def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
+ print('Opening databases')
+ indexDbCon = sqlite3.connect(indexDb)
+ indexDbCur = indexDbCon.cursor()
+ imgDbCon = sqlite3.connect(imgDb)
+ imgDbCur = imgDbCon.cursor()
+ print('Checking tables')
+ if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
+ # Create tables if not present
+ imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+ imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
+ else:
+ # Check for already-processed page IDs
+ numSkipped = 0
+ for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
+ if pid in pageIds:
+ pageIds.remove(pid)
+ numSkipped += 1
+ else:
+ print(f'Found already-processed page ID {pid} which was not in input set')
+ print(f'Will skip {numSkipped} already-processed page IDs')
+ #
+ print('Getting dump-file offsets')
+ offsetToPageids: dict[int, list[int]] = {}
+ offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
+ iterNum = 0
+ for pageId in pageIds:
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
+ row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
+ if row is None:
+ print(f'WARNING: Page ID {pageId} not found')
+ continue
+ chunkOffset, endOffset = row
+ offsetToEnd[chunkOffset] = endOffset
+ if chunkOffset not in offsetToPageids:
+ offsetToPageids[chunkOffset] = []
+ offsetToPageids[chunkOffset].append(pageId)
+ print(f'Found {len(offsetToEnd)} chunks to check')
+ #
+ print('Iterating through chunks in dump file')
+ with open(dumpFile, mode='rb') as file:
+ iterNum = 0
+ for pageOffset, endOffset in offsetToEnd.items():
+ iterNum += 1
+ if iterNum % 100 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ chunkPageIds = offsetToPageids[pageOffset]
+ # Jump to chunk
+ file.seek(pageOffset)
+ compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+ data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+ # Look in chunk for pages
+ lines = data.splitlines()
+ lineIdx = 0
+ while lineIdx < len(lines):
+ # Look for <page>
+ if lines[lineIdx].lstrip() != '<page>':
+ lineIdx += 1
+ continue
+ # Check page id
+ lineIdx += 3
+ idLine = lines[lineIdx].lstrip()
+ match = ID_LINE_REGEX.fullmatch(idLine)
+ if match is None or int(match.group(1)) not in chunkPageIds:
+ lineIdx += 1
+ continue
+ pageId = int(match.group(1))
+ lineIdx += 1
+ # Look for <text> in <page>
+ foundText = False
+ while lineIdx < len(lines):
+ if not lines[lineIdx].lstrip().startswith('<text '):
+ lineIdx += 1
+ continue
+ foundText = True
+ # Get text content
+ content: list[str] = []
+ line = lines[lineIdx]
+ content.append(line[line.find('>') + 1:])
+ lineIdx += 1
+ foundTextEnd = False
+ while lineIdx < len(lines):
+ line = lines[lineIdx]
+ if not line.endswith('</text>'):
+ content.append(line)
+ lineIdx += 1
+ continue
+ foundTextEnd = True
+ content.append(line[:line.rfind('</text>')])
+ # Look for image-filename
+ imageName = getImageName(content)
+ imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName))
+ break
+ if not foundTextEnd:
+ print(f'WARNING: Did not find </text> for page id {pageId}')
+ break
+ if not foundText:
+ print(f'WARNING: Did not find <text> for page id {pageId}')
+ #
+ print('Closing databases')
+ indexDbCon.close()
+ imgDbCon.commit()
+ imgDbCon.close()
+def getImageName(content: list[str]) -> str | None:
+ """ Given an array of text-content lines, tries to return an infoxbox image name, or None """
+ # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+ for line in content:
+ match = IMG_LINE_REGEX.match(line)
+ if match is not None:
+ imageName = match.group(1).strip()
+ if imageName == '':
+ return None
+ imageName = html.unescape(imageName)
+ # Account for {{...
+ if imageName.startswith('{'):
+ match = CSS_IMG_CROP_REGEX.match(imageName)
+ if match is None:
+ return None
+ imageName = match.group(1)
+ # Account for [[File:...|...]]
+ if imageName.startswith('['):
+ match = BRACKET_IMG_REGEX.match(imageName)
+ if match is None:
+ return None
+ imageName = match.group(1)
+ # Account for <!--
+ if imageName.find('<!--') != -1:
+ return None
+ # Remove an initial 'File:'
+ if imageName.startswith('File:'):
+ imageName = imageName[5:]
+ # Remove an initial 'Image:'
+ if imageName.startswith('Image:'):
+ imageName = imageName[6:]
+ # Check for extension
+ match = IMG_NAME_REGEX.match(imageName)
+ if match is not None:
+ imageName = match.group(0)
+ imageName = urllib.parse.unquote(imageName)
+ imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+ imageName = imageName.replace('_', ' ')
+ return imageName
+ # Exclude lines like: | image = &lt;imagemap&gt;
+ return None
+ return None
+
+def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]:
+ print('Getting input page-ids')
+ pageTitles: set[str] = set()
+ pageIds: set[int] = set()
+ print('Reading event titles')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ for (pageId,) in dbCur.execute('SELECT title from events'):
+ pageTitles.add(pageId)
+ dbCon.close()
+ print('Getting event page IDs')
+ dbCon = sqlite3.connect(indexDb)
+ dbCur = dbCon.cursor()
+ for pageTitle in pageTitles:
+ row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (pageTitle,)).fetchone()
+ if row:
+ pageIds.add(row[0])
+ dbCon.close()
+ print(f'Found {len(pageIds)} out of {len(pageTitles)}')
+ return pageIds
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB)
+ genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py
new file mode 100755
index 0000000..b37a107
--- /dev/null
+++ b/backend/hist_data/enwiki/gen_pageview_data.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+"""
+Reads through wikimedia files containing pageview counts,
+computes average counts, and adds them to a database
+"""
+
+# Took about 15min per file (each had about 180e6 lines)
+
+import sys, os, glob, math, re
+from collections import defaultdict
+import bz2, sqlite3
+
+PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
+DUMP_INDEX_DB = 'dump_index.db'
+DB_FILE = 'pageview_data.db'
+
+def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
+ # Each pageview file has lines that seem to hold these space-separated fields:
+ # wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+ # platform (eg: mobile-web), monthly view count,
+ # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
+ if os.path.exists(dbFile):
+ print('ERROR: Database already exists')
+ sys.exit(1)
+ #
+ namespaceRegex = re.compile(r'[a-zA-Z]+:')
+ titleToViews: dict[str, int] = defaultdict(int)
+ linePrefix = b'en.wikipedia '
+ for filename in pageviewFiles:
+ print(f'Reading from {filename}')
+ with bz2.open(filename, 'rb') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e6 == 0:
+ print(f'At line {lineNum}')
+ if not line.startswith(linePrefix):
+ continue
+ # Get second and second-last fields
+ line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
+ title = line[:line.find(b' ')].decode('utf-8')
+ viewCount = int(line[line.rfind(b' ')+1:])
+ if namespaceRegex.match(title) is not None:
+ continue
+ # Update map
+ titleToViews[title] += viewCount
+ print(f'Found {len(titleToViews)} titles')
+ #
+ print('Writing to db')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ idbCon = sqlite3.connect(dumpIndexDb)
+ idbCur = idbCon.cursor()
+ dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)')
+ for title, views in titleToViews.items():
+ row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+ if row is not None:
+ wikiId = int(row[0])
+ dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles))))
+ dbCon.commit()
+ dbCon.close()
+ idbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)