#!/usr/bin/python3

"""
Reads through the wiki dump, attempts to parse short-descriptions,
and adds them to a database
"""

# In testing, this script took over 10 hours to run, and generated about 5GB

import argparse
import sys
import os
import re
import bz2
import sqlite3
import html

import mwxml
import mwparserfromhell

DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
DB_FILE = 'desc_data.db'

DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')

def convertTemplateReplace(match):
	""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
	if match.group(2) is None:
		return f'{match.group(1)} {match.group(4)}'
	else:
		return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'

# ========== For data generation ==========

def genData(dumpFile: str, dbFile: str) -> None:
	print('Creating database')
	if os.path.exists(dbFile):
		raise Exception(f'ERROR: Existing {dbFile}')
	dbCon = sqlite3.connect(dbFile)
	dbCur = dbCon.cursor()
	dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)')
	dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)')
	dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
	dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
	dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')

	print('Iterating through dump file')
	with bz2.open(dumpFile, mode='rt') as file:
		for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
			if pageNum % 1e4 == 0:
				print(f'At page {pageNum}')

			if page.namespace == 0:
				try:
					dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
				except sqlite3.IntegrityError as e:
					# Accounts for certain pages that have the same title
					print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr)
					continue
				if page.redirect is not None:
					dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect)))
				else:
					revision = next(page)
					desc = parseDesc(revision.text)
					if desc is not None:
						dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))

	print('Closing database')
	dbCon.commit()
	dbCon.close()

def parseDesc(text: str) -> str | None:
	"""
	Looks for a description in wikitext content.

	Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
	and then accumulates lines until a blank one.

	Some cases not accounted for include:
		disambiguation pages, abstracts with sentences split-across-lines, 
		nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
	"""
	lines: list[str] = []
	openBraceCount = 0
	openBracketCount = 0
	inComment = False
	skip = False
	for line in text.splitlines():
		line = line.strip()
		if not lines:
			if line:
				if openBraceCount > 0 or line[0] == '{':
					openBraceCount += line.count('{')
					openBraceCount -= line.count('}')
					skip = True
				if openBracketCount > 0 or line[0] == '[':
					openBracketCount += line.count('[')
					openBracketCount -= line.count(']')
					skip = True
				if inComment or line.find('<!--') != -1:
					if line.find('-->') != -1:
						if inComment:
							inComment = False
							skip = True
					else:
						inComment = True
						skip = True
				if skip:
					skip = False
					continue
				if line[-1] == ':': # Seems to help avoid disambiguation pages
					return None
				if DESC_LINE_REGEX.match(line) is not None:
					lines.append(line)
		else:
			if not line:
				return removeMarkup(' '.join(lines))
			lines.append(line)
	if lines:
		return removeMarkup(' '.join(lines))
	return None

def removeMarkup(content: str) -> str:
	content = EMBEDDED_HTML_REGEX.sub('', content)
	content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
	content = PARENS_GROUP_REGEX.sub('', content)
	content = LEFTOVER_BRACE_REGEX.sub('', content)
	return content

def convertTitle(title: str) -> str:
	return html.unescape(title).replace('_', ' ')

# ========== Main block ==========

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	parser.parse_args()

	genData(DUMP_FILE, DB_FILE)