aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/enwiki
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tol_data/enwiki')
-rwxr-xr-xbackend/tol_data/enwiki/download_img_license_info.py30
-rwxr-xr-xbackend/tol_data/enwiki/download_imgs.py24
-rwxr-xr-xbackend/tol_data/enwiki/gen_desc_data.py45
-rwxr-xr-xbackend/tol_data/enwiki/gen_dump_index_db.py16
-rwxr-xr-xbackend/tol_data/enwiki/gen_img_data.py36
-rwxr-xr-xbackend/tol_data/enwiki/gen_pageview_data.py28
-rwxr-xr-xbackend/tol_data/enwiki/lookup_page.py9
7 files changed, 127 insertions, 61 deletions
diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py
index 17e15b4..6efc7a4 100755
--- a/backend/tol_data/enwiki/download_img_license_info.py
+++ b/backend/tol_data/enwiki/download_img_license_info.py
@@ -9,13 +9,19 @@ The program can be re-run to continue downloading, and looks
at already-processed names to decide what to skip.
"""
+import argparse
import re
-import sqlite3, urllib.parse, html
+import sqlite3
+
import requests
-import time, signal
+import urllib.parse
+import html
+
+import time
+import signal
IMG_DB = 'img_data.db'
-#
+
API_URL = 'https://en.wikipedia.org/w/api.php'
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
BATCH_SZ = 50 # Max 50
@@ -30,19 +36,19 @@ def downloadInfo(imgDb: str) -> None:
if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
dbCur.execute('CREATE TABLE imgs (' \
'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
- #
+
print('Reading image names')
imgNames: set[str] = set()
for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
imgNames.add(imgName)
print(f'Found {len(imgNames)}')
- #
+
print('Checking for already-processed images')
oldSz = len(imgNames)
for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
imgNames.discard(imgName)
print(f'Found {oldSz - len(imgNames)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -51,7 +57,7 @@ def downloadInfo(imgDb: str) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Iterating through image names')
imgNameList = list(imgNames)
iterNum = 0
@@ -62,9 +68,11 @@ def downloadInfo(imgDb: str) -> None:
if interrupted:
print(f'Exiting loop at iteration {iterNum}')
break
+
# Get batch
imgBatch = imgNameList[i:i+BATCH_SZ]
imgBatch = ['File:' + x for x in imgBatch]
+
# Make request
headers = {
'user-agent': USER_AGENT,
@@ -87,6 +95,7 @@ def downloadInfo(imgDb: str) -> None:
print(f'ERROR: Exception while downloading info: {e}')
print('\tImage batch: ' + '|'.join(imgBatch))
continue
+
# Parse response-object
if 'query' not in responseObj or 'pages' not in responseObj['query']:
print('WARNING: Response object doesn\'t have page data')
@@ -126,6 +135,7 @@ def downloadInfo(imgDb: str) -> None:
artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
# Remove markup
if artist is not None:
artist = TAG_REGEX.sub(' ', artist).strip()
@@ -137,17 +147,17 @@ def downloadInfo(imgDb: str) -> None:
credit = WHITESPACE_REGEX.sub(' ', credit)
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
+
# Add to db
dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
(title, license, artist, credit, restrictions, url))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadInfo(IMG_DB)
diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py
index c6a1c21..164289d 100755
--- a/backend/tol_data/enwiki/download_imgs.py
+++ b/backend/tol_data/enwiki/download_imgs.py
@@ -11,14 +11,20 @@ in the output directory do decide what to skip.
# In testing, this downloaded about 100k images, over several days
-import re, os
+import argparse
+import re
+import os
import sqlite3
-import urllib.parse, requests
-import time, signal
+
+import requests
+import urllib.parse
+
+import time
+import signal
IMG_DB = 'img_data.db' # About 130k image names
OUT_DIR = 'imgs'
-#
+
LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
TIMEOUT = 1
@@ -34,7 +40,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
for filename in fileList:
pageIdsDone.add(int(os.path.splitext(filename)[0]))
print(f'Found {len(pageIdsDone)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -43,7 +49,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if interrupted:
print('Exiting loop')
break
+
# Check for problematic attributes
if license is None or LICENSE_REGEX.fullmatch(license) is None:
continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
continue
if restrictions is not None and restrictions != '':
continue
+
# Download image
iterNum += 1
print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
@@ -87,12 +95,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
except Exception as e:
print(f'Error while downloading to {outFile}: {e}')
return
+
print('Closing database')
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py
index b3fde52..44e4d6f 100755
--- a/backend/tol_data/enwiki/gen_desc_data.py
+++ b/backend/tol_data/enwiki/gen_desc_data.py
@@ -7,10 +7,16 @@ and adds them to a database
# In testing, this script took over 10 hours to run, and generated about 5GB
-import sys, os, re
+import argparse
+import sys
+import os
+import re
import bz2
-import html, mwxml, mwparserfromhell
import sqlite3
+import html
+
+import mwxml
+import mwparserfromhell
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
DB_FILE = 'desc_data.db'
@@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
def convertTemplateReplace(match):
""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
if match.group(2) is None:
return f'{match.group(1)} {match.group(4)}'
else:
return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
-PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
-LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+# ========== For data generation ==========
def genData(dumpFile: str, dbFile: str) -> None:
print('Creating database')
@@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None:
dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
- #
+
print('Iterating through dump file')
with bz2.open(dumpFile, mode='rt') as file:
for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
if pageNum % 1e4 == 0:
print(f'At page {pageNum}')
- # Parse page
+
if page.namespace == 0:
try:
dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
@@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None:
desc = parseDesc(revision.text)
if desc is not None:
dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def parseDesc(text: str) -> str | None:
- # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
- # and then accumulate lines until a blank one.
- # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
- # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
+ Looks for a description in wikitext content.
+
+ Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ and then accumulates lines until a blank one.
+
+ Some cases not accounted for include:
+ disambiguation pages, abstracts with sentences split-across-lines,
+ nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
lines: list[str] = []
openBraceCount = 0
openBracketCount = 0
@@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None:
if lines:
return removeMarkup(' '.join(lines))
return None
+
def removeMarkup(content: str) -> str:
content = EMBEDDED_HTML_REGEX.sub('', content)
content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
@@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str:
content = PARENS_GROUP_REGEX.sub('', content)
content = LEFTOVER_BRACE_REGEX.sub('', content)
return content
+
def convertTitle(title: str) -> str:
return html.unescape(title).replace('_', ' ')
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(DUMP_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py
index 5778680..12a8a10 100755
--- a/backend/tol_data/enwiki/gen_dump_index_db.py
+++ b/backend/tol_data/enwiki/gen_dump_index_db.py
@@ -1,9 +1,13 @@
#!/usr/bin/python3
"""
-Adds data from the wiki dump index-file into a database
+Converts data from the wiki-dump index-file into a database
"""
-import sys, os, re
+
+import argparse
+import sys
+import os
+import re
import bz2
import sqlite3
@@ -14,10 +18,12 @@ def genData(indexFile: str, dbFile: str) -> None:
""" Reads the index file and creates the db """
if os.path.exists(dbFile):
raise Exception(f'ERROR: Existing {dbFile}')
+
print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+
print('Iterating through index file')
lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
lastOffset = 0
@@ -28,7 +34,7 @@ def genData(indexFile: str, dbFile: str) -> None:
lineNum += 1
if lineNum % 1e5 == 0:
print(f'At line {lineNum}')
- #
+
match = lineRegex.fullmatch(line.rstrip())
assert match is not None
offsetStr, pageId, title = match.group(1,2,3)
@@ -48,13 +54,13 @@ def genData(indexFile: str, dbFile: str) -> None:
dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
except sqlite3.IntegrityError as e:
print(f'Failed on title "{t}": {e}', file=sys.stderr)
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(INDEX_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
index 040f223..2c243f3 100755
--- a/backend/tol_data/enwiki/gen_img_data.py
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -8,31 +8,39 @@ The program can be re-run with an updated set of page IDs, and
will skip already-processed page IDs.
"""
+import argparse
import re
-import os, bz2, html, urllib.parse
+import os
+import bz2
+import html
+import urllib.parse
import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
INDEX_DB = 'dump_index.db'
IMG_DB = 'img_data.db' # The database to create
DB_FILE = os.path.join('..', 'data.db')
-#
+
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+# ========== For data generation ==========
+
def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
print('Opening databases')
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
+
print('Checking tables')
if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
# Create tables if not present
- imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+ imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)')
+ # 'img_name' values are set to NULL to indicate page IDs where no image was found
imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
else:
# Check for already-processed page IDs
@@ -44,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
else:
print(f'Found already-processed page ID {pid} which was not in input set')
print(f'Will skip {numSkipped} already-processed page IDs')
- #
+
print('Getting dump-file offsets')
offsetToPageids: dict[int, list[int]] = {}
offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
@@ -53,7 +61,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}')
- #
+
query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
if row is None:
@@ -65,7 +73,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
offsetToPageids[chunkOffset] = []
offsetToPageids[chunkOffset].append(pageId)
print(f'Found {len(offsetToEnd)} chunks to check')
- #
+
print('Iterating through chunks in dump file')
with open(dumpFile, mode='rb') as file:
iterNum = 0
@@ -73,7 +81,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 100 == 0:
print(f'At iteration {iterNum}')
- #
+
chunkPageIds = offsetToPageids[pageOffset]
# Jump to chunk
file.seek(pageOffset)
@@ -126,14 +134,15 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
break
if not foundText:
print(f'WARNING: Did not find <text> for page id {pageId}')
- #
+
print('Closing databases')
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()
+
def getImageName(content: list[str]) -> str | None:
""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
- # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+ # Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = IMG_LINE_REGEX.match(line)
if match is not None:
@@ -174,6 +183,8 @@ def getImageName(content: list[str]) -> str | None:
return None
return None
+# ========== For getting input page IDs ==========
+
def getInputPageIdsFromDb(dbFile: str) -> set[int]:
print('Getting input page-ids')
pageIds: set[int] = set()
@@ -182,12 +193,15 @@ def getInputPageIdsFromDb(dbFile: str) -> set[int]:
for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
pageIds.add(pageId)
dbCon.close()
+
print(f'Found {len(pageIds)}')
return pageIds
+
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
pageIds = getInputPageIdsFromDb(DB_FILE)
genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py
index 8aee1cc..95b4a60 100755
--- a/backend/tol_data/enwiki/gen_pageview_data.py
+++ b/backend/tol_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
"""
Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+ wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+ platform (eg: mobile-web), monthly view count,
+ hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
"""
# Took about 15min per file (each had about 180e6 lines)
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
DUMP_INDEX_DB = 'dump_index.db'
DB_FILE = 'pageview_data.db'
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
- # Each pageview file has lines that seem to hold these space-separated fields:
- # wiki code (eg: en.wikipedia), article title, page ID (may be: null),
- # platform (eg: mobile-web), monthly view count,
- # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
- #
+
namespaceRegex = re.compile(r'[a-zA-Z]+:')
titleToViews: dict[str, int] = defaultdict(int)
linePrefix = b'en.wikipedia '
@@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
print(f'At line {lineNum}')
if not line.startswith(linePrefix):
continue
+
# Get second and second-last fields
line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
title = line[:line.find(b' ')].decode('utf-8')
viewCount = int(line[line.rfind(b' ')+1:])
if namespaceRegex.match(title) is not None:
continue
+
# Update map
title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
idbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py
index f744818..c4d0932 100755
--- a/backend/tol_data/enwiki/lookup_page.py
+++ b/backend/tol_data/enwiki/lookup_page.py
@@ -5,6 +5,7 @@ Looks up a page with title title1 in the wiki dump, using the dump-index
db, and prints the corresponding <page>.
"""
+import argparse
import sys
import bz2
import sqlite3
@@ -24,7 +25,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
_, pageOffset, endOffset = row
dbCon.close()
print(f'Found chunk at offset {pageOffset}')
- #
+
print('Reading from wiki dump')
content: list[str] = []
with open(dumpFile, mode='rb') as file:
@@ -32,6 +33,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
file.seek(pageOffset)
compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+
# Look in chunk for page
lines = data.splitlines()
lineIdx = 0
@@ -58,14 +60,13 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
if line.lstrip() == '</page>':
break
lineIdx += 1
- #
+
print('Content: ')
print('\n'.join(content))
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('title', help='The title to look up')
args = parser.parse_args()
- #
+
lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' '))