aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/enwiki')
-rw-r--r--backend/data/enwiki/README.md2
-rwxr-xr-xbackend/data/enwiki/downloadImgLicenseInfo.py60
-rwxr-xr-xbackend/data/enwiki/downloadImgs.py (renamed from backend/data/enwiki/downloadEnwikiImgs.py)37
-rwxr-xr-xbackend/data/enwiki/genDescData.py43
-rwxr-xr-xbackend/data/enwiki/genDumpIndexDb.py26
-rwxr-xr-xbackend/data/enwiki/genImgData.py72
-rwxr-xr-xbackend/data/enwiki/lookupPage.py22
7 files changed, 148 insertions, 114 deletions
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index 1c16a2e..90d16c7 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -42,7 +42,7 @@ This directory holds files obtained from/using [English Wikipedia](https://en.wi
`img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
- `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
-- downloadEnwikiImgs.py <br>
+- downloadImgs.py <br>
Used to download image files into imgs/.
# Other Files
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
index 097304b..399922e 100755
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ b/backend/data/enwiki/downloadImgLicenseInfo.py
@@ -5,41 +5,48 @@ import sqlite3, urllib.parse, html
import requests
import time, signal
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n"
-usageInfo += "licensing information for them, adding the info to a sqlite db.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and looks\n"
-usageInfo += "at names added to the db to decide what to skip.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads image names from a database, and uses enwiki's online API to obtain
+licensing information for them, adding the info to the database.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+at already-processed names to decide what to skip.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-imgDb = "imgData.db" # About 130k image names
+imgDb = "imgData.db"
apiUrl = "https://en.wikipedia.org/w/api.php"
+userAgent = "terryt.dev (terry06890@gmail.com)"
batchSz = 50 # Max 50
tagRegex = re.compile(r"<[^<]+>")
whitespaceRegex = re.compile(r"\s+")
-# Open db
+print("Opening database")
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
dbCur2 = dbCon.cursor()
-# Create table if it doesn't exist
+print("Checking for table")
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
dbCur.execute("CREATE TABLE imgs(" \
"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
-# Get image names
+
print("Reading image names")
imgNames = set()
for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
imgNames.add(imgName)
-print(f"Found {len(imgNames)} images")
+print(f"Found {len(imgNames)}")
+
+print("Checking for already-processed images")
oldSz = len(imgNames)
for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
imgNames.discard(imgName)
-print(f"Skipping {oldSz - len(imgNames)} already-done images")
+print(f"Found {oldSz - len(imgNames)}")
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -48,7 +55,8 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Iterate through image names, making API requests
+
+print("Iterating through image names")
imgNames = list(imgNames)
iterNum = 0
for i in range(0, len(imgNames), batchSz):
@@ -63,7 +71,7 @@ for i in range(0, len(imgNames), batchSz):
imgBatch = ["File:" + x for x in imgBatch]
# Make request
headers = {
- "user-agent": "terryt.dev (terry06890@gmail.com)",
+ "user-agent": userAgent,
"accept-encoding": "gzip",
}
params = {
@@ -80,16 +88,16 @@ for i in range(0, len(imgNames), batchSz):
response = requests.get(apiUrl, params=params, headers=headers)
responseObj = response.json()
except Exception as e:
- print(f"Error while downloading info: {e}", file=sys.stderr)
- print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
+ print(f"ERROR: Exception while downloading info: {e}")
+ print(f"\tImage batch: " + "|".join(imgBatch))
continue
# Parse response-object
if "query" not in responseObj or "pages" not in responseObj["query"]:
- print("WARNING: Response object for doesn't have page data", file=sys.stderr)
- print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
+ print("WARNING: Response object for doesn't have page data")
+ print("\tImage batch: " + "|".join(imgBatch))
if "error" in responseObj:
errorCode = responseObj["error"]["code"]
- print(f"\tError code: {errorCode}", file=sys.stderr)
+ print(f"\tError code: {errorCode}")
if errorCode == "maxlag":
time.sleep(5)
continue
@@ -111,10 +119,10 @@ for i in range(0, len(imgNames), batchSz):
title = normalisedToInput[title]
title = title[5:] # Remove 'File:'
if title not in imgNames:
- print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr)
+ print(f"WARNING: Got title \"{title}\" not in image-name list")
continue
if "imageinfo" not in page:
- print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr)
+ print(f"WARNING: No imageinfo section for page \"{title}\"")
continue
metadata = page["imageinfo"][0]["extmetadata"]
url = page["imageinfo"][0]["url"]
@@ -122,7 +130,7 @@ for i in range(0, len(imgNames), batchSz):
artist = metadata['Artist']['value'] if 'Artist' in metadata else None
credit = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
- # Remove newlines
+ # Remove markup
if artist != None:
artist = tagRegex.sub(" ", artist)
artist = whitespaceRegex.sub(" ", artist)
@@ -134,7 +142,9 @@ for i in range(0, len(imgNames), batchSz):
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
# Add to db
- dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url))
-# Close db
+ dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
+ (title, license, artist, credit, restrictions, url))
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadImgs.py
index 2929a0d..8fb605f 100755
--- a/backend/data/enwiki/downloadEnwikiImgs.py
+++ b/backend/data/enwiki/downloadImgs.py
@@ -5,13 +5,16 @@ import sqlite3
import urllib.parse, requests
import time, signal
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an sqlite db,\n"
-usageInfo += "into a specified directory.'\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and looks\n"
-usageInfo += "in the output directory do decide what to skip.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'pageId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -19,18 +22,18 @@ if len(sys.argv) > 1:
imgDb = "imgData.db" # About 130k image names
outDir = "imgs"
licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
+# In testing, this downloaded about 100k images, over several days
-# Create output directory if not present
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Get existing image names
-print("Gettings already-downloaded images")
+print("Checking for already-downloaded images")
fileList = os.listdir(outDir)
pageIdsDone = set()
for filename in fileList:
(basename, extension) = os.path.splitext(filename)
pageIdsDone.add(int(basename))
-print(f"Found {len(pageIdsDone)} already-downloaded images")
+print(f"Found {len(pageIdsDone)}")
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -39,10 +42,10 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Open db
+
+print("Opening database")
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
-# Start downloads
print("Starting downloads")
iterNum = 0
query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
@@ -68,7 +71,7 @@ for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query)
urlParts = urllib.parse.urlparse(url)
extension = os.path.splitext(urlParts.path)[1]
if len(extension) <= 1:
- print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
+ print(f"WARNING: No filename extension found in URL {url}")
sys.exit(1)
outFile = f"{outDir}/{pageId}{extension}"
headers = {
@@ -81,8 +84,8 @@ for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query)
file.write(response.content)
time.sleep(1)
# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
- # It's unclear how to properly check for cache misses, so just do about <=1 per sec
+ # It's unclear how to properly check for cache misses, so this just aims for 1 per sec
except Exception as e:
- print(f"Error while downloading to {outFile}: {e}", file=sys.stderr)
-# Close db
+ print(f"Error while downloading to {outFile}: {e}")
+print("Closing database")
dbCon.close()
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
index 032dbed..b0ca272 100755
--- a/backend/data/enwiki/genDescData.py
+++ b/backend/data/enwiki/genDescData.py
@@ -5,31 +5,36 @@ import bz2
import html, mwxml, mwparserfromhell
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
-usageInfo += "and short-description info to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads through the wiki dump, and attempts to
+parse short-descriptions, and add them to a database.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages
enwikiDb = "descData.db"
+# In testing, this script took over 10 hours to run, and generated about 5GB
-# Some regexps and functions for parsing wikitext
descLineRegex = re.compile("^ *[A-Z'\"]")
embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
-parensGrpRegex = re.compile(r" \([^()]*\)")
-leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
def convertTemplateReplace(match):
if match.group(2) == None:
return f"{match.group(1)} {match.group(4)}"
else:
return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
+parensGroupRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+
def parseDesc(text):
- # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
- # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines,
+ # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ # and then accumulate lines until a blank one.
+ # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
lines = []
openBraceCount = 0
@@ -74,18 +79,15 @@ def removeMarkup(content):
content = embeddedHtmlRegex.sub("", content)
content = convertTemplateRegex.sub(convertTemplateReplace, content)
content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
- content = parensGrpRegex.sub("", content)
+ content = parensGroupRegex.sub("", content)
content = leftoverBraceRegex.sub("", content)
return content
-# Other helper functions
def convertTitle(title):
return html.unescape(title).replace("_", " ")
-# Check for existing db
+print("Creating database")
if os.path.exists(enwikiDb):
- print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
- sys.exit(1)
-# Create db
+ raise Exception(f"ERROR: Existing {enwikiDb}")
dbCon = sqlite3.connect(enwikiDb)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
@@ -93,8 +95,8 @@ dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Read through dump file
-print("Reading dump file")
+
+print("Iterating through dump file")
with bz2.open(dumpFile, mode='rt') as file:
dump = mwxml.Dump.from_file(file)
pageNum = 0
@@ -102,13 +104,15 @@ with bz2.open(dumpFile, mode='rt') as file:
pageNum += 1
if pageNum % 1e4 == 0:
print(f"At page {pageNum}")
+ if pageNum > 3e4:
+ break
# Parse page
if page.namespace == 0:
try:
dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
except sqlite3.IntegrityError as e:
# Accounts for certain pages that have the same title
- print(f"Failed to add page with title \"{page.title}\": {e}")
+ print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr)
continue
if page.redirect != None:
dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
@@ -117,6 +121,7 @@ with bz2.open(dumpFile, mode='rt') as file:
desc = parseDesc(revision.text)
if desc != None:
dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py
index ee3e813..3955885 100755
--- a/backend/data/enwiki/genDumpIndexDb.py
+++ b/backend/data/enwiki/genDumpIndexDb.py
@@ -4,25 +4,26 @@ import sys, os, re
import bz2
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump index file,\n"
-usageInfo += "and stores it's offset and title data to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Adds data from the wiki dump index-file into a database.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines
+indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines
indexDb = "dumpIndex.db"
-# Check for existing db
if os.path.exists(indexDb):
- print(f"ERROR: Existing {indexDb}", file=sys.stderr)
- sys.exit(1)
-# Create db
+ raise Exception(f"ERROR: Existing {indexDb}")
+print("Creating database")
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)")
-# Reading index file
+
+print("Iterating through index file")
lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
lastOffset = 0
lineNum = 0
@@ -42,7 +43,7 @@ with bz2.open(indexFile, mode='rt') as file:
dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset))
except sqlite3.IntegrityError as e:
# Accounts for certain entries in the file that have the same title
- print(f"Failed on title \"{t}\": {e}")
+ print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
entriesToAdd = []
lastOffset = offset
entriesToAdd.append([title, pageId])
@@ -50,7 +51,8 @@ for (title, pageId) in entriesToAdd:
try:
dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1))
except sqlite3.IntegrityError as e:
- print(f"Failed on title \"{t}\": {e}")
-# Close db
+ print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py
index 9bd28f4..dedfe14 100755
--- a/backend/data/enwiki/genImgData.py
+++ b/backend/data/enwiki/genImgData.py
@@ -4,9 +4,15 @@ import sys, re
import bz2, html, urllib.parse
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
-usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -21,58 +27,64 @@ def getInputPageIds():
return pageIds
dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
indexDb = "dumpIndex.db"
-imgDb = "imgData.db" # Output db
+imgDb = "imgData.db" # The database to create
idLineRegex = re.compile(r"<id>(.*)</id>")
imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+# In testing, got about 360k image names
-# Open dbs
+print("Getting input page-ids")
+pageIds = getInputPageIds()
+print(f"Found {len(pageIds)}")
+
+print("Opening databases")
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
-# Create image-db table
-pidsDone = set()
+print("Checking tables")
if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+ # Create tables if not present
imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
else:
+ # Check for already-processed page IDs
+ numSkipped = 0
for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
- pidsDone.add(pid)
- print(f"Will skip {len(pidsDone)} already-processed page-ids")
-# Get input pageIds
-print("Getting input page-ids", file=sys.stderr)
-pageIds = getInputPageIds()
-for pid in pidsDone:
- pageIds.remove(pid)
-print(f"Found {len(pageIds)} page-ids to process")
-# Get page-id dump-file offsets
-print("Getting dump-file offsets", file=sys.stderr)
+ if pid in pageIds:
+ pageIds.remove(pid)
+ numSkipped += 1
+ else:
+ print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
+ print(f"Will skip {numSkipped} already-processed page IDs")
+
+print("Getting dump-file offsets")
offsetToPageids = {}
-offsetToEnd = {}
+offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
iterNum = 0
for pageId in pageIds:
iterNum += 1
if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}", file=sys.stderr)
+ print(f"At iteration {iterNum}")
#
query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
row = indexDbCur.execute(query, (pageId,)).fetchone()
if row == None:
- print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
+ print(f"WARNING: Page ID {pageId} not found")
continue
(chunkOffset, endOffset) = row
offsetToEnd[chunkOffset] = endOffset
if chunkOffset not in offsetToPageids:
offsetToPageids[chunkOffset] = []
offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
-# Look through dump file, jumping to chunks containing relevant pages
-print("Reading through dump file", file=sys.stderr)
+print(f"Found {len(offsetToEnd)} chunks to check")
+
+print("Iterating through chunks in dump file")
def getImageName(content):
- """ Given an array of text-content lines, returns an image-filename, or None """
+ " Given an array of text-content lines, tries to return an infoxbox image name, or None "
+ # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = imageLineRegex.match(line)
if match != None:
@@ -109,16 +121,15 @@ def getImageName(content):
imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
imageName = imageName.replace("_", " ")
return imageName
- # Skip lines like: | image = &lt;imagemap&gt;
+ # Exclude lines like: | image = &lt;imagemap&gt;
return None
- # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
return None
with open(dumpFile, mode='rb') as file:
iterNum = 0
for (pageOffset, endOffset) in offsetToEnd.items():
iterNum += 1
if iterNum % 100 == 0:
- print(f"At iteration {iterNum}", file=sys.stderr)
+ print(f"At iteration {iterNum}")
#
pageIds = offsetToPageids[pageOffset]
# Jump to chunk
@@ -168,11 +179,12 @@ with open(dumpFile, mode='rb') as file:
imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
break
if not foundTextEnd:
- print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
+ print(f"WARNING: Did not find </text> for page id {pageId}")
break
if not foundText:
- print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
-# Close dbs
+ print(f"WARNING: Did not find <text> for page id {pageId}")
+
+print("Closing databases")
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()
diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py
index 76f2f95..1a90851 100755
--- a/backend/data/enwiki/lookupPage.py
+++ b/backend/data/enwiki/lookupPage.py
@@ -4,9 +4,12 @@ import sys, re
import bz2
import sqlite3
-usageInfo = f"usage: {sys.argv[0]} title1\n"
-usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n"
-usageInfo += "using a dump index db, and prints the corresponding <page>.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]} title1
+
+Looks up a page with title title1 in the wiki dump, using
+the dump-index db, and prints the corresponding <page>.
+"""
if len(sys.argv) != 2:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -15,20 +18,19 @@ dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
indexDb = "dumpIndex.db"
pageTitle = sys.argv[1].replace("_", " ")
-# Searching index file
-print("Lookup offset in index db")
+print("Looking up offset in index db")
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?"
row = dbCur.execute(query, (pageTitle,)).fetchone()
if row == None:
print("Title not found")
- sys.exit(1)
-(_, pageOffset, endOffset) = row
+ sys.exit(0)
+_, pageOffset, endOffset = row
dbCon.close()
print(f"Found chunk at offset {pageOffset}")
-# Read dump file
-print("Reading dump file")
+
+print("Reading from wiki dump")
content = []
with open(dumpFile, mode='rb') as file:
# Get uncompressed chunk
@@ -61,6 +63,6 @@ with open(dumpFile, mode='rb') as file:
if line.lstrip() == "</page>":
break
lineIdx += 1
-# Print content
+
print("Content: ")
print("\n".join(content))