aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
committerTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
commit5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/tolData/enwiki
parenta8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/tolData/enwiki')
-rw-r--r--backend/tolData/enwiki/README.md52
-rwxr-xr-xbackend/tolData/enwiki/downloadImgLicenseInfo.py150
-rwxr-xr-xbackend/tolData/enwiki/downloadImgs.py91
-rwxr-xr-xbackend/tolData/enwiki/genDescData.py127
-rwxr-xr-xbackend/tolData/enwiki/genDumpIndexDb.py58
-rwxr-xr-xbackend/tolData/enwiki/genImgData.py190
-rwxr-xr-xbackend/tolData/enwiki/lookupPage.py68
7 files changed, 736 insertions, 0 deletions
diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md
new file mode 100644
index 0000000..90d16c7
--- /dev/null
+++ b/backend/tolData/enwiki/README.md
@@ -0,0 +1,52 @@
+This directory holds files obtained from/using [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page).
+
+# Downloaded Files
+- enwiki-20220501-pages-articles-multistream.xml.bz2 <br>
+ Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror).
+ Contains text content and metadata for pages in enwiki.
+ Some file content and format information was available from
+ <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+- enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br>
+ Obtained like above. Holds lines of the form offset1:pageId1:title1,
+ providing, for each page, an offset into the dump file of a chunk of
+ 100 pages that includes it.
+
+# Generated Dump-Index Files
+- genDumpIndexDb.py <br>
+ Creates an sqlite-database version of the enwiki-dump index file.
+- dumpIndex.db <br>
+ Generated by genDumpIndexDb.py. <br>
+ Tables: <br>
+ - `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT`
+
+# Description Database Files
+- genDescData.py <br>
+ Reads through pages in the dump file, and adds short-description info to a database.
+- descData.db <br>
+ Generated by genDescData.py. <br>
+ Tables: <br>
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE`
+ - `redirects`: `id INT PRIMARY KEY, target TEXT`
+ - `descs`: `id INT PRIMARY KEY, desc TEXT`
+
+# Image Database Files
+- genImgData.py <br>
+ Used to find infobox image names for page IDs, storing them into a database.
+- downloadImgLicenseInfo.py <br>
+ Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database.
+- imgData.db <br>
+ Used to hold metadata about infobox images for a set of pageIDs.
+ Generated using getEnwikiImgData.py and downloadImgLicenseInfo.py. <br>
+ Tables: <br>
+ - `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT` <br>
+ `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
+ - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
+ Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
+- downloadImgs.py <br>
+ Used to download image files into imgs/.
+
+# Other Files
+- lookupPage.py <br>
+ Running `lookupPage.py title1` looks in the dump for a page with a given title,
+ and prints the contents to stdout. Uses dumpIndex.db.
+
diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py
new file mode 100755
index 0000000..399922e
--- /dev/null
+++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3, urllib.parse, html
+import requests
+import time, signal
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads image names from a database, and uses enwiki's online API to obtain
+licensing information for them, adding the info to the database.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+at already-processed names to decide what to skip.
+"""
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+imgDb = "imgData.db"
+apiUrl = "https://en.wikipedia.org/w/api.php"
+userAgent = "terryt.dev (terry06890@gmail.com)"
+batchSz = 50 # Max 50
+tagRegex = re.compile(r"<[^<]+>")
+whitespaceRegex = re.compile(r"\s+")
+
+print("Opening database")
+dbCon = sqlite3.connect(imgDb)
+dbCur = dbCon.cursor()
+dbCur2 = dbCon.cursor()
+print("Checking for table")
+if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
+ dbCur.execute("CREATE TABLE imgs(" \
+ "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
+
+print("Reading image names")
+imgNames = set()
+for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
+ imgNames.add(imgName)
+print(f"Found {len(imgNames)}")
+
+print("Checking for already-processed images")
+oldSz = len(imgNames)
+for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
+ imgNames.discard(imgName)
+print(f"Found {oldSz - len(imgNames)}")
+
+# Set SIGINT handler
+interrupted = False
+oldHandler = None
+def onSigint(sig, frame):
+ global interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+oldHandler = signal.signal(signal.SIGINT, onSigint)
+
+print("Iterating through image names")
+imgNames = list(imgNames)
+iterNum = 0
+for i in range(0, len(imgNames), batchSz):
+ iterNum += 1
+ if iterNum % 1 == 0:
+ print(f"At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)")
+ if interrupted:
+ print(f"Exiting loop at iteration {iterNum}")
+ break
+ # Get batch
+ imgBatch = imgNames[i:i+batchSz]
+ imgBatch = ["File:" + x for x in imgBatch]
+ # Make request
+ headers = {
+ "user-agent": userAgent,
+ "accept-encoding": "gzip",
+ }
+ params = {
+ "action": "query",
+ "format": "json",
+ "prop": "imageinfo",
+ "iiprop": "extmetadata|url",
+ "maxlag": "5",
+ "titles": "|".join(imgBatch),
+ "iiextmetadatafilter": "Artist|Credit|LicenseShortName|Restrictions",
+ }
+ responseObj = None
+ try:
+ response = requests.get(apiUrl, params=params, headers=headers)
+ responseObj = response.json()
+ except Exception as e:
+ print(f"ERROR: Exception while downloading info: {e}")
+ print(f"\tImage batch: " + "|".join(imgBatch))
+ continue
+ # Parse response-object
+ if "query" not in responseObj or "pages" not in responseObj["query"]:
+ print("WARNING: Response object for doesn't have page data")
+ print("\tImage batch: " + "|".join(imgBatch))
+ if "error" in responseObj:
+ errorCode = responseObj["error"]["code"]
+ print(f"\tError code: {errorCode}")
+ if errorCode == "maxlag":
+ time.sleep(5)
+ continue
+ pages = responseObj["query"]["pages"]
+ normalisedToInput = {}
+ if "normalized" in responseObj["query"]:
+ for entry in responseObj["query"]["normalized"]:
+ normalisedToInput[entry["to"]] = entry["from"]
+ for (_, page) in pages.items():
+ # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data
+ # LicenseShortName: short human-readable license name, apparently more reliable than 'License',
+ # Artist: author name (might contain complex html, multiple authors, etc)
+ # Credit: 'source'
+ # For image-map-like images, can be quite large/complex html, creditng each sub-image
+ # May be <a href="text1">text2</a>, where the text2 might be non-indicative
+ # Restrictions: specifies non-copyright legal restrictions
+ title = page["title"]
+ if title in normalisedToInput:
+ title = normalisedToInput[title]
+ title = title[5:] # Remove 'File:'
+ if title not in imgNames:
+ print(f"WARNING: Got title \"{title}\" not in image-name list")
+ continue
+ if "imageinfo" not in page:
+ print(f"WARNING: No imageinfo section for page \"{title}\"")
+ continue
+ metadata = page["imageinfo"][0]["extmetadata"]
+ url = page["imageinfo"][0]["url"]
+ license = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None
+ artist = metadata['Artist']['value'] if 'Artist' in metadata else None
+ credit = metadata['Credit']['value'] if 'Credit' in metadata else None
+ restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+ # Remove markup
+ if artist != None:
+ artist = tagRegex.sub(" ", artist)
+ artist = whitespaceRegex.sub(" ", artist)
+ artist = html.unescape(artist)
+ artist = urllib.parse.unquote(artist)
+ if credit != None:
+ credit = tagRegex.sub(" ", credit)
+ credit = whitespaceRegex.sub(" ", credit)
+ credit = html.unescape(credit)
+ credit = urllib.parse.unquote(credit)
+ # Add to db
+ dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
+ (title, license, artist, credit, restrictions, url))
+
+print("Closing database")
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py
new file mode 100755
index 0000000..8fb605f
--- /dev/null
+++ b/backend/tolData/enwiki/downloadImgs.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+
+import sys, re, os
+import sqlite3
+import urllib.parse, requests
+import time, signal
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'pageId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+imgDb = "imgData.db" # About 130k image names
+outDir = "imgs"
+licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
+# In testing, this downloaded about 100k images, over several days
+
+if not os.path.exists(outDir):
+ os.mkdir(outDir)
+print("Checking for already-downloaded images")
+fileList = os.listdir(outDir)
+pageIdsDone = set()
+for filename in fileList:
+ (basename, extension) = os.path.splitext(filename)
+ pageIdsDone.add(int(basename))
+print(f"Found {len(pageIdsDone)}")
+
+# Set SIGINT handler
+interrupted = False
+oldHandler = None
+def onSigint(sig, frame):
+ global interrupted
+ interrupted = True
+ signal.signal(signal.SIGINT, oldHandler)
+oldHandler = signal.signal(signal.SIGINT, onSigint)
+
+print("Opening database")
+dbCon = sqlite3.connect(imgDb)
+dbCur = dbCon.cursor()
+print("Starting downloads")
+iterNum = 0
+query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
+ " imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name"
+for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query):
+ if pageId in pageIdsDone:
+ continue
+ if interrupted:
+ print(f"Exiting loop")
+ break
+ # Check for problematic attributes
+ if license == None or licenseRegex.fullmatch(license) == None:
+ continue
+ if artist == None or artist == "" or len(artist) > 100 or re.match(r"(\d\. )?File:", artist) != None:
+ continue
+ if credit == None or len(credit) > 300 or re.match(r"File:", credit) != None:
+ continue
+ if restrictions != None and restrictions != "":
+ continue
+ # Download image
+ iterNum += 1
+ print(f"Iteration {iterNum}: Downloading for page-id {pageId}")
+ urlParts = urllib.parse.urlparse(url)
+ extension = os.path.splitext(urlParts.path)[1]
+ if len(extension) <= 1:
+ print(f"WARNING: No filename extension found in URL {url}")
+ sys.exit(1)
+ outFile = f"{outDir}/{pageId}{extension}"
+ headers = {
+ "user-agent": "terryt.dev (terry06890@gmail.com)",
+ "accept-encoding": "gzip",
+ }
+ try:
+ response = requests.get(url, headers=headers)
+ with open(outFile, 'wb') as file:
+ file.write(response.content)
+ time.sleep(1)
+ # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
+ # It's unclear how to properly check for cache misses, so this just aims for 1 per sec
+ except Exception as e:
+ print(f"Error while downloading to {outFile}: {e}")
+print("Closing database")
+dbCon.close()
diff --git a/backend/tolData/enwiki/genDescData.py b/backend/tolData/enwiki/genDescData.py
new file mode 100755
index 0000000..b0ca272
--- /dev/null
+++ b/backend/tolData/enwiki/genDescData.py
@@ -0,0 +1,127 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads through the wiki dump, and attempts to
+parse short-descriptions, and add them to a database.
+"""
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages
+enwikiDb = "descData.db"
+# In testing, this script took over 10 hours to run, and generated about 5GB
+
+descLineRegex = re.compile("^ *[A-Z'\"]")
+embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
+ # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
+def convertTemplateReplace(match):
+ if match.group(2) == None:
+ return f"{match.group(1)} {match.group(4)}"
+ else:
+ return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
+parensGroupRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+
+def parseDesc(text):
+ # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ # and then accumulate lines until a blank one.
+ # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
+ # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ lines = []
+ openBraceCount = 0
+ openBracketCount = 0
+ inComment = False
+ skip = False
+ for line in text.splitlines():
+ line = line.strip()
+ if len(lines) == 0:
+ if len(line) > 0:
+ if openBraceCount > 0 or line[0] == "{":
+ openBraceCount += line.count("{")
+ openBraceCount -= line.count("}")
+ skip = True
+ if openBracketCount > 0 or line[0] == "[":
+ openBracketCount += line.count("[")
+ openBracketCount -= line.count("]")
+ skip = True
+ if inComment or line.find("<!--") != -1:
+ if line.find("-->") != -1:
+ if inComment:
+ inComment = False
+ skip = True
+ else:
+ inComment = True
+ skip = True
+ if skip:
+ skip = False
+ continue
+ if line[-1] == ":": # Seems to help avoid disambiguation pages
+ return None
+ if descLineRegex.match(line) != None:
+ lines.append(line)
+ else:
+ if len(line) == 0:
+ return removeMarkup(" ".join(lines))
+ lines.append(line)
+ if len(lines) > 0:
+ return removeMarkup(" ".join(lines))
+ return None
+def removeMarkup(content):
+ content = embeddedHtmlRegex.sub("", content)
+ content = convertTemplateRegex.sub(convertTemplateReplace, content)
+ content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+ content = parensGroupRegex.sub("", content)
+ content = leftoverBraceRegex.sub("", content)
+ return content
+def convertTitle(title):
+ return html.unescape(title).replace("_", " ")
+
+print("Creating database")
+if os.path.exists(enwikiDb):
+ raise Exception(f"ERROR: Existing {enwikiDb}")
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
+dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+
+print("Iterating through dump file")
+with bz2.open(dumpFile, mode='rt') as file:
+ dump = mwxml.Dump.from_file(file)
+ pageNum = 0
+ for page in dump:
+ pageNum += 1
+ if pageNum % 1e4 == 0:
+ print(f"At page {pageNum}")
+ if pageNum > 3e4:
+ break
+ # Parse page
+ if page.namespace == 0:
+ try:
+ dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain pages that have the same title
+ print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr)
+ continue
+ if page.redirect != None:
+ dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
+ else:
+ revision = next(page)
+ desc = parseDesc(revision.text)
+ if desc != None:
+ dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+
+print("Closing database")
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py
new file mode 100755
index 0000000..3955885
--- /dev/null
+++ b/backend/tolData/enwiki/genDumpIndexDb.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Adds data from the wiki dump index-file into a database.
+"""
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines
+indexDb = "dumpIndex.db"
+
+if os.path.exists(indexDb):
+ raise Exception(f"ERROR: Existing {indexDb}")
+print("Creating database")
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)")
+
+print("Iterating through index file")
+lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
+lastOffset = 0
+lineNum = 0
+entriesToAdd = []
+with bz2.open(indexFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print(f"At line {lineNum}")
+ #
+ match = lineRegex.fullmatch(line.rstrip())
+ (offset, pageId, title) = match.group(1,2,3)
+ offset = int(offset)
+ if offset > lastOffset:
+ for (t, p) in entriesToAdd:
+ try:
+ dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain entries in the file that have the same title
+ print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+ entriesToAdd = []
+ lastOffset = offset
+ entriesToAdd.append([title, pageId])
+for (title, pageId) in entriesToAdd:
+ try:
+ dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1))
+ except sqlite3.IntegrityError as e:
+ print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+
+print("Closing database")
+dbCon.commit()
+dbCon.close()
diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py
new file mode 100755
index 0000000..dedfe14
--- /dev/null
+++ b/backend/tolData/enwiki/genImgData.py
@@ -0,0 +1,190 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2, html, urllib.parse
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+def getInputPageIds():
+ pageIds = set()
+ dbCon = sqlite3.connect("../data.db")
+ dbCur = dbCon.cursor()
+ for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
+ pageIds.add(pageId)
+ dbCon.close()
+ return pageIds
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+imgDb = "imgData.db" # The database to create
+idLineRegex = re.compile(r"<id>(.*)</id>")
+imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
+bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
+imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
+cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+# In testing, got about 360k image names
+
+print("Getting input page-ids")
+pageIds = getInputPageIds()
+print(f"Found {len(pageIds)}")
+
+print("Opening databases")
+indexDbCon = sqlite3.connect(indexDb)
+indexDbCur = indexDbCon.cursor()
+imgDbCon = sqlite3.connect(imgDb)
+imgDbCur = imgDbCon.cursor()
+print("Checking tables")
+if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+ # Create tables if not present
+ imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
+ imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+else:
+ # Check for already-processed page IDs
+ numSkipped = 0
+ for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+ if pid in pageIds:
+ pageIds.remove(pid)
+ numSkipped += 1
+ else:
+ print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
+ print(f"Will skip {numSkipped} already-processed page IDs")
+
+print("Getting dump-file offsets")
+offsetToPageids = {}
+offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
+iterNum = 0
+for pageId in pageIds:
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
+ row = indexDbCur.execute(query, (pageId,)).fetchone()
+ if row == None:
+ print(f"WARNING: Page ID {pageId} not found")
+ continue
+ (chunkOffset, endOffset) = row
+ offsetToEnd[chunkOffset] = endOffset
+ if chunkOffset not in offsetToPageids:
+ offsetToPageids[chunkOffset] = []
+ offsetToPageids[chunkOffset].append(pageId)
+print(f"Found {len(offsetToEnd)} chunks to check")
+
+print("Iterating through chunks in dump file")
+def getImageName(content):
+ " Given an array of text-content lines, tries to return an infoxbox image name, or None "
+ # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+ for line in content:
+ match = imageLineRegex.match(line)
+ if match != None:
+ imageName = match.group(1).strip()
+ if imageName == "":
+ return None
+ imageName = html.unescape(imageName)
+ # Account for {{...
+ if imageName.startswith("{"):
+ match = cssImgCropRegex.match(imageName)
+ if match == None:
+ return None
+ imageName = match.group(1)
+ # Account for [[File:...|...]]
+ if imageName.startswith("["):
+ match = bracketImageRegex.match(imageName)
+ if match == None:
+ return None
+ imageName = match.group(1)
+ # Account for <!--
+ if imageName.find("<!--") != -1:
+ return None
+ # Remove an initial 'File:'
+ if imageName.startswith("File:"):
+ imageName = imageName[5:]
+ # Remove an initial 'Image:'
+ if imageName.startswith("Image:"):
+ imageName = imageName[6:]
+ # Check for extension
+ match = imageNameRegex.match(imageName)
+ if match != None:
+ imageName = match.group(0)
+ imageName = urllib.parse.unquote(imageName)
+ imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+ imageName = imageName.replace("_", " ")
+ return imageName
+ # Exclude lines like: | image = &lt;imagemap&gt;
+ return None
+ return None
+with open(dumpFile, mode='rb') as file:
+ iterNum = 0
+ for (pageOffset, endOffset) in offsetToEnd.items():
+ iterNum += 1
+ if iterNum % 100 == 0:
+ print(f"At iteration {iterNum}")
+ #
+ pageIds = offsetToPageids[pageOffset]
+ # Jump to chunk
+ file.seek(pageOffset)
+ compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+ data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+ # Look in chunk for pages
+ lines = data.splitlines()
+ lineIdx = 0
+ while lineIdx < len(lines):
+ # Look for <page>
+ if lines[lineIdx].lstrip() != "<page>":
+ lineIdx += 1
+ continue
+ # Check page id
+ lineIdx += 3
+ idLine = lines[lineIdx].lstrip()
+ match = idLineRegex.fullmatch(idLine)
+ if match == None or int(match.group(1)) not in pageIds:
+ lineIdx += 1
+ continue
+ pageId = int(match.group(1))
+ lineIdx += 1
+ # Look for <text> in <page>
+ foundText = False
+ while lineIdx < len(lines):
+ if not lines[lineIdx].lstrip().startswith("<text "):
+ lineIdx += 1
+ continue
+ foundText = True
+ # Get text content
+ content = []
+ line = lines[lineIdx]
+ content.append(line[line.find(">") + 1:])
+ lineIdx += 1
+ foundTextEnd = False
+ while lineIdx < len(lines):
+ line = lines[lineIdx]
+ if not line.endswith("</text>"):
+ content.append(line)
+ lineIdx += 1
+ continue
+ foundTextEnd = True
+ content.append(line[:line.rfind("</text>")])
+ # Look for image-filename
+ imageName = getImageName(content)
+ imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+ break
+ if not foundTextEnd:
+ print(f"WARNING: Did not find </text> for page id {pageId}")
+ break
+ if not foundText:
+ print(f"WARNING: Did not find <text> for page id {pageId}")
+
+print("Closing databases")
+indexDbCon.close()
+imgDbCon.commit()
+imgDbCon.close()
diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py
new file mode 100755
index 0000000..1a90851
--- /dev/null
+++ b/backend/tolData/enwiki/lookupPage.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2
+import sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]} title1
+
+Looks up a page with title title1 in the wiki dump, using
+the dump-index db, and prints the corresponding <page>.
+"""
+if len(sys.argv) != 2:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+pageTitle = sys.argv[1].replace("_", " ")
+
+print("Looking up offset in index db")
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?"
+row = dbCur.execute(query, (pageTitle,)).fetchone()
+if row == None:
+ print("Title not found")
+ sys.exit(0)
+_, pageOffset, endOffset = row
+dbCon.close()
+print(f"Found chunk at offset {pageOffset}")
+
+print("Reading from wiki dump")
+content = []
+with open(dumpFile, mode='rb') as file:
+ # Get uncompressed chunk
+ file.seek(pageOffset)
+ compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+ data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+ # Look in chunk for page
+ lines = data.splitlines()
+ lineIdx = 0
+ found = False
+ pageNum = 0
+ while not found:
+ line = lines[lineIdx]
+ if line.lstrip() == "<page>":
+ pageNum += 1
+ if pageNum > 100:
+ print("ERROR: Did not find title after 100 pages")
+ break
+ lineIdx += 1
+ titleLine = lines[lineIdx]
+ if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
+ found = True
+ print(f"Found title in chunk as page {pageNum}")
+ content.append(line)
+ content.append(titleLine)
+ while True:
+ lineIdx += 1
+ line = lines[lineIdx]
+ content.append(line)
+ if line.lstrip() == "</page>":
+ break
+ lineIdx += 1
+
+print("Content: ")
+print("\n".join(content))