aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki/genImgData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/enwiki/genImgData.py')
-rwxr-xr-xbackend/data/enwiki/genImgData.py72
1 files changed, 42 insertions, 30 deletions
diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py
index 9bd28f4..dedfe14 100755
--- a/backend/data/enwiki/genImgData.py
+++ b/backend/data/enwiki/genImgData.py
@@ -4,9 +4,15 @@ import sys, re
import bz2, html, urllib.parse
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
-usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -21,58 +27,64 @@ def getInputPageIds():
return pageIds
dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
indexDb = "dumpIndex.db"
-imgDb = "imgData.db" # Output db
+imgDb = "imgData.db" # The database to create
idLineRegex = re.compile(r"<id>(.*)</id>")
imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+# In testing, got about 360k image names
-# Open dbs
+print("Getting input page-ids")
+pageIds = getInputPageIds()
+print(f"Found {len(pageIds)}")
+
+print("Opening databases")
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
-# Create image-db table
-pidsDone = set()
+print("Checking tables")
if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+ # Create tables if not present
imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
else:
+ # Check for already-processed page IDs
+ numSkipped = 0
for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
- pidsDone.add(pid)
- print(f"Will skip {len(pidsDone)} already-processed page-ids")
-# Get input pageIds
-print("Getting input page-ids", file=sys.stderr)
-pageIds = getInputPageIds()
-for pid in pidsDone:
- pageIds.remove(pid)
-print(f"Found {len(pageIds)} page-ids to process")
-# Get page-id dump-file offsets
-print("Getting dump-file offsets", file=sys.stderr)
+ if pid in pageIds:
+ pageIds.remove(pid)
+ numSkipped += 1
+ else:
+ print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
+ print(f"Will skip {numSkipped} already-processed page IDs")
+
+print("Getting dump-file offsets")
offsetToPageids = {}
-offsetToEnd = {}
+offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
iterNum = 0
for pageId in pageIds:
iterNum += 1
if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}", file=sys.stderr)
+ print(f"At iteration {iterNum}")
#
query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
row = indexDbCur.execute(query, (pageId,)).fetchone()
if row == None:
- print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
+ print(f"WARNING: Page ID {pageId} not found")
continue
(chunkOffset, endOffset) = row
offsetToEnd[chunkOffset] = endOffset
if chunkOffset not in offsetToPageids:
offsetToPageids[chunkOffset] = []
offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
-# Look through dump file, jumping to chunks containing relevant pages
-print("Reading through dump file", file=sys.stderr)
+print(f"Found {len(offsetToEnd)} chunks to check")
+
+print("Iterating through chunks in dump file")
def getImageName(content):
- """ Given an array of text-content lines, returns an image-filename, or None """
+ " Given an array of text-content lines, tries to return an infoxbox image name, or None "
+ # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = imageLineRegex.match(line)
if match != None:
@@ -109,16 +121,15 @@ def getImageName(content):
imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
imageName = imageName.replace("_", " ")
return imageName
- # Skip lines like: | image = &lt;imagemap&gt;
+ # Exclude lines like: | image = &lt;imagemap&gt;
return None
- # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
return None
with open(dumpFile, mode='rb') as file:
iterNum = 0
for (pageOffset, endOffset) in offsetToEnd.items():
iterNum += 1
if iterNum % 100 == 0:
- print(f"At iteration {iterNum}", file=sys.stderr)
+ print(f"At iteration {iterNum}")
#
pageIds = offsetToPageids[pageOffset]
# Jump to chunk
@@ -168,11 +179,12 @@ with open(dumpFile, mode='rb') as file:
imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
break
if not foundTextEnd:
- print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
+ print(f"WARNING: Did not find </text> for page id {pageId}")
break
if not foundText:
- print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
-# Close dbs
+ print(f"WARNING: Did not find <text> for page id {pageId}")
+
+print("Closing databases")
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()