aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki/genImgData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-07 11:37:37 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-07 11:37:37 +1000
commitdaccbbd9c73a5292ea9d6746560d7009e5aa666d (patch)
tree9156bf011ab6302eb3c0d219d40587d594f51841 /backend/tolData/enwiki/genImgData.py
parent1a7fe33edafa68a6f759d124bdeee673ff9cf9ff (diff)
Add python type annotations
Also use consistent quote symbols Also use 'is None' instead of '== None' Also use 'if list1' instead of 'if len(list1) > 0'
Diffstat (limited to 'backend/tolData/enwiki/genImgData.py')
-rwxr-xr-xbackend/tolData/enwiki/genImgData.py118
1 files changed, 59 insertions, 59 deletions
diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py
index b5d546d..00140f6 100755
--- a/backend/tolData/enwiki/genImgData.py
+++ b/backend/tolData/enwiki/genImgData.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-import sys, re
+import re
import bz2, html, urllib.parse
import sqlite3
@@ -15,117 +15,117 @@ will skip already-processed page IDs.
parser.parse_args()
def getInputPageIds():
- pageIds = set()
- dbCon = sqlite3.connect("../data.db")
+ pageIds: set[int] = set()
+ dbCon = sqlite3.connect('../data.db')
dbCur = dbCon.cursor()
- for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
+ for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
pageIds.add(pageId)
dbCon.close()
return pageIds
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
-indexDb = "dumpIndex.db"
-imgDb = "imgData.db" # The database to create
-idLineRegex = re.compile(r"<id>(.*)</id>")
-imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
-bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
-imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
-cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
+indexDb = 'dumpIndex.db'
+imgDb = 'imgData.db' # The database to create
+idLineRegex = re.compile(r'<id>(.*)</id>')
+imageLineRegex = re.compile(r'.*\| *image *= *([^|]*)')
+bracketImageRegex = re.compile(r'\[\[(File:[^|]*).*]]')
+imageNameRegex = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
+cssImgCropRegex = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
-print("Getting input page-ids")
+print('Getting input page-ids')
pageIds = getInputPageIds()
-print(f"Found {len(pageIds)}")
+print(f'Found {len(pageIds)}')
-print("Opening databases")
+print('Opening databases')
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
-print("Checking tables")
-if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+print('Checking tables')
+if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
# Create tables if not present
- imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
- imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+ imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+ imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
else:
# Check for already-processed page IDs
numSkipped = 0
- for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+ for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'):
if pid in pageIds:
pageIds.remove(pid)
numSkipped += 1
else:
- print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
- print(f"Will skip {numSkipped} already-processed page IDs")
+ print(f'WARNING: Found already-processed page ID {pid} which was not in input set')
+ print(f'Will skip {numSkipped} already-processed page IDs')
-print("Getting dump-file offsets")
-offsetToPageids = {}
-offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
+print('Getting dump-file offsets')
+offsetToPageids: dict[int, list[int]] = {}
+offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
iterNum = 0
for pageId in pageIds:
iterNum += 1
if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
+ print(f'At iteration {iterNum}')
#
- query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
- row = indexDbCur.execute(query, (pageId,)).fetchone()
- if row == None:
- print(f"WARNING: Page ID {pageId} not found")
+ query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
+ row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
+ if row is None:
+ print(f'WARNING: Page ID {pageId} not found')
continue
- (chunkOffset, endOffset) = row
+ chunkOffset, endOffset = row
offsetToEnd[chunkOffset] = endOffset
if chunkOffset not in offsetToPageids:
offsetToPageids[chunkOffset] = []
offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check")
+print(f'Found {len(offsetToEnd)} chunks to check')
-print("Iterating through chunks in dump file")
-def getImageName(content):
- " Given an array of text-content lines, tries to return an infoxbox image name, or None "
+print('Iterating through chunks in dump file')
+def getImageName(content: list[str]) -> str | None:
+ """ Given an array of text-content lines, tries to return an infoxbox image name, or None """
# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = imageLineRegex.match(line)
- if match != None:
+ if match is not None:
imageName = match.group(1).strip()
- if imageName == "":
+ if imageName == '':
return None
imageName = html.unescape(imageName)
# Account for {{...
- if imageName.startswith("{"):
+ if imageName.startswith('{'):
match = cssImgCropRegex.match(imageName)
- if match == None:
+ if match is None:
return None
imageName = match.group(1)
# Account for [[File:...|...]]
- if imageName.startswith("["):
+ if imageName.startswith('['):
match = bracketImageRegex.match(imageName)
- if match == None:
+ if match is None:
return None
imageName = match.group(1)
# Account for <!--
- if imageName.find("<!--") != -1:
+ if imageName.find('<!--') != -1:
return None
# Remove an initial 'File:'
- if imageName.startswith("File:"):
+ if imageName.startswith('File:'):
imageName = imageName[5:]
# Remove an initial 'Image:'
- if imageName.startswith("Image:"):
+ if imageName.startswith('Image:'):
imageName = imageName[6:]
# Check for extension
match = imageNameRegex.match(imageName)
- if match != None:
+ if match is not None:
imageName = match.group(0)
imageName = urllib.parse.unquote(imageName)
imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
- imageName = imageName.replace("_", " ")
+ imageName = imageName.replace('_', ' ')
return imageName
# Exclude lines like: | image = &lt;imagemap&gt;
return None
return None
with open(dumpFile, mode='rb') as file:
iterNum = 0
- for (pageOffset, endOffset) in offsetToEnd.items():
+ for pageOffset, endOffset in offsetToEnd.items():
iterNum += 1
if iterNum % 100 == 0:
- print(f"At iteration {iterNum}")
+ print(f'At iteration {iterNum}')
#
pageIds = offsetToPageids[pageOffset]
# Jump to chunk
@@ -137,14 +137,14 @@ with open(dumpFile, mode='rb') as file:
lineIdx = 0
while lineIdx < len(lines):
# Look for <page>
- if lines[lineIdx].lstrip() != "<page>":
+ if lines[lineIdx].lstrip() != '<page>':
lineIdx += 1
continue
# Check page id
lineIdx += 3
idLine = lines[lineIdx].lstrip()
match = idLineRegex.fullmatch(idLine)
- if match == None or int(match.group(1)) not in pageIds:
+ if match is None or int(match.group(1)) not in pageIds:
lineIdx += 1
continue
pageId = int(match.group(1))
@@ -152,35 +152,35 @@ with open(dumpFile, mode='rb') as file:
# Look for <text> in <page>
foundText = False
while lineIdx < len(lines):
- if not lines[lineIdx].lstrip().startswith("<text "):
+ if not lines[lineIdx].lstrip().startswith('<text '):
lineIdx += 1
continue
foundText = True
# Get text content
- content = []
+ content: list[str] = []
line = lines[lineIdx]
- content.append(line[line.find(">") + 1:])
+ content.append(line[line.find('>') + 1:])
lineIdx += 1
foundTextEnd = False
while lineIdx < len(lines):
line = lines[lineIdx]
- if not line.endswith("</text>"):
+ if not line.endswith('</text>'):
content.append(line)
lineIdx += 1
continue
foundTextEnd = True
- content.append(line[:line.rfind("</text>")])
+ content.append(line[:line.rfind('</text>')])
# Look for image-filename
imageName = getImageName(content)
- imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+ imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName))
break
if not foundTextEnd:
- print(f"WARNING: Did not find </text> for page id {pageId}")
+ print(f'WARNING: Did not find </text> for page id {pageId}')
break
if not foundText:
- print(f"WARNING: Did not find <text> for page id {pageId}")
+ print(f'WARNING: Did not find <text> for page id {pageId}')
-print("Closing databases")
+print('Closing databases')
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()