aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/enwiki')
-rwxr-xr-xbackend/data/enwiki/genData.py10
-rwxr-xr-xbackend/data/enwiki/genDumpIndexDb.py8
-rwxr-xr-xbackend/data/enwiki/lookupPage.py4
3 files changed, 11 insertions, 11 deletions
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py
index 646292c..f439d11 100755
--- a/backend/data/enwiki/genData.py
+++ b/backend/data/enwiki/genData.py
@@ -24,9 +24,9 @@ parensGrpRegex = re.compile(r" \([^()]*\)")
leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
def convertTemplateReplace(match):
if match.group(2) == None:
- return "{} {}".format(match.group(1), match.group(4))
+ return f"{match.group(1)} {match.group(4)}"
else:
- return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
+ return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
def parseDesc(text):
# Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
# Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines,
@@ -83,7 +83,7 @@ def convertTitle(title):
# Check for existing db
if os.path.exists(enwikiDb):
- print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr)
+ print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
sys.exit(1)
# Create db
dbCon = sqlite3.connect(enwikiDb)
@@ -101,14 +101,14 @@ with bz2.open(dumpFile, mode='rt') as file:
for page in dump:
pageNum += 1
if pageNum % 1e4 == 0:
- print("At page {}".format(pageNum))
+ print(f"At page {pageNum}")
# Parse page
if page.namespace == 0:
try:
dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
except sqlite3.IntegrityError as e:
# Accounts for certain pages that have the same title
- print("Failed to add page with title \"{}\": {}".format(page.title, e))
+ print(f"Failed to add page with title \"{page.title}\": {e}")
continue
if page.redirect != None:
dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py
index 13f7eb6..450754b 100755
--- a/backend/data/enwiki/genDumpIndexDb.py
+++ b/backend/data/enwiki/genDumpIndexDb.py
@@ -16,7 +16,7 @@ indexDb = "dumpIndex.db"
# Check for existing db
if os.path.exists(indexDb):
- print("ERROR: Existing {}".format(indexDb), file=sys.stderr)
+ print(f"ERROR: Existing {indexDb}", file=sys.stderr)
sys.exit(1)
# Create db
dbCon = sqlite3.connect(indexDb)
@@ -31,7 +31,7 @@ with bz2.open(indexFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print("At line {}".format(lineNum))
+ print(f"At line {lineNum}")
#
match = lineRegex.fullmatch(line.rstrip())
(offset, _, title) = match.group(1,2,3)
@@ -42,7 +42,7 @@ with bz2.open(indexFile, mode='rt') as file:
dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset))
except sqlite3.IntegrityError as e:
# Accounts for certain entries in the file that have the same title
- print("Failed on title \"{}\": {}".format(t, e))
+ print(f"Failed on title \"{t}\": {e}")
titlesToAdd = []
lastOffset = offset
titlesToAdd.append(title)
@@ -50,7 +50,7 @@ for title in titlesToAdd:
try:
dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1))
except sqlite3.IntegrityError as e:
- print("Failed on title \"{}\": {}".format(t, e))
+ print(f"Failed on title \"{t}\": {e}")
# Close db
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py
index c795c35..1d379e7 100755
--- a/backend/data/enwiki/lookupPage.py
+++ b/backend/data/enwiki/lookupPage.py
@@ -26,7 +26,7 @@ if row == None:
sys.exit(0)
(_, pageOffset, endOffset) = row
dbCon.close()
-print("Found chunk at offset {}".format(pageOffset))
+print(f"Found chunk at offset {pageOffset}")
# Read dump file
print("Reading dump file")
content = []
@@ -51,7 +51,7 @@ with open(dumpFile, mode='rb') as file:
titleLine = lines[lineIdx]
if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
found = True
- print("Found title in chunk as page {}".format(pageNum))
+ print(f"Found title in chunk as page {pageNum}")
content.append(line)
content.append(titleLine)
while True: