aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki/genDumpIndexDb.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-07 11:37:37 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-07 11:37:37 +1000
commitdaccbbd9c73a5292ea9d6746560d7009e5aa666d (patch)
tree9156bf011ab6302eb3c0d219d40587d594f51841 /backend/tolData/enwiki/genDumpIndexDb.py
parent1a7fe33edafa68a6f759d124bdeee673ff9cf9ff (diff)
Add python type annotations
Also use consistent quote symbols Also use 'is None' instead of '== None' Also use 'if list1' instead of 'if len(list1) > 0'
Diffstat (limited to 'backend/tolData/enwiki/genDumpIndexDb.py')
-rwxr-xr-xbackend/tolData/enwiki/genDumpIndexDb.py39
1 files changed, 20 insertions, 19 deletions
diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py
index 1bffb27..3bd129f 100755
--- a/backend/tolData/enwiki/genDumpIndexDb.py
+++ b/backend/tolData/enwiki/genDumpIndexDb.py
@@ -10,46 +10,47 @@ Adds data from the wiki dump index-file into a database
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
-indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines
-indexDb = "dumpIndex.db"
+indexFile = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
+indexDb = 'dumpIndex.db'
if os.path.exists(indexDb):
- raise Exception(f"ERROR: Existing {indexDb}")
-print("Creating database")
+ raise Exception(f'ERROR: Existing {indexDb}')
+print('Creating database')
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)")
+dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
-print("Iterating through index file")
-lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
+print('Iterating through index file')
+lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
lastOffset = 0
lineNum = 0
-entriesToAdd = []
+entriesToAdd: list[tuple[str, str]] = []
with bz2.open(indexFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print(f"At line {lineNum}")
+ print(f'At line {lineNum}')
#
match = lineRegex.fullmatch(line.rstrip())
- (offset, pageId, title) = match.group(1,2,3)
- offset = int(offset)
+ assert match is not None
+ offsetStr, pageId, title = match.group(1,2,3)
+ offset = int(offsetStr)
if offset > lastOffset:
- for (t, p) in entriesToAdd:
+ for t, p in entriesToAdd:
try:
- dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset))
+ dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset))
except sqlite3.IntegrityError as e:
# Accounts for certain entries in the file that have the same title
- print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+ print(f'Failed on title "{t}": {e}', file=sys.stderr)
entriesToAdd = []
lastOffset = offset
- entriesToAdd.append([title, pageId])
-for (title, pageId) in entriesToAdd:
+ entriesToAdd.append((title, pageId))
+for title, pageId in entriesToAdd:
try:
- dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1))
+ dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
except sqlite3.IntegrityError as e:
- print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+ print(f'Failed on title "{t}": {e}', file=sys.stderr)
-print("Closing database")
+print('Closing database')
dbCon.commit()
dbCon.close()