diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-03 11:03:25 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-03 11:03:25 +1000 |
| commit | 811946498edc472d91e5ca8d41a4a0568e0d6e8f (patch) | |
| tree | 3ec4f0950950bc7b3cff782f4f9c2b13f9b51cb3 /backend/data/enwiki/genDumpIndexDb.py | |
| parent | 515e02b9453f7740d7429ad7e11d913e32e5ffdb (diff) | |
Adjust enwiki dump-index-db and lookup script to include wiki-ids
Diffstat (limited to 'backend/data/enwiki/genDumpIndexDb.py')
| -rwxr-xr-x | backend/data/enwiki/genDumpIndexDb.py | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py index 450754b..ee3e813 100755 --- a/backend/data/enwiki/genDumpIndexDb.py +++ b/backend/data/enwiki/genDumpIndexDb.py @@ -21,12 +21,12 @@ if os.path.exists(indexDb): # Create db dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, offset INT, next_offset INT)") +dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)") # Reading index file lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") lastOffset = 0 lineNum = 0 -titlesToAdd = [] +entriesToAdd = [] with bz2.open(indexFile, mode='rt') as file: for line in file: lineNum += 1 @@ -34,21 +34,21 @@ with bz2.open(indexFile, mode='rt') as file: print(f"At line {lineNum}") # match = lineRegex.fullmatch(line.rstrip()) - (offset, _, title) = match.group(1,2,3) + (offset, pageId, title) = match.group(1,2,3) offset = int(offset) if offset > lastOffset: - for t in titlesToAdd: + for (t, p) in entriesToAdd: try: - dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset)) + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset)) except sqlite3.IntegrityError as e: # Accounts for certain entries in the file that have the same title print(f"Failed on title \"{t}\": {e}") - titlesToAdd = [] + entriesToAdd = [] lastOffset = offset - titlesToAdd.append(title) -for title in titlesToAdd: + entriesToAdd.append([title, pageId]) +for (title, pageId) in entriesToAdd: try: - dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1)) + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1)) except sqlite3.IntegrityError as e: print(f"Failed on title \"{t}\": {e}") # Close db |
