diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-17 10:41:12 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-05-17 10:41:12 +1000 |
| commit | 29940d51eb8b6b220d53940ecbc212cea78159ae (patch) | |
| tree | bfa698c17525de7876b80ad37d8f7777b9505ba0 /backend/data/enwiki/genDumpIndexDb.py | |
| parent | a840a16c6bd5aef906bd5cbce8293fc863cb5a5d (diff) | |
Improve enwiki description extraction
Adjust enwiki code to handle single dump file, and add scripts for
'convenient' page-content lookup.
Diffstat (limited to 'backend/data/enwiki/genDumpIndexDb.py')
| -rwxr-xr-x | backend/data/enwiki/genDumpIndexDb.py | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py new file mode 100755 index 0000000..13f7eb6 --- /dev/null +++ b/backend/data/enwiki/genDumpIndexDb.py @@ -0,0 +1,56 @@ +#!/usr/bin/python3 + +import sys, os, re +import bz2 +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a Wikimedia enwiki dump index file,\n" +usageInfo += "and stores it's offset and title data to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines +indexDb = "dumpIndex.db" + +# Check for existing db +if os.path.exists(indexDb): + print("ERROR: Existing {}".format(indexDb), file=sys.stderr) + sys.exit(1) +# Create db +dbCon = sqlite3.connect(indexDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, offset INT, next_offset INT)") +# Reading index file +lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") +lastOffset = 0 +lineNum = 0 +titlesToAdd = [] +with bz2.open(indexFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("At line {}".format(lineNum)) + # + match = lineRegex.fullmatch(line.rstrip()) + (offset, _, title) = match.group(1,2,3) + offset = int(offset) + if offset > lastOffset: + for t in titlesToAdd: + try: + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset)) + except sqlite3.IntegrityError as e: + # Accounts for certain entries in the file that have the same title + print("Failed on title \"{}\": {}".format(t, e)) + titlesToAdd = [] + lastOffset = offset + titlesToAdd.append(title) +for title in titlesToAdd: + try: + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1)) + except sqlite3.IntegrityError as e: + print("Failed on title \"{}\": {}".format(t, e)) +# Close db +dbCon.commit() +dbCon.close() |
