diff options
Diffstat (limited to 'backend/data/enwiki/genRedirectData.py')
| -rwxr-xr-x | backend/data/enwiki/genRedirectData.py | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py new file mode 100755 index 0000000..e1aadc8 --- /dev/null +++ b/backend/data/enwiki/genRedirectData.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import sys, os.path +from mwsql import Dump +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n" +usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n" +usageInfo += "a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +redirectDumpFile = "enwiki-20220420-redirect.sql.gz" +enwikiDb = "enwikiData.db" + +# Open db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)") +dbCur2 = dbCon.cursor() +# Parse redirect data +dump = Dump.from_file(redirectDumpFile) +iterationNum = 0 +for row in dump.rows(convert_dtypes=True): + iterationNum += 1 + if iterationNum % 1e6 == 0: + print("At iteration {}".format(iterationNum)) + # Add to map + [pageId, namespace, title] = row[:3] + if namespace == 0: # If page is in the article namespace + row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone() + if row != None: + targetId = row[0] + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId)) +# Close db +dbCon.commit() +dbCon.close() |
