diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-17 10:41:12 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-05-17 10:41:12 +1000 |
| commit | 29940d51eb8b6b220d53940ecbc212cea78159ae (patch) | |
| tree | bfa698c17525de7876b80ad37d8f7777b9505ba0 /backend/data/enwiki/genRedirectData.py | |
| parent | a840a16c6bd5aef906bd5cbce8293fc863cb5a5d (diff) | |
Improve enwiki description extraction
Adjust enwiki code to handle single dump file, and add scripts for
'convenient' page-content lookup.
Diffstat (limited to 'backend/data/enwiki/genRedirectData.py')
| -rwxr-xr-x | backend/data/enwiki/genRedirectData.py | 39 |
1 files changed, 0 insertions, 39 deletions
diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py deleted file mode 100755 index e1aadc8..0000000 --- a/backend/data/enwiki/genRedirectData.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 - -import sys, os.path -from mwsql import Dump -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n" -usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n" -usageInfo += "a sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -redirectDumpFile = "enwiki-20220420-redirect.sql.gz" -enwikiDb = "enwikiData.db" - -# Open db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)") -dbCur2 = dbCon.cursor() -# Parse redirect data -dump = Dump.from_file(redirectDumpFile) -iterationNum = 0 -for row in dump.rows(convert_dtypes=True): - iterationNum += 1 - if iterationNum % 1e6 == 0: - print("At iteration {}".format(iterationNum)) - # Add to map - [pageId, namespace, title] = row[:3] - if namespace == 0: # If page is in the article namespace - row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone() - if row != None: - targetId = row[0] - dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId)) -# Close db -dbCon.commit() -dbCon.close() |
