aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki/genRedirectData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-17 10:41:12 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-17 10:41:12 +1000
commit29940d51eb8b6b220d53940ecbc212cea78159ae (patch)
treebfa698c17525de7876b80ad37d8f7777b9505ba0 /backend/data/enwiki/genRedirectData.py
parenta840a16c6bd5aef906bd5cbce8293fc863cb5a5d (diff)
Improve enwiki description extraction
Adjust enwiki code to handle single dump file, and add scripts for 'convenient' page-content lookup.
Diffstat (limited to 'backend/data/enwiki/genRedirectData.py')
-rwxr-xr-xbackend/data/enwiki/genRedirectData.py39
1 files changed, 0 insertions, 39 deletions
diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py
deleted file mode 100755
index e1aadc8..0000000
--- a/backend/data/enwiki/genRedirectData.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os.path
-from mwsql import Dump
-import sqlite3
-
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n"
-usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n"
-usageInfo += "a sqlite db.\n"
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-redirectDumpFile = "enwiki-20220420-redirect.sql.gz"
-enwikiDb = "enwikiData.db"
-
-# Open db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)")
-dbCur2 = dbCon.cursor()
-# Parse redirect data
-dump = Dump.from_file(redirectDumpFile)
-iterationNum = 0
-for row in dump.rows(convert_dtypes=True):
- iterationNum += 1
- if iterationNum % 1e6 == 0:
- print("At iteration {}".format(iterationNum))
- # Add to map
- [pageId, namespace, title] = row[:3]
- if namespace == 0: # If page is in the article namespace
- row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone()
- if row != None:
- targetId = row[0]
- dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId))
-# Close db
-dbCon.commit()
-dbCon.close()