aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki/genRedirectData.py
blob: e1aadc8f71641f24e182c81d805988e1c9aba578 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/python3

import sys, os.path
from mwsql import Dump
import sqlite3

usageInfo =  f"usage: {sys.argv[0]}\n"
usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n"
usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n"
usageInfo += "a sqlite db.\n"
if len(sys.argv) > 1:
	print(usageInfo, file=sys.stderr)
	sys.exit(1)

redirectDumpFile = "enwiki-20220420-redirect.sql.gz"
enwikiDb = "enwikiData.db"

# Open db
dbCon = sqlite3.connect(enwikiDb)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)")
dbCur2 = dbCon.cursor()
# Parse redirect data
dump = Dump.from_file(redirectDumpFile)
iterationNum = 0
for row in dump.rows(convert_dtypes=True):
	iterationNum += 1
	if iterationNum % 1e6 == 0:
		print("At iteration {}".format(iterationNum))
	# Add to map
	[pageId, namespace, title] = row[:3]
	if namespace == 0: # If page is in the article namespace
		row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone()
		if row != None:
			targetId = row[0]
			dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId))
# Close db
dbCon.commit()
dbCon.close()