diff options
Diffstat (limited to 'backend/data/enwiki/genPageData.py')
| -rwxr-xr-x | backend/data/enwiki/genPageData.py | 39 |
1 files changed, 0 insertions, 39 deletions
diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py deleted file mode 100755 index 7522f1f..0000000 --- a/backend/data/enwiki/genPageData.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 - -import sys, os.path -from mwsql import Dump -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n" -usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n" -usageInfo += "a sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -pageDumpFile = "enwiki-20220420-page.sql.gz" -enwikiDb = "enwikiData.db" - -# Check for existing db -if os.path.exists(enwikiDb): - print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr) - sys.exit(1) -# Create db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") -dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") -# Parse page data -dump = Dump.from_file(pageDumpFile) -iterationNum = 0 -for row in dump.rows(convert_dtypes=True): - iterationNum += 1 - if iterationNum % 1e6 == 0: - print("At iteration {}".format(iterationNum)) - # Add to map - if row[1] == 0: # If page in article namespace - dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " "))) -# Close db -dbCon.commit() -dbCon.close() |
