aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEnwikiDescData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/genEnwikiDescData.py')
-rwxr-xr-xbackend/data/genEnwikiDescData.py102
1 files changed, 0 insertions, 102 deletions
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
deleted file mode 100755
index d3f93ed..0000000
--- a/backend/data/genEnwikiDescData.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os
-import sqlite3
-
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-Reads a database containing data from Wikipedia, and tries to associate
-wiki pages with nodes in the database, and add descriptions for nodes
-that don't have them.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-enwikiDb = "enwiki/descData.db"
-dbFile = "data.db"
-namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
-pickedLabelsFile = "pickedEnwikiLabels.txt"
-# Got about 25k descriptions when testing
-
-print("Opening databases")
-enwikiCon = sqlite3.connect(enwikiDb)
-enwikiCur = enwikiCon.cursor()
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-
-print("Checking for names to skip")
-namesToSkip = set()
-if os.path.exists(namesToSkipFile):
- with open(namesToSkipFile) as file:
- for line in file:
- namesToSkip.add(line.rstrip())
- print(f"Found {len(namesToSkip)}")
-print("Checking for picked-titles")
-nameToPickedTitle = {}
-if os.path.exists(pickedLabelsFile):
- with open(pickedLabelsFile) as file:
- for line in file:
- (name, _, title) = line.rstrip().partition("|")
- nameToPickedTitle[name.lower()] = title
-print(f"Found {len(nameToPickedTitle)}")
-
-print("Getting names of nodes without descriptions")
-nodeNames = set()
-query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL"
-for (name,) in dbCur.execute(query):
- nodeNames.add(name)
-print(f"Found {len(nodeNames)}")
-nodeNames.difference_update(namesToSkip)
-
-print("Associating nodes with page IDs")
-nodeToPageId = {}
-iterNum = 0
-for name in nodeNames:
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- if name not in nameToPickedTitle:
- row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
- else:
- title = nameToPickedTitle[name]
- row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
- else:
- print("WARNING: Picked title {title} not found", file=sys.stderr)
-
-print("Resolving redirects")
-redirectingNames = set()
-iterNum = 0
-for (name, pageId) in nodeToPageId.items():
- iterNum += 1
- if iterNum % 1e3 == 0:
- print(f"At iteration {iterNum}")
- #
- query = "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?"
- row = enwikiCur.execute(query, (pageId,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
- redirectingNames.add(name)
-
-print("Adding description data")
-iterNum = 0
-for (name, pageId) in nodeToPageId.items():
- iterNum += 1
- if iterNum % 1e3 == 0:
- print(f"At iteration {iterNum}")
- #
- row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
- if row != None:
- dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0))
- dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0))
-
-print("Closing databases")
-dbCon.commit()
-dbCon.close()
-enwikiCon.close()