aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEnwikiDescData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/genEnwikiDescData.py')
-rwxr-xr-xbackend/data/genEnwikiDescData.py54
1 files changed, 30 insertions, 24 deletions
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index dbc8d6b..d3f93ed 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -3,10 +3,13 @@
import sys, re, os
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data"
-usageInfo += "from a sqlite database, and adds description data for names that\n"
-usageInfo += "don't have them.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads a database containing data from Wikipedia, and tries to associate
+wiki pages with nodes in the database, and add descriptions for nodes
+that don't have them.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -15,36 +18,39 @@ enwikiDb = "enwiki/descData.db"
dbFile = "data.db"
namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
pickedLabelsFile = "pickedEnwikiLabels.txt"
+# Got about 25k descriptions when testing
-# Open dbs
+print("Opening databases")
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Read name/title files
+
+print("Checking for names to skip")
namesToSkip = set()
-nameToPickedTitle = {} # Maps names to titles to be used for them
if os.path.exists(namesToSkipFile):
with open(namesToSkipFile) as file:
for line in file:
namesToSkip.add(line.rstrip())
- print(f"Read in {len(namesToSkip)} names to skip")
+ print(f"Found {len(namesToSkip)}")
+print("Checking for picked-titles")
+nameToPickedTitle = {}
if os.path.exists(pickedLabelsFile):
with open(pickedLabelsFile) as file:
for line in file:
(name, _, title) = line.rstrip().partition("|")
nameToPickedTitle[name.lower()] = title
-print(f"Read in {len(nameToPickedTitle)} titles to use for certain names")
-# Get node names without descriptions
-print("Getting node names")
+print(f"Found {len(nameToPickedTitle)}")
+
+print("Getting names of nodes without descriptions")
nodeNames = set()
query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL"
-for row in dbCur.execute(query):
- nodeNames.add(row[0])
-print(f"Found {len(nodeNames)} names")
+for (name,) in dbCur.execute(query):
+ nodeNames.add(name)
+print(f"Found {len(nodeNames)}")
nodeNames.difference_update(namesToSkip)
-# Find page id for each node name
-print("Getting node page-ids")
+
+print("Associating nodes with page IDs")
nodeToPageId = {}
iterNum = 0
for name in nodeNames:
@@ -63,34 +69,34 @@ for name in nodeNames:
nodeToPageId[name] = row[0]
else:
print("WARNING: Picked title {title} not found", file=sys.stderr)
-# Resolve redirects
+
print("Resolving redirects")
redirectingNames = set()
iterNum = 0
for (name, pageId) in nodeToPageId.items():
iterNum += 1
- if iterNum % 1000 == 0:
+ if iterNum % 1e3 == 0:
print(f"At iteration {iterNum}")
#
- row = enwikiCur.execute(
- "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?",
- (pageId,)).fetchone()
+ query = "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?"
+ row = enwikiCur.execute(query, (pageId,)).fetchone()
if row != None:
nodeToPageId[name] = row[0]
redirectingNames.add(name)
-# Add descriptions for each node
+
print("Adding description data")
iterNum = 0
for (name, pageId) in nodeToPageId.items():
iterNum += 1
- if iterNum % 1000 == 0:
+ if iterNum % 1e3 == 0:
print(f"At iteration {iterNum}")
#
row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
if row != None:
dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0))
dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0))
-# Close dbs
+
+print("Closing databases")
dbCon.commit()
dbCon.close()
enwikiCon.close()