aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEnwikiDescData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-26 13:22:36 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-26 13:22:36 +1000
commit07397961bfb113bd9c03883f2b24e6d287f989ca (patch)
treea5a4fc18b54689497eae85f269e9467e1a0068aa /backend/data/genEnwikiDescData.py
parent2d67e54dc91708eaf89eca9dca27cec126f7f465 (diff)
Add some enwiki redirect data as alt-names
Diffstat (limited to 'backend/data/genEnwikiDescData.py')
-rwxr-xr-xbackend/data/genEnwikiDescData.py71
1 files changed, 71 insertions, 0 deletions
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
new file mode 100755
index 0000000..40a6c92
--- /dev/null
+++ b/backend/data/genEnwikiDescData.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data"
+usageInfo += "from a sqlite database, and adds description data for names that\n"
+usageInfo += "don't have them.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+enwikiDb = "enwiki/enwikiData.db"
+dbFile = "data.db"
+
+# Open dbs
+enwikiCon = sqlite3.connect(enwikiDb)
+enwikiCur = enwikiCon.cursor()
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Get node names without descriptions
+print("Getting node names")
+nodeNames = set()
+query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL"
+for row in dbCur.execute(query):
+ nodeNames.add(row[0])
+print("Found {} names".format(len(nodeNames)))
+# Find page id for each node name
+print("Getting node page-ids")
+nodeToPageId = {}
+iterNum = 0
+for name in nodeNames:
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
+ if row != None:
+ nodeToPageId[name] = row[0]
+# Resolve redirects
+print("Resolving redirects")
+redirectingNames = set()
+iterNum = 0
+for (name, pageId) in nodeToPageId.items():
+ iterNum += 1
+ if iterNum % 1000 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = enwikiCur.execute(
+ "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?",
+ (pageId,)).fetchone()
+ if row != None:
+ nodeToPageId[name] = row[0]
+ redirectingNames.add(name)
+# Add descriptions for each node
+print("Adding description data")
+iterNum = 0
+for (name, pageId) in nodeToPageId.items():
+ iterNum += 1
+ if iterNum % 1000 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
+ if row != None:
+ dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)",
+ (name, row[0], 1 if name in redirectingNames else 0, pageId, 0))
+# Close dbs
+dbCon.commit()
+dbCon.close()
+enwikiCon.close()