From 07397961bfb113bd9c03883f2b24e6d287f989ca Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Thu, 26 May 2022 13:22:36 +1000 Subject: Add some enwiki redirect data as alt-names --- backend/data/README.md | 17 ++++----- backend/data/enwiki/genData.py | 1 + backend/data/genEnwikiData.py | 72 --------------------------------------- backend/data/genEnwikiDescData.py | 71 ++++++++++++++++++++++++++++++++++++++ backend/data/genEnwikiNameData.py | 71 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 152 insertions(+), 80 deletions(-) delete mode 100755 backend/data/genEnwikiData.py create mode 100755 backend/data/genEnwikiDescData.py create mode 100755 backend/data/genEnwikiNameData.py (limited to 'backend') diff --git a/backend/data/README.md b/backend/data/README.md index 8cfa960..d444e4f 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -21,14 +21,15 @@ File Generation Process which uses 'nodes', 'edges', 'eol_ids', and 'images', to associate nodes without images to child images. 4 Node Description Data - - Using DBpedia - 1 Obtain data in dbpedia/, as specified in it's README. - 2 Run genDbpData.py, which adds a 'descs' table to data.db, using - data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table. - - Supplementing with Wikipedia dump - 1 Obtain data in enwiki/, as specified in it's README. - 2 Run genEnwikiData.py, which adds to the 'descs' table, using data in - enwiki/enwikiData.db, and the 'nodes' table. + 1 Obtain data in dbpedia/, as specified in it's README. + 2 Run genDbpData.py, which adds a 'descs' table to data.db, using + data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table. +5 Supplementary Name/Description Data + 1 Obtain data in enwiki/, as specified in it's README. + 2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in + enwiki/enwikiData.db, and the 'nodes' table. + 3 Run genEnwikiNameData.py, which adds to the 'names' table, using data in + enwiki/enwikiData.db, and the 'names' and 'descs' tables. 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py index 4f0d62e..646292c 100755 --- a/backend/data/enwiki/genData.py +++ b/backend/data/enwiki/genData.py @@ -91,6 +91,7 @@ dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") +dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") # Read through dump file print("Reading dump file") diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py deleted file mode 100755 index d33fd5d..0000000 --- a/backend/data/genEnwikiData.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python3 - -import sys, re -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data" -usageInfo += "from a sqlite database, and adds description data for names that\n" -usageInfo += "don't have them.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -enwikiDb = "enwiki/enwikiData.db" -dbFile = "data.db" - -# Open dbs -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -# Get node names without descriptions -print("Getting node names") -nodeNames = set() -query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL" -for row in dbCur.execute(query): - nodeNames.add(row[0]) -print("Found {} names".format(len(nodeNames))) -# Find page id for each node name -print("Getting node page-ids") -nodeToPageId = {} -iterNum = 0 -for name in nodeNames: - iterNum += 1 - if iterNum % 1e4 == 0: - print("At iteration {}".format(iterNum)) - # - row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] -# Resolve redirects -print("Resolving redirects") -redirectingNames = set() -iterNum = 0 -for (name, pageId) in nodeToPageId.items(): - iterNum += 1 - if iterNum % 1000 == 0: - print("At iteration {}".format(iterNum)) - # - row = enwikiCur.execute( - "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?", - (pageId,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] - redirectingNames.add(name) -# Add descriptions for each node -print("Adding description data") -iterNum = 0 -for (name, pageId) in nodeToPageId.items(): - iterNum += 1 - if iterNum % 1000 == 0: - print("At iteration {}".format(iterNum)) - # - row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() - if row != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)", - (name, row[0], 1 if name in redirectingNames else 0, pageId, 0)) -# Close dbs -dbCon.commit() -dbCon.close() -enwikiCon.commit() -enwikiCon.close() diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py new file mode 100755 index 0000000..40a6c92 --- /dev/null +++ b/backend/data/genEnwikiDescData.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +import sys, re +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data" +usageInfo += "from a sqlite database, and adds description data for names that\n" +usageInfo += "don't have them.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +enwikiDb = "enwiki/enwikiData.db" +dbFile = "data.db" + +# Open dbs +enwikiCon = sqlite3.connect(enwikiDb) +enwikiCur = enwikiCon.cursor() +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Get node names without descriptions +print("Getting node names") +nodeNames = set() +query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL" +for row in dbCur.execute(query): + nodeNames.add(row[0]) +print("Found {} names".format(len(nodeNames))) +# Find page id for each node name +print("Getting node page-ids") +nodeToPageId = {} +iterNum = 0 +for name in nodeNames: + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] +# Resolve redirects +print("Resolving redirects") +redirectingNames = set() +iterNum = 0 +for (name, pageId) in nodeToPageId.items(): + iterNum += 1 + if iterNum % 1000 == 0: + print("At iteration {}".format(iterNum)) + # + row = enwikiCur.execute( + "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?", + (pageId,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] + redirectingNames.add(name) +# Add descriptions for each node +print("Adding description data") +iterNum = 0 +for (name, pageId) in nodeToPageId.items(): + iterNum += 1 + if iterNum % 1000 == 0: + print("At iteration {}".format(iterNum)) + # + row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() + if row != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)", + (name, row[0], 1 if name in redirectingNames else 0, pageId, 0)) +# Close dbs +dbCon.commit() +dbCon.close() +enwikiCon.close() diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py new file mode 100755 index 0000000..dfed46c --- /dev/null +++ b/backend/data/genEnwikiNameData.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +import sys, re +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads Wikimedia enwiki redirect data from enwiki/, and node and wiki-id\n" +usageInfo += "data from a sqlite database, and adds supplmenentary alt-name data.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +enwikiDb = "enwiki/enwikiData.db" +dbFile = "data.db" +altNameRegex = re.compile(r"[a-zA-Z]+") + # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)', + +# Open dbs +enwikiCon = sqlite3.connect(enwikiDb) +enwikiCur = enwikiCon.cursor() +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Get nodes with wiki-ids +print("Getting nodes with wiki IDs") +nodeToWikiId = {} +for row in dbCur.execute("SELECT name, wiki_id from descs"): + nodeToWikiId[row[0]] = row[1] +print("Found {} nodes".format(len(nodeToWikiId))) +# Find wiki-ids that redirect to each node +print("Finding redirecter names") +nodeToAltNames = {} +numAltNames = 0 +iterNum = 0 +for (nodeName, wikiId) in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + nodeToAltNames[nodeName] = set() + query = "SELECT p1.title FROM pages p1" \ + " INNER JOIN redirects r1 ON p1.id = r1.id" \ + " INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?" + for (name,) in enwikiCur.execute(query, (wikiId,)): + if altNameRegex.fullmatch(name) != None: + nodeToAltNames[nodeName].add(name.lower()) + numAltNames += 1 +print("Found {} alt-names".format(numAltNames)) +# Remove existing alt-names +print("Removing existing alt-names") +query = "SELECT alt_name FROM names WHERE alt_name IN ({})" +iterNum = 0 +for (nodeName, altNames) in nodeToAltNames.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + existingNames = set() + for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)): + existingNames.add(name) + numAltNames -= len(existingNames) + altNames.difference_update(existingNames) +print("Left with {} alt-names".format(numAltNames)) +# Add alt-names +print("Adding alt-names") +for (nodeName, altNames) in nodeToAltNames.items(): + for altName in altNames: + dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (nodeName, altName, 0)) +# Close dbs +dbCon.commit() +dbCon.close() +enwikiCon.close() -- cgit v1.2.3