aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/gen_desc_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tol_data/gen_desc_data.py')
-rwxr-xr-xbackend/tol_data/gen_desc_data.py92
1 files changed, 92 insertions, 0 deletions
diff --git a/backend/tol_data/gen_desc_data.py b/backend/tol_data/gen_desc_data.py
new file mode 100755
index 0000000..fa08a8c
--- /dev/null
+++ b/backend/tol_data/gen_desc_data.py
@@ -0,0 +1,92 @@
+#!/usr/bin/python3
+
+"""
+Maps nodes to short descriptions, using data from DBpedia and
+Wikipedia, and stores results in the database.
+"""
+
+import os, sqlite3
+
+DBPEDIA_DB = os.path.join('dbpedia', 'desc_data.db')
+ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
+DB_FILE = 'data.db'
+
+def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
+ print('Creating table')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)')
+ #
+ print('Getting node mappings')
+ nodeToWikiId: dict[str, int] = {}
+ for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
+ nodeToWikiId[name] = wikiId
+ #
+ print('Reading data from DBpedia')
+ dbpCon = sqlite3.connect(dbpediaDb)
+ dbpCur = dbpCon.cursor()
+ print('Getting node IRIs')
+ nodeToIri: dict[str, str] = {}
+ iterNum = 0
+ for name, wikiId in nodeToWikiId.items():
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone()
+ if row is not None:
+ nodeToIri[name] = row[0]
+ print('Resolving redirects')
+ iterNum = 0
+ for name, iri in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone()
+ if row is not None:
+ nodeToIri[name] = row[0]
+ print('Adding descriptions')
+ iterNum = 0
+ for name, iri in nodeToIri.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f'At iteration {iterNum}')
+ #
+ row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone()
+ if row is not None:
+ dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1))
+ del nodeToWikiId[name]
+ dbpCon.close()
+ #
+ print('Reading data from Wikipedia')
+ enwikiCon = sqlite3.connect(enwikiDb)
+ enwikiCur = enwikiCon.cursor()
+ print('Adding descriptions')
+ iterNum = 0
+ for name, wikiId in nodeToWikiId.items():
+ iterNum += 1
+ if iterNum % 1e3 == 0:
+ print(f'At iteration {iterNum}')
+ # Check for redirect
+ wikiIdToGet = wikiId
+ query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title' \
+ ' WHERE redirects.id = ?'
+ row = enwikiCur.execute(query, (wikiId,)).fetchone()
+ if row is not None:
+ wikiIdToGet = row[0]
+ #
+ row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone()
+ if row is not None:
+ dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0))
+ #
+ print('Closing databases')
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ args = parser.parse_args()
+ #
+ genData(DBPEDIA_DB, ENWIKI_DB, DB_FILE)