aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genDescData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/genDescData.py')
-rwxr-xr-xbackend/tolData/genDescData.py90
1 files changed, 0 insertions, 90 deletions
diff --git a/backend/tolData/genDescData.py b/backend/tolData/genDescData.py
deleted file mode 100755
index bb1cbc8..0000000
--- a/backend/tolData/genDescData.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/python3
-
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Maps nodes to short descriptions, using data from DBpedia and
-Wikipedia, and stores results in the database.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-args = parser.parse_args()
-
-dbpediaDb = 'dbpedia/descData.db'
-enwikiDb = 'enwiki/descData.db'
-dbFile = 'data.db'
-
-print('Creating table')
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)')
-
-print('Getting node mappings')
-nodeToWikiId: dict[str, int] = {}
-for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
- nodeToWikiId[name] = wikiId
-
-print('Reading data from DBpedia')
-dbpCon = sqlite3.connect(dbpediaDb)
-dbpCur = dbpCon.cursor()
-print('Getting node IRIs')
-nodeToIri: dict[str, str] = {}
-iterNum = 0
-for name, wikiId in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f'At iteration {iterNum}')
- #
- row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone()
- if row is not None:
- nodeToIri[name] = row[0]
-print('Resolving redirects')
-iterNum = 0
-for name, iri in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f'At iteration {iterNum}')
- #
- row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone()
- if row is not None:
- nodeToIri[name] = row[0]
-print('Adding descriptions')
-iterNum = 0
-for name, iri in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f'At iteration {iterNum}')
- #
- row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone()
- if row is not None:
- dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1))
- del nodeToWikiId[name]
-dbpCon.close()
-
-print('Reading data from Wikipedia')
-enwikiCon = sqlite3.connect(enwikiDb)
-enwikiCur = enwikiCon.cursor()
-print('Resolving redirects')
-iterNum = 0
-for name, wikiId in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f'At iteration {iterNum}')
- #
- query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?'
- row = enwikiCur.execute(query, (wikiId,)).fetchone()
- if row is not None:
- nodeToWikiId[name] = row[0]
-print('Adding descriptions')
-iterNum = 0
-for name, wikiId in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e3 == 0:
- print(f'At iteration {iterNum}')
- #
- row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiId,)).fetchone()
- if row is not None:
- dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0))
-
-print('Closing databases')
-dbCon.commit()
-dbCon.close()