diff options
Diffstat (limited to 'backend/tol_data/gen_desc_data.py')
| -rwxr-xr-x | backend/tol_data/gen_desc_data.py | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/backend/tol_data/gen_desc_data.py b/backend/tol_data/gen_desc_data.py index fa08a8c..69efe79 100755 --- a/backend/tol_data/gen_desc_data.py +++ b/backend/tol_data/gen_desc_data.py @@ -5,7 +5,9 @@ Maps nodes to short descriptions, using data from DBpedia and Wikipedia, and stores results in the database. """ -import os, sqlite3 +import argparse +import os +import sqlite3 DBPEDIA_DB = os.path.join('dbpedia', 'desc_data.db') ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') @@ -16,12 +18,12 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)') - # + print('Getting node mappings') nodeToWikiId: dict[str, int] = {} for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): nodeToWikiId[name] = wikiId - # + print('Reading data from DBpedia') dbpCon = sqlite3.connect(dbpediaDb) dbpCur = dbpCon.cursor() @@ -32,20 +34,22 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: iterNum += 1 if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') - # + row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone() if row is not None: nodeToIri[name] = row[0] + print('Resolving redirects') iterNum = 0 for name, iri in nodeToIri.items(): iterNum += 1 if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') - # + row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone() if row is not None: nodeToIri[name] = row[0] + print('Adding descriptions') iterNum = 0 for name, iri in nodeToIri.items(): @@ -57,11 +61,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: if row is not None: dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1)) del nodeToWikiId[name] + dbpCon.close() - # + print('Reading data from Wikipedia') enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() + print('Adding descriptions') iterNum = 0 for name, wikiId in nodeToWikiId.items(): @@ -79,14 +85,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone() if row is not None: dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0)) - # + print('Closing databases') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(DBPEDIA_DB, ENWIKI_DB, DB_FILE) |
