diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
| commit | e8e58a3bb9dc233dacf573973457c5b48d369503 (patch) | |
| tree | 242500ca304c5afbb7e6506e61da4c4dfff0b175 /backend/tolData/genDescData.py | |
| parent | 930c12d33e1093f874a4beb4d6376621e464e8c0 (diff) | |
Add scripts for generating eol/enwiki mappings
- New data sources: OTOL taxonomy, EOL provider-ids, Wikidata dump
- Add 'node_iucn' table
- Remove 'redirected' field from 'wiki_ids' table
- Make 'eol_ids' table have 'name' as the primary key
- Combine name-generation scripts into genNameData.py
- Combine description-generation scripts into genDescData.py
Diffstat (limited to 'backend/tolData/genDescData.py')
| -rwxr-xr-x | backend/tolData/genDescData.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/backend/tolData/genDescData.py b/backend/tolData/genDescData.py new file mode 100755 index 0000000..28971f4 --- /dev/null +++ b/backend/tolData/genDescData.py @@ -0,0 +1,91 @@ +#!/usr/bin/python3 + +import sys, os, re +import sqlite3 + +import argparse +parser = argparse.ArgumentParser(description=''' +Maps nodes to short descriptions, using data from DBpedia and +Wikipedia, and stores results in the database. +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +dbpediaDb = 'dbpedia/descData.db' +enwikiDb = 'enwiki/descData.db' +dbFile = 'data.db' + +print('Creating table') +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)') + +print('Getting node mappings') +nodeToWikiId = {} +for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): + nodeToWikiId[name] = wikiId + +print('Reading data from DBpedia') +dbpCon = sqlite3.connect(dbpediaDb) +dbpCur = dbpCon.cursor() +print('Getting node IRIs') +nodeToIri = {} +iterNum = 0 +for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone() + if row != None: + nodeToIri[name] = row[0] +print('Resolving redirects') +iterNum = 0 +for name, iri in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone() + if row != None: + nodeToIri[name] = row[0] +print('Adding descriptions') +iterNum = 0 +for name, iri in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone() + if row != None: + dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1)) + del nodeToWikiId[name] +dbpCon.close() + +print('Reading data from Wikipedia') +enwikiCon = sqlite3.connect(enwikiDb) +enwikiCur = enwikiCon.cursor() +print('Resolving redirects') +iterNum = 0 +for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?' + row = enwikiCur.execute(query, (wikiId,)).fetchone() + if row != None: + nodeToWikiId[name] = row[0] +print('Adding descriptions') +iterNum = 0 +for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e3 == 0: + print(f'At iteration {iterNum}') + # + row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiId,)).fetchone() + if row != None: + dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0)) + +print('Closing databases') +dbCon.commit() +dbCon.close() |
