diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-09-11 14:55:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-09-11 15:04:14 +1000 |
| commit | 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch) | |
| tree | 2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/genDescData.py | |
| parent | daccbbd9c73a5292ea9d6746560d7009e5aa666d (diff) | |
Add backend unit tests
- Add unit testing code in backend/tests/
- Change to snake-case for script/file/directory names
- Use os.path.join() instead of '/'
- Refactor script code into function defs and a main-guard
- Make global vars all-caps
Some fixes:
- For getting descriptions, some wiki redirects weren't properly resolved
- Linked images were sub-optimally propagated
- Generation of reduced trees assumed a wiki-id association implied a description
- Tilo.py had potential null dereferences by not always using a reduced node set
- EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tolData/genDescData.py')
| -rwxr-xr-x | backend/tolData/genDescData.py | 90 |
1 files changed, 0 insertions, 90 deletions
diff --git a/backend/tolData/genDescData.py b/backend/tolData/genDescData.py deleted file mode 100755 index bb1cbc8..0000000 --- a/backend/tolData/genDescData.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/python3 - -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Maps nodes to short descriptions, using data from DBpedia and -Wikipedia, and stores results in the database. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -args = parser.parse_args() - -dbpediaDb = 'dbpedia/descData.db' -enwikiDb = 'enwiki/descData.db' -dbFile = 'data.db' - -print('Creating table') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)') - -print('Getting node mappings') -nodeToWikiId: dict[str, int] = {} -for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): - nodeToWikiId[name] = wikiId - -print('Reading data from DBpedia') -dbpCon = sqlite3.connect(dbpediaDb) -dbpCur = dbpCon.cursor() -print('Getting node IRIs') -nodeToIri: dict[str, str] = {} -iterNum = 0 -for name, wikiId in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f'At iteration {iterNum}') - # - row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone() - if row is not None: - nodeToIri[name] = row[0] -print('Resolving redirects') -iterNum = 0 -for name, iri in nodeToIri.items(): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f'At iteration {iterNum}') - # - row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone() - if row is not None: - nodeToIri[name] = row[0] -print('Adding descriptions') -iterNum = 0 -for name, iri in nodeToIri.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') - # - row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone() - if row is not None: - dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1)) - del nodeToWikiId[name] -dbpCon.close() - -print('Reading data from Wikipedia') -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -print('Resolving redirects') -iterNum = 0 -for name, wikiId in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') - # - query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?' - row = enwikiCur.execute(query, (wikiId,)).fetchone() - if row is not None: - nodeToWikiId[name] = row[0] -print('Adding descriptions') -iterNum = 0 -for name, wikiId in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e3 == 0: - print(f'At iteration {iterNum}') - # - row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiId,)).fetchone() - if row is not None: - dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0)) - -print('Closing databases') -dbCon.commit() -dbCon.close() |
