aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/genDescData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-11 14:55:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-11 15:04:14 +1000
commit5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree2567c25c902dbb40d44419805cebb38171df47fa /backend/tolData/genDescData.py
parentdaccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
Add backend unit tests
- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tolData/genDescData.py')
-rwxr-xr-xbackend/tolData/genDescData.py90
1 files changed, 0 insertions, 90 deletions
diff --git a/backend/tolData/genDescData.py b/backend/tolData/genDescData.py
deleted file mode 100755
index bb1cbc8..0000000
--- a/backend/tolData/genDescData.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/python3
-
-import sqlite3
-
-import argparse
-parser = argparse.ArgumentParser(description="""
-Maps nodes to short descriptions, using data from DBpedia and
-Wikipedia, and stores results in the database.
-""", formatter_class=argparse.RawDescriptionHelpFormatter)
-args = parser.parse_args()
-
-dbpediaDb = 'dbpedia/descData.db'
-enwikiDb = 'enwiki/descData.db'
-dbFile = 'data.db'
-
-print('Creating table')
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)')
-
-print('Getting node mappings')
-nodeToWikiId: dict[str, int] = {}
-for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
- nodeToWikiId[name] = wikiId
-
-print('Reading data from DBpedia')
-dbpCon = sqlite3.connect(dbpediaDb)
-dbpCur = dbpCon.cursor()
-print('Getting node IRIs')
-nodeToIri: dict[str, str] = {}
-iterNum = 0
-for name, wikiId in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f'At iteration {iterNum}')
- #
- row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone()
- if row is not None:
- nodeToIri[name] = row[0]
-print('Resolving redirects')
-iterNum = 0
-for name, iri in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f'At iteration {iterNum}')
- #
- row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone()
- if row is not None:
- nodeToIri[name] = row[0]
-print('Adding descriptions')
-iterNum = 0
-for name, iri in nodeToIri.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f'At iteration {iterNum}')
- #
- row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone()
- if row is not None:
- dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1))
- del nodeToWikiId[name]
-dbpCon.close()
-
-print('Reading data from Wikipedia')
-enwikiCon = sqlite3.connect(enwikiDb)
-enwikiCur = enwikiCon.cursor()
-print('Resolving redirects')
-iterNum = 0
-for name, wikiId in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f'At iteration {iterNum}')
- #
- query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?'
- row = enwikiCur.execute(query, (wikiId,)).fetchone()
- if row is not None:
- nodeToWikiId[name] = row[0]
-print('Adding descriptions')
-iterNum = 0
-for name, wikiId in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e3 == 0:
- print(f'At iteration {iterNum}')
- #
- row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiId,)).fetchone()
- if row is not None:
- dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0))
-
-print('Closing databases')
-dbCon.commit()
-dbCon.close()