diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
| commit | 8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch) | |
| tree | ffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data/dbpedia | |
| parent | f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff) | |
Adjust backend coding style
Add line spacing, section comments, and import consistency
Diffstat (limited to 'backend/tol_data/dbpedia')
| -rwxr-xr-x | backend/tol_data/dbpedia/gen_desc_data.py | 21 |
1 files changed, 11 insertions, 10 deletions
diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py index 50418e0..f8a665a 100755 --- a/backend/tol_data/dbpedia/gen_desc_data.py +++ b/backend/tol_data/dbpedia/gen_desc_data.py @@ -6,8 +6,10 @@ Adds DBpedia labels/types/abstracts/etc data into a database # In testing, this script took a few hours to run, and generated about 10GB +import argparse import re -import bz2, sqlite3 +import bz2 +import sqlite3 LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries IDS_FILE = 'page_lang=en_ids.ttl.bz2' @@ -24,7 +26,7 @@ def genData( print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Reading/storing label data') dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)') dbCur.execute('CREATE INDEX labels_idx ON labels(label)') @@ -38,7 +40,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing wiki page ids') dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)') dbCur.execute('CREATE INDEX ids_idx ON ids(id)') @@ -55,7 +57,7 @@ def genData( except sqlite3.IntegrityError as e: # Accounts for certain lines that have the same IRI print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}') - # + print('Reading/storing redirection data') dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)') redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') @@ -67,7 +69,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing diambiguation-page data') dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)') disambigLineRegex = redirLineRegex @@ -79,7 +81,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),)) - # + print('Reading/storing instance-type data') dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)') dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)') @@ -92,7 +94,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing abstracts') dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)') descLineRegex = labelLineRegex @@ -107,14 +109,13 @@ def genData( raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO abstracts VALUES (?, ?)', (match.group(1), match.group(2).replace(r'\"', '"'))) - # + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE) |
