diff options
Diffstat (limited to 'backend/tol_data/dbpedia')
| -rwxr-xr-x | backend/tol_data/dbpedia/gen_desc_data.py | 21 |
1 files changed, 11 insertions, 10 deletions
diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py index 50418e0..f8a665a 100755 --- a/backend/tol_data/dbpedia/gen_desc_data.py +++ b/backend/tol_data/dbpedia/gen_desc_data.py @@ -6,8 +6,10 @@ Adds DBpedia labels/types/abstracts/etc data into a database # In testing, this script took a few hours to run, and generated about 10GB +import argparse import re -import bz2, sqlite3 +import bz2 +import sqlite3 LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries IDS_FILE = 'page_lang=en_ids.ttl.bz2' @@ -24,7 +26,7 @@ def genData( print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Reading/storing label data') dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)') dbCur.execute('CREATE INDEX labels_idx ON labels(label)') @@ -38,7 +40,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing wiki page ids') dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)') dbCur.execute('CREATE INDEX ids_idx ON ids(id)') @@ -55,7 +57,7 @@ def genData( except sqlite3.IntegrityError as e: # Accounts for certain lines that have the same IRI print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}') - # + print('Reading/storing redirection data') dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)') redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') @@ -67,7 +69,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing diambiguation-page data') dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)') disambigLineRegex = redirLineRegex @@ -79,7 +81,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),)) - # + print('Reading/storing instance-type data') dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)') dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)') @@ -92,7 +94,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing abstracts') dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)') descLineRegex = labelLineRegex @@ -107,14 +109,13 @@ def genData( raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO abstracts VALUES (?, ?)', (match.group(1), match.group(2).replace(r'\"', '"'))) - # + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE) |
