aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/dbpedia/gen_desc_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tol_data/dbpedia/gen_desc_data.py')
-rwxr-xr-xbackend/tol_data/dbpedia/gen_desc_data.py21
1 files changed, 11 insertions, 10 deletions
diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py
index 50418e0..f8a665a 100755
--- a/backend/tol_data/dbpedia/gen_desc_data.py
+++ b/backend/tol_data/dbpedia/gen_desc_data.py
@@ -6,8 +6,10 @@ Adds DBpedia labels/types/abstracts/etc data into a database
# In testing, this script took a few hours to run, and generated about 10GB
+import argparse
import re
-import bz2, sqlite3
+import bz2
+import sqlite3
LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries
IDS_FILE = 'page_lang=en_ids.ttl.bz2'
@@ -24,7 +26,7 @@ def genData(
print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Reading/storing label data')
dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)')
dbCur.execute('CREATE INDEX labels_idx ON labels(label)')
@@ -38,7 +40,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing wiki page ids')
dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX ids_idx ON ids(id)')
@@ -55,7 +57,7 @@ def genData(
except sqlite3.IntegrityError as e:
# Accounts for certain lines that have the same IRI
print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}')
- #
+
print('Reading/storing redirection data')
dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)')
redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
@@ -67,7 +69,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing diambiguation-page data')
dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)')
disambigLineRegex = redirLineRegex
@@ -79,7 +81,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),))
- #
+
print('Reading/storing instance-type data')
dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)')
dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)')
@@ -92,7 +94,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing abstracts')
dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)')
descLineRegex = labelLineRegex
@@ -107,14 +109,13 @@ def genData(
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO abstracts VALUES (?, ?)',
(match.group(1), match.group(2).replace(r'\"', '"')))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE)