aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data/dbpedia
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-09-11 14:55:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-09-11 15:04:14 +1000
commit5de5fb93e50fe9006221b30ac4a66f1be0db82e7 (patch)
tree2567c25c902dbb40d44419805cebb38171df47fa /backend/tol_data/dbpedia
parentdaccbbd9c73a5292ea9d6746560d7009e5aa666d (diff)
Add backend unit tests
- Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing
Diffstat (limited to 'backend/tol_data/dbpedia')
-rw-r--r--backend/tol_data/dbpedia/README.md29
-rw-r--r--backend/tol_data/dbpedia/__init__.py0
-rwxr-xr-xbackend/tol_data/dbpedia/gen_desc_data.py120
3 files changed, 149 insertions, 0 deletions
diff --git a/backend/tol_data/dbpedia/README.md b/backend/tol_data/dbpedia/README.md
new file mode 100644
index 0000000..a708122
--- /dev/null
+++ b/backend/tol_data/dbpedia/README.md
@@ -0,0 +1,29 @@
+This directory holds files obtained/derived from [Dbpedia](https://www.dbpedia.org).
+
+# Downloaded Files
+- `labels_lang=en.ttl.bz2` <br>
+ Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core.
+ Downloaded from <https://databus.dbpedia.org/dbpedia/generic/labels/2022.03.01/labels_lang=en.ttl.bz2>.
+- `page_lang=en_ids.ttl.bz2` <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/generic/page/2022.03.01/page_lang=en_ids.ttl.bz2>
+- `redirects_lang=en_transitive.ttl.bz2` <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/generic/redirects/2022.03.01/redirects_lang=en_transitive.ttl.bz2>.
+- `disambiguations_lang=en.ttl.bz2` <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/generic/disambiguations/2022.03.01/disambiguations_lang=en.ttl.bz2>.
+- `instance-types_lang=en_specific.ttl.bz2` <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/mappings/instance-types/2022.03.01/instance-types_lang=en_specific.ttl.bz2>.
+- `short-abstracts_lang=en.ttl.bz2` <br>
+ Downloaded from <https://databus.dbpedia.org/vehnem/text/short-abstracts/2021.05.01/short-abstracts_lang=en.ttl.bz2>.
+
+# Other Files
+- `gen_desc_data.py` <br>
+ Used to generate a database representing data from the ttl files.
+- `desc_data.db` <br>
+ Generated by `gen_desc_data.py`. <br>
+ Tables: <br>
+ - `labels`: `iri TEXT PRIMARY KEY, label TEXT `
+ - `ids`: `iri TEXT PRIMARY KEY, id INT`
+ - `redirects`: `iri TEXT PRIMARY KEY, target TEXT`
+ - `disambiguations`: `iri TEXT PRIMARY KEY`
+ - `types`: `iri TEXT, type TEXT`
+ - `abstracts`: `iri TEXT PRIMARY KEY, abstract TEXT`
diff --git a/backend/tol_data/dbpedia/__init__.py b/backend/tol_data/dbpedia/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/backend/tol_data/dbpedia/__init__.py
diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py
new file mode 100755
index 0000000..50418e0
--- /dev/null
+++ b/backend/tol_data/dbpedia/gen_desc_data.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python3
+
+"""
+Adds DBpedia labels/types/abstracts/etc data into a database
+"""
+
+# In testing, this script took a few hours to run, and generated about 10GB
+
+import re
+import bz2, sqlite3
+
+LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries
+IDS_FILE = 'page_lang=en_ids.ttl.bz2'
+REDIRECTS_FILE = 'redirects_lang=en_transitive.ttl.bz2'
+DISAMBIG_FILE = 'disambiguations_lang=en.ttl.bz2'
+TYPES_FILE = 'instance-types_lang=en_specific.ttl.bz2'
+ABSTRACTS_FILE = 'short-abstracts_lang=en.ttl.bz2'
+DB_FILE = 'desc_data.db'
+
+def genData(
+ labelsFile: str, idsFile: str, redirectsFile: str, disambigFile: str,
+ typesFile: str, abstractsFile: str, dbFile: str) -> None:
+ """ Reads the files and writes to db """
+ print('Creating database')
+ dbCon = sqlite3.connect(dbFile)
+ dbCur = dbCon.cursor()
+ #
+ print('Reading/storing label data')
+ dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)')
+ dbCur.execute('CREATE INDEX labels_idx ON labels(label)')
+ dbCur.execute('CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)')
+ labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n')
+ with bz2.open(labelsFile, mode='rt') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ match = labelLineRegex.fullmatch(line)
+ if match is None:
+ raise Exception(f'ERROR: Line {lineNum} has unexpected format')
+ dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2)))
+ #
+ print('Reading/storing wiki page ids')
+ dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)')
+ dbCur.execute('CREATE INDEX ids_idx ON ids(id)')
+ idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n')
+ with bz2.open(idsFile, mode='rt') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ match = idLineRegex.fullmatch(line)
+ if match is None:
+ raise Exception(f'ERROR: Line {lineNum} has unexpected format')
+ try:
+ dbCur.execute('INSERT INTO ids VALUES (?, ?)', (match.group(1), int(match.group(2))))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain lines that have the same IRI
+ print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}')
+ #
+ print('Reading/storing redirection data')
+ dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)')
+ redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
+ with bz2.open(redirectsFile, mode='rt') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ match = redirLineRegex.fullmatch(line)
+ if match is None:
+ raise Exception(f'ERROR: Line {lineNum} has unexpected format')
+ dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2)))
+ #
+ print('Reading/storing diambiguation-page data')
+ dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)')
+ disambigLineRegex = redirLineRegex
+ with bz2.open(disambigFile, mode='rt') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ match = disambigLineRegex.fullmatch(line)
+ if match is None:
+ raise Exception(f'ERROR: Line {lineNum} has unexpected format')
+ dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),))
+ #
+ print('Reading/storing instance-type data')
+ dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)')
+ dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)')
+ typeLineRegex = redirLineRegex
+ with bz2.open(typesFile, mode='rt') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ match = typeLineRegex.fullmatch(line)
+ if match is None:
+ raise Exception(f'ERROR: Line {lineNum} has unexpected format')
+ dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2)))
+ #
+ print('Reading/storing abstracts')
+ dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)')
+ descLineRegex = labelLineRegex
+ with bz2.open(abstractsFile, mode='rt') as file:
+ for lineNum, line in enumerate(file):
+ if lineNum % 1e5 == 0:
+ print(f'At line {lineNum}')
+ if line[0] == '#':
+ continue
+ match = descLineRegex.fullmatch(line)
+ if match is None:
+ raise Exception(f'ERROR: Line {lineNum} has unexpected format')
+ dbCur.execute('INSERT INTO abstracts VALUES (?, ?)',
+ (match.group(1), match.group(2).replace(r'\"', '"')))
+ #
+ print('Closing database')
+ dbCon.commit()
+ dbCon.close()
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.parse_args()
+ #
+ genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE)