From 52560266b585e63742a81e27a3b6f1ef194470c6 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 18 May 2022 21:09:05 +1000 Subject: Add wikipedia-link to tile-info display Add 'wiki_id' and 'from_dbp' columns to 'descs' table, adjust dbpedia data to include wikipedia IDs, adjust data generations scripts, make server send extra data, and make TileInfo display it. --- backend/data/README.md | 2 +- backend/data/dbpedia/README.md | 3 +++ backend/data/dbpedia/genData.py | 25 ++++++++++++++++++++++++- backend/data/genDbpData.py | 8 +++++--- backend/data/genEnwikiData.py | 3 ++- backend/server.py | 8 +++++--- 6 files changed, 40 insertions(+), 9 deletions(-) (limited to 'backend') diff --git a/backend/data/README.md b/backend/data/README.md index 576c70e..3df4268 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -36,7 +36,7 @@ data.db Tables - names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) - eol\_ids: id INT PRIMARY KEY, name TEXT - images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT -- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT +- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from_dbp INT - r\_nodes: name TEXT PRIMARY KEY, tips INT - r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) diff --git a/backend/data/dbpedia/README.md b/backend/data/dbpedia/README.md index 0e7c266..78e2a90 100644 --- a/backend/data/dbpedia/README.md +++ b/backend/data/dbpedia/README.md @@ -3,6 +3,8 @@ Downloaded Files - labels\_lang=en.ttl.bz2
Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core, using the link . +- page\_lang=en\_ids.ttl.bz2
+ Downloaded from - redirects\_lang=en\_transitive.ttl.bz2
Downloaded from . - disambiguations\_lang=en.ttl.bz2
@@ -19,6 +21,7 @@ Generated Files Generated by running genData.py. Tables - labels: iri TEXT PRIMARY KEY, label TEXT + - ids: iri TEXT PRIMARY KEY, id INT - redirects: iri TEXT PRIMARY KEY, target TEXT - disambiguations: iri TEXT PRIMARY KEY - types: iri TEXT, type TEXT diff --git a/backend/data/dbpedia/genData.py b/backend/data/dbpedia/genData.py index e147641..3df1961 100755 --- a/backend/data/dbpedia/genData.py +++ b/backend/data/dbpedia/genData.py @@ -4,13 +4,14 @@ import sys, re import bz2, sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads DBpedia labels+types+redirects+abstracts data,\n" +usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n" usageInfo += "and creates a sqlite db containing that data.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines +idsFile = "page_lang=en_ids.ttl.bz2" redirectsFile = "redirects_lang=en_transitive.ttl.bz2" disambigFile = "disambiguations_lang=en.ttl.bz2" typesFile = "instance-types_lang=en_specific.ttl.bz2" @@ -39,6 +40,28 @@ with bz2.open(labelsFile, mode='rt') as file: else: dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2))) dbCon.commit() +# Read/store wiki page ids +print("Reading/storing wiki page ids") +dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)") +idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') +lineNum = 0 +with bz2.open(idsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("Processing line {}".format(lineNum)) + # + match = idLineRegex.fullmatch(line) + if match == None: + print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr) + sys.exit(1) + else: + try: + dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2)))) + except sqlite3.IntegrityError as e: + # Accounts for certain lines that have the same IRI + print("Failed to add entry with IRI \"{}\": {}".format(match.group(1), e)) +dbCon.commit() # Read/store redirects print("Reading/storing redirection data") dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)") diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py index 0ba1ef1..c8394ea 100755 --- a/backend/data/genDbpData.py +++ b/backend/data/genDbpData.py @@ -211,16 +211,18 @@ for (name, iri) in nodeToIri.items(): redirectingIriSet.add(name) # Find descriptions, and add to db print("Adding node description data") -dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)") +dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)") iterNum = 0 for (name, iri) in nodeToIri.items(): iterNum += 1 if iterNum % 1e4 == 0: print("At iteration {}".format(iterNum)) # - row = dbpCur.execute("SELECT abstract FROM abstracts where iri = ?", (iri,)).fetchone() + query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?" + row = dbpCur.execute(query, (iri,)).fetchone() if row != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingIriSet else 0)) + dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)", + (name, row[0], 1 if name in redirectingIriSet else 0, row[1], 1)) # Close dbs dbCon.commit() dbCon.close() diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py index 879ecf6..d33fd5d 100755 --- a/backend/data/genEnwikiData.py +++ b/backend/data/genEnwikiData.py @@ -63,7 +63,8 @@ for (name, pageId) in nodeToPageId.items(): # row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() if row != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingNames else 0)) + dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)", + (name, row[0], 1 if name in redirectingNames else 0, pageId, 0)) # Close dbs dbCon.commit() dbCon.close() diff --git a/backend/server.py b/backend/server.py index 54e4803..a64a145 100755 --- a/backend/server.py +++ b/backend/server.py @@ -132,8 +132,10 @@ def lookupNodeInfo(name, useReducedTree): temp = lookupNodes([name], useReducedTree) nodeObj = temp[name] if name in temp else None # Get node desc - row = cur.execute("SELECT desc, redirected from descs WHERE descs.name = ?", (name,)).fetchone() - desc = {"text": row[0], "fromRedirect": row[1] == 1} if row != None else None + row = cur.execute("SELECT desc, redirected, wiki_id, from_dbp from descs WHERE descs.name = ?", (name,)).fetchone() + descObj = None + if row != None: + descObj = {"text": row[0], "fromRedirect": row[1] == 1, "wikiId": row[2], "fromDbp": row[3] == 1} # Get img info imgInfo = None if nodeObj != None and nodeObj["imgName"] != None: @@ -142,7 +144,7 @@ def lookupNodeInfo(name, useReducedTree): row = cur.execute(imgInfoQuery, (eolId,)).fetchone() imgInfo = {"eolId": row[0], "sourceUrl": row[1], "license": row[2], "copyrightOwner": row[3]} # - return {"desc": desc, "imgInfo": imgInfo, "nodeObj": nodeObj} + return {"descObj": descObj, "imgInfo": imgInfo, "nodeObj": nodeObj} class DbServer(BaseHTTPRequestHandler): def do_GET(self): -- cgit v1.2.3