aboutsummaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
Diffstat (limited to 'backend')
-rw-r--r--backend/data/README.md2
-rw-r--r--backend/data/dbpedia/README.md3
-rwxr-xr-xbackend/data/dbpedia/genData.py25
-rwxr-xr-xbackend/data/genDbpData.py8
-rwxr-xr-xbackend/data/genEnwikiData.py3
-rwxr-xr-xbackend/server.py8
6 files changed, 40 insertions, 9 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 576c70e..3df4268 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -36,7 +36,7 @@ data.db Tables
- names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name)
- eol\_ids: id INT PRIMARY KEY, name TEXT
- images: eol\_id INT PRIMARY KEY, source\_url TEXT, license TEXT, copyright\_owner TEXT
-- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT
+- descs: name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki\_id INT, from_dbp INT
- r\_nodes: name TEXT PRIMARY KEY, tips INT
- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
diff --git a/backend/data/dbpedia/README.md b/backend/data/dbpedia/README.md
index 0e7c266..78e2a90 100644
--- a/backend/data/dbpedia/README.md
+++ b/backend/data/dbpedia/README.md
@@ -3,6 +3,8 @@ Downloaded Files
- labels\_lang=en.ttl.bz2 <br>
Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core,
using the link <https://databus.dbpedia.org/dbpedia/generic/labels/2022.03.01/labels_lang=en.ttl.bz2>.
+- page\_lang=en\_ids.ttl.bz2 <br>
+ Downloaded from <https://databus.dbpedia.org/dbpedia/generic/page/2022.03.01/page_lang=en_ids.ttl.bz2>
- redirects\_lang=en\_transitive.ttl.bz2 <br>
Downloaded from <https://databus.dbpedia.org/dbpedia/generic/redirects/2022.03.01/redirects_lang=en_transitive.ttl.bz2>.
- disambiguations\_lang=en.ttl.bz2 <br>
@@ -19,6 +21,7 @@ Generated Files
Generated by running genData.py.
Tables
- labels: iri TEXT PRIMARY KEY, label TEXT
+ - ids: iri TEXT PRIMARY KEY, id INT
- redirects: iri TEXT PRIMARY KEY, target TEXT
- disambiguations: iri TEXT PRIMARY KEY
- types: iri TEXT, type TEXT
diff --git a/backend/data/dbpedia/genData.py b/backend/data/dbpedia/genData.py
index e147641..3df1961 100755
--- a/backend/data/dbpedia/genData.py
+++ b/backend/data/dbpedia/genData.py
@@ -4,13 +4,14 @@ import sys, re
import bz2, sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads DBpedia labels+types+redirects+abstracts data,\n"
+usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n"
usageInfo += "and creates a sqlite db containing that data.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines
+idsFile = "page_lang=en_ids.ttl.bz2"
redirectsFile = "redirects_lang=en_transitive.ttl.bz2"
disambigFile = "disambiguations_lang=en.ttl.bz2"
typesFile = "instance-types_lang=en_specific.ttl.bz2"
@@ -39,6 +40,28 @@ with bz2.open(labelsFile, mode='rt') as file:
else:
dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2)))
dbCon.commit()
+# Read/store wiki page ids
+print("Reading/storing wiki page ids")
+dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)")
+idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n')
+lineNum = 0
+with bz2.open(idsFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print("Processing line {}".format(lineNum))
+ #
+ match = idLineRegex.fullmatch(line)
+ if match == None:
+ print("ERROR: Line {} has unexpected format".format(lineNum), file=sys.stderr)
+ sys.exit(1)
+ else:
+ try:
+ dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2))))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain lines that have the same IRI
+ print("Failed to add entry with IRI \"{}\": {}".format(match.group(1), e))
+dbCon.commit()
# Read/store redirects
print("Reading/storing redirection data")
dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)")
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
index 0ba1ef1..c8394ea 100755
--- a/backend/data/genDbpData.py
+++ b/backend/data/genDbpData.py
@@ -211,16 +211,18 @@ for (name, iri) in nodeToIri.items():
redirectingIriSet.add(name)
# Find descriptions, and add to db
print("Adding node description data")
-dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)")
+dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT, wiki_id INT, from_dbp INT)")
iterNum = 0
for (name, iri) in nodeToIri.items():
iterNum += 1
if iterNum % 1e4 == 0:
print("At iteration {}".format(iterNum))
#
- row = dbpCur.execute("SELECT abstract FROM abstracts where iri = ?", (iri,)).fetchone()
+ query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?"
+ row = dbpCur.execute(query, (iri,)).fetchone()
if row != None:
- dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingIriSet else 0))
+ dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)",
+ (name, row[0], 1 if name in redirectingIriSet else 0, row[1], 1))
# Close dbs
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py
index 879ecf6..d33fd5d 100755
--- a/backend/data/genEnwikiData.py
+++ b/backend/data/genEnwikiData.py
@@ -63,7 +63,8 @@ for (name, pageId) in nodeToPageId.items():
#
row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
if row != None:
- dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingNames else 0))
+ dbCur.execute("INSERT INTO descs VALUES (?, ?, ?, ?, ?)",
+ (name, row[0], 1 if name in redirectingNames else 0, pageId, 0))
# Close dbs
dbCon.commit()
dbCon.close()
diff --git a/backend/server.py b/backend/server.py
index 54e4803..a64a145 100755
--- a/backend/server.py
+++ b/backend/server.py
@@ -132,8 +132,10 @@ def lookupNodeInfo(name, useReducedTree):
temp = lookupNodes([name], useReducedTree)
nodeObj = temp[name] if name in temp else None
# Get node desc
- row = cur.execute("SELECT desc, redirected from descs WHERE descs.name = ?", (name,)).fetchone()
- desc = {"text": row[0], "fromRedirect": row[1] == 1} if row != None else None
+ row = cur.execute("SELECT desc, redirected, wiki_id, from_dbp from descs WHERE descs.name = ?", (name,)).fetchone()
+ descObj = None
+ if row != None:
+ descObj = {"text": row[0], "fromRedirect": row[1] == 1, "wikiId": row[2], "fromDbp": row[3] == 1}
# Get img info
imgInfo = None
if nodeObj != None and nodeObj["imgName"] != None:
@@ -142,7 +144,7 @@ def lookupNodeInfo(name, useReducedTree):
row = cur.execute(imgInfoQuery, (eolId,)).fetchone()
imgInfo = {"eolId": row[0], "sourceUrl": row[1], "license": row[2], "copyrightOwner": row[3]}
#
- return {"desc": desc, "imgInfo": imgInfo, "nodeObj": nodeObj}
+ return {"descObj": descObj, "imgInfo": imgInfo, "nodeObj": nodeObj}
class DbServer(BaseHTTPRequestHandler):
def do_GET(self):