aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--backend/data/enwiki/README.md4
-rwxr-xr-xbackend/data/enwiki/genData.py2
-rwxr-xr-xbackend/data/enwiki/genDumpIndexDb.py18
-rwxr-xr-xbackend/data/enwiki/lookupPage.py2
4 files changed, 14 insertions, 12 deletions
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index cdabf50..c9615ef 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -17,7 +17,9 @@ Generated Files
- dumpIndex.db <br>
Holds data from the enwiki dump index file. Generated by
genDumpIndexDb.py, and used by lookupPage.py to get content for a
- given page title.
+ given page title. <br>
+ Tables: <br>
+ - offsets: title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next\_offset INT
- enwikiData.db <br>
Holds data obtained from the enwiki dump file, in 'pages',
'redirects', and 'descs' tables. Generated by genData.py, which uses
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py
index f439d11..3e60bb5 100755
--- a/backend/data/enwiki/genData.py
+++ b/backend/data/enwiki/genData.py
@@ -18,7 +18,7 @@ enwikiDb = "enwikiData.db"
# Some regexps and functions for parsing wikitext
descLineRegex = re.compile("^ *[A-Z'\"]")
embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
- # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
+ # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
parensGrpRegex = re.compile(r" \([^()]*\)")
leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py
index 450754b..ee3e813 100755
--- a/backend/data/enwiki/genDumpIndexDb.py
+++ b/backend/data/enwiki/genDumpIndexDb.py
@@ -21,12 +21,12 @@ if os.path.exists(indexDb):
# Create db
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, offset INT, next_offset INT)")
+dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)")
# Reading index file
lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
lastOffset = 0
lineNum = 0
-titlesToAdd = []
+entriesToAdd = []
with bz2.open(indexFile, mode='rt') as file:
for line in file:
lineNum += 1
@@ -34,21 +34,21 @@ with bz2.open(indexFile, mode='rt') as file:
print(f"At line {lineNum}")
#
match = lineRegex.fullmatch(line.rstrip())
- (offset, _, title) = match.group(1,2,3)
+ (offset, pageId, title) = match.group(1,2,3)
offset = int(offset)
if offset > lastOffset:
- for t in titlesToAdd:
+ for (t, p) in entriesToAdd:
try:
- dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset))
+ dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset))
except sqlite3.IntegrityError as e:
# Accounts for certain entries in the file that have the same title
print(f"Failed on title \"{t}\": {e}")
- titlesToAdd = []
+ entriesToAdd = []
lastOffset = offset
- titlesToAdd.append(title)
-for title in titlesToAdd:
+ entriesToAdd.append([title, pageId])
+for (title, pageId) in entriesToAdd:
try:
- dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1))
+ dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1))
except sqlite3.IntegrityError as e:
print(f"Failed on title \"{t}\": {e}")
# Close db
diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py
index 1d379e7..76f2f95 100755
--- a/backend/data/enwiki/lookupPage.py
+++ b/backend/data/enwiki/lookupPage.py
@@ -23,7 +23,7 @@ query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?"
row = dbCur.execute(query, (pageTitle,)).fetchone()
if row == None:
print("Title not found")
- sys.exit(0)
+ sys.exit(1)
(_, pageOffset, endOffset) = row
dbCon.close()
print(f"Found chunk at offset {pageOffset}")