diff options
| -rw-r--r-- | backend/data/enwiki/README.md | 4 | ||||
| -rwxr-xr-x | backend/data/enwiki/genData.py | 2 | ||||
| -rwxr-xr-x | backend/data/enwiki/genDumpIndexDb.py | 18 | ||||
| -rwxr-xr-x | backend/data/enwiki/lookupPage.py | 2 |
4 files changed, 14 insertions, 12 deletions
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index cdabf50..c9615ef 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -17,7 +17,9 @@ Generated Files - dumpIndex.db <br> Holds data from the enwiki dump index file. Generated by genDumpIndexDb.py, and used by lookupPage.py to get content for a - given page title. + given page title. <br> + Tables: <br> + - offsets: title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next\_offset INT - enwikiData.db <br> Holds data obtained from the enwiki dump file, in 'pages', 'redirects', and 'descs' tables. Generated by genData.py, which uses diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py index f439d11..3e60bb5 100755 --- a/backend/data/enwiki/genData.py +++ b/backend/data/enwiki/genData.py @@ -18,7 +18,7 @@ enwikiDb = "enwikiData.db" # Some regexps and functions for parsing wikitext descLineRegex = re.compile("^ *[A-Z'\"]") embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$") - # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag + # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") parensGrpRegex = re.compile(r" \([^()]*\)") leftoverBraceRegex = re.compile(r"(?:{\||{{).*") diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py index 450754b..ee3e813 100755 --- a/backend/data/enwiki/genDumpIndexDb.py +++ b/backend/data/enwiki/genDumpIndexDb.py @@ -21,12 +21,12 @@ if os.path.exists(indexDb): # Create db dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, offset INT, next_offset INT)") +dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)") # Reading index file lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") lastOffset = 0 lineNum = 0 -titlesToAdd = [] +entriesToAdd = [] with bz2.open(indexFile, mode='rt') as file: for line in file: lineNum += 1 @@ -34,21 +34,21 @@ with bz2.open(indexFile, mode='rt') as file: print(f"At line {lineNum}") # match = lineRegex.fullmatch(line.rstrip()) - (offset, _, title) = match.group(1,2,3) + (offset, pageId, title) = match.group(1,2,3) offset = int(offset) if offset > lastOffset: - for t in titlesToAdd: + for (t, p) in entriesToAdd: try: - dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset)) + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset)) except sqlite3.IntegrityError as e: # Accounts for certain entries in the file that have the same title print(f"Failed on title \"{t}\": {e}") - titlesToAdd = [] + entriesToAdd = [] lastOffset = offset - titlesToAdd.append(title) -for title in titlesToAdd: + entriesToAdd.append([title, pageId]) +for (title, pageId) in entriesToAdd: try: - dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1)) + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1)) except sqlite3.IntegrityError as e: print(f"Failed on title \"{t}\": {e}") # Close db diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py index 1d379e7..76f2f95 100755 --- a/backend/data/enwiki/lookupPage.py +++ b/backend/data/enwiki/lookupPage.py @@ -23,7 +23,7 @@ query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?" row = dbCur.execute(query, (pageTitle,)).fetchone() if row == None: print("Title not found") - sys.exit(0) + sys.exit(1) (_, pageOffset, endOffset) = row dbCon.close() print(f"Found chunk at offset {pageOffset}") |
