diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-17 10:41:12 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-05-17 10:41:12 +1000 |
| commit | 29940d51eb8b6b220d53940ecbc212cea78159ae (patch) | |
| tree | bfa698c17525de7876b80ad37d8f7777b9505ba0 | |
| parent | a840a16c6bd5aef906bd5cbce8293fc863cb5a5d (diff) | |
Improve enwiki description extraction
Adjust enwiki code to handle single dump file, and add scripts for
'convenient' page-content lookup.
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | backend/data/README.md | 2 | ||||
| -rw-r--r-- | backend/data/dbpPickedLabels.txt | 3 | ||||
| -rw-r--r-- | backend/data/enwiki/README.md | 51 | ||||
| -rwxr-xr-x | backend/data/enwiki/genData.py | 121 | ||||
| -rwxr-xr-x | backend/data/enwiki/genDescData.py | 68 | ||||
| -rwxr-xr-x | backend/data/enwiki/genDumpIndexDb.py | 56 | ||||
| -rwxr-xr-x | backend/data/enwiki/genPageData.py | 39 | ||||
| -rwxr-xr-x | backend/data/enwiki/genRedirectData.py | 39 | ||||
| -rwxr-xr-x | backend/data/enwiki/lookupPage.py | 66 | ||||
| -rwxr-xr-x | backend/data/genEnwikiData.py | 49 |
11 files changed, 295 insertions, 201 deletions
@@ -14,7 +14,7 @@ /backend/data/imgsForReview/ /backend/data/imgsReviewed/ /backend/data/img/ -/backend/data/enwiki/*.gz +/backend/data/enwiki/*.bz2 /backend/data/enwiki/*.db /backend/data/enwiki/enwiki_content/ /backend/data/enwiki/.venv/ diff --git a/backend/data/README.md b/backend/data/README.md index cb9cd42..576c70e 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -24,7 +24,7 @@ File Generation Process - Supplementing with Wikipedia dump 1 Obtain data in enwiki/, as specified in it's README. 2 Run genEnwikiData.py, which adds to the 'descs' table, using data in - enwiki/enwikiData.db, reducedTol/names.txt, and the 'nodes' table. + enwiki/enwikiData.db, and the 'nodes' table. 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt index 80a4770..d8f939e 100644 --- a/backend/data/dbpPickedLabels.txt +++ b/backend/data/dbpPickedLabels.txt @@ -88,7 +88,6 @@ balfouria (flatworm) ballana (leafhopper) Barcella Baryonyx -basuto (horse) Begonia Belbina belisarius (scorpion) @@ -320,7 +319,6 @@ Gymnopodium habeas corpus (pig) Halenia Halesia -halla (horse) Hallucigenia Harmothoe Harpa @@ -621,7 +619,6 @@ Thyreus tinerfe (ctenophore) Tiso Titanophora -tokara (horse) tortricidae (snakes) Tortrix Triaenophorus diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index e4e1aae..cdabf50 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -1,35 +1,28 @@ Downloaded Files ================ -- enwiki\_content/enwiki-20220420-pages-articles-*.xml.gz <br> - Obtained via https://dumps.wikimedia.org/backup-index.html (site suggests downloading from a mirror). - Contains text content and metadata for pages in English Wikipedia (current revision only, excludes talk pages). - Some file content and format information was available from - https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download. -- enwiki-20220420-page.sql.gz <br> - Obtained like above. Contains page-table information including page id, namespace, title, etc. - Format information was found at https://www.mediawiki.org/wiki/Manual:Page_table. -- enwiki-20220420-redirect.sql.gz <br> - Obtained like above. Contains page-redirection info. - Format information was found at https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download. +- enwiki-20220501-pages-articles-multistream.xml.bz2 <br> + Obtained via <https://dumps.wikimedia.org/backup-index.html> + (site suggests downloading from a mirror). Contains text + content and metadata for pages in English Wikipedia + (current revision only, excludes talk pages). Some file + content and format information was available from + <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>. +- enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br> + Obtained like above. Holds lines of the form offset1:pageId1:title1, + providing offsets, for each page, into the dump file, of a chunk of + 100 pages that includes it. Generated Files =============== -- enwiki\_content/enwiki-*.xml and enwiki-*.sql <br> - Uncompressed versions of downloaded files. +- dumpIndex.db <br> + Holds data from the enwiki dump index file. Generated by + genDumpIndexDb.py, and used by lookupPage.py to get content for a + given page title. - enwikiData.db <br> - An sqlite database representing data from the enwiki dump files. - Generation: - 1 Install python, and packages mwsql, mwxml, and mwparsefromhell. Example: - 1 On Ubuntu, install python3, python3-pip, and python3-venv via `apt-get update; apt-get ...`. - 2 Create a virtual environment in which to install packages via `python3 -m venv .venv`. - 3 Activate the virtual environment via `source .venv/bin/activate`. - 4 Install mwsql, mwxml, and mwparsefromhell via `pip install mwsql mwxml mwparsefromhell`. - 2 Run genPageData.py (still under the virtual environment), which creates the database, - reads from the page dump, and creates a 'pages' table. - 3 Run genRedirectData.py, which creates a 'redirects' table, using information in the redirects dump, - and page ids from the 'pages' table. - 4 Run genDescData.py, which reads the page-content xml dumps, and the 'pages' and 'redirects' tables, - and associates page ids with (potentially redirect-resolved) pages, and attempts to parse some - wikitext within those pages to obtain the first descriptive paragraph, with markup removed. -- .venv <br> - Provides a python virtual environment for packages needed to generate data. + Holds data obtained from the enwiki dump file, in 'pages', + 'redirects', and 'descs' tables. Generated by genData.py, which uses + python packages mwxml and mwparserfromhell. <br> + Tables: <br> + - pages: id INT PRIMARY KEY, title TEXT UNIQUE + - redirects: id INT PRIMARY KEY, target TEXT + - descs: id INT PRIMARY KEY, desc TEXT diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py new file mode 100755 index 0000000..4f0d62e --- /dev/null +++ b/backend/data/enwiki/genData.py @@ -0,0 +1,121 @@ +#!/usr/bin/python3 + +import sys, os, re +import bz2 +import html, mwxml, mwparserfromhell +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n" +usageInfo += "and short-description info to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages +enwikiDb = "enwikiData.db" + +# Some regexps and functions for parsing wikitext +descLineRegex = re.compile("^ *[A-Z'\"]") +embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$") + # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag +convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") +parensGrpRegex = re.compile(r" \([^()]*\)") +leftoverBraceRegex = re.compile(r"(?:{\||{{).*") +def convertTemplateReplace(match): + if match.group(2) == None: + return "{} {}".format(match.group(1), match.group(4)) + else: + return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4)) +def parseDesc(text): + # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank + # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, + # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + lines = [] + openBraceCount = 0 + openBracketCount = 0 + inComment = False + skip = False + for line in text.splitlines(): + line = line.strip() + if len(lines) == 0: + if len(line) > 0: + if openBraceCount > 0 or line[0] == "{": + openBraceCount += line.count("{") + openBraceCount -= line.count("}") + skip = True + if openBracketCount > 0 or line[0] == "[": + openBracketCount += line.count("[") + openBracketCount -= line.count("]") + skip = True + if inComment or line.find("<!--") != -1: + if line.find("-->") != -1: + if inComment: + inComment = False + skip = True + else: + inComment = True + skip = True + if skip: + skip = False + continue + if line[-1] == ":": # Seems to help avoid disambiguation pages + return None + if descLineRegex.match(line) != None: + lines.append(line) + else: + if len(line) == 0: + return removeMarkup(" ".join(lines)) + lines.append(line) + if len(lines) > 0: + return removeMarkup(" ".join(lines)) + return None +def removeMarkup(content): + content = embeddedHtmlRegex.sub("", content) + content = convertTemplateRegex.sub(convertTemplateReplace, content) + content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup + content = parensGrpRegex.sub("", content) + content = leftoverBraceRegex.sub("", content) + return content +# Other helper functions +def convertTitle(title): + return html.unescape(title).replace("_", " ") + +# Check for existing db +if os.path.exists(enwikiDb): + print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr) + sys.exit(1) +# Create db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") +dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") +dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") +dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") +# Read through dump file +print("Reading dump file") +with bz2.open(dumpFile, mode='rt') as file: + dump = mwxml.Dump.from_file(file) + pageNum = 0 + for page in dump: + pageNum += 1 + if pageNum % 1e4 == 0: + print("At page {}".format(pageNum)) + # Parse page + if page.namespace == 0: + try: + dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) + except sqlite3.IntegrityError as e: + # Accounts for certain pages that have the same title + print("Failed to add page with title \"{}\": {}".format(page.title, e)) + continue + if page.redirect != None: + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) + else: + revision = next(page) + desc = parseDesc(revision.text) + if desc != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py deleted file mode 100755 index 3602138..0000000 --- a/backend/data/enwiki/genDescData.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/python3 - -import re -import sys, os.path, glob -import mwxml, mwparserfromhell -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n" -usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml") -wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1))) -enwikiDb = "enwikiData.db" - -# Some regexps and functions for parsing wikitext -descLineRegex = "^ *[A-Z'\"]" -embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$" - # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag -convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}" -def convertTemplateReplace(match): - if match.group(2) == None: - return "{} {}".format(match.group(1), match.group(4)) - else: - return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4)) -parenGrpRegex = r" \([^()]*\)" -def parseDesc(text): - prevLine = None - for line in text.splitlines(): - if prevLine != None: - if line.strip() == "" or re.match(descLineRegex, line) != None: - return prevLine - else: - prevLine = None - if re.match(descLineRegex, line) != None: - line = re.sub(embeddedHtmlRegex, "", line) - line = re.sub(convertTemplateRegex, convertTemplateReplace, line) - line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup - prevLine = re.sub(parenGrpRegex, "", line) - if prevLine != None: - return prevLine - return None - -# Open db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") -# Parse data -iterationNum = 0 -for fileName in wikiDumpFiles: - print("Processing file {}".format(fileName)) - dump = mwxml.Dump.from_file(open(fileName)) - for page in dump: - iterationNum += 1 - if iterationNum % 10000 == 0: - print("At iteration {}".format(iterationNum)) - # Parse page - if page.namespace == 0 and page.redirect == None: - revision = next(page) - desc = parseDesc(revision.text) - if desc != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) -# Close db -dbCon.commit() -dbCon.close() diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py new file mode 100755 index 0000000..13f7eb6 --- /dev/null +++ b/backend/data/enwiki/genDumpIndexDb.py @@ -0,0 +1,56 @@ +#!/usr/bin/python3 + +import sys, os, re +import bz2 +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a Wikimedia enwiki dump index file,\n" +usageInfo += "and stores it's offset and title data to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines +indexDb = "dumpIndex.db" + +# Check for existing db +if os.path.exists(indexDb): + print("ERROR: Existing {}".format(indexDb), file=sys.stderr) + sys.exit(1) +# Create db +dbCon = sqlite3.connect(indexDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, offset INT, next_offset INT)") +# Reading index file +lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") +lastOffset = 0 +lineNum = 0 +titlesToAdd = [] +with bz2.open(indexFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print("At line {}".format(lineNum)) + # + match = lineRegex.fullmatch(line.rstrip()) + (offset, _, title) = match.group(1,2,3) + offset = int(offset) + if offset > lastOffset: + for t in titlesToAdd: + try: + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset)) + except sqlite3.IntegrityError as e: + # Accounts for certain entries in the file that have the same title + print("Failed on title \"{}\": {}".format(t, e)) + titlesToAdd = [] + lastOffset = offset + titlesToAdd.append(title) +for title in titlesToAdd: + try: + dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1)) + except sqlite3.IntegrityError as e: + print("Failed on title \"{}\": {}".format(t, e)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py deleted file mode 100755 index 7522f1f..0000000 --- a/backend/data/enwiki/genPageData.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 - -import sys, os.path -from mwsql import Dump -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n" -usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n" -usageInfo += "a sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -pageDumpFile = "enwiki-20220420-page.sql.gz" -enwikiDb = "enwikiData.db" - -# Check for existing db -if os.path.exists(enwikiDb): - print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr) - sys.exit(1) -# Create db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") -dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") -# Parse page data -dump = Dump.from_file(pageDumpFile) -iterationNum = 0 -for row in dump.rows(convert_dtypes=True): - iterationNum += 1 - if iterationNum % 1e6 == 0: - print("At iteration {}".format(iterationNum)) - # Add to map - if row[1] == 0: # If page in article namespace - dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " "))) -# Close db -dbCon.commit() -dbCon.close() diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py deleted file mode 100755 index e1aadc8..0000000 --- a/backend/data/enwiki/genRedirectData.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 - -import sys, os.path -from mwsql import Dump -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n" -usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n" -usageInfo += "a sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -redirectDumpFile = "enwiki-20220420-redirect.sql.gz" -enwikiDb = "enwikiData.db" - -# Open db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)") -dbCur2 = dbCon.cursor() -# Parse redirect data -dump = Dump.from_file(redirectDumpFile) -iterationNum = 0 -for row in dump.rows(convert_dtypes=True): - iterationNum += 1 - if iterationNum % 1e6 == 0: - print("At iteration {}".format(iterationNum)) - # Add to map - [pageId, namespace, title] = row[:3] - if namespace == 0: # If page is in the article namespace - row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone() - if row != None: - targetId = row[0] - dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId)) -# Close db -dbCon.commit() -dbCon.close() diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py new file mode 100755 index 0000000..5d6afe9 --- /dev/null +++ b/backend/data/enwiki/lookupPage.py @@ -0,0 +1,66 @@ +#!/usr/bin/python3 + +import sys, re +import bz2 +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]} title1\n" +usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n" +usageInfo += "using a dump index db, and prints the corresponding <page>.\n" +if len(sys.argv) != 2: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" +indexDb = "dumpIndex.db" +pageTitle = sys.argv[1] + +# Searching index file +print("Lookup offset in index db") +dbCon = sqlite3.connect(indexDb) +dbCur = dbCon.cursor() +row = dbCur.execute("SELECT title, offset, next_offset FROM offsets WHERE title = ?", + (pageTitle.replace("_", " "),)).fetchone() +if row == None: + print("Title not found") + sys.exit(0) +(_, pageOffset, endOffset) = row +dbCon.close() +print("Found chunk at offset {}".format(pageOffset)) +# Read dump file +print("Reading dump file") +content = [] +with open(dumpFile, mode='rb') as file: + # Get uncompressed chunk + file.seek(pageOffset) + compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) + data = bz2.BZ2Decompressor().decompress(compressedData).decode() + # Look in chunk for page + lines = data.splitlines() + lineIdx = 0 + found = False + pageNum = 0 + while not found: + line = lines[lineIdx] + if line.lstrip() == "<page>": + pageNum += 1 + if pageNum > 100: + print("ERROR: Did not find title after 100 pages") + break + lineIdx += 1 + titleLine = lines[lineIdx] + if titleLine.lstrip() == '<title>' + pageTitle + '</title>': + found = True + print("Found title in chunk as page {}".format(pageNum)) + content.append(line) + content.append(titleLine) + while True: + lineIdx += 1 + line = lines[lineIdx] + content.append(line) + if line.lstrip() == "</page>": + break + lineIdx += 1 +# Print content +print("Content: ") +print("\n".join(content)) diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py index 48fd2c6..879ecf6 100755 --- a/backend/data/genEnwikiData.py +++ b/backend/data/genEnwikiData.py @@ -4,15 +4,14 @@ import sys, re import sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads Wikimedia enwiki data from enwiki/, a list of node names," -usageInfo += "and node and name data from a sqlite database, and adds\n" -usageInfo += "description data for names that don't have them\n" +usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data" +usageInfo += "from a sqlite database, and adds description data for names that\n" +usageInfo += "don't have them.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) enwikiDb = "enwiki/enwikiData.db" -namesFile = "reducedTol/names.txt" dbFile = "data.db" # Open dbs @@ -20,40 +19,48 @@ enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Read in names to check -print("Getting names to check") +# Get node names without descriptions +print("Getting node names") nodeNames = set() -with open(namesFile) as file: - for line in file: - nodeNames.add(line.rstrip()) +query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL" +for row in dbCur.execute(query): + nodeNames.add(row[0]) print("Found {} names".format(len(nodeNames))) -# Remove names that have descriptions -print("Checking for existing name descriptions") -namesWithDescs = set() -for name in nodeNames: - row = dbCur.execute("SELECT name FROM descs where name = ?", (name,)).fetchone() - if row != None: - namesWithDescs.add(name) -nodeNames.difference_update(namesWithDescs) -print("Remaining nodes: {}".format(len(nodeNames))) # Find page id for each node name -nodeToPageId = {} print("Getting node page-ids") +nodeToPageId = {} +iterNum = 0 for name in nodeNames: - row = enwikiCur.execute("SELECT id FROM pages where pages.title = ? COLLATE nocase", (name,)).fetchone() + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() if row != None: nodeToPageId[name] = row[0] # Resolve redirects print("Resolving redirects") redirectingNames = set() +iterNum = 0 for (name, pageId) in nodeToPageId.items(): - row = enwikiCur.execute("SELECT target_id FROM redirects where redirects.id = ?", (pageId,)).fetchone() + iterNum += 1 + if iterNum % 1000 == 0: + print("At iteration {}".format(iterNum)) + # + row = enwikiCur.execute( + "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?", + (pageId,)).fetchone() if row != None: nodeToPageId[name] = row[0] redirectingNames.add(name) # Add descriptions for each node print("Adding description data") +iterNum = 0 for (name, pageId) in nodeToPageId.items(): + iterNum += 1 + if iterNum % 1000 == 0: + print("At iteration {}".format(iterNum)) + # row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() if row != None: dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingNames else 0)) |
