aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-17 10:41:12 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-17 10:41:12 +1000
commit29940d51eb8b6b220d53940ecbc212cea78159ae (patch)
treebfa698c17525de7876b80ad37d8f7777b9505ba0 /backend/data
parenta840a16c6bd5aef906bd5cbce8293fc863cb5a5d (diff)
Improve enwiki description extraction
Adjust enwiki code to handle single dump file, and add scripts for 'convenient' page-content lookup.
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md2
-rw-r--r--backend/data/dbpPickedLabels.txt3
-rw-r--r--backend/data/enwiki/README.md51
-rwxr-xr-xbackend/data/enwiki/genData.py121
-rwxr-xr-xbackend/data/enwiki/genDescData.py68
-rwxr-xr-xbackend/data/enwiki/genDumpIndexDb.py56
-rwxr-xr-xbackend/data/enwiki/genPageData.py39
-rwxr-xr-xbackend/data/enwiki/genRedirectData.py39
-rwxr-xr-xbackend/data/enwiki/lookupPage.py66
-rwxr-xr-xbackend/data/genEnwikiData.py49
10 files changed, 294 insertions, 200 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index cb9cd42..576c70e 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -24,7 +24,7 @@ File Generation Process
- Supplementing with Wikipedia dump
1 Obtain data in enwiki/, as specified in it's README.
2 Run genEnwikiData.py, which adds to the 'descs' table, using data in
- enwiki/enwikiData.db, reducedTol/names.txt, and the 'nodes' table.
+ enwiki/enwikiData.db, and the 'nodes' table.
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt
index 80a4770..d8f939e 100644
--- a/backend/data/dbpPickedLabels.txt
+++ b/backend/data/dbpPickedLabels.txt
@@ -88,7 +88,6 @@ balfouria (flatworm)
ballana (leafhopper)
Barcella
Baryonyx
-basuto (horse)
Begonia
Belbina
belisarius (scorpion)
@@ -320,7 +319,6 @@ Gymnopodium
habeas corpus (pig)
Halenia
Halesia
-halla (horse)
Hallucigenia
Harmothoe
Harpa
@@ -621,7 +619,6 @@ Thyreus
tinerfe (ctenophore)
Tiso
Titanophora
-tokara (horse)
tortricidae (snakes)
Tortrix
Triaenophorus
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index e4e1aae..cdabf50 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -1,35 +1,28 @@
Downloaded Files
================
-- enwiki\_content/enwiki-20220420-pages-articles-*.xml.gz <br>
- Obtained via https://dumps.wikimedia.org/backup-index.html (site suggests downloading from a mirror).
- Contains text content and metadata for pages in English Wikipedia (current revision only, excludes talk pages).
- Some file content and format information was available from
- https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
-- enwiki-20220420-page.sql.gz <br>
- Obtained like above. Contains page-table information including page id, namespace, title, etc.
- Format information was found at https://www.mediawiki.org/wiki/Manual:Page_table.
-- enwiki-20220420-redirect.sql.gz <br>
- Obtained like above. Contains page-redirection info.
- Format information was found at https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
+- enwiki-20220501-pages-articles-multistream.xml.bz2 <br>
+ Obtained via <https://dumps.wikimedia.org/backup-index.html>
+ (site suggests downloading from a mirror). Contains text
+ content and metadata for pages in English Wikipedia
+ (current revision only, excludes talk pages). Some file
+ content and format information was available from
+ <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+- enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br>
+ Obtained like above. Holds lines of the form offset1:pageId1:title1,
+ providing offsets, for each page, into the dump file, of a chunk of
+ 100 pages that includes it.
Generated Files
===============
-- enwiki\_content/enwiki-*.xml and enwiki-*.sql <br>
- Uncompressed versions of downloaded files.
+- dumpIndex.db <br>
+ Holds data from the enwiki dump index file. Generated by
+ genDumpIndexDb.py, and used by lookupPage.py to get content for a
+ given page title.
- enwikiData.db <br>
- An sqlite database representing data from the enwiki dump files.
- Generation:
- 1 Install python, and packages mwsql, mwxml, and mwparsefromhell. Example:
- 1 On Ubuntu, install python3, python3-pip, and python3-venv via `apt-get update; apt-get ...`.
- 2 Create a virtual environment in which to install packages via `python3 -m venv .venv`.
- 3 Activate the virtual environment via `source .venv/bin/activate`.
- 4 Install mwsql, mwxml, and mwparsefromhell via `pip install mwsql mwxml mwparsefromhell`.
- 2 Run genPageData.py (still under the virtual environment), which creates the database,
- reads from the page dump, and creates a 'pages' table.
- 3 Run genRedirectData.py, which creates a 'redirects' table, using information in the redirects dump,
- and page ids from the 'pages' table.
- 4 Run genDescData.py, which reads the page-content xml dumps, and the 'pages' and 'redirects' tables,
- and associates page ids with (potentially redirect-resolved) pages, and attempts to parse some
- wikitext within those pages to obtain the first descriptive paragraph, with markup removed.
-- .venv <br>
- Provides a python virtual environment for packages needed to generate data.
+ Holds data obtained from the enwiki dump file, in 'pages',
+ 'redirects', and 'descs' tables. Generated by genData.py, which uses
+ python packages mwxml and mwparserfromhell. <br>
+ Tables: <br>
+ - pages: id INT PRIMARY KEY, title TEXT UNIQUE
+ - redirects: id INT PRIMARY KEY, target TEXT
+ - descs: id INT PRIMARY KEY, desc TEXT
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py
new file mode 100755
index 0000000..4f0d62e
--- /dev/null
+++ b/backend/data/enwiki/genData.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
+usageInfo += "and short-description info to an sqlite db.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
+enwikiDb = "enwikiData.db"
+
+# Some regexps and functions for parsing wikitext
+descLineRegex = re.compile("^ *[A-Z'\"]")
+embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
+ # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
+parensGrpRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+def convertTemplateReplace(match):
+ if match.group(2) == None:
+ return "{} {}".format(match.group(1), match.group(4))
+ else:
+ return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
+def parseDesc(text):
+ # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
+ # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines,
+ # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ lines = []
+ openBraceCount = 0
+ openBracketCount = 0
+ inComment = False
+ skip = False
+ for line in text.splitlines():
+ line = line.strip()
+ if len(lines) == 0:
+ if len(line) > 0:
+ if openBraceCount > 0 or line[0] == "{":
+ openBraceCount += line.count("{")
+ openBraceCount -= line.count("}")
+ skip = True
+ if openBracketCount > 0 or line[0] == "[":
+ openBracketCount += line.count("[")
+ openBracketCount -= line.count("]")
+ skip = True
+ if inComment or line.find("<!--") != -1:
+ if line.find("-->") != -1:
+ if inComment:
+ inComment = False
+ skip = True
+ else:
+ inComment = True
+ skip = True
+ if skip:
+ skip = False
+ continue
+ if line[-1] == ":": # Seems to help avoid disambiguation pages
+ return None
+ if descLineRegex.match(line) != None:
+ lines.append(line)
+ else:
+ if len(line) == 0:
+ return removeMarkup(" ".join(lines))
+ lines.append(line)
+ if len(lines) > 0:
+ return removeMarkup(" ".join(lines))
+ return None
+def removeMarkup(content):
+ content = embeddedHtmlRegex.sub("", content)
+ content = convertTemplateRegex.sub(convertTemplateReplace, content)
+ content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+ content = parensGrpRegex.sub("", content)
+ content = leftoverBraceRegex.sub("", content)
+ return content
+# Other helper functions
+def convertTitle(title):
+ return html.unescape(title).replace("_", " ")
+
+# Check for existing db
+if os.path.exists(enwikiDb):
+ print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr)
+ sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+# Read through dump file
+print("Reading dump file")
+with bz2.open(dumpFile, mode='rt') as file:
+ dump = mwxml.Dump.from_file(file)
+ pageNum = 0
+ for page in dump:
+ pageNum += 1
+ if pageNum % 1e4 == 0:
+ print("At page {}".format(pageNum))
+ # Parse page
+ if page.namespace == 0:
+ try:
+ dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain pages that have the same title
+ print("Failed to add page with title \"{}\": {}".format(page.title, e))
+ continue
+ if page.redirect != None:
+ dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
+ else:
+ revision = next(page)
+ desc = parseDesc(revision.text)
+ if desc != None:
+ dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
deleted file mode 100755
index 3602138..0000000
--- a/backend/data/enwiki/genDescData.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/python3
-
-import re
-import sys, os.path, glob
-import mwxml, mwparserfromhell
-import sqlite3
-
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n"
-usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n"
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml")
-wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1)))
-enwikiDb = "enwikiData.db"
-
-# Some regexps and functions for parsing wikitext
-descLineRegex = "^ *[A-Z'\"]"
-embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$"
- # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
-convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}"
-def convertTemplateReplace(match):
- if match.group(2) == None:
- return "{} {}".format(match.group(1), match.group(4))
- else:
- return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
-parenGrpRegex = r" \([^()]*\)"
-def parseDesc(text):
- prevLine = None
- for line in text.splitlines():
- if prevLine != None:
- if line.strip() == "" or re.match(descLineRegex, line) != None:
- return prevLine
- else:
- prevLine = None
- if re.match(descLineRegex, line) != None:
- line = re.sub(embeddedHtmlRegex, "", line)
- line = re.sub(convertTemplateRegex, convertTemplateReplace, line)
- line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup
- prevLine = re.sub(parenGrpRegex, "", line)
- if prevLine != None:
- return prevLine
- return None
-
-# Open db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Parse data
-iterationNum = 0
-for fileName in wikiDumpFiles:
- print("Processing file {}".format(fileName))
- dump = mwxml.Dump.from_file(open(fileName))
- for page in dump:
- iterationNum += 1
- if iterationNum % 10000 == 0:
- print("At iteration {}".format(iterationNum))
- # Parse page
- if page.namespace == 0 and page.redirect == None:
- revision = next(page)
- desc = parseDesc(revision.text)
- if desc != None:
- dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py
new file mode 100755
index 0000000..13f7eb6
--- /dev/null
+++ b/backend/data/enwiki/genDumpIndexDb.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a Wikimedia enwiki dump index file,\n"
+usageInfo += "and stores it's offset and title data to an sqlite db.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines
+indexDb = "dumpIndex.db"
+
+# Check for existing db
+if os.path.exists(indexDb):
+ print("ERROR: Existing {}".format(indexDb), file=sys.stderr)
+ sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, offset INT, next_offset INT)")
+# Reading index file
+lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
+lastOffset = 0
+lineNum = 0
+titlesToAdd = []
+with bz2.open(indexFile, mode='rt') as file:
+ for line in file:
+ lineNum += 1
+ if lineNum % 1e5 == 0:
+ print("At line {}".format(lineNum))
+ #
+ match = lineRegex.fullmatch(line.rstrip())
+ (offset, _, title) = match.group(1,2,3)
+ offset = int(offset)
+ if offset > lastOffset:
+ for t in titlesToAdd:
+ try:
+ dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain entries in the file that have the same title
+ print("Failed on title \"{}\": {}".format(t, e))
+ titlesToAdd = []
+ lastOffset = offset
+ titlesToAdd.append(title)
+for title in titlesToAdd:
+ try:
+ dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1))
+ except sqlite3.IntegrityError as e:
+ print("Failed on title \"{}\": {}".format(t, e))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py
deleted file mode 100755
index 7522f1f..0000000
--- a/backend/data/enwiki/genPageData.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os.path
-from mwsql import Dump
-import sqlite3
-
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n"
-usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n"
-usageInfo += "a sqlite db.\n"
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-pageDumpFile = "enwiki-20220420-page.sql.gz"
-enwikiDb = "enwikiData.db"
-
-# Check for existing db
-if os.path.exists(enwikiDb):
- print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr)
- sys.exit(1)
-# Create db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
-dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
-# Parse page data
-dump = Dump.from_file(pageDumpFile)
-iterationNum = 0
-for row in dump.rows(convert_dtypes=True):
- iterationNum += 1
- if iterationNum % 1e6 == 0:
- print("At iteration {}".format(iterationNum))
- # Add to map
- if row[1] == 0: # If page in article namespace
- dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " ")))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py
deleted file mode 100755
index e1aadc8..0000000
--- a/backend/data/enwiki/genRedirectData.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os.path
-from mwsql import Dump
-import sqlite3
-
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n"
-usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n"
-usageInfo += "a sqlite db.\n"
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-redirectDumpFile = "enwiki-20220420-redirect.sql.gz"
-enwikiDb = "enwikiData.db"
-
-# Open db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)")
-dbCur2 = dbCon.cursor()
-# Parse redirect data
-dump = Dump.from_file(redirectDumpFile)
-iterationNum = 0
-for row in dump.rows(convert_dtypes=True):
- iterationNum += 1
- if iterationNum % 1e6 == 0:
- print("At iteration {}".format(iterationNum))
- # Add to map
- [pageId, namespace, title] = row[:3]
- if namespace == 0: # If page is in the article namespace
- row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone()
- if row != None:
- targetId = row[0]
- dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py
new file mode 100755
index 0000000..5d6afe9
--- /dev/null
+++ b/backend/data/enwiki/lookupPage.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]} title1\n"
+usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n"
+usageInfo += "using a dump index db, and prints the corresponding <page>.\n"
+if len(sys.argv) != 2:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+pageTitle = sys.argv[1]
+
+# Searching index file
+print("Lookup offset in index db")
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+row = dbCur.execute("SELECT title, offset, next_offset FROM offsets WHERE title = ?",
+ (pageTitle.replace("_", " "),)).fetchone()
+if row == None:
+ print("Title not found")
+ sys.exit(0)
+(_, pageOffset, endOffset) = row
+dbCon.close()
+print("Found chunk at offset {}".format(pageOffset))
+# Read dump file
+print("Reading dump file")
+content = []
+with open(dumpFile, mode='rb') as file:
+ # Get uncompressed chunk
+ file.seek(pageOffset)
+ compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+ data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+ # Look in chunk for page
+ lines = data.splitlines()
+ lineIdx = 0
+ found = False
+ pageNum = 0
+ while not found:
+ line = lines[lineIdx]
+ if line.lstrip() == "<page>":
+ pageNum += 1
+ if pageNum > 100:
+ print("ERROR: Did not find title after 100 pages")
+ break
+ lineIdx += 1
+ titleLine = lines[lineIdx]
+ if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
+ found = True
+ print("Found title in chunk as page {}".format(pageNum))
+ content.append(line)
+ content.append(titleLine)
+ while True:
+ lineIdx += 1
+ line = lines[lineIdx]
+ content.append(line)
+ if line.lstrip() == "</page>":
+ break
+ lineIdx += 1
+# Print content
+print("Content: ")
+print("\n".join(content))
diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py
index 48fd2c6..879ecf6 100755
--- a/backend/data/genEnwikiData.py
+++ b/backend/data/genEnwikiData.py
@@ -4,15 +4,14 @@ import sys, re
import sqlite3
usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki data from enwiki/, a list of node names,"
-usageInfo += "and node and name data from a sqlite database, and adds\n"
-usageInfo += "description data for names that don't have them\n"
+usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data"
+usageInfo += "from a sqlite database, and adds description data for names that\n"
+usageInfo += "don't have them.\n"
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
enwikiDb = "enwiki/enwikiData.db"
-namesFile = "reducedTol/names.txt"
dbFile = "data.db"
# Open dbs
@@ -20,40 +19,48 @@ enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Read in names to check
-print("Getting names to check")
+# Get node names without descriptions
+print("Getting node names")
nodeNames = set()
-with open(namesFile) as file:
- for line in file:
- nodeNames.add(line.rstrip())
+query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL"
+for row in dbCur.execute(query):
+ nodeNames.add(row[0])
print("Found {} names".format(len(nodeNames)))
-# Remove names that have descriptions
-print("Checking for existing name descriptions")
-namesWithDescs = set()
-for name in nodeNames:
- row = dbCur.execute("SELECT name FROM descs where name = ?", (name,)).fetchone()
- if row != None:
- namesWithDescs.add(name)
-nodeNames.difference_update(namesWithDescs)
-print("Remaining nodes: {}".format(len(nodeNames)))
# Find page id for each node name
-nodeToPageId = {}
print("Getting node page-ids")
+nodeToPageId = {}
+iterNum = 0
for name in nodeNames:
- row = enwikiCur.execute("SELECT id FROM pages where pages.title = ? COLLATE nocase", (name,)).fetchone()
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
if row != None:
nodeToPageId[name] = row[0]
# Resolve redirects
print("Resolving redirects")
redirectingNames = set()
+iterNum = 0
for (name, pageId) in nodeToPageId.items():
- row = enwikiCur.execute("SELECT target_id FROM redirects where redirects.id = ?", (pageId,)).fetchone()
+ iterNum += 1
+ if iterNum % 1000 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ row = enwikiCur.execute(
+ "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?",
+ (pageId,)).fetchone()
if row != None:
nodeToPageId[name] = row[0]
redirectingNames.add(name)
# Add descriptions for each node
print("Adding description data")
+iterNum = 0
for (name, pageId) in nodeToPageId.items():
+ iterNum += 1
+ if iterNum % 1000 == 0:
+ print("At iteration {}".format(iterNum))
+ #
row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
if row != None:
dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingNames else 0))