Improve enwiki description extraction

Adjust enwiki code to handle single dump file, and add scripts for 'convenient' page-content lookup.
author: Terry Truong <terry06890@gmail.com> 2022-05-17 10:41:12 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-05-17 10:41:12 +1000
commit: 29940d51eb8b6b220d53940ecbc212cea78159ae (patch)
tree: bfa698c17525de7876b80ad37d8f7777b9505ba0
parent: a840a16c6bd5aef906bd5cbce8293fc863cb5a5d (diff)
11 files changed, 295 insertions, 201 deletions
diff --git a/.gitignore b/.gitignore
index 908fc4f..7fa730c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,7 @@
 /backend/data/imgsForReview/
 /backend/data/imgsReviewed/
 /backend/data/img/
-/backend/data/enwiki/*.gz
+/backend/data/enwiki/*.bz2
 /backend/data/enwiki/*.db
 /backend/data/enwiki/enwiki_content/
 /backend/data/enwiki/.venv/
diff --git a/backend/data/README.md b/backend/data/README.md
index cb9cd42..576c70e 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -24,7 +24,7 @@ File Generation Process
     -   Supplementing with Wikipedia dump
         1   Obtain data in enwiki/, as specified in it's README.
         2   Run genEnwikiData.py, which adds to the 'descs' table, using data in
-            enwiki/enwikiData.db, reducedTol/names.txt, and the 'nodes' table.
+            enwiki/enwikiData.db, and the 'nodes' table.
 5   Reduced Tree Structure Data
     1   Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
         data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
diff --git a/backend/data/dbpPickedLabels.txt b/backend/data/dbpPickedLabels.txt
index 80a4770..d8f939e 100644
--- a/backend/data/dbpPickedLabels.txt
+++ b/backend/data/dbpPickedLabels.txt
@@ -88,7 +88,6 @@ balfouria (flatworm)
 ballana (leafhopper)
 Barcella
 Baryonyx
-basuto (horse)
 Begonia
 Belbina
 belisarius (scorpion)
@@ -320,7 +319,6 @@ Gymnopodium
 habeas corpus (pig)
 Halenia
 Halesia
-halla (horse)
 Hallucigenia
 Harmothoe
 Harpa
@@ -621,7 +619,6 @@ Thyreus
 tinerfe (ctenophore)
 Tiso
 Titanophora
-tokara (horse)
 tortricidae (snakes)
 Tortrix
 Triaenophorus
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index e4e1aae..cdabf50 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -1,35 +1,28 @@
 Downloaded Files
 ================
--   enwiki\_content/enwiki-20220420-pages-articles-*.xml.gz <br>
-    Obtained via https://dumps.wikimedia.org/backup-index.html (site suggests downloading from a mirror).
-    Contains text content and metadata for pages in English Wikipedia (current revision only, excludes talk pages).
-    Some file content and format information was available from
-    https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
--   enwiki-20220420-page.sql.gz <br>
-    Obtained like above. Contains page-table information including page id, namespace, title, etc.
-    Format information was found at https://www.mediawiki.org/wiki/Manual:Page_table.
--   enwiki-20220420-redirect.sql.gz <br>
-    Obtained like above. Contains page-redirection info.
-    Format information was found at https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
+-   enwiki-20220501-pages-articles-multistream.xml.bz2 <br>
+    Obtained via <https://dumps.wikimedia.org/backup-index.html>
+    (site suggests downloading from a mirror).  Contains text
+    content and metadata for pages in English Wikipedia
+    (current revision only, excludes talk pages).  Some file
+    content and format information was available from
+    <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+-   enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br>
+    Obtained like above. Holds lines of the form offset1:pageId1:title1,
+    providing offsets, for each page, into the dump file, of a chunk of
+    100 pages that includes it.
 
 Generated Files
 ===============
--   enwiki\_content/enwiki-*.xml and enwiki-*.sql <br>
-    Uncompressed versions of downloaded files.
+-   dumpIndex.db <br>
+    Holds data from the enwiki dump index file. Generated by
+    genDumpIndexDb.py, and used by lookupPage.py to get content for a
+    given page title.
 -   enwikiData.db <br>
-    An sqlite database representing data from the enwiki dump files.
-    Generation: 
-    1   Install python, and packages mwsql, mwxml, and mwparsefromhell. Example:
-        1   On Ubuntu, install python3, python3-pip, and python3-venv via `apt-get update; apt-get ...`.
-        2   Create a virtual environment in which to install packages via `python3 -m venv .venv`.
-        3   Activate the virtual environment via `source .venv/bin/activate`.
-        4   Install mwsql, mwxml, and mwparsefromhell via `pip install mwsql mwxml mwparsefromhell`.
-    2   Run genPageData.py (still under the virtual environment), which creates the database,
-        reads from the page dump, and creates a 'pages' table.
-    3   Run genRedirectData.py, which creates a 'redirects' table, using information in the redirects dump,
-        and page ids from the 'pages' table.
-    4   Run genDescData.py, which reads the page-content xml dumps, and the 'pages' and 'redirects' tables,
-        and associates page ids with (potentially redirect-resolved) pages, and attempts to parse some
-        wikitext within those pages to obtain the first descriptive paragraph, with markup removed.
--   .venv <br>
-    Provides a python virtual environment for packages needed to generate data.
+    Holds data obtained from the enwiki dump file, in 'pages',
+    'redirects', and 'descs' tables. Generated by genData.py, which uses
+    python packages mwxml and mwparserfromhell. <br>
+    Tables: <br>
+    -   pages:     id INT PRIMARY KEY, title TEXT UNIQUE
+    -   redirects: id INT PRIMARY KEY, target TEXT
+    -   descs:     id INT PRIMARY KEY, desc TEXT
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py
new file mode 100755
index 0000000..4f0d62e
--- /dev/null
+++ b/backend/data/enwiki/genData.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
+usageInfo += "and short-description info to an sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
+enwikiDb = "enwikiData.db"
+
+# Some regexps and functions for parsing wikitext
+descLineRegex = re.compile("^ *[A-Z'\"]")
+embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
+	# Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
+parensGrpRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+def convertTemplateReplace(match):
+	if match.group(2) == None:
+		return "{} {}".format(match.group(1), match.group(4))
+	else:
+		return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
+def parseDesc(text):
+	# Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
+	# Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, 
+		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	lines = []
+	openBraceCount = 0
+	openBracketCount = 0
+	inComment = False
+	skip = False
+	for line in text.splitlines():
+		line = line.strip()
+		if len(lines) == 0:
+			if len(line) > 0:
+				if openBraceCount > 0 or line[0] == "{":
+					openBraceCount += line.count("{")
+					openBraceCount -= line.count("}")
+					skip = True
+				if openBracketCount > 0 or line[0] == "[":
+					openBracketCount += line.count("[")
+					openBracketCount -= line.count("]")
+					skip = True
+				if inComment or line.find("<!--") != -1:
+					if line.find("-->") != -1:
+						if inComment:
+							inComment = False
+							skip = True
+					else:
+						inComment = True
+						skip = True
+				if skip:
+					skip = False
+					continue
+				if line[-1] == ":": # Seems to help avoid disambiguation pages
+					return None
+				if descLineRegex.match(line) != None:
+					lines.append(line)
+		else:
+			if len(line) == 0:
+				return removeMarkup(" ".join(lines))
+			lines.append(line)
+	if len(lines) > 0:
+		return removeMarkup(" ".join(lines))
+	return None
+def removeMarkup(content):
+	content = embeddedHtmlRegex.sub("", content)
+	content = convertTemplateRegex.sub(convertTemplateReplace, content)
+	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+	content = parensGrpRegex.sub("", content)
+	content = leftoverBraceRegex.sub("", content)
+	return content
+# Other helper functions
+def convertTitle(title):
+	return html.unescape(title).replace("_", " ")
+
+# Check for existing db
+if os.path.exists(enwikiDb):
+	print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr)
+	sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+# Read through dump file
+print("Reading dump file")
+with bz2.open(dumpFile, mode='rt') as file:
+	dump = mwxml.Dump.from_file(file)
+	pageNum = 0
+	for page in dump:
+		pageNum += 1
+		if pageNum % 1e4 == 0:
+			print("At page {}".format(pageNum))
+		# Parse page
+		if page.namespace == 0:
+			try:
+				dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
+			except sqlite3.IntegrityError as e:
+				# Accounts for certain pages that have the same title
+				print("Failed to add page with title \"{}\": {}".format(page.title, e))
+				continue
+			if page.redirect != None:
+				dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
+			else:
+				revision = next(page)
+				desc = parseDesc(revision.text)
+				if desc != None:
+					dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
deleted file mode 100755
index 3602138..0000000
--- a/backend/data/enwiki/genDescData.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/python3
-
-import re
-import sys, os.path, glob
-import mwxml, mwparserfromhell
-import sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n"
-usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml")
-wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1)))
-enwikiDb = "enwikiData.db"
-
-# Some regexps and functions for parsing wikitext
-descLineRegex = "^ *[A-Z'\"]"
-embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$"
-	# Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
-convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}"
-def convertTemplateReplace(match):
-	if match.group(2) == None:
-		return "{} {}".format(match.group(1), match.group(4))
-	else:
-		return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
-parenGrpRegex = r" \([^()]*\)"
-def parseDesc(text):
-	prevLine = None
-	for line in text.splitlines():
-		if prevLine != None:
-			if line.strip() == "" or re.match(descLineRegex, line) != None:
-				return prevLine
-			else:
-				prevLine = None
-		if re.match(descLineRegex, line) != None:
-			line = re.sub(embeddedHtmlRegex, "", line)
-			line = re.sub(convertTemplateRegex, convertTemplateReplace, line)
-			line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup
-			prevLine = re.sub(parenGrpRegex, "", line)
-	if prevLine != None:
-		return prevLine
-	return None
-
-# Open db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Parse data
-iterationNum = 0
-for fileName in wikiDumpFiles:
-	print("Processing file {}".format(fileName))
-	dump = mwxml.Dump.from_file(open(fileName))
-	for page in dump:
-		iterationNum += 1
-		if iterationNum % 10000 == 0:
-			print("At iteration {}".format(iterationNum))
-		# Parse page
-		if page.namespace == 0 and page.redirect == None:
-			revision = next(page)
-			desc = parseDesc(revision.text)
-			if desc != None:
-				dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py
new file mode 100755
index 0000000..13f7eb6
--- /dev/null
+++ b/backend/data/enwiki/genDumpIndexDb.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a Wikimedia enwiki dump index file,\n"
+usageInfo += "and stores it's offset and title data to an sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines
+indexDb = "dumpIndex.db"
+
+# Check for existing db
+if os.path.exists(indexDb):
+	print("ERROR: Existing {}".format(indexDb), file=sys.stderr)
+	sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, offset INT, next_offset INT)")
+# Reading index file
+lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
+lastOffset = 0
+lineNum = 0
+titlesToAdd = []
+with bz2.open(indexFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print("At line {}".format(lineNum))
+		#
+		match = lineRegex.fullmatch(line.rstrip())
+		(offset, _, title) = match.group(1,2,3)
+		offset = int(offset)
+		if offset > lastOffset:
+			for t in titlesToAdd:
+				try:
+					dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (t, lastOffset, offset))
+				except sqlite3.IntegrityError as e:
+					# Accounts for certain entries in the file that have the same title
+					print("Failed on title \"{}\": {}".format(t, e))
+			titlesToAdd = []
+			lastOffset = offset
+		titlesToAdd.append(title)
+for title in titlesToAdd:
+	try:
+		dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?)", (title, lastOffset, -1))
+	except sqlite3.IntegrityError as e:
+		print("Failed on title \"{}\": {}".format(t, e))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py
deleted file mode 100755
index 7522f1f..0000000
--- a/backend/data/enwiki/genPageData.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os.path
-from mwsql import Dump
-import sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n"
-usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n"
-usageInfo += "a sqlite db.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-pageDumpFile = "enwiki-20220420-page.sql.gz"
-enwikiDb = "enwikiData.db"
-
-# Check for existing db
-if os.path.exists(enwikiDb):
-	print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr)
-	sys.exit(1)
-# Create db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
-dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
-# Parse page data
-dump = Dump.from_file(pageDumpFile)
-iterationNum = 0
-for row in dump.rows(convert_dtypes=True):
-	iterationNum += 1
-	if iterationNum % 1e6 == 0:
-		print("At iteration {}".format(iterationNum))
-	# Add to map
-	if row[1] == 0: # If page in article namespace
-		dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " ")))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py
deleted file mode 100755
index e1aadc8..0000000
--- a/backend/data/enwiki/genRedirectData.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os.path
-from mwsql import Dump
-import sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n"
-usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n"
-usageInfo += "a sqlite db.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-redirectDumpFile = "enwiki-20220420-redirect.sql.gz"
-enwikiDb = "enwikiData.db"
-
-# Open db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)")
-dbCur2 = dbCon.cursor()
-# Parse redirect data
-dump = Dump.from_file(redirectDumpFile)
-iterationNum = 0
-for row in dump.rows(convert_dtypes=True):
-	iterationNum += 1
-	if iterationNum % 1e6 == 0:
-		print("At iteration {}".format(iterationNum))
-	# Add to map
-	[pageId, namespace, title] = row[:3]
-	if namespace == 0: # If page is in the article namespace
-		row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone()
-		if row != None:
-			targetId = row[0]
-			dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py
new file mode 100755
index 0000000..5d6afe9
--- /dev/null
+++ b/backend/data/enwiki/lookupPage.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]} title1\n"
+usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n"
+usageInfo += "using a dump index db, and prints the corresponding <page>.\n"
+if len(sys.argv) != 2:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+pageTitle = sys.argv[1]
+
+# Searching index file
+print("Lookup offset in index db")
+dbCon = sqlite3.connect(indexDb)
+dbCur = dbCon.cursor()
+row = dbCur.execute("SELECT title, offset, next_offset FROM offsets WHERE title = ?",
+	(pageTitle.replace("_", " "),)).fetchone()
+if row == None:
+	print("Title not found")
+	sys.exit(0)
+(_, pageOffset, endOffset) = row
+dbCon.close()
+print("Found chunk at offset {}".format(pageOffset))
+# Read dump file
+print("Reading dump file")
+content = []
+with open(dumpFile, mode='rb') as file:
+	# Get uncompressed chunk
+	file.seek(pageOffset)
+	compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+	data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+	# Look in chunk for page
+	lines = data.splitlines()
+	lineIdx = 0
+	found = False
+	pageNum = 0
+	while not found:
+		line = lines[lineIdx]
+		if line.lstrip() == "<page>":
+			pageNum += 1
+			if pageNum > 100:
+				print("ERROR: Did not find title after 100 pages")
+				break
+			lineIdx += 1
+			titleLine = lines[lineIdx]
+			if titleLine.lstrip() == '<title>' + pageTitle + '</title>':
+				found = True
+				print("Found title in chunk as page {}".format(pageNum))
+				content.append(line)
+				content.append(titleLine)
+				while True:
+					lineIdx += 1
+					line = lines[lineIdx]
+					content.append(line)
+					if line.lstrip() == "</page>":
+						break
+		lineIdx += 1
+# Print content
+print("Content: ")
+print("\n".join(content))
diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py
index 48fd2c6..879ecf6 100755
--- a/backend/data/genEnwikiData.py
+++ b/backend/data/genEnwikiData.py
@@ -4,15 +4,14 @@ import sys, re
 import sqlite3
 
 usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki data from enwiki/, a list of node names,"
-usageInfo += "and node and name data from a sqlite database, and adds\n"
-usageInfo += "description data for names that don't have them\n"
+usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data"
+usageInfo += "from a sqlite database, and adds description data for names that\n"
+usageInfo += "don't have them.\n"
 if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
 enwikiDb = "enwiki/enwikiData.db"
-namesFile = "reducedTol/names.txt"
 dbFile = "data.db"
 
 # Open dbs
@@ -20,40 +19,48 @@ enwikiCon = sqlite3.connect(enwikiDb)
 enwikiCur = enwikiCon.cursor()
 dbCon = sqlite3.connect(dbFile)
 dbCur = dbCon.cursor()
-# Read in names to check
-print("Getting names to check")
+# Get node names without descriptions
+print("Getting node names")
 nodeNames = set()
-with open(namesFile) as file:
-	for line in file:
-		nodeNames.add(line.rstrip())
+query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name WHERE desc IS NULL"
+for row in dbCur.execute(query):
+	nodeNames.add(row[0])
 print("Found {} names".format(len(nodeNames)))
-# Remove names that have descriptions
-print("Checking for existing name descriptions")
-namesWithDescs = set()
-for name in nodeNames:
-	row = dbCur.execute("SELECT name FROM descs where name = ?", (name,)).fetchone()
-	if row != None:
-		namesWithDescs.add(name)
-nodeNames.difference_update(namesWithDescs)
-print("Remaining nodes: {}".format(len(nodeNames)))
 # Find page id for each node name
-nodeToPageId = {}
 print("Getting node page-ids")
+nodeToPageId = {}
+iterNum = 0
 for name in nodeNames:
-	row = enwikiCur.execute("SELECT id FROM pages where pages.title = ? COLLATE nocase", (name,)).fetchone()
+	iterNum += 1
+	if iterNum % 1e4 == 0:
+		print("At iteration {}".format(iterNum))
+	#
+	row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
 	if row != None:
 		nodeToPageId[name] = row[0]
 # Resolve redirects
 print("Resolving redirects")
 redirectingNames = set()
+iterNum = 0
 for (name, pageId) in nodeToPageId.items():
-	row = enwikiCur.execute("SELECT target_id FROM redirects where redirects.id = ?", (pageId,)).fetchone()
+	iterNum += 1
+	if iterNum % 1000 == 0:
+		print("At iteration {}".format(iterNum))
+	#
+	row = enwikiCur.execute(
+		"SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?",
+		(pageId,)).fetchone()
 	if row != None:
 		nodeToPageId[name] = row[0]
 		redirectingNames.add(name)
 # Add descriptions for each node
 print("Adding description data")
+iterNum = 0
 for (name, pageId) in nodeToPageId.items():
+	iterNum += 1
+	if iterNum % 1000 == 0:
+		print("At iteration {}".format(iterNum))
+	#
 	row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
 	if row != None:
 		dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingNames else 0))
author	Terry Truong <terry06890@gmail.com>	2022-05-17 10:41:12 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-05-17 10:41:12 +1000
commit	29940d51eb8b6b220d53940ecbc212cea78159ae (patch)
tree	bfa698c17525de7876b80ad37d8f7777b9505ba0
parent	a840a16c6bd5aef906bd5cbce8293fc863cb5a5d (diff)