Add scripts for obtaining/sending/displaying wikipedia descriptions

Add backend/data/enwiki/ directory containing scripts and instructive READMEs. Adjust some other scripts to generate 'eol_ids' sqlite table separate from 'names'. Make server respond to /data/desc requests, and have client TileInfo component display response data. Also adjust .gitignore entries to be root-relative.
author: Terry Truong <terry06890@gmail.com> 2022-05-04 01:17:06 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-05-04 01:17:06 +1000
commit: 90a5e15bb824b84e5bb60337d6a57a1394090dc6 (patch)
tree: 661ea356c8d83b74d16f19d3555b0a1d3eb6eb56 /backend
parent: ec29e5731136c74a1991e2f93b5e233747f2a230 (diff)
11 files changed, 300 insertions, 32 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index e639cb6..8791fb4 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -1,21 +1,31 @@
 File Generation Process
 =======================
-1   Obtain data in otol/ and eol/, as specified in their README files.
-2   Run genOtolData.py, which creates data.db, and adds a 'nodes'
-    table using data in otol/*.
-3   Run genEolNameData.py, which adds a 'names' table to data.db, 
-    using data in eol/vernacularNames.csv and the 'nodes' table.
-4   Run genSpellfixNameData.py, which adds a 'spellfix\_alt\_names'
-    table to data.db, using data in the 'names' table.
-5   Use downloadImgsForReview.py to download EOL images into imgsForReview/.
-    It uses data in eol/imagesList.db, and the 'names' table.
-6   Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique
-    images in imgsReviewed/ (uses 'names' to display common names).
-7   Use genImgsForWeb.py to create cropped/resized images in img/, using
-    images in imgsReviewed, and also to add an 'images' table to data.db.
+
+1   Tree Structure Data
+    1   Obtain data in otol/, as specified in it's README.
+    2   Run genOtolData.py, which creates data.db, and adds a 'nodes'
+        table using data in otol/*.
+2   Name Data for Search
+    1   Obtain data in eol/, as specified in it's README.
+    2   Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, 
+        using data in eol/vernacularNames.csv and the 'nodes' table.
+    3   Run genSpellfixNameData.py, which adds a 'spellfix\_alt\_names'
+        table to data.db, using data in the 'names' table.
+3   Image Data
+    1   Use downloadImgsForReview.py to download EOL images into imgsForReview/.
+        It uses data in eol/imagesList.db, and the 'eol_ids' table.
+    2   Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique
+        images in imgsReviewed/ (uses 'names' and 'eol_ids' to display extra info).
+    3   Use genImgsForWeb.py to create cropped/resized images in img/, using
+        images in imgsReviewed, and also to add an 'images' table to data.db.
+4   Node Description Data
+    1   Obtain data in enwiki/, as specified in it's README.
+    2   Run genEnwikiData.py, which adds a 'descs' table to data.db,
+        using data in enwiki/enwikiData.db, and the 'nodes' table.
 
 spellfix.so
 ===========
+
 This file provides the spellfix1 extension for Sqlite, and
 is used for responding to fuzzy-search requests.
 
diff --git a/backend/data/downloadImgsForReview.py b/backend/data/downloadImgsForReview.py
index 12b52ff..03e22a8 100755
--- a/backend/data/downloadImgsForReview.py
+++ b/backend/data/downloadImgsForReview.py
@@ -31,7 +31,7 @@ eolIds = set()
 print("Reading in EOL IDs")
 dbCon = sqlite3.connect(dbFile)
 dbCur = dbCon.cursor()
-for row in dbCur.execute("SELECT DISTINCT eol_id FROM names"):
+for row in dbCur.execute("SELECT id FROM eol_ids"):
 	eolIds.add(row[0])
 dbCon.close()
 # Get eol-ids from images db
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
new file mode 100644
index 0000000..8e748c9
--- /dev/null
+++ b/backend/data/enwiki/README.md
@@ -0,0 +1,35 @@
+Downloaded Files
+================
+-   enwiki\_content/enwiki-20220420-pages-articles-*.xml.gz:
+    Obtained via https://dumps.wikimedia.org/backup-index.html (site suggests downloading from a mirror).
+    Contains text content and metadata for pages in English Wikipedia (current revision only, excludes talk pages).
+    Some file content and format information was available from
+    https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
+-   enwiki-20220420-page.sql.gz:
+    Obtained like above. Contains page-table information including page id, namespace, title, etc.
+    Format information was found at https://www.mediawiki.org/wiki/Manual:Page_table.
+-   enwiki-20220420-redirect.sql.gz:
+    Obtained like above. Contains page-redirection info.
+    Format information was found at https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
+
+Generated Files
+===============
+-   enwiki\_content/enwiki-*.xml and enwiki-*.sql:
+    Uncompressed versions of downloaded files.
+-   enwikiData.db:
+    An sqlite database representing data from the enwiki dump files.
+    Generation: 
+    1   Install python, and packages mwsql, mwxml, and mwparsefromhell. Example:
+        1   On Ubuntu, install python3, python3-pip, and python3-venv via `apt-get update; apt-get ...`.
+        2   Create a virtual environment in which to install packages via `python3 -m venv .venv`.
+        3   Activate the virtual environment via `source .venv/bin/activate`.
+        4   Install mwsql, mwxml, and mwparsefromhell via `pip install mwsql mwxml mwparsefromhell`.
+    2   Run genPageData.py (still under the virtual environment), which creates the database,
+        reads from the page dump, and creates a 'pages' table.
+    3   Run genRedirectData.py, which creates a 'redirects' table, using information in the redirects dump,
+        and page ids from the 'pages' table.
+    4   Run genDescData.py, which reads the page-content xml dumps, and the 'pages' and 'redirects' tables,
+        and associates page ids with (potentially redirect-resolved) pages, and attempts to parse some
+        wikitext within those pages to obtain the first descriptive paragraph, with markup removed.
+-   .venv:
+    Provides a python virtual environment for packages needed to generate data.
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
new file mode 100755
index 0000000..3602138
--- /dev/null
+++ b/backend/data/enwiki/genDescData.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+import re
+import sys, os.path, glob
+import mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n"
+usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml")
+wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1)))
+enwikiDb = "enwikiData.db"
+
+# Some regexps and functions for parsing wikitext
+descLineRegex = "^ *[A-Z'\"]"
+embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$"
+	# Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}"
+def convertTemplateReplace(match):
+	if match.group(2) == None:
+		return "{} {}".format(match.group(1), match.group(4))
+	else:
+		return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
+parenGrpRegex = r" \([^()]*\)"
+def parseDesc(text):
+	prevLine = None
+	for line in text.splitlines():
+		if prevLine != None:
+			if line.strip() == "" or re.match(descLineRegex, line) != None:
+				return prevLine
+			else:
+				prevLine = None
+		if re.match(descLineRegex, line) != None:
+			line = re.sub(embeddedHtmlRegex, "", line)
+			line = re.sub(convertTemplateRegex, convertTemplateReplace, line)
+			line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup
+			prevLine = re.sub(parenGrpRegex, "", line)
+	if prevLine != None:
+		return prevLine
+	return None
+
+# Open db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+# Parse data
+iterationNum = 0
+for fileName in wikiDumpFiles:
+	print("Processing file {}".format(fileName))
+	dump = mwxml.Dump.from_file(open(fileName))
+	for page in dump:
+		iterationNum += 1
+		if iterationNum % 10000 == 0:
+			print("At iteration {}".format(iterationNum))
+		# Parse page
+		if page.namespace == 0 and page.redirect == None:
+			revision = next(page)
+			desc = parseDesc(revision.text)
+			if desc != None:
+				dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py
new file mode 100755
index 0000000..7522f1f
--- /dev/null
+++ b/backend/data/enwiki/genPageData.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+import sys, os.path
+from mwsql import Dump
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n"
+usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n"
+usageInfo += "a sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+pageDumpFile = "enwiki-20220420-page.sql.gz"
+enwikiDb = "enwikiData.db"
+
+# Check for existing db
+if os.path.exists(enwikiDb):
+	print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr)
+	sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+# Parse page data
+dump = Dump.from_file(pageDumpFile)
+iterationNum = 0
+for row in dump.rows(convert_dtypes=True):
+	iterationNum += 1
+	if iterationNum % 1e6 == 0:
+		print("At iteration {}".format(iterationNum))
+	# Add to map
+	if row[1] == 0: # If page in article namespace
+		dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " ")))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py
new file mode 100755
index 0000000..e1aadc8
--- /dev/null
+++ b/backend/data/enwiki/genRedirectData.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+import sys, os.path
+from mwsql import Dump
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n"
+usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n"
+usageInfo += "a sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+redirectDumpFile = "enwiki-20220420-redirect.sql.gz"
+enwikiDb = "enwikiData.db"
+
+# Open db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)")
+dbCur2 = dbCon.cursor()
+# Parse redirect data
+dump = Dump.from_file(redirectDumpFile)
+iterationNum = 0
+for row in dump.rows(convert_dtypes=True):
+	iterationNum += 1
+	if iterationNum % 1e6 == 0:
+		print("At iteration {}".format(iterationNum))
+	# Add to map
+	[pageId, namespace, title] = row[:3]
+	if namespace == 0: # If page is in the article namespace
+		row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone()
+		if row != None:
+			targetId = row[0]
+			dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md
index 3ce9799..d863099 100644
--- a/backend/data/eol/README.md
+++ b/backend/data/eol/README.md
@@ -1,15 +1,15 @@
 Downloaded Files
 ================
--   imagesList.tgz
-    Obtained from https://opendata.eol.org/dataset/images-list on 24/04/2022
-    Listed as being last updated on 05/02/2020
--   vernacularNames.csv
-    Obtained from https://opendata.eol.org/dataset/vernacular-names on 24/04/2022
-    Listed as being last updated on 27/10/2020
+-   imagesList.tgz:
+    Obtained from https://opendata.eol.org/dataset/images-list on 24/04/2022.
+    Listed as being last updated on 05/02/2020.
+-   vernacularNames.csv:
+    Obtained from https://opendata.eol.org/dataset/vernacular-names on 24/04/2022.
+    Listed as being last updated on 27/10/2020.
 
 Generated Files
 ===============
--   imagesList/
-    Obtained by extracting imagesList.tgz
--   imagesList.db
-    Represents data from eol/imagesList/*, and is created by genImagesListDb.sh
+-   imagesList/:
+    Obtained by extracting imagesList.tgz.
+-   imagesList.db:
+    Represents data from eol/imagesList/*, and is created by genImagesListDb.sh.
diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py
new file mode 100755
index 0000000..f1490b6
--- /dev/null
+++ b/backend/data/genEnwikiData.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads Wikimedia enwiki data from enwiki/, along with node and name data\n"
+usageInfo += "from a sqlite database, associates nodes with enwiki pages, and adds\n"
+usageInfo += "alt-name and description information for those nodes.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+enwikiDb = "enwiki/enwikiData.db"
+dbFile = "data.db"
+
+# Open dbs
+enwikiCon = sqlite3.connect(enwikiDb)
+enwikiCur = enwikiCon.cursor()
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Find page id for each node name
+nodeToPageId = {}
+print("Getting node page-ids")
+iterationNum = 0
+for row in dbCur.execute("SELECT name from nodes"):
+	iterationNum += 1
+	if iterationNum % 1e4 == 0:
+		print("At iteration {}".format(iterationNum))
+	#
+	name = row[0]
+	row = enwikiCur.execute("SELECT id FROM pages where pages.title = ? COLLATE nocase", (name,)).fetchone()
+	if row != None:
+		nodeToPageId[name] = row[0]
+# Resolve redirects
+print("Resolving redirects")
+redirectingNames = set()
+iterationNum = 0
+for (name, pageId) in nodeToPageId.items():
+	iterationNum += 1
+	if iterationNum % 1e4 == 0:
+		print("At iteration {}".format(iterationNum))
+	#
+	row = enwikiCur.execute("SELECT target_id FROM redirects where redirects.id = ?", (pageId,)).fetchone()
+	if row != None:
+		nodeToPageId[name] = row[0]
+		redirectingNames.add(name)
+# Add descriptions for each node
+print("Adding node description data")
+dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)")
+iterationNum = 0
+for (name, pageId) in nodeToPageId.items():
+	iterationNum += 1
+	if iterationNum % 1e4 == 0:
+		print("At iteration {}".format(iterationNum))
+	#
+	row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
+	if row != None:
+		dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingNames else 0))
+# Close dbs
+dbCon.commit()
+dbCon.close()
+enwikiCon.commit()
+enwikiCon.close()
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index 200b459..74d9329 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -62,8 +62,9 @@ with open(vnamesFile, newline="") as csvfile:
 # Open db connection
 dbCon = sqlite3.connect(dbFile)
 dbCur = dbCon.cursor()
-# Create 'names' table
-dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, pref_alt INT, PRIMARY KEY(name, alt_name))")
+# Create tables
+dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, PRIMARY KEY(name, alt_name))")
+dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
 # Iterate through 'nodes' table, resolving to canonical-names
 usedPids = set()
 unresolvedNodeNames = set()
@@ -85,11 +86,12 @@ for row in dbCur2.execute("SELECT name FROM nodes"):
 			usedPids.add(pidToUse)
 			altNames = {name}
 			preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
+			dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, name))
 			for n in pidToNames[pidToUse]:
 				altNames.add(n)
 			for n in altNames:
 				isPreferred = 1 if (n == preferredName) else 0
-				dbCur.execute("INSERT INTO names VALUES (?, ?, ?, ?)", (name, n, pidToUse, isPreferred))
+				dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred))
 	elif name in nameToPids:
 		unresolvedNodeNames.add(name)
 # Iterate through unresolved nodes, resolving to vernacular-names
@@ -108,11 +110,12 @@ for name in unresolvedNodeNames:
 		usedPids.add(pidToUse)
 		altNames = {name}
 		preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
+		dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (name, pidToUse))
 		for n in pidToNames[pidToUse]:
 			altNames.add(n)
 		for n in altNames:
 			isPreferred = 1 if (n == preferredName) else 0
-			dbCur.execute("INSERT INTO names VALUES (?, ?, ?, ?)", (name, n, pidToUse, isPreferred))
+			dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred))
 # Close db
 dbCon.commit()
 dbCon.close()
diff --git a/backend/data/reviewImgs.py b/backend/data/reviewImgs.py
index 5dcd52e..8987007 100755
--- a/backend/data/reviewImgs.py
+++ b/backend/data/reviewImgs.py
@@ -125,10 +125,12 @@ class EolImgReviewer:
 		# Update title
 		firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
 		lastImgIdx = self.imgListIdx
-		row = dbCur.execute("SELECT alt_name, eol_id, pref_alt FROM names WHERE eol_id = ? and pref_alt = 1",
-			(self.nextEolId,)).fetchone()
+		query = "SELECT eol_ids.id, names.alt_name, names.pref_alt FROM" \
+			" names INNER JOIN eol_ids ON eol_ids.name = names.name" \
+			" WHERE id = ? and pref_alt = 1"
+		row = dbCur.execute(query, (self.nextEolId,)).fetchone()
 		if row != None:
-			commonName = row[0]
+			commonName = row[1]
 			self.root.title("Reviewing EOL ID {}, aka \"{}\" (imgs {} to {} out of {})".format(
 				self.nextEolId, commonName, firstImgIdx, lastImgIdx, len(self.imgList)))
 		else:
diff --git a/backend/server.py b/backend/server.py
index 1c09ad7..580b4fb 100755
--- a/backend/server.py
+++ b/backend/server.py
@@ -52,9 +52,10 @@ def lookupNode(name):
 		nodeObj["img"] = nodeNameToFile(match.group(1), cur)
 		if nodeObj["img"] == None:
 			nodeObj["img"] = nodeNameToFile(match.group(2), cur)
+	#
 	return nodeObj;
 def nodeNameToFile(name, cur):
-	row = cur.execute("SELECT name, eol_id FROM names WHERE name = ?", (name,)).fetchone()
+	row = cur.execute("SELECT name, id FROM eol_ids WHERE name = ?", (name,)).fetchone()
 	if row == None:
 		return None
 	eolId = row[1]
@@ -92,6 +93,10 @@ def lookupName(name):
 		hasMore = True
 		del results[-1]
 	return json.dumps([results, hasMore])
+def lookupDesc(name):
+	cur = dbCon.cursor()
+	row = cur.execute("SELECT desc, redirected from descs WHERE descs.name = ?", (name,)).fetchone()
+	return json.dumps([row[0], row[1] == 1] if row != None else None)
 
 class DbServer(BaseHTTPRequestHandler):
 	def do_GET(self):
@@ -158,6 +163,9 @@ class DbServer(BaseHTTPRequestHandler):
 			elif reqType == "search":
 				self.respondJson(lookupName(name))
 				return
+			elif reqType == "desc":
+				self.respondJson(lookupDesc(name))
+				return
 		self.send_response(404)
 		self.end_headers()
 		self.end_headers()
author	Terry Truong <terry06890@gmail.com>	2022-05-04 01:17:06 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-05-04 01:17:06 +1000
commit	90a5e15bb824b84e5bb60337d6a57a1394090dc6 (patch)
tree	661ea356c8d83b74d16f19d3555b0a1d3eb6eb56 /backend
parent	ec29e5731136c74a1991e2f93b5e233747f2a230 (diff)