aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-05-04 01:17:06 +1000
committerTerry Truong <terry06890@gmail.com>2022-05-04 01:17:06 +1000
commit90a5e15bb824b84e5bb60337d6a57a1394090dc6 (patch)
tree661ea356c8d83b74d16f19d3555b0a1d3eb6eb56 /backend/data/enwiki
parentec29e5731136c74a1991e2f93b5e233747f2a230 (diff)
Add scripts for obtaining/sending/displaying wikipedia descriptions
Add backend/data/enwiki/ directory containing scripts and instructive READMEs. Adjust some other scripts to generate 'eol_ids' sqlite table separate from 'names'. Make server respond to /data/desc requests, and have client TileInfo component display response data. Also adjust .gitignore entries to be root-relative.
Diffstat (limited to 'backend/data/enwiki')
-rw-r--r--backend/data/enwiki/README.md35
-rwxr-xr-xbackend/data/enwiki/genDescData.py68
-rwxr-xr-xbackend/data/enwiki/genPageData.py39
-rwxr-xr-xbackend/data/enwiki/genRedirectData.py39
4 files changed, 181 insertions, 0 deletions
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
new file mode 100644
index 0000000..8e748c9
--- /dev/null
+++ b/backend/data/enwiki/README.md
@@ -0,0 +1,35 @@
+Downloaded Files
+================
+- enwiki\_content/enwiki-20220420-pages-articles-*.xml.gz:
+ Obtained via https://dumps.wikimedia.org/backup-index.html (site suggests downloading from a mirror).
+ Contains text content and metadata for pages in English Wikipedia (current revision only, excludes talk pages).
+ Some file content and format information was available from
+ https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
+- enwiki-20220420-page.sql.gz:
+ Obtained like above. Contains page-table information including page id, namespace, title, etc.
+ Format information was found at https://www.mediawiki.org/wiki/Manual:Page_table.
+- enwiki-20220420-redirect.sql.gz:
+ Obtained like above. Contains page-redirection info.
+ Format information was found at https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download.
+
+Generated Files
+===============
+- enwiki\_content/enwiki-*.xml and enwiki-*.sql:
+ Uncompressed versions of downloaded files.
+- enwikiData.db:
+ An sqlite database representing data from the enwiki dump files.
+ Generation:
+ 1 Install python, and packages mwsql, mwxml, and mwparsefromhell. Example:
+ 1 On Ubuntu, install python3, python3-pip, and python3-venv via `apt-get update; apt-get ...`.
+ 2 Create a virtual environment in which to install packages via `python3 -m venv .venv`.
+ 3 Activate the virtual environment via `source .venv/bin/activate`.
+ 4 Install mwsql, mwxml, and mwparsefromhell via `pip install mwsql mwxml mwparsefromhell`.
+ 2 Run genPageData.py (still under the virtual environment), which creates the database,
+ reads from the page dump, and creates a 'pages' table.
+ 3 Run genRedirectData.py, which creates a 'redirects' table, using information in the redirects dump,
+ and page ids from the 'pages' table.
+ 4 Run genDescData.py, which reads the page-content xml dumps, and the 'pages' and 'redirects' tables,
+ and associates page ids with (potentially redirect-resolved) pages, and attempts to parse some
+ wikitext within those pages to obtain the first descriptive paragraph, with markup removed.
+- .venv:
+ Provides a python virtual environment for packages needed to generate data.
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
new file mode 100755
index 0000000..3602138
--- /dev/null
+++ b/backend/data/enwiki/genDescData.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+import re
+import sys, os.path, glob
+import mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n"
+usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml")
+wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1)))
+enwikiDb = "enwikiData.db"
+
+# Some regexps and functions for parsing wikitext
+descLineRegex = "^ *[A-Z'\"]"
+embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$"
+ # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}"
+def convertTemplateReplace(match):
+ if match.group(2) == None:
+ return "{} {}".format(match.group(1), match.group(4))
+ else:
+ return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4))
+parenGrpRegex = r" \([^()]*\)"
+def parseDesc(text):
+ prevLine = None
+ for line in text.splitlines():
+ if prevLine != None:
+ if line.strip() == "" or re.match(descLineRegex, line) != None:
+ return prevLine
+ else:
+ prevLine = None
+ if re.match(descLineRegex, line) != None:
+ line = re.sub(embeddedHtmlRegex, "", line)
+ line = re.sub(convertTemplateRegex, convertTemplateReplace, line)
+ line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup
+ prevLine = re.sub(parenGrpRegex, "", line)
+ if prevLine != None:
+ return prevLine
+ return None
+
+# Open db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+# Parse data
+iterationNum = 0
+for fileName in wikiDumpFiles:
+ print("Processing file {}".format(fileName))
+ dump = mwxml.Dump.from_file(open(fileName))
+ for page in dump:
+ iterationNum += 1
+ if iterationNum % 10000 == 0:
+ print("At iteration {}".format(iterationNum))
+ # Parse page
+ if page.namespace == 0 and page.redirect == None:
+ revision = next(page)
+ desc = parseDesc(revision.text)
+ if desc != None:
+ dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py
new file mode 100755
index 0000000..7522f1f
--- /dev/null
+++ b/backend/data/enwiki/genPageData.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+import sys, os.path
+from mwsql import Dump
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n"
+usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n"
+usageInfo += "a sqlite db.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+pageDumpFile = "enwiki-20220420-page.sql.gz"
+enwikiDb = "enwikiData.db"
+
+# Check for existing db
+if os.path.exists(enwikiDb):
+ print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr)
+ sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+# Parse page data
+dump = Dump.from_file(pageDumpFile)
+iterationNum = 0
+for row in dump.rows(convert_dtypes=True):
+ iterationNum += 1
+ if iterationNum % 1e6 == 0:
+ print("At iteration {}".format(iterationNum))
+ # Add to map
+ if row[1] == 0: # If page in article namespace
+ dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " ")))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py
new file mode 100755
index 0000000..e1aadc8
--- /dev/null
+++ b/backend/data/enwiki/genRedirectData.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+import sys, os.path
+from mwsql import Dump
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n"
+usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n"
+usageInfo += "a sqlite db.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+redirectDumpFile = "enwiki-20220420-redirect.sql.gz"
+enwikiDb = "enwikiData.db"
+
+# Open db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)")
+dbCur2 = dbCon.cursor()
+# Parse redirect data
+dump = Dump.from_file(redirectDumpFile)
+iterationNum = 0
+for row in dump.rows(convert_dtypes=True):
+ iterationNum += 1
+ if iterationNum % 1e6 == 0:
+ print("At iteration {}".format(iterationNum))
+ # Add to map
+ [pageId, namespace, title] = row[:3]
+ if namespace == 0: # If page is in the article namespace
+ row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone()
+ if row != None:
+ targetId = row[0]
+ dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId))
+# Close db
+dbCon.commit()
+dbCon.close()