From 90a5e15bb824b84e5bb60337d6a57a1394090dc6 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 4 May 2022 01:17:06 +1000 Subject: Add scripts for obtaining/sending/displaying wikipedia descriptions Add backend/data/enwiki/ directory containing scripts and instructive READMEs. Adjust some other scripts to generate 'eol_ids' sqlite table separate from 'names'. Make server respond to /data/desc requests, and have client TileInfo component display response data. Also adjust .gitignore entries to be root-relative. --- backend/data/enwiki/README.md | 35 +++++++++++++++++ backend/data/enwiki/genDescData.py | 68 ++++++++++++++++++++++++++++++++++ backend/data/enwiki/genPageData.py | 39 +++++++++++++++++++ backend/data/enwiki/genRedirectData.py | 39 +++++++++++++++++++ 4 files changed, 181 insertions(+) create mode 100644 backend/data/enwiki/README.md create mode 100755 backend/data/enwiki/genDescData.py create mode 100755 backend/data/enwiki/genPageData.py create mode 100755 backend/data/enwiki/genRedirectData.py (limited to 'backend/data/enwiki') diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md new file mode 100644 index 0000000..8e748c9 --- /dev/null +++ b/backend/data/enwiki/README.md @@ -0,0 +1,35 @@ +Downloaded Files +================ +- enwiki\_content/enwiki-20220420-pages-articles-*.xml.gz: + Obtained via https://dumps.wikimedia.org/backup-index.html (site suggests downloading from a mirror). + Contains text content and metadata for pages in English Wikipedia (current revision only, excludes talk pages). + Some file content and format information was available from + https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download. +- enwiki-20220420-page.sql.gz: + Obtained like above. Contains page-table information including page id, namespace, title, etc. + Format information was found at https://www.mediawiki.org/wiki/Manual:Page_table. +- enwiki-20220420-redirect.sql.gz: + Obtained like above. Contains page-redirection info. + Format information was found at https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download. + +Generated Files +=============== +- enwiki\_content/enwiki-*.xml and enwiki-*.sql: + Uncompressed versions of downloaded files. +- enwikiData.db: + An sqlite database representing data from the enwiki dump files. + Generation: + 1 Install python, and packages mwsql, mwxml, and mwparsefromhell. Example: + 1 On Ubuntu, install python3, python3-pip, and python3-venv via `apt-get update; apt-get ...`. + 2 Create a virtual environment in which to install packages via `python3 -m venv .venv`. + 3 Activate the virtual environment via `source .venv/bin/activate`. + 4 Install mwsql, mwxml, and mwparsefromhell via `pip install mwsql mwxml mwparsefromhell`. + 2 Run genPageData.py (still under the virtual environment), which creates the database, + reads from the page dump, and creates a 'pages' table. + 3 Run genRedirectData.py, which creates a 'redirects' table, using information in the redirects dump, + and page ids from the 'pages' table. + 4 Run genDescData.py, which reads the page-content xml dumps, and the 'pages' and 'redirects' tables, + and associates page ids with (potentially redirect-resolved) pages, and attempts to parse some + wikitext within those pages to obtain the first descriptive paragraph, with markup removed. +- .venv: + Provides a python virtual environment for packages needed to generate data. diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py new file mode 100755 index 0000000..3602138 --- /dev/null +++ b/backend/data/enwiki/genDescData.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 + +import re +import sys, os.path, glob +import mwxml, mwparserfromhell +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n" +usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml") +wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1))) +enwikiDb = "enwikiData.db" + +# Some regexps and functions for parsing wikitext +descLineRegex = "^ *[A-Z'\"]" +embeddedHtmlRegex = r"<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$" + # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag +convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}" +def convertTemplateReplace(match): + if match.group(2) == None: + return "{} {}".format(match.group(1), match.group(4)) + else: + return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4)) +parenGrpRegex = r" \([^()]*\)" +def parseDesc(text): + prevLine = None + for line in text.splitlines(): + if prevLine != None: + if line.strip() == "" or re.match(descLineRegex, line) != None: + return prevLine + else: + prevLine = None + if re.match(descLineRegex, line) != None: + line = re.sub(embeddedHtmlRegex, "", line) + line = re.sub(convertTemplateRegex, convertTemplateReplace, line) + line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup + prevLine = re.sub(parenGrpRegex, "", line) + if prevLine != None: + return prevLine + return None + +# Open db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") +# Parse data +iterationNum = 0 +for fileName in wikiDumpFiles: + print("Processing file {}".format(fileName)) + dump = mwxml.Dump.from_file(open(fileName)) + for page in dump: + iterationNum += 1 + if iterationNum % 10000 == 0: + print("At iteration {}".format(iterationNum)) + # Parse page + if page.namespace == 0 and page.redirect == None: + revision = next(page) + desc = parseDesc(revision.text) + if desc != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py new file mode 100755 index 0000000..7522f1f --- /dev/null +++ b/backend/data/enwiki/genPageData.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import sys, os.path +from mwsql import Dump +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n" +usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n" +usageInfo += "a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +pageDumpFile = "enwiki-20220420-page.sql.gz" +enwikiDb = "enwikiData.db" + +# Check for existing db +if os.path.exists(enwikiDb): + print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr) + sys.exit(1) +# Create db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") +dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") +# Parse page data +dump = Dump.from_file(pageDumpFile) +iterationNum = 0 +for row in dump.rows(convert_dtypes=True): + iterationNum += 1 + if iterationNum % 1e6 == 0: + print("At iteration {}".format(iterationNum)) + # Add to map + if row[1] == 0: # If page in article namespace + dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " "))) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py new file mode 100755 index 0000000..e1aadc8 --- /dev/null +++ b/backend/data/enwiki/genRedirectData.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import sys, os.path +from mwsql import Dump +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n" +usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n" +usageInfo += "a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +redirectDumpFile = "enwiki-20220420-redirect.sql.gz" +enwikiDb = "enwikiData.db" + +# Open db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)") +dbCur2 = dbCon.cursor() +# Parse redirect data +dump = Dump.from_file(redirectDumpFile) +iterationNum = 0 +for row in dump.rows(convert_dtypes=True): + iterationNum += 1 + if iterationNum % 1e6 == 0: + print("At iteration {}".format(iterationNum)) + # Add to map + [pageId, namespace, title] = row[:3] + if namespace == 0: # If page is in the article namespace + row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone() + if row != None: + targetId = row[0] + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId)) +# Close db +dbCon.commit() +dbCon.close() -- cgit v1.2.3