diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-04 01:17:06 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-05-04 01:17:06 +1000 |
| commit | 90a5e15bb824b84e5bb60337d6a57a1394090dc6 (patch) | |
| tree | 661ea356c8d83b74d16f19d3555b0a1d3eb6eb56 /backend | |
| parent | ec29e5731136c74a1991e2f93b5e233747f2a230 (diff) | |
Add scripts for obtaining/sending/displaying wikipedia descriptions
Add backend/data/enwiki/ directory containing scripts and instructive
READMEs. Adjust some other scripts to generate 'eol_ids' sqlite table
separate from 'names'. Make server respond to /data/desc requests,
and have client TileInfo component display response data.
Also adjust .gitignore entries to be root-relative.
Diffstat (limited to 'backend')
| -rw-r--r-- | backend/data/README.md | 36 | ||||
| -rwxr-xr-x | backend/data/downloadImgsForReview.py | 2 | ||||
| -rw-r--r-- | backend/data/enwiki/README.md | 35 | ||||
| -rwxr-xr-x | backend/data/enwiki/genDescData.py | 68 | ||||
| -rwxr-xr-x | backend/data/enwiki/genPageData.py | 39 | ||||
| -rwxr-xr-x | backend/data/enwiki/genRedirectData.py | 39 | ||||
| -rw-r--r-- | backend/data/eol/README.md | 20 | ||||
| -rwxr-xr-x | backend/data/genEnwikiData.py | 64 | ||||
| -rwxr-xr-x | backend/data/genEolNameData.py | 11 | ||||
| -rwxr-xr-x | backend/data/reviewImgs.py | 8 | ||||
| -rwxr-xr-x | backend/server.py | 10 |
11 files changed, 300 insertions, 32 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index e639cb6..8791fb4 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -1,21 +1,31 @@ File Generation Process ======================= -1 Obtain data in otol/ and eol/, as specified in their README files. -2 Run genOtolData.py, which creates data.db, and adds a 'nodes' - table using data in otol/*. -3 Run genEolNameData.py, which adds a 'names' table to data.db, - using data in eol/vernacularNames.csv and the 'nodes' table. -4 Run genSpellfixNameData.py, which adds a 'spellfix\_alt\_names' - table to data.db, using data in the 'names' table. -5 Use downloadImgsForReview.py to download EOL images into imgsForReview/. - It uses data in eol/imagesList.db, and the 'names' table. -6 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique - images in imgsReviewed/ (uses 'names' to display common names). -7 Use genImgsForWeb.py to create cropped/resized images in img/, using - images in imgsReviewed, and also to add an 'images' table to data.db. + +1 Tree Structure Data + 1 Obtain data in otol/, as specified in it's README. + 2 Run genOtolData.py, which creates data.db, and adds a 'nodes' + table using data in otol/*. +2 Name Data for Search + 1 Obtain data in eol/, as specified in it's README. + 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, + using data in eol/vernacularNames.csv and the 'nodes' table. + 3 Run genSpellfixNameData.py, which adds a 'spellfix\_alt\_names' + table to data.db, using data in the 'names' table. +3 Image Data + 1 Use downloadImgsForReview.py to download EOL images into imgsForReview/. + It uses data in eol/imagesList.db, and the 'eol_ids' table. + 2 Use reviewImgs.py to filter images in imgsForReview/ into EOL-id-unique + images in imgsReviewed/ (uses 'names' and 'eol_ids' to display extra info). + 3 Use genImgsForWeb.py to create cropped/resized images in img/, using + images in imgsReviewed, and also to add an 'images' table to data.db. +4 Node Description Data + 1 Obtain data in enwiki/, as specified in it's README. + 2 Run genEnwikiData.py, which adds a 'descs' table to data.db, + using data in enwiki/enwikiData.db, and the 'nodes' table. spellfix.so =========== + This file provides the spellfix1 extension for Sqlite, and is used for responding to fuzzy-search requests. diff --git a/backend/data/downloadImgsForReview.py b/backend/data/downloadImgsForReview.py index 12b52ff..03e22a8 100755 --- a/backend/data/downloadImgsForReview.py +++ b/backend/data/downloadImgsForReview.py @@ -31,7 +31,7 @@ eolIds = set() print("Reading in EOL IDs") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -for row in dbCur.execute("SELECT DISTINCT eol_id FROM names"): +for row in dbCur.execute("SELECT id FROM eol_ids"): eolIds.add(row[0]) dbCon.close() # Get eol-ids from images db diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md new file mode 100644 index 0000000..8e748c9 --- /dev/null +++ b/backend/data/enwiki/README.md @@ -0,0 +1,35 @@ +Downloaded Files +================ +- enwiki\_content/enwiki-20220420-pages-articles-*.xml.gz: + Obtained via https://dumps.wikimedia.org/backup-index.html (site suggests downloading from a mirror). + Contains text content and metadata for pages in English Wikipedia (current revision only, excludes talk pages). + Some file content and format information was available from + https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download. +- enwiki-20220420-page.sql.gz: + Obtained like above. Contains page-table information including page id, namespace, title, etc. + Format information was found at https://www.mediawiki.org/wiki/Manual:Page_table. +- enwiki-20220420-redirect.sql.gz: + Obtained like above. Contains page-redirection info. + Format information was found at https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download. + +Generated Files +=============== +- enwiki\_content/enwiki-*.xml and enwiki-*.sql: + Uncompressed versions of downloaded files. +- enwikiData.db: + An sqlite database representing data from the enwiki dump files. + Generation: + 1 Install python, and packages mwsql, mwxml, and mwparsefromhell. Example: + 1 On Ubuntu, install python3, python3-pip, and python3-venv via `apt-get update; apt-get ...`. + 2 Create a virtual environment in which to install packages via `python3 -m venv .venv`. + 3 Activate the virtual environment via `source .venv/bin/activate`. + 4 Install mwsql, mwxml, and mwparsefromhell via `pip install mwsql mwxml mwparsefromhell`. + 2 Run genPageData.py (still under the virtual environment), which creates the database, + reads from the page dump, and creates a 'pages' table. + 3 Run genRedirectData.py, which creates a 'redirects' table, using information in the redirects dump, + and page ids from the 'pages' table. + 4 Run genDescData.py, which reads the page-content xml dumps, and the 'pages' and 'redirects' tables, + and associates page ids with (potentially redirect-resolved) pages, and attempts to parse some + wikitext within those pages to obtain the first descriptive paragraph, with markup removed. +- .venv: + Provides a python virtual environment for packages needed to generate data. diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py new file mode 100755 index 0000000..3602138 --- /dev/null +++ b/backend/data/enwiki/genDescData.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 + +import re +import sys, os.path, glob +import mwxml, mwparserfromhell +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads Wikimedia enwiki pages-articles XML dumps, obtaining\n" +usageInfo += "descriptions for page-ids, and adds them to a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +wikiDumpFiles = glob.glob("enwiki_content/enwiki-*-pages-articles-multistream*.xml") +wikiDumpFiles.sort(key = lambda x: int(re.search(r"multistream(\d+)", x).group(1))) +enwikiDb = "enwikiData.db" + +# Some regexps and functions for parsing wikitext +descLineRegex = "^ *[A-Z'\"]" +embeddedHtmlRegex = r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$" + # Recognises a self-closing HTML tag, a tag 0 children, tag with 1 child with 0 children, or unclosed tag +convertTemplateRegex = r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}" +def convertTemplateReplace(match): + if match.group(2) == None: + return "{} {}".format(match.group(1), match.group(4)) + else: + return "{} {} {} {}".format(match.group(1), match.group(2), match.group(3), match.group(4)) +parenGrpRegex = r" \([^()]*\)" +def parseDesc(text): + prevLine = None + for line in text.splitlines(): + if prevLine != None: + if line.strip() == "" or re.match(descLineRegex, line) != None: + return prevLine + else: + prevLine = None + if re.match(descLineRegex, line) != None: + line = re.sub(embeddedHtmlRegex, "", line) + line = re.sub(convertTemplateRegex, convertTemplateReplace, line) + line = mwparserfromhell.parse(line).strip_code() # Remove wikitext markup + prevLine = re.sub(parenGrpRegex, "", line) + if prevLine != None: + return prevLine + return None + +# Open db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") +# Parse data +iterationNum = 0 +for fileName in wikiDumpFiles: + print("Processing file {}".format(fileName)) + dump = mwxml.Dump.from_file(open(fileName)) + for page in dump: + iterationNum += 1 + if iterationNum % 10000 == 0: + print("At iteration {}".format(iterationNum)) + # Parse page + if page.namespace == 0 and page.redirect == None: + revision = next(page) + desc = parseDesc(revision.text) + if desc != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genPageData.py b/backend/data/enwiki/genPageData.py new file mode 100755 index 0000000..7522f1f --- /dev/null +++ b/backend/data/enwiki/genPageData.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import sys, os.path +from mwsql import Dump +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a gzipped Wikimedia enwiki 'page' table MySql dump,\n" +usageInfo += "obtaining a page-id to page-title mapping, and adds it to\n" +usageInfo += "a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +pageDumpFile = "enwiki-20220420-page.sql.gz" +enwikiDb = "enwikiData.db" + +# Check for existing db +if os.path.exists(enwikiDb): + print("ERROR: Existing {}".format(enwikiDb), file=sys.stderr) + sys.exit(1) +# Create db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") +dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") +# Parse page data +dump = Dump.from_file(pageDumpFile) +iterationNum = 0 +for row in dump.rows(convert_dtypes=True): + iterationNum += 1 + if iterationNum % 1e6 == 0: + print("At iteration {}".format(iterationNum)) + # Add to map + if row[1] == 0: # If page in article namespace + dbCur.execute("INSERT INTO pages VALUES (?, ?)", (row[0], row[2].replace("_", " "))) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genRedirectData.py b/backend/data/enwiki/genRedirectData.py new file mode 100755 index 0000000..e1aadc8 --- /dev/null +++ b/backend/data/enwiki/genRedirectData.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import sys, os.path +from mwsql import Dump +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a gzipped Wikimedia enwiki 'redirect' table MySql dump,\n" +usageInfo += "obtaining a page-id to redirect-page-id mapping, and adds it to\n" +usageInfo += "a sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +redirectDumpFile = "enwiki-20220420-redirect.sql.gz" +enwikiDb = "enwikiData.db" + +# Open db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target_id INT)") +dbCur2 = dbCon.cursor() +# Parse redirect data +dump = Dump.from_file(redirectDumpFile) +iterationNum = 0 +for row in dump.rows(convert_dtypes=True): + iterationNum += 1 + if iterationNum % 1e6 == 0: + print("At iteration {}".format(iterationNum)) + # Add to map + [pageId, namespace, title] = row[:3] + if namespace == 0: # If page is in the article namespace + row = dbCur2.execute("SELECT id from pages where pages.title = ?", (title.replace("_", " "),)).fetchone() + if row != None: + targetId = row[0] + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (pageId, targetId)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md index 3ce9799..d863099 100644 --- a/backend/data/eol/README.md +++ b/backend/data/eol/README.md @@ -1,15 +1,15 @@ Downloaded Files ================ -- imagesList.tgz - Obtained from https://opendata.eol.org/dataset/images-list on 24/04/2022 - Listed as being last updated on 05/02/2020 -- vernacularNames.csv - Obtained from https://opendata.eol.org/dataset/vernacular-names on 24/04/2022 - Listed as being last updated on 27/10/2020 +- imagesList.tgz: + Obtained from https://opendata.eol.org/dataset/images-list on 24/04/2022. + Listed as being last updated on 05/02/2020. +- vernacularNames.csv: + Obtained from https://opendata.eol.org/dataset/vernacular-names on 24/04/2022. + Listed as being last updated on 27/10/2020. Generated Files =============== -- imagesList/ - Obtained by extracting imagesList.tgz -- imagesList.db - Represents data from eol/imagesList/*, and is created by genImagesListDb.sh +- imagesList/: + Obtained by extracting imagesList.tgz. +- imagesList.db: + Represents data from eol/imagesList/*, and is created by genImagesListDb.sh. diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiData.py new file mode 100755 index 0000000..f1490b6 --- /dev/null +++ b/backend/data/genEnwikiData.py @@ -0,0 +1,64 @@ +#!/usr/bin/python3 + +import sys, re +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads Wikimedia enwiki data from enwiki/, along with node and name data\n" +usageInfo += "from a sqlite database, associates nodes with enwiki pages, and adds\n" +usageInfo += "alt-name and description information for those nodes.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +enwikiDb = "enwiki/enwikiData.db" +dbFile = "data.db" + +# Open dbs +enwikiCon = sqlite3.connect(enwikiDb) +enwikiCur = enwikiCon.cursor() +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Find page id for each node name +nodeToPageId = {} +print("Getting node page-ids") +iterationNum = 0 +for row in dbCur.execute("SELECT name from nodes"): + iterationNum += 1 + if iterationNum % 1e4 == 0: + print("At iteration {}".format(iterationNum)) + # + name = row[0] + row = enwikiCur.execute("SELECT id FROM pages where pages.title = ? COLLATE nocase", (name,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] +# Resolve redirects +print("Resolving redirects") +redirectingNames = set() +iterationNum = 0 +for (name, pageId) in nodeToPageId.items(): + iterationNum += 1 + if iterationNum % 1e4 == 0: + print("At iteration {}".format(iterationNum)) + # + row = enwikiCur.execute("SELECT target_id FROM redirects where redirects.id = ?", (pageId,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] + redirectingNames.add(name) +# Add descriptions for each node +print("Adding node description data") +dbCur.execute("CREATE TABLE descs (name TEXT PRIMARY KEY, desc TEXT, redirected INT)") +iterationNum = 0 +for (name, pageId) in nodeToPageId.items(): + iterationNum += 1 + if iterationNum % 1e4 == 0: + print("At iteration {}".format(iterationNum)) + # + row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() + if row != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?, ?)", (name, row[0], 1 if name in redirectingNames else 0)) +# Close dbs +dbCon.commit() +dbCon.close() +enwikiCon.commit() +enwikiCon.close() diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py index 200b459..74d9329 100755 --- a/backend/data/genEolNameData.py +++ b/backend/data/genEolNameData.py @@ -62,8 +62,9 @@ with open(vnamesFile, newline="") as csvfile: # Open db connection dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Create 'names' table -dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, pref_alt INT, PRIMARY KEY(name, alt_name))") +# Create tables +dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, PRIMARY KEY(name, alt_name))") +dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)") # Iterate through 'nodes' table, resolving to canonical-names usedPids = set() unresolvedNodeNames = set() @@ -85,11 +86,12 @@ for row in dbCur2.execute("SELECT name FROM nodes"): usedPids.add(pidToUse) altNames = {name} preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None + dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, name)) for n in pidToNames[pidToUse]: altNames.add(n) for n in altNames: isPreferred = 1 if (n == preferredName) else 0 - dbCur.execute("INSERT INTO names VALUES (?, ?, ?, ?)", (name, n, pidToUse, isPreferred)) + dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred)) elif name in nameToPids: unresolvedNodeNames.add(name) # Iterate through unresolved nodes, resolving to vernacular-names @@ -108,11 +110,12 @@ for name in unresolvedNodeNames: usedPids.add(pidToUse) altNames = {name} preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None + dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (name, pidToUse)) for n in pidToNames[pidToUse]: altNames.add(n) for n in altNames: isPreferred = 1 if (n == preferredName) else 0 - dbCur.execute("INSERT INTO names VALUES (?, ?, ?, ?)", (name, n, pidToUse, isPreferred)) + dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, isPreferred)) # Close db dbCon.commit() dbCon.close() diff --git a/backend/data/reviewImgs.py b/backend/data/reviewImgs.py index 5dcd52e..8987007 100755 --- a/backend/data/reviewImgs.py +++ b/backend/data/reviewImgs.py @@ -125,10 +125,12 @@ class EolImgReviewer: # Update title firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 lastImgIdx = self.imgListIdx - row = dbCur.execute("SELECT alt_name, eol_id, pref_alt FROM names WHERE eol_id = ? and pref_alt = 1", - (self.nextEolId,)).fetchone() + query = "SELECT eol_ids.id, names.alt_name, names.pref_alt FROM" \ + " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ + " WHERE id = ? and pref_alt = 1" + row = dbCur.execute(query, (self.nextEolId,)).fetchone() if row != None: - commonName = row[0] + commonName = row[1] self.root.title("Reviewing EOL ID {}, aka \"{}\" (imgs {} to {} out of {})".format( self.nextEolId, commonName, firstImgIdx, lastImgIdx, len(self.imgList))) else: diff --git a/backend/server.py b/backend/server.py index 1c09ad7..580b4fb 100755 --- a/backend/server.py +++ b/backend/server.py @@ -52,9 +52,10 @@ def lookupNode(name): nodeObj["img"] = nodeNameToFile(match.group(1), cur) if nodeObj["img"] == None: nodeObj["img"] = nodeNameToFile(match.group(2), cur) + # return nodeObj; def nodeNameToFile(name, cur): - row = cur.execute("SELECT name, eol_id FROM names WHERE name = ?", (name,)).fetchone() + row = cur.execute("SELECT name, id FROM eol_ids WHERE name = ?", (name,)).fetchone() if row == None: return None eolId = row[1] @@ -92,6 +93,10 @@ def lookupName(name): hasMore = True del results[-1] return json.dumps([results, hasMore]) +def lookupDesc(name): + cur = dbCon.cursor() + row = cur.execute("SELECT desc, redirected from descs WHERE descs.name = ?", (name,)).fetchone() + return json.dumps([row[0], row[1] == 1] if row != None else None) class DbServer(BaseHTTPRequestHandler): def do_GET(self): @@ -158,6 +163,9 @@ class DbServer(BaseHTTPRequestHandler): elif reqType == "search": self.respondJson(lookupName(name)) return + elif reqType == "desc": + self.respondJson(lookupDesc(name)) + return self.send_response(404) self.end_headers() self.end_headers() |
