From e78c4df403e5f98afa08f7a0841ff233d5f6d05b Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 22 Jun 2022 01:42:41 +1000 Subject: Update backend READMEs, rename some files for consistency --- backend/data/enwiki/README.md | 73 ++++++----- backend/data/enwiki/downloadEnwikiImgs.py | 2 +- backend/data/enwiki/downloadImgLicenseInfo.py | 2 +- backend/data/enwiki/genData.py | 122 ------------------ backend/data/enwiki/genDescData.py | 122 ++++++++++++++++++ backend/data/enwiki/genImgData.py | 178 ++++++++++++++++++++++++++ backend/data/enwiki/getEnwikiImgData.py | 178 -------------------------- 7 files changed, 345 insertions(+), 332 deletions(-) delete mode 100755 backend/data/enwiki/genData.py create mode 100755 backend/data/enwiki/genDescData.py create mode 100755 backend/data/enwiki/genImgData.py delete mode 100755 backend/data/enwiki/getEnwikiImgData.py (limited to 'backend/data/enwiki') diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index 6462d7d..1c16a2e 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -1,39 +1,52 @@ -Downloaded Files -================ +This directory holds files obtained from/using [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page). + +# Downloaded Files - enwiki-20220501-pages-articles-multistream.xml.bz2
- Obtained via - (site suggests downloading from a mirror). Contains text - content and metadata for pages in English Wikipedia - (current revision only, excludes talk pages). Some file - content and format information was available from - . + Obtained via (site suggests downloading from a mirror). + Contains text content and metadata for pages in enwiki. + Some file content and format information was available from + . - enwiki-20220501-pages-articles-multistream-index.txt.bz2
Obtained like above. Holds lines of the form offset1:pageId1:title1, - providing offsets, for each page, into the dump file, of a chunk of + providing, for each page, an offset into the dump file of a chunk of 100 pages that includes it. -Generated Files -=============== +# Generated Dump-Index Files +- genDumpIndexDb.py
+ Creates an sqlite-database version of the enwiki-dump index file. - dumpIndex.db
- Holds data from the enwiki dump index file. Generated by - genDumpIndexDb.py, and used by lookupPage.py to get content for a - given page title.
+ Generated by genDumpIndexDb.py.
Tables:
- - offsets: title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next\_offset INT -- enwikiData.db
- Holds data obtained from the enwiki dump file, in 'pages', - 'redirects', and 'descs' tables. Generated by genData.py, which uses - python packages mwxml and mwparserfromhell.
+ - `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT` + +# Description Database Files +- genDescData.py
+ Reads through pages in the dump file, and adds short-description info to a database. +- descData.db
+ Generated by genDescData.py.
Tables:
- - pages: id INT PRIMARY KEY, title TEXT UNIQUE - - redirects: id INT PRIMARY KEY, target TEXT - - descs: id INT PRIMARY KEY, desc TEXT -- enwikiImgs.db
- Holds infobox-images obtained for some set of wiki page-ids. - Generated by running getEnwikiImgData.py, which uses the enwiki dump - file and dumpIndex.db.
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE` + - `redirects`: `id INT PRIMARY KEY, target TEXT` + - `descs`: `id INT PRIMARY KEY, desc TEXT` + +# Image Database Files +- genImgData.py
+ Used to find infobox image names for page IDs, storing them into a database. +- downloadImgLicenseInfo.py
+ Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database. +- imgData.db
+ Used to hold metadata about infobox images for a set of pageIDs. + Generated using getEnwikiImgData.py and downloadImgLicenseInfo.py.
Tables:
- - page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT - (img\_name may be null, which is used to avoid re-processing the page-id on a second pass) - - imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT - (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info) + - `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT`
+ `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids. + - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT`
+ Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. +- downloadEnwikiImgs.py
+ Used to download image files into imgs/. + +# Other Files +- lookupPage.py
+ Running `lookupPage.py title1` looks in the dump for a page with a given title, + and prints the contents to stdout. Uses dumpIndex.db. + diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py index de9b862..2929a0d 100755 --- a/backend/data/enwiki/downloadEnwikiImgs.py +++ b/backend/data/enwiki/downloadEnwikiImgs.py @@ -16,7 +16,7 @@ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDb = "enwikiImgs.db" # About 130k image names +imgDb = "imgData.db" # About 130k image names outDir = "imgs" licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py index 8231fbb..097304b 100755 --- a/backend/data/enwiki/downloadImgLicenseInfo.py +++ b/backend/data/enwiki/downloadImgLicenseInfo.py @@ -16,7 +16,7 @@ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDb = "enwikiImgs.db" # About 130k image names +imgDb = "imgData.db" # About 130k image names apiUrl = "https://en.wikipedia.org/w/api.php" batchSz = 50 # Max 50 tagRegex = re.compile(r"<[^<]+>") diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py deleted file mode 100755 index 3e60bb5..0000000 --- a/backend/data/enwiki/genData.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/python3 - -import sys, os, re -import bz2 -import html, mwxml, mwparserfromhell -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n" -usageInfo += "and short-description info to an sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages -enwikiDb = "enwikiData.db" - -# Some regexps and functions for parsing wikitext -descLineRegex = re.compile("^ *[A-Z'\"]") -embeddedHtmlRegex = re.compile(r"<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$") - # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag -convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") -parensGrpRegex = re.compile(r" \([^()]*\)") -leftoverBraceRegex = re.compile(r"(?:{\||{{).*") -def convertTemplateReplace(match): - if match.group(2) == None: - return f"{match.group(1)} {match.group(4)}" - else: - return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}" -def parseDesc(text): - # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank - # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, - lines = [] - openBraceCount = 0 - openBracketCount = 0 - inComment = False - skip = False - for line in text.splitlines(): - line = line.strip() - if len(lines) == 0: - if len(line) > 0: - if openBraceCount > 0 or line[0] == "{": - openBraceCount += line.count("{") - openBraceCount -= line.count("}") - skip = True - if openBracketCount > 0 or line[0] == "[": - openBracketCount += line.count("[") - openBracketCount -= line.count("]") - skip = True - if inComment or line.find("") != -1: - if inComment: - inComment = False - skip = True - else: - inComment = True - skip = True - if skip: - skip = False - continue - if line[-1] == ":": # Seems to help avoid disambiguation pages - return None - if descLineRegex.match(line) != None: - lines.append(line) - else: - if len(line) == 0: - return removeMarkup(" ".join(lines)) - lines.append(line) - if len(lines) > 0: - return removeMarkup(" ".join(lines)) - return None -def removeMarkup(content): - content = embeddedHtmlRegex.sub("", content) - content = convertTemplateRegex.sub(convertTemplateReplace, content) - content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup - content = parensGrpRegex.sub("", content) - content = leftoverBraceRegex.sub("", content) - return content -# Other helper functions -def convertTitle(title): - return html.unescape(title).replace("_", " ") - -# Check for existing db -if os.path.exists(enwikiDb): - print(f"ERROR: Existing {enwikiDb}", file=sys.stderr) - sys.exit(1) -# Create db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") -dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") -dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") -dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") -dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") -# Read through dump file -print("Reading dump file") -with bz2.open(dumpFile, mode='rt') as file: - dump = mwxml.Dump.from_file(file) - pageNum = 0 - for page in dump: - pageNum += 1 - if pageNum % 1e4 == 0: - print(f"At page {pageNum}") - # Parse page - if page.namespace == 0: - try: - dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) - except sqlite3.IntegrityError as e: - # Accounts for certain pages that have the same title - print(f"Failed to add page with title \"{page.title}\": {e}") - continue - if page.redirect != None: - dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) - else: - revision = next(page) - desc = parseDesc(revision.text) - if desc != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) -# Close db -dbCon.commit() -dbCon.close() diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py new file mode 100755 index 0000000..032dbed --- /dev/null +++ b/backend/data/enwiki/genDescData.py @@ -0,0 +1,122 @@ +#!/usr/bin/python3 + +import sys, os, re +import bz2 +import html, mwxml, mwparserfromhell +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n" +usageInfo += "and short-description info to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages +enwikiDb = "descData.db" + +# Some regexps and functions for parsing wikitext +descLineRegex = re.compile("^ *[A-Z'\"]") +embeddedHtmlRegex = re.compile(r"<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$") + # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag +convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") +parensGrpRegex = re.compile(r" \([^()]*\)") +leftoverBraceRegex = re.compile(r"(?:{\||{{).*") +def convertTemplateReplace(match): + if match.group(2) == None: + return f"{match.group(1)} {match.group(4)}" + else: + return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}" +def parseDesc(text): + # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank + # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, + # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + lines = [] + openBraceCount = 0 + openBracketCount = 0 + inComment = False + skip = False + for line in text.splitlines(): + line = line.strip() + if len(lines) == 0: + if len(line) > 0: + if openBraceCount > 0 or line[0] == "{": + openBraceCount += line.count("{") + openBraceCount -= line.count("}") + skip = True + if openBracketCount > 0 or line[0] == "[": + openBracketCount += line.count("[") + openBracketCount -= line.count("]") + skip = True + if inComment or line.find("") != -1: + if inComment: + inComment = False + skip = True + else: + inComment = True + skip = True + if skip: + skip = False + continue + if line[-1] == ":": # Seems to help avoid disambiguation pages + return None + if descLineRegex.match(line) != None: + lines.append(line) + else: + if len(line) == 0: + return removeMarkup(" ".join(lines)) + lines.append(line) + if len(lines) > 0: + return removeMarkup(" ".join(lines)) + return None +def removeMarkup(content): + content = embeddedHtmlRegex.sub("", content) + content = convertTemplateRegex.sub(convertTemplateReplace, content) + content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup + content = parensGrpRegex.sub("", content) + content = leftoverBraceRegex.sub("", content) + return content +# Other helper functions +def convertTitle(title): + return html.unescape(title).replace("_", " ") + +# Check for existing db +if os.path.exists(enwikiDb): + print(f"ERROR: Existing {enwikiDb}", file=sys.stderr) + sys.exit(1) +# Create db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") +dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") +dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") +dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") +dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") +# Read through dump file +print("Reading dump file") +with bz2.open(dumpFile, mode='rt') as file: + dump = mwxml.Dump.from_file(file) + pageNum = 0 + for page in dump: + pageNum += 1 + if pageNum % 1e4 == 0: + print(f"At page {pageNum}") + # Parse page + if page.namespace == 0: + try: + dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) + except sqlite3.IntegrityError as e: + # Accounts for certain pages that have the same title + print(f"Failed to add page with title \"{page.title}\": {e}") + continue + if page.redirect != None: + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) + else: + revision = next(page) + desc = parseDesc(revision.text) + if desc != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py new file mode 100755 index 0000000..9bd28f4 --- /dev/null +++ b/backend/data/enwiki/genImgData.py @@ -0,0 +1,178 @@ +#!/usr/bin/python3 + +import sys, re +import bz2, html, urllib.parse +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n" +usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +def getInputPageIds(): + pageIds = set() + dbCon = sqlite3.connect("../data.db") + dbCur = dbCon.cursor() + for (pageId,) in dbCur.execute("SELECT id from wiki_ids"): + pageIds.add(pageId) + dbCon.close() + return pageIds +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" +indexDb = "dumpIndex.db" +imgDb = "imgData.db" # Output db +idLineRegex = re.compile(r"(.*)") +imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") +bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") +imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE) +cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) + +# Open dbs +indexDbCon = sqlite3.connect(indexDb) +indexDbCur = indexDbCon.cursor() +imgDbCon = sqlite3.connect(imgDb) +imgDbCur = imgDbCon.cursor() +# Create image-db table +pidsDone = set() +if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None: + imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL + imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") +else: + for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"): + pidsDone.add(pid) + print(f"Will skip {len(pidsDone)} already-processed page-ids") +# Get input pageIds +print("Getting input page-ids", file=sys.stderr) +pageIds = getInputPageIds() +for pid in pidsDone: + pageIds.remove(pid) +print(f"Found {len(pageIds)} page-ids to process") +# Get page-id dump-file offsets +print("Getting dump-file offsets", file=sys.stderr) +offsetToPageids = {} +offsetToEnd = {} +iterNum = 0 +for pageId in pageIds: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}", file=sys.stderr) + # + query = "SELECT offset, next_offset FROM offsets WHERE id = ?" + row = indexDbCur.execute(query, (pageId,)).fetchone() + if row == None: + print(f"WARNING: Page id {pageId} not found", file=sys.stderr) + continue + (chunkOffset, endOffset) = row + offsetToEnd[chunkOffset] = endOffset + if chunkOffset not in offsetToPageids: + offsetToPageids[chunkOffset] = [] + offsetToPageids[chunkOffset].append(pageId) +print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr) +# Look through dump file, jumping to chunks containing relevant pages +print("Reading through dump file", file=sys.stderr) +def getImageName(content): + """ Given an array of text-content lines, returns an image-filename, or None """ + for line in content: + match = imageLineRegex.match(line) + if match != None: + imageName = match.group(1).strip() + if imageName == "": + return None + imageName = html.unescape(imageName) + # Account for {{... + if imageName.startswith("{"): + match = cssImgCropRegex.match(imageName) + if match == None: + return None + imageName = match.group(1) + # Account for [[File:...|...]] + if imageName.startswith("["): + match = bracketImageRegex.match(imageName) + if match == None: + return None + imageName = match.group(1) + # Account for