diff options
Diffstat (limited to 'backend/hist_data/enwiki')
| -rw-r--r-- | backend/hist_data/enwiki/README.md | 10 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_img_license_info.py | 5 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_imgs.py | 9 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_desc_data.py | 7 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_dump_index_db.py | 8 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 51 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_pageview_data.py | 10 |
7 files changed, 39 insertions, 61 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index 29fc2ff..76d33e5 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -33,12 +33,12 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. # Image Files - `gen_img_data.py` <br> - Used to find infobox image names for page IDs, and store them into a database. + Finds infobox image names for page IDs, and stores them into a database. - `download_img_license_info.py` <br> - Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database. + Downloads licensing metadata for image names, via wikipedia's online API, and stores them into a database. - `img_data.db` <br> - Used to hold metadata about infobox images for a set of page IDs. - Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br> + Holds metadata about infobox images for a set of page IDs. + Generated using `gen_img_data.py` and `download_img_license_info.py`. <br> Tables: <br> - `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br> `img_name` may be NULL, which means 'none found', and is used to avoid re-processing page IDs. @@ -47,7 +47,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. <br> Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. - `download_imgs.py` <br> - Used to download image files into imgs/. + Downloads image files into imgs/. # Description Files - `gen_desc_data.py` <br> diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 1217caf..43f2c43 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks at already-processed names to decide what to skip. """ -import re +import argparse +import re, time, signal import sqlite3, urllib.parse, html import requests -import time, signal IMG_DB = 'img_data.db' # @@ -150,7 +150,6 @@ def downloadInfo(imgDb: str) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index bbd2cda..7dd0771 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. """ -import re, os +import argparse +import re, os, time, signal import sqlite3 import urllib.parse, requests -import time, signal IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' @@ -22,7 +22,7 @@ USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' # It's unclear how to properly check for cache misses, so we just aim for 1 per sec -BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit) +EXP_BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit) def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if not os.path.exists(outDir): @@ -84,7 +84,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: time.sleep(timeout) except Exception as e: print(f'Error while downloading to {outFile}: {e}') - if not BACKOFF: + if not EXP_BACKOFF: return else: timeout *= 2 @@ -94,7 +94,6 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index b3fde52..bb2b845 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -7,14 +7,14 @@ and adds them to a database # In testing, this script took over 10 hours to run, and generated about 5GB +import argparse import sys, os, re -import bz2 -import html, mwxml, mwparserfromhell +import bz2, html, mwxml, mwparserfromhell import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' - +# Regexps DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag @@ -119,7 +119,6 @@ def convertTitle(title: str) -> str: return html.unescape(title).replace('_', ' ') if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 5778680..6be8bc5 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -1,11 +1,12 @@ #!/usr/bin/python3 """ -Adds data from the wiki dump index-file into a database +Adds data from the wiki-dump index-file into a database """ + +import argparse import sys, os, re -import bz2 -import sqlite3 +import bz2, sqlite3 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines DB_FILE = 'dump_index.db' @@ -53,7 +54,6 @@ def genData(indexFile: str, dbFile: str) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index b4ade9f..9aa3863 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -8,17 +8,15 @@ The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """ -import re -import os, bz2, html, urllib.parse +import os, re +import bz2, html, urllib.parse import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' -PAGEVIEW_DB = 'pageview_data.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -MAX_IMGS_PER_CTG = 20000 -# +# Regexps ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') @@ -35,7 +33,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)') - # 'img_name' may be NULL + # 'img_name' values are set to NULL to indicate page IDs where no image was found imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') else: # Check for already-processed page IDs @@ -179,49 +177,28 @@ def getImageName(content: list[str]) -> str | None: return None return None -def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]: +def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: print('Getting event data') - titleToCtg: dict[str, str] = {} + titles: set[str] = set() dbCon = sqlite3.connect(dbFile) - for title, ctg in dbCon.execute('SELECT title, ctg from events'): - titleToCtg[title] = ctg + for (title,) in dbCon.execute('SELECT title from events'): + titles.add(title) dbCon.close() - print('Getting top images for each event category') - ctgToTitles: dict[str, list[str]] = {} - dbCon = sqlite3.connect(pageviewDb) - for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'): - if title not in titleToCtg: - continue - ctg = titleToCtg[title] - if ctg not in ctgToTitles: - ctgToTitles[ctg] = [] - elif len(ctgToTitles[ctg]) == maxImgsPerCtg: - continue - ctgToTitles[ctg].append(title) - del titleToCtg[title] - dbCon.close() - for title, ctg in titleToCtg.items(): # Account for titles without view counts - if ctg not in ctgToTitles: - ctgToTitles[ctg] = [] - elif len(ctgToTitles[ctg]) == maxImgsPerCtg: - continue - ctgToTitles[ctg].append(title) print('Getting page IDs') pageIds: set[int] = set() dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() - for ctg in ctgToTitles: - for title in ctgToTitles[ctg]: - row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() - if row: - pageIds.add(row[0]) + for title in titles: + row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row: + pageIds.add(row[0]) dbCon.close() - print(f'Result: {len(pageIds)} out of {len(titleToCtg)}') + print(f'Result: {len(pageIds)} out of {len(titles)}') return pageIds if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # - pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG) + pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index 90ec925..935b303 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -36,9 +36,13 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: if not line.startswith(linePrefix): continue # Get second and second-last fields - line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields - title = line[:line.find(b' ')].decode('utf-8') - viewCount = int(line[line.rfind(b' ')+1:]) + linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields + title = linePart[:linePart.find(b' ')].decode('utf-8') + try: + viewCount = int(linePart[linePart.rfind(b' ')+1:]) + except ValueError: + print(f'Unable to read count in line {lineNum}: {line}') + continue if namespaceRegex.match(title) is not None: continue # Update map |
