diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-10-01 23:14:08 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-10-02 00:01:01 +1000 |
| commit | 1b4fc8667714ef4ce9f326bd14f795fc2417ecb9 (patch) | |
| tree | 474f54e527a1f26e33c92fd54a718e697de75744 /backend/hist_data/enwiki | |
| parent | 4edb7998012bcc804482a76277cd25b90fb373c9 (diff) | |
Add per-event-category image limit
Diffstat (limited to 'backend/hist_data/enwiki')
| -rw-r--r-- | backend/hist_data/enwiki/README.md | 24 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 51 |
2 files changed, 48 insertions, 27 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index e50c7e2..dd090ca 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -29,6 +29,18 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. - `redirects`: `id INT PRIMARY KEY, target TEXT` - `descs`: `id INT PRIMARY KEY, desc TEXT` +# Page View Files +- `pageviews/pageviews-*-user.bz2` + Each holds wikimedia article page view data for some month. + Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>. + Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>. +- `gen_pageview_data.py` <br> + Reads pageview/* and `dump_index.db`, and creates a database holding average monthly pageview counts. +- `pageview_data.db` <br> + Generated using `gen_pageview_data.py`. <br> + Tables: <br> + - `views`: `title TEXT PRIMARY KEY, id INT UNIQUE, views INT` + # Image Files - `gen_img_data.py` <br> Used to find infobox image names for page IDs, and store them into a database. @@ -46,15 +58,3 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. - `download_imgs.py` <br> Used to download image files into imgs/. - -# Page View Files -- `pageviews/pageviews-*-user.bz2` - Each holds wikimedia article page view data for some month. - Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>. - Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>. -- `gen_pageview_data.py` <br> - Reads pageview/* and `dump_index.db`, and creates a database holding average monthly pageview counts. -- `pageview_data.db` <br> - Generated using `gen_pageview_data.py`. <br> - Tables: <br> - - `views`: `title TEXT PRIMARY KEY, id INT UNIQUE, views INT` diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 29ae7b6..762952e 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -14,8 +14,10 @@ import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' +PAGEVIEW_DB = 'pageview_data.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') +MAX_IMGS_PER_CTG = 20000 # ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') @@ -174,30 +176,49 @@ def getImageName(content: list[str]) -> str | None: return None return None -def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: - print('Getting input page-ids') - pageTitles: set[str] = set() - pageIds: set[int] = set() - print('Reading event titles') +def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]: + print('Getting event data') + titleToCtg: dict[str, str] = {} dbCon = sqlite3.connect(dbFile) - dbCur = dbCon.cursor() - for (pageId,) in dbCur.execute('SELECT title from events'): - pageTitles.add(pageId) + for title, ctg in dbCon.execute('SELECT title, ctg from events'): + titleToCtg[title] = ctg dbCon.close() - print('Getting event page IDs') + print('Getting top images for each event category') + ctgToTitles: dict[str, list[str]] = {} + dbCon = sqlite3.connect(pageviewDb) + for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'): + if title not in titleToCtg: + continue + ctg = titleToCtg[title] + if ctg not in ctgToTitles: + ctgToTitles[ctg] = [] + elif len(ctgToTitles[ctg]) == maxImgsPerCtg: + continue + ctgToTitles[ctg].append(title) + del titleToCtg[title] + dbCon.close() + for title, ctg in titleToCtg.items(): # Account for titles without view counts + if ctg not in ctgToTitles: + ctgToTitles[ctg] = [] + elif len(ctgToTitles[ctg]) == maxImgsPerCtg: + continue + ctgToTitles[ctg].append(title) + print('Getting page IDs') + pageIds: set[int] = set() dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() - for pageTitle in pageTitles: - row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (pageTitle,)).fetchone() - if row: - pageIds.add(row[0]) + for ctg in ctgToTitles: + for title in ctgToTitles[ctg]: + row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row: + pageIds.add(row[0]) dbCon.close() - print(f'Found {len(pageIds)} out of {len(pageTitles)}') + print(f'Result: {len(pageIds)} out of {len(titleToCtg)}') return pageIds if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # - pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB) + pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) |
