diff options
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | backend/hist_data/enwiki/README.md | 24 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 51 | ||||
| -rw-r--r-- | backend/tests/enwiki/test_gen_img_data.py | 22 |
4 files changed, 70 insertions, 29 deletions
@@ -9,3 +9,5 @@ __pycache__ /backend/hist_data/wikidata/*.json.bz2 /backend/hist_data/enwiki/*.db /backend/hist_data/enwiki/pageviews/ +/backend/hist_data/enwiki/*.bz2 +/backend/hist_data/enwiki/imgs/ diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index e50c7e2..dd090ca 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -29,6 +29,18 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. - `redirects`: `id INT PRIMARY KEY, target TEXT` - `descs`: `id INT PRIMARY KEY, desc TEXT` +# Page View Files +- `pageviews/pageviews-*-user.bz2` + Each holds wikimedia article page view data for some month. + Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>. + Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>. +- `gen_pageview_data.py` <br> + Reads pageview/* and `dump_index.db`, and creates a database holding average monthly pageview counts. +- `pageview_data.db` <br> + Generated using `gen_pageview_data.py`. <br> + Tables: <br> + - `views`: `title TEXT PRIMARY KEY, id INT UNIQUE, views INT` + # Image Files - `gen_img_data.py` <br> Used to find infobox image names for page IDs, and store them into a database. @@ -46,15 +58,3 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. - `download_imgs.py` <br> Used to download image files into imgs/. - -# Page View Files -- `pageviews/pageviews-*-user.bz2` - Each holds wikimedia article page view data for some month. - Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>. - Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>. -- `gen_pageview_data.py` <br> - Reads pageview/* and `dump_index.db`, and creates a database holding average monthly pageview counts. -- `pageview_data.db` <br> - Generated using `gen_pageview_data.py`. <br> - Tables: <br> - - `views`: `title TEXT PRIMARY KEY, id INT UNIQUE, views INT` diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 29ae7b6..762952e 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -14,8 +14,10 @@ import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' +PAGEVIEW_DB = 'pageview_data.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') +MAX_IMGS_PER_CTG = 20000 # ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') @@ -174,30 +176,49 @@ def getImageName(content: list[str]) -> str | None: return None return None -def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: - print('Getting input page-ids') - pageTitles: set[str] = set() - pageIds: set[int] = set() - print('Reading event titles') +def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]: + print('Getting event data') + titleToCtg: dict[str, str] = {} dbCon = sqlite3.connect(dbFile) - dbCur = dbCon.cursor() - for (pageId,) in dbCur.execute('SELECT title from events'): - pageTitles.add(pageId) + for title, ctg in dbCon.execute('SELECT title, ctg from events'): + titleToCtg[title] = ctg dbCon.close() - print('Getting event page IDs') + print('Getting top images for each event category') + ctgToTitles: dict[str, list[str]] = {} + dbCon = sqlite3.connect(pageviewDb) + for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'): + if title not in titleToCtg: + continue + ctg = titleToCtg[title] + if ctg not in ctgToTitles: + ctgToTitles[ctg] = [] + elif len(ctgToTitles[ctg]) == maxImgsPerCtg: + continue + ctgToTitles[ctg].append(title) + del titleToCtg[title] + dbCon.close() + for title, ctg in titleToCtg.items(): # Account for titles without view counts + if ctg not in ctgToTitles: + ctgToTitles[ctg] = [] + elif len(ctgToTitles[ctg]) == maxImgsPerCtg: + continue + ctgToTitles[ctg].append(title) + print('Getting page IDs') + pageIds: set[int] = set() dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() - for pageTitle in pageTitles: - row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (pageTitle,)).fetchone() - if row: - pageIds.add(row[0]) + for ctg in ctgToTitles: + for title in ctgToTitles[ctg]: + row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row: + pageIds.add(row[0]) dbCon.close() - print(f'Found {len(pageIds)} out of {len(pageTitles)}') + print(f'Result: {len(pageIds)} out of {len(titleToCtg)}') return pageIds if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # - pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB) + pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py index 019b757..dc77fe6 100644 --- a/backend/tests/enwiki/test_gen_img_data.py +++ b/backend/tests/enwiki/test_gen_img_data.py @@ -19,6 +19,22 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): { (1, 'Belgium', 2389729, None, None, None, 2, 'country'), (2, 'George Washington', 2353711, None, 2378478, None, 2, 'human'), + (3, 'Douglas Adams', 2434082, None, 2452040, None, 2, 'human'), + (4, 'World War II', 2429507, None, 2431700, None, 2, 'event'), + (5, 'Marie Curie', 2403277, None, 2427622, None, 2, 'human'), + } + ) + # Create temp pageviews db + pageviewDb = os.path.join(tempDir, 'pageview_data.db') + createTestDbTable( + pageviewDb, + 'CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)', + 'INSERT INTO views VALUES (?, ?, ?)', + { + ('George Washington', 2, 8), + ('Marie Curie', 5, 10), + ('Douglas Adams', 3, 5), + ('Belgium', 1, 100), } ) # Create temp dump-index db @@ -30,13 +46,15 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): { ('Belgium',10,0,-1), ('George Washington',20,0,-1), + ('Douglas Adamns',30,0,-1), + ('Marie Curie',50,0,-1), ('Autism',25,0,-1), } ) # Run - pageIds = getInputPageIdsFromDb(dbFile, indexDb) + pageIds = getInputPageIdsFromDb(dbFile, pageviewDb, indexDb, 2) # Check - self.assertEqual(pageIds, {10, 20}) + self.assertEqual(pageIds, {50, 20, 10}) class TestGenData(unittest.TestCase): def test_gen(self): |
