From 0e5e46cedaaeacf59cfd0f2e30c1ae6923466870 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Fri, 30 Dec 2022 23:28:09 +1100 Subject: Generate event_disp data before image-generation Make gen_disp_data.py delete non-displayable events Make reduce_event_data.py also delete from 'dist' and 'event_disp' Remove MAX_IMGS_PER_CTG from enwiki/gen_img_data.py Make gen_desc_data.py include events without images --- backend/hist_data/enwiki/README.md | 2 +- backend/hist_data/enwiki/gen_img_data.py | 43 ++++++++------------------------ 2 files changed, 11 insertions(+), 34 deletions(-) (limited to 'backend/hist_data/enwiki') diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index 29fc2ff..262ebdb 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -38,7 +38,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database. - `img_data.db`
Used to hold metadata about infobox images for a set of page IDs. - Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`.
+ Generated using `gen_img_data.py` and `download_img_license_info.py`.
Tables:
- `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT`
`img_name` may be NULL, which means 'none found', and is used to avoid re-processing page IDs. diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index b4ade9f..922b893 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -14,10 +14,8 @@ import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' -PAGEVIEW_DB = 'pageview_data.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -MAX_IMGS_PER_CTG = 20000 # ID_LINE_REGEX = re.compile(r'(.*)') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') @@ -179,49 +177,28 @@ def getImageName(content: list[str]) -> str | None: return None return None -def getInputPageIdsFromDb(dbFile: str, pageviewDb: str, indexDb: str, maxImgsPerCtg: int) -> set[int]: +def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: print('Getting event data') - titleToCtg: dict[str, str] = {} + titles: set[str] = set() dbCon = sqlite3.connect(dbFile) - for title, ctg in dbCon.execute('SELECT title, ctg from events'): - titleToCtg[title] = ctg + for (title,) in dbCon.execute('SELECT title from events'): + titles.add(title) dbCon.close() - print('Getting top images for each event category') - ctgToTitles: dict[str, list[str]] = {} - dbCon = sqlite3.connect(pageviewDb) - for (title,) in dbCon.execute('SELECT title FROM views ORDER BY views DESC'): - if title not in titleToCtg: - continue - ctg = titleToCtg[title] - if ctg not in ctgToTitles: - ctgToTitles[ctg] = [] - elif len(ctgToTitles[ctg]) == maxImgsPerCtg: - continue - ctgToTitles[ctg].append(title) - del titleToCtg[title] - dbCon.close() - for title, ctg in titleToCtg.items(): # Account for titles without view counts - if ctg not in ctgToTitles: - ctgToTitles[ctg] = [] - elif len(ctgToTitles[ctg]) == maxImgsPerCtg: - continue - ctgToTitles[ctg].append(title) print('Getting page IDs') pageIds: set[int] = set() dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() - for ctg in ctgToTitles: - for title in ctgToTitles[ctg]: - row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() - if row: - pageIds.add(row[0]) + for title in titles: + row = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row: + pageIds.add(row[0]) dbCon.close() - print(f'Result: {len(pageIds)} out of {len(titleToCtg)}') + print(f'Result: {len(pageIds)} out of {len(titles)}') return pageIds if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # - pageIds = getInputPageIdsFromDb(DB_FILE, PAGEVIEW_DB, INDEX_DB, MAX_IMGS_PER_CTG) + pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) -- cgit v1.2.3