diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-21 12:21:03 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-21 12:32:01 +1100 |
| commit | 0a9b2c2e5eca8a04e37fbdd423379882863237c2 (patch) | |
| tree | 1812bdb6bb13e4f76fdd7ef04075b291f775c213 /backend/hist_data/enwiki | |
| parent | 8321e2f92dbc073b8f1de87895d6620a2021b22e (diff) | |
Adjust backend coding style
Increase line spacing, add section comments, etc
Diffstat (limited to 'backend/hist_data/enwiki')
| -rwxr-xr-x | backend/hist_data/enwiki/download_img_license_info.py | 29 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_imgs.py | 27 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_desc_data.py | 51 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_dump_index_db.py | 17 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 44 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_pageview_data.py | 30 |
6 files changed, 135 insertions, 63 deletions
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 43f2c43..6fd710c 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -10,12 +10,16 @@ at already-processed names to decide what to skip. """ import argparse -import re, time, signal -import sqlite3, urllib.parse, html +import re +import time +import signal +import sqlite3 +import urllib.parse +import html import requests IMG_DB = 'img_data.db' -# + API_URL = 'https://en.wikipedia.org/w/api.php' USER_AGENT = 'terryt.dev (terry06890@gmail.com)' BATCH_SZ = 50 # Max 50 @@ -26,17 +30,18 @@ def downloadInfo(imgDb: str) -> None: print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() + print('Checking for table') if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: dbCur.execute('CREATE TABLE imgs (id INT PRIMARY KEY, name TEXT UNIQUE, ' \ 'license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') - # + print('Reading image names') imgNames: set[str] = set() for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): imgNames.add(imgName) print(f'Found {len(imgNames)}') - # + print('Checking for already-processed images') nextImgId = 1 oldSz = len(imgNames) @@ -45,7 +50,7 @@ def downloadInfo(imgDb: str) -> None: if imgId >= nextImgId: nextImgId = imgId + 1 print(f'Found {oldSz - len(imgNames)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -54,7 +59,7 @@ def downloadInfo(imgDb: str) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Iterating through image names') imgNameList = list(imgNames) iterNum = 0 @@ -65,9 +70,11 @@ def downloadInfo(imgDb: str) -> None: if interrupted: print(f'Exiting loop at iteration {iterNum}') break + # Get batch imgBatch = imgNameList[i:i+BATCH_SZ] imgBatch = ['File:' + x for x in imgBatch] + # Make request headers = { 'user-agent': USER_AGENT, @@ -90,6 +97,7 @@ def downloadInfo(imgDb: str) -> None: print(f'ERROR: Exception while downloading info: {e}') print('\tImage batch: ' + '|'.join(imgBatch)) continue + # Parse response-object if 'query' not in responseObj or 'pages' not in responseObj['query']: print('WARNING: Response object doesn\'t have page data') @@ -120,6 +128,7 @@ def downloadInfo(imgDb: str) -> None: if title not in imgNames: print(f'WARNING: Got title "{title}" not in image-name list') continue + if 'imageinfo' not in page: print(f'WARNING: No imageinfo section for page "{title}"') continue @@ -129,6 +138,7 @@ def downloadInfo(imgDb: str) -> None: artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup if artist is not None: artist = TAG_REGEX.sub(' ', artist).strip() @@ -140,11 +150,12 @@ def downloadInfo(imgDb: str) -> None: credit = WHITESPACE_REGEX.sub(' ', credit) credit = html.unescape(credit) credit = urllib.parse.unquote(credit) + # Add to db dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?, ?)', (nextImgId, title, license, artist, credit, restrictions, url)) nextImgId += 1 - # + print('Closing database') dbCon.commit() dbCon.close() @@ -152,5 +163,5 @@ def downloadInfo(imgDb: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadInfo(IMG_DB) diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index df40bae..e484b33 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -9,33 +9,38 @@ The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. """ -# Took about a week to downloaded about 60k images +# Note: Took about a week to downloaded about 60k images import argparse -import re, os, time, signal +import re +import os +import time +import signal import sqlite3 -import urllib.parse, requests +import urllib.parse +import requests IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' -# + LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' - # It's unclear how to properly check for cache misses, so we just aim for 1 per sec + # Note: https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'. + # It's unclear how to properly check for cache misses, so we just aim for 1 per sec. EXP_BACKOFF = True # If True, double the timeout each time a download error occurs (otherwise just exit) def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if not os.path.exists(outDir): os.mkdir(outDir) + print('Checking for already-downloaded images') fileList = os.listdir(outDir) imgIdsDone: set[int] = set() for filename in fileList: imgIdsDone.add(int(os.path.splitext(filename)[0])) print(f'Found {len(imgIdsDone)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -44,10 +49,11 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() + print('Starting downloads') iterNum = 0 query = 'SELECT id, license, artist, credit, restrictions, url FROM imgs' @@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if interrupted: print('Exiting loop') break + # Check for problematic attributes if license is None or LICENSE_REGEX.fullmatch(license) is None: continue @@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: continue if restrictions is not None and restrictions != '': continue + # Download image iterNum += 1 print(f'Iteration {iterNum}: Downloading for image ID {imgId}') @@ -92,11 +100,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: timeout *= 2 print(f'New timeout: {timeout}') continue + print('Closing database') dbCon.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index bb2b845..194afe8 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -5,30 +5,40 @@ Reads through the wiki dump, attempts to parse short-descriptions, and adds them to a database """ -# In testing, this script took over 10 hours to run, and generated about 5GB +# Note: In testing, this script took over 10 hours to run, and generated about 5GB import argparse -import sys, os, re -import bz2, html, mwxml, mwparserfromhell +import sys +import os +import re import sqlite3 +import bz2 +import html + +import mwxml +import mwparserfromhell DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' -# Regexps + DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + def convertTemplateReplace(match): """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ if match.group(2) is None: return f'{match.group(1)} {match.group(4)}' else: return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' -PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') -LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +# ========== For data generation ========== def genData(dumpFile: str, dbFile: str) -> None: + """ Reads dump, parses descriptions, and writes to db """ print('Creating database') if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') @@ -39,13 +49,13 @@ def genData(dumpFile: str, dbFile: str) -> None: dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') - # + print('Iterating through dump file') with bz2.open(dumpFile, mode='rt') as file: for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): if pageNum % 1e4 == 0: print(f'At page {pageNum}') - # Parse page + if page.namespace == 0: try: dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) @@ -60,15 +70,22 @@ def genData(dumpFile: str, dbFile: str) -> None: desc = parseDesc(revision.text) if desc is not None: dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) - # + print('Closing database') dbCon.commit() dbCon.close() + def parseDesc(text: str) -> str | None: - # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, - # and then accumulate lines until a blank one. - # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ + Looks for a description in wikitext content. + + Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs, + and then accumulates lines until a blank one. + + Some cases not accounted for include: + disambiguation pages, abstracts with sentences split-across-lines, + nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ lines: list[str] = [] openBraceCount = 0 openBracketCount = 0 @@ -108,18 +125,24 @@ def parseDesc(text: str) -> str | None: if lines: return removeMarkup(' '.join(lines)) return None + def removeMarkup(content: str) -> str: + """ Tries to remove markup from wikitext content """ content = EMBEDDED_HTML_REGEX.sub('', content) content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup content = PARENS_GROUP_REGEX.sub('', content) content = LEFTOVER_BRACE_REGEX.sub('', content) return content + def convertTitle(title: str) -> str: + """ Replaces underscores in wiki item title """ return html.unescape(title).replace('_', ' ') +# ========== Main block ========== + if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(DUMP_FILE, DB_FILE) diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 6be8bc5..8872171 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -1,24 +1,28 @@ #!/usr/bin/python3 """ -Adds data from the wiki-dump index-file into a database +Converts data from the wiki-dump index-file into a database """ import argparse -import sys, os, re -import bz2, sqlite3 +import sys +import os +import re +import bz2 +import sqlite3 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines DB_FILE = 'dump_index.db' def genData(indexFile: str, dbFile: str) -> None: - """ Reads the index file and creates the db """ if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') lastOffset = 0 @@ -29,7 +33,7 @@ def genData(indexFile: str, dbFile: str) -> None: lineNum += 1 if lineNum % 1e5 == 0: print(f'At line {lineNum}') - # + match = lineRegex.fullmatch(line.rstrip()) assert match is not None offsetStr, pageId, title = match.group(1,2,3) @@ -49,6 +53,7 @@ def genData(indexFile: str, dbFile: str) -> None: dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) except sqlite3.IntegrityError as e: print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') dbCon.commit() dbCon.close() @@ -56,5 +61,5 @@ def genData(indexFile: str, dbFile: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(INDEX_FILE, DB_FILE) diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 9aa3863..05df63d 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -8,35 +8,42 @@ The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """ -import os, re -import bz2, html, urllib.parse +import argparse +import os +import re +import bz2 +import html +import urllib.parse import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -# Regexps + ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) +# ========== For data generation ========== + def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: + """ Looks up page IDs in dump and creates database """ print('Opening databases') indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() + print('Checking tables') if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)') # 'img_name' values are set to NULL to indicate page IDs where no image was found imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') - else: - # Check for already-processed page IDs + else: # Check for already-processed page IDs numSkipped = 0 for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'): if pid in pageIds: @@ -45,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: else: print(f'Found already-processed page ID {pid} which was not in input set') print(f'Will skip {numSkipped} already-processed page IDs') - # + print('Getting dump-file offsets') offsetToPageId: dict[int, list[int]] = {} offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets @@ -55,7 +62,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') - # + query = 'SELECT offset, next_offset, title FROM offsets WHERE id = ?' row = indexDbCur.execute(query, (pageId,)).fetchone() if row is None: @@ -68,7 +75,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: offsetToPageId[chunkOffset].append(pageId) pageIdToTitle[pageId] = title print(f'Found {len(offsetToEnd)} chunks to check') - # + print('Iterating through chunks in dump file') with open(dumpFile, mode='rb') as file: iterNum = 0 @@ -76,7 +83,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 100 == 0: print(f'At iteration {iterNum}') - # + chunkPageIds = offsetToPageId[pageOffset] # Jump to chunk file.seek(pageOffset) @@ -122,21 +129,24 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: content.append(line[:line.rfind('</text>')]) # Look for image-filename imageName = getImageName(content) - imgDbCur.execute('INSERT into page_imgs VALUES (?, ?, ?)', (pageId, None if imageName is None else pageIdToTitle[pageId], imageName)) + imgDbCur.execute( + 'INSERT into page_imgs VALUES (?, ?, ?)', + (pageId, None if imageName is None else pageIdToTitle[pageId], imageName)) break if not foundTextEnd: print(f'WARNING: Did not find </text> for page id {pageId}') break if not foundText: print(f'WARNING: Did not find <text> for page id {pageId}') - # + print('Closing databases') indexDbCon.close() imgDbCon.commit() imgDbCon.close() + def getImageName(content: list[str]) -> str | None: """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ - # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections + # Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections for line in content: match = IMG_LINE_REGEX.match(line) if match is not None: @@ -177,6 +187,8 @@ def getImageName(content: list[str]) -> str | None: return None return None +# ========== For getting input page IDs ========== + def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: print('Getting event data') titles: set[str] = set() @@ -184,6 +196,7 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: for (title,) in dbCon.execute('SELECT title from events'): titles.add(title) dbCon.close() + print('Getting page IDs') pageIds: set[int] = set() dbCon = sqlite3.connect(indexDb) @@ -193,12 +206,15 @@ def getInputPageIdsFromDb(dbFile: str, indexDb: str) -> set[int]: if row: pageIds.add(row[0]) dbCon.close() + print(f'Result: {len(pageIds)} out of {len(titles)}') return pageIds + +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + pageIds = getInputPageIdsFromDb(DB_FILE, INDEX_DB) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/hist_data/enwiki/gen_pageview_data.py b/backend/hist_data/enwiki/gen_pageview_data.py index 935b303..57d6c7b 100755 --- a/backend/hist_data/enwiki/gen_pageview_data.py +++ b/backend/hist_data/enwiki/gen_pageview_data.py @@ -3,27 +3,34 @@ """ Reads through wikimedia files containing pageview counts, computes average counts, and adds them to a database + +Each pageview file has lines that seem to hold these space-separated fields: + wiki code (eg: en.wikipedia), article title, page ID (may be: null), + platform (eg: mobile-web), monthly view count, + hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) """ -# Took about 10min per file (each had about 180e6 lines) +# Note: Took about 10min per file (each had about 180e6 lines) -import sys, os, glob, math, re +import argparse +import sys +import os +import glob +import math +import re from collections import defaultdict -import bz2, sqlite3 +import bz2 +import sqlite3 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') DUMP_INDEX_DB = 'dump_index.db' DB_FILE = 'pageview_data.db' def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: - # Each pageview file has lines that seem to hold these space-separated fields: - # wiki code (eg: en.wikipedia), article title, page ID (may be: null), - # platform (eg: mobile-web), monthly view count, - # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) if os.path.exists(dbFile): print('ERROR: Database already exists') sys.exit(1) - # + namespaceRegex = re.compile(r'[a-zA-Z]+:') titleToViews: dict[str, int] = defaultdict(int) linePrefix = b'en.wikipedia ' @@ -35,6 +42,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: print(f'At line {lineNum}') if not line.startswith(linePrefix): continue + # Get second and second-last fields linePart = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields title = linePart[:linePart.find(b' ')].decode('utf-8') @@ -45,11 +53,12 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: continue if namespaceRegex.match(title) is not None: continue + # Update map title = title.replace('_', ' ') titleToViews[title] += viewCount print(f'Found {len(titleToViews)} titles') - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -66,8 +75,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: idbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE) |
