From 0cd58b3c1a8c5297579ea7a24a14d82ae8fed169 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Tue, 30 Aug 2022 17:54:10 +1000 Subject: Add node-popularity data for search-sugg ordering Add Wikipedia pageview dumps to enwiki/pageview/ Add scripts to generate viewcount averages Update backend to sort search suggestions by popularity --- backend/tilo.py | 37 +++++++++--------- backend/tolData/README.md | 12 +++++- backend/tolData/enwiki/README.md | 16 +++++++- backend/tolData/enwiki/genPageviewData.py | 62 +++++++++++++++++++++++++++++++ backend/tolData/genPopData.py | 40 ++++++++++++++++++++ 5 files changed, 146 insertions(+), 21 deletions(-) create mode 100755 backend/tolData/enwiki/genPageviewData.py create mode 100755 backend/tolData/genPopData.py (limited to 'backend') diff --git a/backend/tilo.py b/backend/tilo.py index 196f193..f08da80 100755 --- a/backend/tilo.py +++ b/backend/tilo.py @@ -47,9 +47,10 @@ class TolNode: self.iucn = iucn # null | string class SearchSugg: " Represents a search suggestion " - def __init__(self, name, canonicalName=None): + def __init__(self, name, canonicalName=None, pop=0): self.name = name # string self.canonicalName = canonicalName # string | null + self.pop = pop # number class SearchSuggResponse: " Sent as responses to 'sugg' requests " def __init__(self, searchSuggs, hasMore): @@ -144,40 +145,42 @@ def lookupSuggs(searchStr, suggLimit, tree, dbCur): " For a search string, returns a SearchSuggResponse describing search suggestions " results = [] hasMore = False - # Get node names and alt-names + # Get node names and alt-names, ordering by popularity query1, query2 = (None, None) nodesTable = f"nodes_{getTableSuffix(tree)}" - query1 = f"SELECT DISTINCT name FROM {nodesTable}" \ - f" WHERE name LIKE ? AND name NOT LIKE '[%' ORDER BY length(name) LIMIT ?" - query2 = f"SELECT DISTINCT alt_name, names.name FROM" \ + query1 = f"SELECT DISTINCT {nodesTable}.name, node_pop.pop FROM {nodesTable}" \ + f" LEFT JOIN node_pop ON {nodesTable}.name = node_pop.name" \ + f" WHERE node_pop.name LIKE ? AND node_pop.name NOT LIKE '[%'" \ + f" ORDER BY node_pop.pop DESC LIMIT ?" + query2 = f"SELECT DISTINCT alt_name, names.name, node_pop.pop FROM" \ f" names INNER JOIN {nodesTable} ON names.name = {nodesTable}.name" \ - f" WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?" - # Join results, and get shortest + f" LEFT JOIN node_pop ON {nodesTable}.name = node_pop.name" \ + f" WHERE alt_name LIKE ? ORDER BY node_pop.pop DESC LIMIT ?" suggs = [] - for (nodeName,) in dbCur.execute(query1, (searchStr + "%", suggLimit + 1)): - suggs.append(SearchSugg(nodeName)) - for (altName, nodeName) in dbCur.execute(query2, (searchStr + "%", suggLimit + 1)): - suggs.append(SearchSugg(altName, nodeName)) + for nodeName, pop in dbCur.execute(query1, (searchStr + "%", suggLimit + 1)): + suggs.append(SearchSugg(nodeName, pop=pop)) + for altName, nodeName, pop in dbCur.execute(query2, (searchStr + "%", suggLimit + 1)): + suggs.append(SearchSugg(altName, nodeName, pop)) # If insufficient results, try substring-search foundNames = {n.name for n in suggs} suggs2 = [] if len(suggs) < suggLimit: newLim = suggLimit + 1 - len(suggs) - for (nodeName,) in dbCur.execute(query1, ("%" + searchStr + "%", newLim)): + for nodeName, pop in dbCur.execute(query1, ("%" + searchStr + "%", newLim)): if nodeName not in foundNames: - suggs2.append(SearchSugg(nodeName)) + suggs2.append(SearchSugg(nodeName, pop=pop)) foundNames.add(nodeName) if len(suggs) + len(suggs2) < suggLimit: newLim = suggLimit + 1 - len(suggs) - len(suggs2) - for (altName, nodeName) in dbCur.execute(query2, ("%" + searchStr + "%", suggLimit + 1)): + for altName, nodeName, pop in dbCur.execute(query2, ("%" + searchStr + "%", suggLimit + 1)): if altName not in foundNames: - suggs2.append(SearchSugg(altName, nodeName)) + suggs2.append(SearchSugg(altName, nodeName, pop)) foundNames.add(altName) # Sort results suggs.sort(key=lambda x: x.name) - suggs.sort(key=lambda x: len(x.name)) + suggs.sort(key=lambda x: x.pop, reverse=True) suggs2.sort(key=lambda x: x.name) - suggs2.sort(key=lambda x: len(x.name)) + suggs2.sort(key=lambda x: x.pop, reverse=True) suggs.extend(suggs2) # Apply suggestion-quantity limit results = suggs[:suggLimit] diff --git a/backend/tolData/README.md b/backend/tolData/README.md index ece07b4..3b78af8 100644 --- a/backend/tolData/README.md +++ b/backend/tolData/README.md @@ -45,7 +45,10 @@ This directory holds files used to generate the tree-of-life database data.db. ## Other - `node_iucn`
Format: `name TEXT PRIMARY KEY, iucn TEXT`
- Associated nodes with IUCN conservation status strings (eg: 'endangered') + Associates nodes with IUCN conservation status strings (eg: 'endangered') +- `node_pop`
+ Format: `name TEXT PRIMARY KEY, pop INT`
+ Associates nodes with popularity values (higher means more popular) # Generating the Database @@ -135,7 +138,12 @@ Some of the scripts use third-party packages: images of it's children. Adds the `linked_imgs` table, and uses the `nodes`, `edges`, and `node_imgs` tables. -## Do some Post-Processing +## Generate Reduced Trees 1. Run genReducedTrees.py, which generates multiple reduced versions of the tree, adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from pickedNodes.txt, which lists names of nodes that must be included (1 per line). + +## Generate Node Popularity Data +1. Obtain 'page view files' in enwiki/Run genPopData.py, as specified in it's README. +2. Run genPopData.py, which adds the `node_pop` table, using data in enwiki/, + and the `wiki_ids` table. diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md index 7df21c9..76f9ee5 100644 --- a/backend/tolData/enwiki/README.md +++ b/backend/tolData/enwiki/README.md @@ -2,8 +2,8 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. # Downloaded Files - enwiki-20220501-pages-articles-multistream.xml.bz2
- Obtained via (site suggests downloading from a mirror). Contains text content and metadata for pages in enwiki. + Obtained via (site suggests downloading from a mirror). Some file content and format information was available from . - enwiki-20220501-pages-articles-multistream-index.txt.bz2
@@ -13,7 +13,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. # Dump-Index Files - genDumpIndexDb.py
- Creates an sqlite-database version of the enwiki-dump index file. + Creates a database version of the enwiki-dump index file. - dumpIndex.db
Generated by genDumpIndexDb.py.
Tables:
@@ -45,6 +45,18 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. - downloadImgs.py
Used to download image files into imgs/. +# Page View Files +- pageviews/pageviews-*-user.bz2 + Each holds wikimedia article page view data for some month. + Obtained via . + Some format info was available from . +- genPageviewData.py
+ Reads pageview/*, and creates a database holding average monthly pageview counts. +- pageviewData.db
+ Generated using genPageviewData.py.
+ Tables:
+ - `views`: `title TEXT PRIMARY KEY, id INT, views INT` + # Other Files - lookupPage.py
Running `lookupPage.py title1` looks in the dump for a page with a given title, diff --git a/backend/tolData/enwiki/genPageviewData.py b/backend/tolData/enwiki/genPageviewData.py new file mode 100755 index 0000000..f0901b2 --- /dev/null +++ b/backend/tolData/enwiki/genPageviewData.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 + +import sys, os, glob, math, re +from collections import defaultdict +import bz2, sqlite3 + +import argparse +parser = argparse.ArgumentParser(description=''' +Reads through wikimedia files containing pageview counts, +computes average counts, and adds them to a database +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +pageviewFiles = glob.glob('./pageviews/pageviews-*-user.bz2') +dbFile = 'pageviewData.db' +dumpIndexDb = 'dumpIndex.db' + +# Took about 15min per file (each about 180e6 lines) + +if os.path.exists(dbFile): + print('ERROR: Database already exists') + sys.exit(1) + +# Each pageview file has lines that seem to hold these space-separated fields: + # wiki code (eg: en.wikipedia), article title, page ID (may be: null), + # platform (eg: mobile-web), monthly view count, + # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) +namespaceRegex = re.compile(r'[a-zA-Z]+:') +titleToViews = defaultdict(int) +linePrefix = b'en.wikipedia ' +for filename in pageviewFiles: + print(f'Reading from {filename}') + with bz2.open(filename, 'rb') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e6 == 0: + print(f'At line {lineNum}') + if not line.startswith(linePrefix): + continue + # Get second and second-last fields + line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields + title = line[:line.find(b' ')].decode('utf-8') + viewCount = int(line[line.rfind(b' ')+1:]) + if namespaceRegex.match(title) != None: + continue + # Update map + titleToViews[title] += viewCount +print(f'Found {len(titleToViews)} titles') + +print('Writing to db') +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +idbCon = sqlite3.connect(dumpIndexDb) +idbCur = idbCon.cursor() +dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)') +for title, views in titleToViews.items(): + row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row != None: + wikiId = int(row[0]) + dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles)))) +dbCon.commit() +dbCon.close() +idbCon.close() diff --git a/backend/tolData/genPopData.py b/backend/tolData/genPopData.py new file mode 100755 index 0000000..9c5382c --- /dev/null +++ b/backend/tolData/genPopData.py @@ -0,0 +1,40 @@ +#!/usr/bin/python3 + +import sys +import sqlite3 + +import argparse +parser = argparse.ArgumentParser(description=''' +Reads enwiki page view info from a database, and stores it +as node popularity values in the database. +''', formatter_class=argparse.RawDescriptionHelpFormatter) +args = parser.parse_args() + +pageviewsDb = 'enwiki/pageviewData.db' +dbFile = 'data.db' + +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() + +print('Getting view counts') +pdbCon = sqlite3.connect(pageviewsDb) +pdbCur = pdbCon.cursor() +nodeToViews = {} # Maps node names to counts +iterNum = 0 +for wikiId, views in pdbCur.execute('SELECT id, views from views'): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') # Reached 1.6e6 + # + row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone() + if row != None: + nodeToViews[row[0]] = views +pdbCon.close() + +print(f'Writing {len(nodeToViews)} entries to db') +dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)') +for nodeName, views in nodeToViews.items(): + dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views)) + +dbCon.commit() +dbCon.close() -- cgit v1.2.3