aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rwxr-xr-xbackend/tilo.py37
-rw-r--r--backend/tolData/README.md12
-rw-r--r--backend/tolData/enwiki/README.md16
-rwxr-xr-xbackend/tolData/enwiki/genPageviewData.py62
-rwxr-xr-xbackend/tolData/genPopData.py40
-rw-r--r--src/lib.ts1
7 files changed, 148 insertions, 21 deletions
diff --git a/.gitignore b/.gitignore
index 2d08c56..70b08a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@
/backend/tolData/enwiki/*.gz
/backend/tolData/enwiki/*.db
/backend/tolData/enwiki/imgs/
+/backend/tolData/enwiki/pageviews/
/backend/tolData/dbpedia/*.bz2
/backend/tolData/dbpedia/*.db
/backend/tolData/wikidata/*.bz2
diff --git a/backend/tilo.py b/backend/tilo.py
index 196f193..f08da80 100755
--- a/backend/tilo.py
+++ b/backend/tilo.py
@@ -47,9 +47,10 @@ class TolNode:
self.iucn = iucn # null | string
class SearchSugg:
" Represents a search suggestion "
- def __init__(self, name, canonicalName=None):
+ def __init__(self, name, canonicalName=None, pop=0):
self.name = name # string
self.canonicalName = canonicalName # string | null
+ self.pop = pop # number
class SearchSuggResponse:
" Sent as responses to 'sugg' requests "
def __init__(self, searchSuggs, hasMore):
@@ -144,40 +145,42 @@ def lookupSuggs(searchStr, suggLimit, tree, dbCur):
" For a search string, returns a SearchSuggResponse describing search suggestions "
results = []
hasMore = False
- # Get node names and alt-names
+ # Get node names and alt-names, ordering by popularity
query1, query2 = (None, None)
nodesTable = f"nodes_{getTableSuffix(tree)}"
- query1 = f"SELECT DISTINCT name FROM {nodesTable}" \
- f" WHERE name LIKE ? AND name NOT LIKE '[%' ORDER BY length(name) LIMIT ?"
- query2 = f"SELECT DISTINCT alt_name, names.name FROM" \
+ query1 = f"SELECT DISTINCT {nodesTable}.name, node_pop.pop FROM {nodesTable}" \
+ f" LEFT JOIN node_pop ON {nodesTable}.name = node_pop.name" \
+ f" WHERE node_pop.name LIKE ? AND node_pop.name NOT LIKE '[%'" \
+ f" ORDER BY node_pop.pop DESC LIMIT ?"
+ query2 = f"SELECT DISTINCT alt_name, names.name, node_pop.pop FROM" \
f" names INNER JOIN {nodesTable} ON names.name = {nodesTable}.name" \
- f" WHERE alt_name LIKE ? ORDER BY length(alt_name) LIMIT ?"
- # Join results, and get shortest
+ f" LEFT JOIN node_pop ON {nodesTable}.name = node_pop.name" \
+ f" WHERE alt_name LIKE ? ORDER BY node_pop.pop DESC LIMIT ?"
suggs = []
- for (nodeName,) in dbCur.execute(query1, (searchStr + "%", suggLimit + 1)):
- suggs.append(SearchSugg(nodeName))
- for (altName, nodeName) in dbCur.execute(query2, (searchStr + "%", suggLimit + 1)):
- suggs.append(SearchSugg(altName, nodeName))
+ for nodeName, pop in dbCur.execute(query1, (searchStr + "%", suggLimit + 1)):
+ suggs.append(SearchSugg(nodeName, pop=pop))
+ for altName, nodeName, pop in dbCur.execute(query2, (searchStr + "%", suggLimit + 1)):
+ suggs.append(SearchSugg(altName, nodeName, pop))
# If insufficient results, try substring-search
foundNames = {n.name for n in suggs}
suggs2 = []
if len(suggs) < suggLimit:
newLim = suggLimit + 1 - len(suggs)
- for (nodeName,) in dbCur.execute(query1, ("%" + searchStr + "%", newLim)):
+ for nodeName, pop in dbCur.execute(query1, ("%" + searchStr + "%", newLim)):
if nodeName not in foundNames:
- suggs2.append(SearchSugg(nodeName))
+ suggs2.append(SearchSugg(nodeName, pop=pop))
foundNames.add(nodeName)
if len(suggs) + len(suggs2) < suggLimit:
newLim = suggLimit + 1 - len(suggs) - len(suggs2)
- for (altName, nodeName) in dbCur.execute(query2, ("%" + searchStr + "%", suggLimit + 1)):
+ for altName, nodeName, pop in dbCur.execute(query2, ("%" + searchStr + "%", suggLimit + 1)):
if altName not in foundNames:
- suggs2.append(SearchSugg(altName, nodeName))
+ suggs2.append(SearchSugg(altName, nodeName, pop))
foundNames.add(altName)
# Sort results
suggs.sort(key=lambda x: x.name)
- suggs.sort(key=lambda x: len(x.name))
+ suggs.sort(key=lambda x: x.pop, reverse=True)
suggs2.sort(key=lambda x: x.name)
- suggs2.sort(key=lambda x: len(x.name))
+ suggs2.sort(key=lambda x: x.pop, reverse=True)
suggs.extend(suggs2)
# Apply suggestion-quantity limit
results = suggs[:suggLimit]
diff --git a/backend/tolData/README.md b/backend/tolData/README.md
index ece07b4..3b78af8 100644
--- a/backend/tolData/README.md
+++ b/backend/tolData/README.md
@@ -45,7 +45,10 @@ This directory holds files used to generate the tree-of-life database data.db.
## Other
- `node_iucn` <br>
Format: `name TEXT PRIMARY KEY, iucn TEXT` <br>
- Associated nodes with IUCN conservation status strings (eg: 'endangered')
+ Associates nodes with IUCN conservation status strings (eg: 'endangered')
+- `node_pop` <br>
+ Format: `name TEXT PRIMARY KEY, pop INT` <br>
+ Associates nodes with popularity values (higher means more popular)
# Generating the Database
@@ -135,7 +138,12 @@ Some of the scripts use third-party packages:
images of it's children. Adds the `linked_imgs` table, and uses the
`nodes`, `edges`, and `node_imgs` tables.
-## Do some Post-Processing
+## Generate Reduced Trees
1. Run genReducedTrees.py, which generates multiple reduced versions of the tree,
adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from
pickedNodes.txt, which lists names of nodes that must be included (1 per line).
+
+## Generate Node Popularity Data
+1. Obtain 'page view files' in enwiki/Run genPopData.py, as specified in it's README.
+2. Run genPopData.py, which adds the `node_pop` table, using data in enwiki/,
+ and the `wiki_ids` table.
diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md
index 7df21c9..76f9ee5 100644
--- a/backend/tolData/enwiki/README.md
+++ b/backend/tolData/enwiki/README.md
@@ -2,8 +2,8 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
# Downloaded Files
- enwiki-20220501-pages-articles-multistream.xml.bz2 <br>
- Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror).
Contains text content and metadata for pages in enwiki.
+ Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror).
Some file content and format information was available from
<https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
- enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br>
@@ -13,7 +13,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
# Dump-Index Files
- genDumpIndexDb.py <br>
- Creates an sqlite-database version of the enwiki-dump index file.
+ Creates a database version of the enwiki-dump index file.
- dumpIndex.db <br>
Generated by genDumpIndexDb.py. <br>
Tables: <br>
@@ -45,6 +45,18 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
- downloadImgs.py <br>
Used to download image files into imgs/.
+# Page View Files
+- pageviews/pageviews-*-user.bz2
+ Each holds wikimedia article page view data for some month.
+ Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>.
+ Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>.
+- genPageviewData.py <br>
+ Reads pageview/*, and creates a database holding average monthly pageview counts.
+- pageviewData.db <br>
+ Generated using genPageviewData.py. <br>
+ Tables: <br>
+ - `views`: `title TEXT PRIMARY KEY, id INT, views INT`
+
# Other Files
- lookupPage.py <br>
Running `lookupPage.py title1` looks in the dump for a page with a given title,
diff --git a/backend/tolData/enwiki/genPageviewData.py b/backend/tolData/enwiki/genPageviewData.py
new file mode 100755
index 0000000..f0901b2
--- /dev/null
+++ b/backend/tolData/enwiki/genPageviewData.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python3
+
+import sys, os, glob, math, re
+from collections import defaultdict
+import bz2, sqlite3
+
+import argparse
+parser = argparse.ArgumentParser(description='''
+Reads through wikimedia files containing pageview counts,
+computes average counts, and adds them to a database
+''', formatter_class=argparse.RawDescriptionHelpFormatter)
+args = parser.parse_args()
+
+pageviewFiles = glob.glob('./pageviews/pageviews-*-user.bz2')
+dbFile = 'pageviewData.db'
+dumpIndexDb = 'dumpIndex.db'
+
+# Took about 15min per file (each about 180e6 lines)
+
+if os.path.exists(dbFile):
+ print('ERROR: Database already exists')
+ sys.exit(1)
+
+# Each pageview file has lines that seem to hold these space-separated fields:
+ # wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+ # platform (eg: mobile-web), monthly view count,
+ # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
+namespaceRegex = re.compile(r'[a-zA-Z]+:')
+titleToViews = defaultdict(int)
+linePrefix = b'en.wikipedia '
+for filename in pageviewFiles:
+ print(f'Reading from {filename}')
+ with bz2.open(filename, 'rb') as file:
+ for lineNum, line in enumerate(file, 1):
+ if lineNum % 1e6 == 0:
+ print(f'At line {lineNum}')
+ if not line.startswith(linePrefix):
+ continue
+ # Get second and second-last fields
+ line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
+ title = line[:line.find(b' ')].decode('utf-8')
+ viewCount = int(line[line.rfind(b' ')+1:])
+ if namespaceRegex.match(title) != None:
+ continue
+ # Update map
+ titleToViews[title] += viewCount
+print(f'Found {len(titleToViews)} titles')
+
+print('Writing to db')
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+idbCon = sqlite3.connect(dumpIndexDb)
+idbCur = idbCon.cursor()
+dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)')
+for title, views in titleToViews.items():
+ row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone()
+ if row != None:
+ wikiId = int(row[0])
+ dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles))))
+dbCon.commit()
+dbCon.close()
+idbCon.close()
diff --git a/backend/tolData/genPopData.py b/backend/tolData/genPopData.py
new file mode 100755
index 0000000..9c5382c
--- /dev/null
+++ b/backend/tolData/genPopData.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python3
+
+import sys
+import sqlite3
+
+import argparse
+parser = argparse.ArgumentParser(description='''
+Reads enwiki page view info from a database, and stores it
+as node popularity values in the database.
+''', formatter_class=argparse.RawDescriptionHelpFormatter)
+args = parser.parse_args()
+
+pageviewsDb = 'enwiki/pageviewData.db'
+dbFile = 'data.db'
+
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+
+print('Getting view counts')
+pdbCon = sqlite3.connect(pageviewsDb)
+pdbCur = pdbCon.cursor()
+nodeToViews = {} # Maps node names to counts
+iterNum = 0
+for wikiId, views in pdbCur.execute('SELECT id, views from views'):
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print(f'At iteration {iterNum}') # Reached 1.6e6
+ #
+ row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone()
+ if row != None:
+ nodeToViews[row[0]] = views
+pdbCon.close()
+
+print(f'Writing {len(nodeToViews)} entries to db')
+dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)')
+for nodeName, views in nodeToViews.items():
+ dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views))
+
+dbCon.commit()
+dbCon.close()
diff --git a/src/lib.ts b/src/lib.ts
index c9570bc..821a2e1 100644
--- a/src/lib.ts
+++ b/src/lib.ts
@@ -31,6 +31,7 @@ export function getImagePath(imgName: string): string {
export type SearchSugg = { // Represents a search-string suggestion
name: string,
canonicalName: string | null,
+ pop: number,
};
export type SearchSuggResponse = { // Holds search suggestions and an indication of if there was more
suggs: SearchSugg[],