diff options
| author | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2023-01-29 11:30:47 +1100 |
| commit | 8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch) | |
| tree | ffd824aa9b945d69b47f012617ee13d98764d078 | |
| parent | f5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff) | |
Adjust backend coding style
Add line spacing, section comments, and import consistency
44 files changed, 667 insertions, 223 deletions
diff --git a/backend/server.py b/backend/server.py index c953a9f..d7f6309 100755 --- a/backend/server.py +++ b/backend/server.py @@ -18,10 +18,8 @@ def wrappingApp(environ: dict[str, str], start_response) -> Iterable[bytes]: """ WSGI handler that uses 'application', but also serves image files """ urlPath = environ['PATH_INFO'] if urlPath.startswith('/data/'): - # Run WSGI script - return application(environ, start_response) - elif urlPath.startswith('/tol_data/img/'): - # Serve image file + return application(environ, start_response) # Run WSGI script + elif urlPath.startswith('/tol_data/img/'): # Serve image file imgPath = os.path.join(os.getcwd(), urlPath[1:]) if os.path.exists(imgPath): imgType = mimetypes.guess_type(imgPath)[0] @@ -33,6 +31,7 @@ def wrappingApp(environ: dict[str, str], start_response) -> Iterable[bytes]: else: start_response('404 Not Found', [('Content-type', 'text/plain')]) return [b'Unrecognised path'] + # Start server with simple_server.make_server('', 8000, wrappingApp) as httpd: print('Serving HTTP on port 8000...') diff --git a/backend/tests/common.py b/backend/tests/common.py index cb455e4..abfa471 100644 --- a/backend/tests/common.py +++ b/backend/tests/common.py @@ -3,7 +3,9 @@ Utilities for testing """ from typing import Any -import bz2, gzip, sqlite3 +import bz2 +import gzip +import sqlite3 def createTestFile(filename: str, content: str) -> None: """ Creates a file with the given name and contents """ diff --git a/backend/tests/dbpedia/test_gen_desc_data.py b/backend/tests/dbpedia/test_gen_desc_data.py index 7d35677..ae56c5e 100644 --- a/backend/tests/dbpedia/test_gen_desc_data.py +++ b/backend/tests/dbpedia/test_gen_desc_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestBz2, readTestDbTable from tol_data.dbpedia.gen_desc_data import genData @@ -57,9 +58,11 @@ class TestGenData(unittest.TestCase): '<http://dbpedia.org/resource/A_Hat> <http://www.w3.org/2000/01/rdf-schema#comment>' ' "Hats are not parrots, nor are they potatoes."@en .\n' )) + # Run dbFile = os.path.join(tempDir, 'descData.db') genData(labelsFile, idsFile, redirectsFile, disambigFile, typesFile, abstractsFile, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT iri, label from labels'), diff --git a/backend/tests/enwiki/test_download_img_license_info.py b/backend/tests/enwiki/test_download_img_license_info.py index ed6e426..bd91478 100644 --- a/backend/tests/enwiki/test_download_img_license_info.py +++ b/backend/tests/enwiki/test_download_img_license_info.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import Mock, patch -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from tol_data.enwiki.download_img_license_info import downloadInfo @@ -53,6 +54,7 @@ TEST_RESPONSE1 = { } } } + TEST_RESPONSE2 = { 'batchcomplete': '', 'query': { @@ -152,8 +154,10 @@ class TestDownloadInfo(unittest.TestCase): (1, 'Octopus2.jpg'), } ) + # Run downloadInfo(imgDb) + # Check self.assertEqual( readTestDbTable(imgDb, 'SELECT name, license, artist, credit, restrictions, url from imgs'), @@ -162,6 +166,7 @@ class TestDownloadInfo(unittest.TestCase): 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'), } ) + # Run with updated image-data db createTestDbTable( imgDb, @@ -172,6 +177,7 @@ class TestDownloadInfo(unittest.TestCase): } ) downloadInfo(imgDb) + # Check self.assertEqual( readTestDbTable(imgDb, 'SELECT name, license, artist, credit, restrictions, url from imgs'), diff --git a/backend/tests/enwiki/test_download_imgs.py b/backend/tests/enwiki/test_download_imgs.py index 2618b8a..aaf27bc 100644 --- a/backend/tests/enwiki/test_download_imgs.py +++ b/backend/tests/enwiki/test_download_imgs.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import Mock, patch -import tempfile, os +import tempfile +import os from tests.common import readTestFile, createTestDbTable from tol_data.enwiki.download_imgs import downloadImgs @@ -40,10 +41,12 @@ class TestDownloadInfo(unittest.TestCase): ('six','cc-by','','fred','','https://upload.wikimedia.org/6.png'), } ) + # Create temp output directory with tempfile.TemporaryDirectory() as outDir: # Run downloadImgs(imgDb, outDir, 0) + # Check expectedImgs = { '1.jpg': 'img:https://upload.wikimedia.org/1.jpg', diff --git a/backend/tests/enwiki/test_gen_desc_data.py b/backend/tests/enwiki/test_gen_desc_data.py index 801aa69..0d1536b 100644 --- a/backend/tests/enwiki/test_gen_desc_data.py +++ b/backend/tests/enwiki/test_gen_desc_data.py @@ -1,5 +1,6 @@ import unittest -import os, tempfile +import os +import tempfile from tests.common import readTestDbTable from tol_data.enwiki.gen_desc_data import genData @@ -12,6 +13,7 @@ class TestGenData(unittest.TestCase): # Run dbFile = os.path.join(tempDir, 'descData.db') genData(TEST_DUMP_FILE, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT id, title FROM pages'), diff --git a/backend/tests/enwiki/test_gen_dump_index_db.py b/backend/tests/enwiki/test_gen_dump_index_db.py index e0715f3..b918f15 100644 --- a/backend/tests/enwiki/test_gen_dump_index_db.py +++ b/backend/tests/enwiki/test_gen_dump_index_db.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestBz2, readTestDbTable from tol_data.enwiki.gen_dump_index_db import genData @@ -10,15 +11,18 @@ def runGenData(indexFileContents: str): # Create temp index file indexFile = os.path.join(tempDir, 'index.txt.bz2') createTestBz2(indexFile, indexFileContents) + # Run dbFile = os.path.join(tempDir, 'data.db') genData(indexFile, dbFile) + # Read db return readTestDbTable(dbFile, 'SELECT title, id, offset, next_offset FROM offsets') class TestGenData(unittest.TestCase): def setUp(self): self.maxDiff = None # Remove output-diff size limit + def test_index_file(self): indexFileContents = ( '100:10:apple\n' @@ -33,6 +37,7 @@ class TestGenData(unittest.TestCase): ('banana ice-cream', 99, 300, 1000), ('Custard!', 2030, 1000, -1), }) + def test_emp_index(self): offsetsMap = runGenData('') self.assertEqual(offsetsMap, set()) diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py index 1703b78..0a8f79d 100644 --- a/backend/tests/enwiki/test_gen_img_data.py +++ b/backend/tests/enwiki/test_gen_img_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from tol_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData @@ -20,8 +21,10 @@ class TestGetInputPageIdsFromDb(unittest.TestCase): ('and another', 2), } ) + # Run pageIds = getInputPageIdsFromDb(dbFile) + # Check self.assertEqual(pageIds, {1, 2}) @@ -40,9 +43,11 @@ class TestGenData(unittest.TestCase): ('Autism',25,0,-1), } ) + # Run imgDb = os.path.join(tempDir, 'imgData.db') genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb) + # Check self.assertEqual( readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'), @@ -51,8 +56,10 @@ class TestGenData(unittest.TestCase): (25, 'Autism-stacking-cans 2nd edit.jpg'), } ) + # Run with updated page-ids set genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb) + # Check self.assertEqual( readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'), diff --git a/backend/tests/enwiki/test_gen_pageview_data.py b/backend/tests/enwiki/test_gen_pageview_data.py index 5002eb0..0c4a35e 100644 --- a/backend/tests/enwiki/test_gen_pageview_data.py +++ b/backend/tests/enwiki/test_gen_pageview_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestBz2, createTestDbTable, readTestDbTable from tol_data.enwiki.gen_pageview_data import genData @@ -18,6 +19,7 @@ class TestGenData(unittest.TestCase): 'fr.wikipedia Four null desktop 12 T6U6\n' 'en.wikipedia Three null desktop 10 E4G5Z61\n' )) + # Create temp dump-index db dumpIndexDb = os.path.join(tempDir, 'dump_index.db') createTestDbTable( @@ -31,9 +33,11 @@ class TestGenData(unittest.TestCase): ('Four', 4, 0, -1), } ) + # Run dbFile = os.path.join(tempDir, 'data.db') genData(pageviewFiles, dumpIndexDb, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT title, id, views from views'), diff --git a/backend/tests/eol/test_download_imgs.py b/backend/tests/eol/test_download_imgs.py index 975d1c7..4872ca3 100644 --- a/backend/tests/eol/test_download_imgs.py +++ b/backend/tests/eol/test_download_imgs.py @@ -1,6 +1,7 @@ import unittest from unittest.mock import Mock, patch -import tempfile, os +import tempfile +import os from tests.common import readTestFile, createTestDbTable from tol_data.eol.download_imgs import getEolIdsFromDb, downloadImgs @@ -19,8 +20,10 @@ class TestGetEolIdsFromDb(unittest.TestCase): ('a second', 2), } ) + # Run eolIds = getEolIdsFromDb(dbFile) + # Check self.assertEqual(eolIds, {1, 2}) @@ -30,6 +33,7 @@ class TestDownloadImgs(unittest.TestCase): requestsGetMock.side_effect = lambda url: Mock(content=('img:' + url).encode()) with tempfile.TemporaryDirectory() as tempDir: eolIds = {1, 2, 4} + # Create temp images-list db imagesListDb = os.path.join(tempDir, 'images_list.db') createTestDbTable( @@ -48,10 +52,12 @@ class TestDownloadImgs(unittest.TestCase): (30, 3, '', 'https://content.eol.org/3.png', 'cc-by', 'owner3'), } ) + # Create temp output dir with tempfile.TemporaryDirectory() as outDir: # Run downloadImgs(eolIds, imagesListDb, outDir) + # Check expectedImgs1 = { '1 10.jpg': 'img:https://content.eol.org/1.jpg', diff --git a/backend/tests/eol/test_gen_images_list_db.py b/backend/tests/eol/test_gen_images_list_db.py index ca9b495..c1c81f3 100644 --- a/backend/tests/eol/test_gen_images_list_db.py +++ b/backend/tests/eol/test_gen_images_list_db.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestFile, readTestDbTable from tol_data.eol.gen_images_list_db import genData @@ -17,9 +18,11 @@ class TestGenData(unittest.TestCase): createTestFile(os.path.join(tempDir, 'imgs-2.csv'), ( '3,30,https://example.com/3/,https://content.eol.org/3.png,public,owner3\n' )) + # Run dbFile = os.path.join(tempDir, 'imagesList.db') genData(imageListsGlob, dbFile) + # Check self.assertEqual( readTestDbTable( diff --git a/backend/tests/eol/test_review_imgs.py b/backend/tests/eol/test_review_imgs.py index 49c09bb..21d4756 100644 --- a/backend/tests/eol/test_review_imgs.py +++ b/backend/tests/eol/test_review_imgs.py @@ -1,5 +1,7 @@ import unittest -import tempfile, os, shutil +import tempfile +import os +import shutil from tests.common import createTestDbTable from tol_data.eol.review_imgs import reviewImgs @@ -19,6 +21,7 @@ class TestReviewImgs(unittest.TestCase): shutil.copy(AVOID_IMG, os.path.join(imgDir, '2 22.jpg')) shutil.copy(AVOID_IMG, os.path.join(imgDir, '3 30.png')) shutil.copy(AVOID_IMG, os.path.join(imgDir, '3 31.jpg')) + # Create temp extra-info db extraInfoDb = os.path.join(tempDir, 'data.db') createTestDbTable( @@ -39,8 +42,10 @@ class TestReviewImgs(unittest.TestCase): ('two','II',1,'eol'), } ) + # Run outDir = os.path.join(tempDir, 'imgs') reviewImgs(imgDir, outDir, extraInfoDb) + # Check self.assertEqual(set(os.listdir(outDir)), {'1 10.jpg', '2 20.jpeg'}) diff --git a/backend/tests/test_gen_desc_data.py b/backend/tests/test_gen_desc_data.py index cc0582d..8d21978 100644 --- a/backend/tests/test_gen_desc_data.py +++ b/backend/tests/test_gen_desc_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from tol_data.gen_desc_data import genData @@ -37,6 +38,7 @@ class TestGenData(unittest.TestCase): ('<http://dbpedia.org/resource/Three>', 'Three from dbp'), } ) + # Create temp enwiki db enwikiDb = os.path.join(tempDir, 'enwiki_descs.db') createTestDbTable( @@ -70,6 +72,7 @@ class TestGenData(unittest.TestCase): (5, 'Five from enwiki'), } ) + # Create temp tree-of-life db dbFile = os.path.join(tempDir, 'data.db') createTestDbTable( @@ -86,8 +89,10 @@ class TestGenData(unittest.TestCase): ('seventh', 7), } ) + # Run genData(dbpediaDb, enwikiDb, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT wiki_id, desc, from_dbp from descs'), diff --git a/backend/tests/test_gen_imgs.py b/backend/tests/test_gen_imgs.py index 1ddd438..efab361 100644 --- a/backend/tests/test_gen_imgs.py +++ b/backend/tests/test_gen_imgs.py @@ -1,6 +1,8 @@ import unittest from unittest.mock import patch -import tempfile, os, shutil +import tempfile +import os +import shutil from tests.common import createTestFile, createTestDbTable, readTestDbTable from tol_data.gen_imgs import genImgs @@ -95,9 +97,11 @@ class TestGenImgs(unittest.TestCase): ('node6', 'ott6', 10), } ) + # Run outDir = os.path.join(tempDir, 'img') genImgs(imgListFile, eolImgDir, outDir, eolImgDb, enwikiImgDb, pickedImgDir, pickedImgsFile, dbFile) + # Check self.assertEqual(set(os.listdir(outDir)), { 'ott1.jpg', diff --git a/backend/tests/test_gen_linked_imgs.py b/backend/tests/test_gen_linked_imgs.py index b989407..be4b0d1 100644 --- a/backend/tests/test_gen_linked_imgs.py +++ b/backend/tests/test_gen_linked_imgs.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from tol_data.gen_linked_imgs import genData @@ -70,8 +71,10 @@ class TestGenData(unittest.TestCase): ('thirteen', 12, 'enwiki'), } ) + # Run genData(dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT name, otol_ids from linked_imgs'), diff --git a/backend/tests/test_gen_mapping_data.py b/backend/tests/test_gen_mapping_data.py index 9aa99b7..57c9ef7 100644 --- a/backend/tests/test_gen_mapping_data.py +++ b/backend/tests/test_gen_mapping_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestFile, createTestGzip, createTestDbTable, readTestDbTable from tol_data.gen_mapping_data import \ @@ -18,10 +19,12 @@ class TestReadTaxonomyFile(unittest.TestCase): SEP.join(['10', '20', 'ten', 'family', 'if:10,if:100', '', '', '\n']), SEP.join(['11', '100', 'eleven', '', 'igloo:1,ncbi:?', '', '', '\n']) ])) + # Run nodeToSrcIds = {} usedSrcIds = set() readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds) + # Check self.assertEqual(nodeToSrcIds, { 1: {'ncbi': 10}, @@ -34,6 +37,7 @@ class TestReadTaxonomyFile(unittest.TestCase): ('gbif', 1), ('if', 10) }) + class TestReadEolIdsFile(unittest.TestCase): def test_read(self): with tempfile.TemporaryDirectory() as tempDir: @@ -51,15 +55,18 @@ class TestReadEolIdsFile(unittest.TestCase): 10: {'ncbi': 10}, 20: {'ncbi': 23, 'gbif': 234} } + # Run usedSrcIds = {('ncbi', 10), ('gbif', 234), ('ncbi', 23)} nodeToEolId = {} readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId) + # Check self.assertEqual(nodeToEolId, { 10: 1, 20: 101, }) + class TestReadWikidataDb(unittest.TestCase): def test_read(self): with tempfile.TemporaryDirectory() as tempDir: @@ -105,10 +112,12 @@ class TestReadWikidataDb(unittest.TestCase): nodeToEolId = { 20: 100, } + # Run nodeToWikiTitle = {} titleToIucnStatus = {} readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId) + # Check self.assertEqual(nodeToWikiTitle, { 10: 'one', @@ -123,6 +132,7 @@ class TestReadWikidataDb(unittest.TestCase): 10: 1, 20: 100, }) + class TestReadPickedMappings(unittest.TestCase): def test_read(self): with tempfile.TemporaryDirectory() as tempDir: @@ -155,8 +165,10 @@ class TestReadPickedMappings(unittest.TestCase): 12: 'two', 35: 'goanna', } + # Run readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle) + # Check self.assertEqual(nodeToEolId, { 1: 1, @@ -170,6 +182,7 @@ class TestReadPickedMappings(unittest.TestCase): 15: 'ghi', 35: 'jkl', }) + class TestReadGetEnwikiPageIds(unittest.TestCase): def test_read(self): with tempfile.TemporaryDirectory() as tempDir: @@ -191,14 +204,17 @@ class TestReadGetEnwikiPageIds(unittest.TestCase): 20: 'two', 30: 'three', } + # Run titleToPageId = {} getEnwikiPageIds(dumpIndexDb, nodeToWikiTitle, titleToPageId) + # Check self.assertEqual(titleToPageId, { 'one': 1, 'two': 22, }) + class TestGenData(unittest.TestCase): def test_mapping(self): with tempfile.TemporaryDirectory() as tempDir: @@ -275,8 +291,10 @@ class TestGenData(unittest.TestCase): ('third', 'ott3', 2), ] ) + # Run genData(taxonomyFile, eolIdsFile, wikidataDb, pickedMappings, dumpIndexDb, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT name, id from eol_ids'), diff --git a/backend/tests/test_gen_name_data.py b/backend/tests/test_gen_name_data.py index 85e81d8..0dab23a 100644 --- a/backend/tests/test_gen_name_data.py +++ b/backend/tests/test_gen_name_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestFile, createTestDbTable, readTestDbTable from tol_data.gen_name_data import genData @@ -78,8 +79,10 @@ class TestGenData(unittest.TestCase): ('three', 2), ] ) + # Run genData(eolNamesFile, enwikiDb, pickedNamesFile, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT name, alt_name, pref_alt, src FROM names'), diff --git a/backend/tests/test_gen_otol_data.py b/backend/tests/test_gen_otol_data.py index 25e65e3..cc0404a 100644 --- a/backend/tests/test_gen_otol_data.py +++ b/backend/tests/test_gen_otol_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestFile, readTestDbTable from tol_data.gen_otol_data import genData @@ -16,9 +17,11 @@ def runGenData(treeFileContents: str, annFileContents: str, pickedFileContents: # Create temp picked names file pickedFile = os.path.join(tempDir, 'pn.txt') createTestFile(pickedFile, pickedFileContents) + # Run genData() dbFile = os.path.join(tempDir, 'data.db') genData(treeFile, annFile, pickedFile, dbFile) + # Read database nodes = readTestDbTable(dbFile, 'SELECT name, id, tips FROM nodes') edges = readTestDbTable(dbFile, 'SELECT parent, child, p_support FROM edges') @@ -27,6 +30,7 @@ def runGenData(treeFileContents: str, annFileContents: str, pickedFileContents: class TestGenData(unittest.TestCase): def setUp(self): self.maxDiff = None # Remove output-diff size limit + def test_newick(self): treeFileContents = """ ( @@ -40,7 +44,9 @@ class TestGenData(unittest.TestCase): )cellular_organisms_ott1;""" annFileContents = '{"nodes": {}}' pickedFileContents = '' + nodes, edges = runGenData(treeFileContents, annFileContents, pickedFileContents) + self.assertEqual(nodes, { ('land plants', 'ott2', 1), ('traveller\'s tree', 'ott100', 1), @@ -66,9 +72,11 @@ class TestGenData(unittest.TestCase): ('citrus', 'lemon', 0), ('citrus', 'orange', 0), }) + def test_newick_invalid(self): with self.assertRaises(Exception): runGenData('(A,B,(C,D));', '{"nodes": {}}', '') + def test_annotations(self): treeFileContents = '(two_ott2, three_ott3, four_ott4)one_ott1;' annFileContents = """ @@ -91,7 +99,9 @@ class TestGenData(unittest.TestCase): } } }""" + nodes, edges = runGenData(treeFileContents, annFileContents, '') + self.assertEqual(nodes, { ('one', 'ott1', 3), ('two', 'ott2', 1), @@ -103,10 +113,13 @@ class TestGenData(unittest.TestCase): ('one', 'three', 1), ('one', 'four', 0), }) + def test_picked_names_file(self): treeFileContents = '(one_ott2, two_ott3)one_ott1;' pickedFileContents = 'one|ott2' + nodes, edges = runGenData(treeFileContents, '{"nodes": {}}', pickedFileContents) + self.assertEqual(nodes, { ('one [2]', 'ott1', 2), ('one', 'ott2', 1), diff --git a/backend/tests/test_gen_pop_data.py b/backend/tests/test_gen_pop_data.py index dd1cb22..b71ebc5 100644 --- a/backend/tests/test_gen_pop_data.py +++ b/backend/tests/test_gen_pop_data.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable, readTestDbTable from tol_data.gen_pop_data import genData @@ -30,8 +31,10 @@ class TestGenData(unittest.TestCase): ('node3', 3), } ) + # Run genData(pageviewsDb, dbFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT name, pop from node_pop'), diff --git a/backend/tests/test_gen_reduced_trees.py b/backend/tests/test_gen_reduced_trees.py index 2ae4dfd..99cbd92 100644 --- a/backend/tests/test_gen_reduced_trees.py +++ b/backend/tests/test_gen_reduced_trees.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestFile, createTestDbTable, readTestDbTable from tol_data.gen_reduced_trees import genData @@ -98,8 +99,10 @@ class TestGenData(unittest.TestCase): 'five\n' 'VIII\n' )) + # Run genData(None, dbFile, pickedNodesFile) + # Check self.assertEqual( readTestDbTable(dbFile, 'SELECT name, id, tips from nodes_p'), diff --git a/backend/tests/test_review_imgs_to_gen.py b/backend/tests/test_review_imgs_to_gen.py index d88523b..e98ab32 100644 --- a/backend/tests/test_review_imgs_to_gen.py +++ b/backend/tests/test_review_imgs_to_gen.py @@ -1,5 +1,7 @@ import unittest -import tempfile, os, shutil +import tempfile +import os +import shutil from tests.common import readTestFile, createTestDbTable from tol_data.review_imgs_to_gen import reviewImgs @@ -62,19 +64,24 @@ class TestReviewImgs(unittest.TestCase): ('four', 4), } ) + # Run outFile = os.path.join(tempDir, 'imgList.txt') reviewImgs(eolImgDir, enwikiImgDir, dbFile, outFile, 'all') + # Check self.assertEqual(set(readTestFile(outFile).splitlines()), { 'ott1 ' + os.path.join(eolImgDir, '1 10.jpg'), 'ott2', 'ott3 ' + os.path.join(enwikiImgDir, '3.png'), }) + # Add extra data createTestDbTable(dbFile, None, 'INSERT INTO nodes VALUES (?, ?, ?)',{('four', 'ott4', 2)}) + # Run reviewImgs(eolImgDir, enwikiImgDir, dbFile, outFile, 'all') + # Check self.assertEqual(set(readTestFile(outFile).splitlines()), { 'ott1 ' + os.path.join(eolImgDir, '1 10.jpg'), diff --git a/backend/tests/test_tilo.py b/backend/tests/test_tilo.py index cfc719a..718fb8b 100644 --- a/backend/tests/test_tilo.py +++ b/backend/tests/test_tilo.py @@ -1,5 +1,6 @@ import unittest -import tempfile, os +import tempfile +import os from tests.common import createTestDbTable from tilo import handleReq, TolNode, SearchSuggResponse, SearchSugg, InfoResponse, NodeInfo, DescInfo, ImgInfo @@ -122,8 +123,10 @@ class TestHandleReq(unittest.TestCase): self.tempDir = tempfile.TemporaryDirectory() self.dbFile = os.path.join(self.tempDir.name, 'data.db') initTestDb(self.dbFile) + def tearDown(self): self.tempDir.cleanup() + def test_node_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'name=two&type=node&tree=trimmed'}) self.assertEqual(response, { @@ -131,6 +134,7 @@ class TestHandleReq(unittest.TestCase): 'three': TolNode('ott3', [], 'two', 1, False, None, None, None), 'four': TolNode('ott4', [], 'two', 1, True, None, 'ott4.jpg', None), }) + def test_node_toroot_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'name=seven&type=node&toroot=1&excl=five&tree=trimmed'}) self.assertEqual(response, { @@ -138,6 +142,7 @@ class TestHandleReq(unittest.TestCase): 'six': TolNode('ott6', ['seven'], 'five', 1, 1, 'VI', 'ott6.jpg', 'endangered'), 'seven': TolNode('ott7', [], 'six', 1, 1, None, None, None), }) + def test_sugg_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'name=t&type=sugg&tree=trimmed'}) self.assertEqual(response, SearchSuggResponse( @@ -148,6 +153,7 @@ class TestHandleReq(unittest.TestCase): ], False )) + def test_info_req(self): response = handleReq(self.dbFile, {'QUERY_STRING': 'name=six&type=info&tree=trimmed'}) self.assertEqual(response, InfoResponse( diff --git a/backend/tilo.py b/backend/tilo.py index 21b5a7f..f33449b 100755 --- a/backend/tilo.py +++ b/backend/tilo.py @@ -18,16 +18,20 @@ Expected HTTP query parameters: """ from typing import Iterable, cast -import sys, re -import urllib.parse, sqlite3 -import gzip, jsonpickle +import sys +import re +import urllib.parse +import sqlite3 +import gzip +import jsonpickle DB_FILE = 'tol_data/data.db' DEFAULT_SUGG_LIM = 5 MAX_SUGG_LIM = 50 ROOT_NAME = 'cellular organisms' -# Classes for objects sent as responses (matches lib.ts types in client-side code) +# ========== Classes for values sent as responses ========== + class TolNode: """ Used when responding to 'node' and 'chain' requests """ def __init__( @@ -48,52 +52,61 @@ class TolNode: self.commonName = commonName self.imgName = imgName self.iucn = iucn - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, TolNode) and \ (self.otolId, set(self.children), self.parent, self.tips, \ self.pSupport, self.commonName, self.imgName, self.iucn) == \ (other.otolId, set(other.children), other.parent, other.tips, \ other.pSupport, other.commonName, other.imgName, other.iucn) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class SearchSugg: """ Represents a search suggestion """ def __init__(self, name: str, canonicalName: str | None = None, pop=0): self.name = name self.canonicalName = canonicalName self.pop = pop if pop is not None else 0 - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, SearchSugg) and \ (self.name, self.canonicalName, self.pop) == (other.name, other.canonicalName, other.pop) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) - def __hash__(self): + + def __hash__(self): # Used in unit testing return (self.name, self.canonicalName, self.pop).__hash__() + class SearchSuggResponse: """ Sent as responses to 'sugg' requests """ def __init__(self, searchSuggs: list[SearchSugg], hasMore: bool): self.suggs = searchSuggs self.hasMore = hasMore - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, SearchSuggResponse) and \ (set(self.suggs), self.hasMore) == (set(other.suggs), other.hasMore) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class DescInfo: """ Represents a node's associated description """ def __init__(self, text: str, wikiId: int, fromDbp: bool): self.text = text self.wikiId = wikiId self.fromDbp = fromDbp - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, DescInfo) and \ (self.text, self.wikiId, self.fromDbp) == (other.text, other.wikiId, other.fromDbp) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class ImgInfo: """ Represents a node's associated image """ def __init__(self, id: int, src: str, url: str, license: str, artist: str, credit: str): @@ -103,38 +116,44 @@ class ImgInfo: self.license = license self.artist = artist self.credit = credit - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, ImgInfo) and \ (self.id, self.src, self.url, self.license, self.artist, self.credit) == \ (other.id, other.src, other.url, other.license, other.artist, other.credit) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class NodeInfo: """ Represents info about a node """ def __init__(self, tolNode: TolNode, descInfo: DescInfo | None, imgInfo: ImgInfo | None): self.tolNode = tolNode self.descInfo = descInfo self.imgInfo = imgInfo - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, NodeInfo) and \ (self.tolNode, self.descInfo, self.imgInfo) == (other.tolNode, other.descInfo, other.imgInfo) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) + class InfoResponse: """ Sent as responses to 'info' requests """ def __init__(self, nodeInfo: NodeInfo, subNodesInfo: tuple[()] | tuple[NodeInfo | None, NodeInfo | None]): self.nodeInfo = nodeInfo self.subNodesInfo = subNodesInfo - # Used in unit testing - def __eq__(self, other): + + def __eq__(self, other): # Used in unit testing return isinstance(other, InfoResponse) and \ (self.nodeInfo, self.subNodesInfo) == (other.nodeInfo, other.subNodesInfo) - def __repr__(self): + + def __repr__(self): # Used in unit testing return str(self.__dict__) -# For data lookup +# ========== For data lookup ========== + def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, TolNode]: """ For a set of node names, returns a name-to-TolNode map that describes those nodes """ # Get node info @@ -146,6 +165,7 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, query = f'SELECT name, id, tips FROM {nodesTable} WHERE name IN ({queryParamStr})' for nodeName, otolId, tips in dbCur.execute(query, names): nameToNodes[nodeName] = TolNode(otolId, [], tips=tips) + # Get child info query = f'SELECT parent, child FROM {edgesTable} WHERE parent IN ({queryParamStr})' for nodeName, childName in dbCur.execute(query, names): @@ -158,11 +178,13 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, for n, tips in dbCur.execute(query, node.children): childToTips[n] = tips node.children.sort(key=lambda n: childToTips[n], reverse=True) + # Get parent info query = f'SELECT parent, child, p_support FROM {edgesTable} WHERE child IN ({queryParamStr})' for nodeName, childName, pSupport in dbCur.execute(query, names): nameToNodes[childName].parent = nodeName nameToNodes[childName].pSupport = pSupport == 1 + # Get image names idsToNames = {nameToNodes[n].otolId: n for n in nameToNodes.keys()} query = f'SELECT {nodesTable}.id from {nodesTable}' \ @@ -170,6 +192,7 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, f' WHERE {nodesTable}.id IN ' '({})'.format(','.join(['?'] * len(idsToNames))) for (otolId,) in dbCur.execute(query, list(idsToNames.keys())): nameToNodes[idsToNames[otolId]].imgName = otolId + '.jpg' + # Get 'linked' images for unresolved names unresolvedNames = [n for n in nameToNodes if nameToNodes[n].imgName is None] query = 'SELECT name, otol_ids from linked_imgs WHERE name IN ({})' @@ -183,21 +206,25 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, id1 + '.jpg' if id1 != '' else None, id2 + '.jpg' if id2 != '' else None, ) + # Get preferred-name info query = f'SELECT name, alt_name FROM names WHERE pref_alt = 1 AND name IN ({queryParamStr})' for name, altName in dbCur.execute(query, names): if name in nameToNodes: nameToNodes[name].commonName = altName + # Get IUCN status query = f'SELECT name, iucn FROM node_iucn WHERE name IN ({queryParamStr})' for name, iucn in dbCur.execute(query, names): if name in nameToNodes: nameToNodes[name].iucn = iucn - # + return nameToNodes + def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor) -> SearchSuggResponse: """ For a search string, returns a SearchSuggResponse describing search suggestions """ hasMore = False + # Get node names and alt-names, ordering by popularity nodesTable = f'nodes_{getTableSuffix(tree)}' nameQuery = f'SELECT {nodesTable}.name, node_pop.pop FROM {nodesTable}' \ @@ -210,6 +237,7 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor f' WHERE alt_name LIKE ? ORDER BY node_pop.pop DESC' suggs: dict[str, SearchSugg] = {} tempLimit = suggLimit + 1 # For determining if 'more suggestions exist' + # Prefix search for altName, nodeName, prefAlt, pop in dbCur.execute(altNameQuery, (searchStr + '%',)): if nodeName not in suggs or prefAlt == 1 and suggs[nodeName].canonicalName is not None: @@ -224,6 +252,7 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor if len(suggs) == tempLimit: break suggList = sorted(suggs.values(), key=lambda x: x.pop, reverse=True) + # If insufficient results, try substring-search if len(suggs) < tempLimit: newNames: set[str] = set() @@ -243,18 +272,21 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor if len(suggs) == tempLimit: break suggList.extend(sorted([suggs[n] for n in newNames], key=lambda x: x.pop, reverse=True)) - # + if len(suggList) > suggLimit: hasMore = True return SearchSuggResponse(suggList[:suggLimit], hasMore) + def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | None: """ For a node name, returns a descriptive InfoResponse, or None """ nodesTable = f'nodes_{getTableSuffix(tree)}' + # Get node info nameToNodes = lookupNodes([name], tree, dbCur) tolNode = nameToNodes[name] if name in nameToNodes else None if tolNode is None: return None + # Check for compound node match = re.fullmatch(r'\[(.+) \+ (.+)]', name) subNames = [match.group(1), match.group(2)] if match is not None else [] @@ -264,6 +296,7 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No subNames = [n if n in nameToSubNodes else None for n in subNames] nameToNodes.update(nameToSubNodes) namesToLookup = [name] if not subNames else [n for n in subNames if n is not None] + # Get desc info nameToDescInfo: dict[str, DescInfo] = {} query = 'SELECT name, desc, wiki_id, from_dbp FROM' \ @@ -271,6 +304,7 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No ' WHERE wiki_ids.name IN ({})'.format(','.join(['?'] * len(namesToLookup))) for nodeName, desc, wikiId, fromDbp in dbCur.execute(query, namesToLookup): nameToDescInfo[nodeName] = DescInfo(desc, wikiId, fromDbp == 1) + # Get image info nameToImgInfo: dict[str, ImgInfo] = {} idsToNames = {cast(str, nameToNodes[n].imgName)[:-4]: n @@ -282,6 +316,7 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No f' WHERE {nodesTable}.id IN ' '({})'.format(','.join(['?'] * len(idsToLookup))) for id, imgId, imgSrc, url, license, artist, credit in dbCur.execute(query, idsToLookup): nameToImgInfo[idsToNames[id]] = ImgInfo(imgId, imgSrc, url, license, artist, credit) + # Construct response nodeInfoObjs = [ NodeInfo( @@ -293,15 +328,19 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No return InfoResponse( nodeInfoObjs[0], cast(tuple[()] | tuple[NodeInfo | None, NodeInfo | None], nodeInfoObjs[1:])) + def getTableSuffix(tree: str) -> str: - """ converts a reduced-tree descriptor into a sql-table-suffix """ + """ Converts a reduced-tree descriptor into a sql-table-suffix """ return 't' if tree == 'trimmed' else 'i' if tree == 'images' else 'p' +# ========== Entry point ========== + def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode] | SearchSuggResponse | InfoResponse: """ Queries the database, and constructs a response object """ # Open db dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() + # Get query params queryStr = environ['QUERY_STRING'] if 'QUERY_STRING' in environ else '' queryDict = urllib.parse.parse_qs(queryStr) @@ -313,6 +352,7 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode] #(name,) = dbCur.execute(query).fetchone() reqType = queryDict['type'][0] if 'type' in queryDict else None tree = queryDict['tree'][0] if 'tree' in queryDict else 'images' + # Check for valid 'tree' if tree is not None and re.fullmatch(r'trimmed|images|picked', tree) is None: return None @@ -339,7 +379,7 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode] parent = row[0] nodesToSkip.add(parent) nodeName = parent - # + results: dict[str, TolNode] = {} ranOnce = False while True: @@ -378,6 +418,7 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode] except ValueError: invalidLimit = True print(f'INFO: Invalid limit {suggLimit}', file=sys.stderr) + # Get search suggestions if not invalidLimit: return lookupSuggs(name, suggLimit, tree, dbCur) @@ -385,12 +426,15 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode] infoResponse = lookupInfo(name, tree, dbCur) if infoResponse is not None: return infoResponse + # On failure, provide empty response return None + def application(environ: dict[str, str], start_response) -> Iterable[bytes]: """ Entry point for the WSGI script """ # Get response object val = handleReq(DB_FILE, environ) + # Construct response data = jsonpickle.encode(val, unpicklable=False).encode() headers = [('Content-type', 'application/json')] @@ -400,4 +444,5 @@ def application(environ: dict[str, str], start_response) -> Iterable[bytes]: headers.append(('Content-encoding', 'gzip')) headers.append(('Content-Length', str(len(data)))) start_response('200 OK', headers) + return [data] diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py index 50418e0..f8a665a 100755 --- a/backend/tol_data/dbpedia/gen_desc_data.py +++ b/backend/tol_data/dbpedia/gen_desc_data.py @@ -6,8 +6,10 @@ Adds DBpedia labels/types/abstracts/etc data into a database # In testing, this script took a few hours to run, and generated about 10GB +import argparse import re -import bz2, sqlite3 +import bz2 +import sqlite3 LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries IDS_FILE = 'page_lang=en_ids.ttl.bz2' @@ -24,7 +26,7 @@ def genData( print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Reading/storing label data') dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)') dbCur.execute('CREATE INDEX labels_idx ON labels(label)') @@ -38,7 +40,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing wiki page ids') dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)') dbCur.execute('CREATE INDEX ids_idx ON ids(id)') @@ -55,7 +57,7 @@ def genData( except sqlite3.IntegrityError as e: # Accounts for certain lines that have the same IRI print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}') - # + print('Reading/storing redirection data') dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)') redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') @@ -67,7 +69,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing diambiguation-page data') dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)') disambigLineRegex = redirLineRegex @@ -79,7 +81,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),)) - # + print('Reading/storing instance-type data') dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)') dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)') @@ -92,7 +94,7 @@ def genData( if match is None: raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2))) - # + print('Reading/storing abstracts') dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)') descLineRegex = labelLineRegex @@ -107,14 +109,13 @@ def genData( raise Exception(f'ERROR: Line {lineNum} has unexpected format') dbCur.execute('INSERT INTO abstracts VALUES (?, ?)', (match.group(1), match.group(2).replace(r'\"', '"'))) - # + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py index 17e15b4..6efc7a4 100755 --- a/backend/tol_data/enwiki/download_img_license_info.py +++ b/backend/tol_data/enwiki/download_img_license_info.py @@ -9,13 +9,19 @@ The program can be re-run to continue downloading, and looks at already-processed names to decide what to skip. """ +import argparse import re -import sqlite3, urllib.parse, html +import sqlite3 + import requests -import time, signal +import urllib.parse +import html + +import time +import signal IMG_DB = 'img_data.db' -# + API_URL = 'https://en.wikipedia.org/w/api.php' USER_AGENT = 'terryt.dev (terry06890@gmail.com)' BATCH_SZ = 50 # Max 50 @@ -30,19 +36,19 @@ def downloadInfo(imgDb: str) -> None: if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: dbCur.execute('CREATE TABLE imgs (' \ 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') - # + print('Reading image names') imgNames: set[str] = set() for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): imgNames.add(imgName) print(f'Found {len(imgNames)}') - # + print('Checking for already-processed images') oldSz = len(imgNames) for (imgName,) in dbCur.execute('SELECT name FROM imgs'): imgNames.discard(imgName) print(f'Found {oldSz - len(imgNames)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -51,7 +57,7 @@ def downloadInfo(imgDb: str) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Iterating through image names') imgNameList = list(imgNames) iterNum = 0 @@ -62,9 +68,11 @@ def downloadInfo(imgDb: str) -> None: if interrupted: print(f'Exiting loop at iteration {iterNum}') break + # Get batch imgBatch = imgNameList[i:i+BATCH_SZ] imgBatch = ['File:' + x for x in imgBatch] + # Make request headers = { 'user-agent': USER_AGENT, @@ -87,6 +95,7 @@ def downloadInfo(imgDb: str) -> None: print(f'ERROR: Exception while downloading info: {e}') print('\tImage batch: ' + '|'.join(imgBatch)) continue + # Parse response-object if 'query' not in responseObj or 'pages' not in responseObj['query']: print('WARNING: Response object doesn\'t have page data') @@ -126,6 +135,7 @@ def downloadInfo(imgDb: str) -> None: artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup if artist is not None: artist = TAG_REGEX.sub(' ', artist).strip() @@ -137,17 +147,17 @@ def downloadInfo(imgDb: str) -> None: credit = WHITESPACE_REGEX.sub(' ', credit) credit = html.unescape(credit) credit = urllib.parse.unquote(credit) + # Add to db dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', (title, license, artist, credit, restrictions, url)) - # + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadInfo(IMG_DB) diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py index c6a1c21..164289d 100755 --- a/backend/tol_data/enwiki/download_imgs.py +++ b/backend/tol_data/enwiki/download_imgs.py @@ -11,14 +11,20 @@ in the output directory do decide what to skip. # In testing, this downloaded about 100k images, over several days -import re, os +import argparse +import re +import os import sqlite3 -import urllib.parse, requests -import time, signal + +import requests +import urllib.parse + +import time +import signal IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' -# + LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 @@ -34,7 +40,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: for filename in fileList: pageIdsDone.add(int(os.path.splitext(filename)[0])) print(f'Found {len(pageIdsDone)}') - # + # Set SIGINT handler interrupted = False oldHandler = None @@ -43,7 +49,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) - # + print('Opening database') dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() @@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if interrupted: print('Exiting loop') break + # Check for problematic attributes if license is None or LICENSE_REGEX.fullmatch(license) is None: continue @@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: continue if restrictions is not None and restrictions != '': continue + # Download image iterNum += 1 print(f'Iteration {iterNum}: Downloading for page-id {pageId}') @@ -87,12 +95,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: except Exception as e: print(f'Error while downloading to {outFile}: {e}') return + print('Closing database') dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py index b3fde52..44e4d6f 100755 --- a/backend/tol_data/enwiki/gen_desc_data.py +++ b/backend/tol_data/enwiki/gen_desc_data.py @@ -7,10 +7,16 @@ and adds them to a database # In testing, this script took over 10 hours to run, and generated about 5GB -import sys, os, re +import argparse +import sys +import os +import re import bz2 -import html, mwxml, mwparserfromhell import sqlite3 +import html + +import mwxml +import mwparserfromhell DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' @@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + def convertTemplateReplace(match): """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ if match.group(2) is None: return f'{match.group(1)} {match.group(4)}' else: return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' -PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') -LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +# ========== For data generation ========== def genData(dumpFile: str, dbFile: str) -> None: print('Creating database') @@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None: dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') - # + print('Iterating through dump file') with bz2.open(dumpFile, mode='rt') as file: for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): if pageNum % 1e4 == 0: print(f'At page {pageNum}') - # Parse page + if page.namespace == 0: try: dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) @@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None: desc = parseDesc(revision.text) if desc is not None: dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) - # + print('Closing database') dbCon.commit() dbCon.close() + def parseDesc(text: str) -> str | None: - # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, - # and then accumulate lines until a blank one. - # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ + Looks for a description in wikitext content. + + Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs, + and then accumulates lines until a blank one. + + Some cases not accounted for include: + disambiguation pages, abstracts with sentences split-across-lines, + nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + """ lines: list[str] = [] openBraceCount = 0 openBracketCount = 0 @@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None: if lines: return removeMarkup(' '.join(lines)) return None + def removeMarkup(content: str) -> str: content = EMBEDDED_HTML_REGEX.sub('', content) content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) @@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str: content = PARENS_GROUP_REGEX.sub('', content) content = LEFTOVER_BRACE_REGEX.sub('', content) return content + def convertTitle(title: str) -> str: return html.unescape(title).replace('_', ' ') +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(DUMP_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py index 5778680..12a8a10 100755 --- a/backend/tol_data/enwiki/gen_dump_index_db.py +++ b/backend/tol_data/enwiki/gen_dump_index_db.py @@ -1,9 +1,13 @@ #!/usr/bin/python3 """ -Adds data from the wiki dump index-file into a database +Converts data from the wiki-dump index-file into a database """ -import sys, os, re + +import argparse +import sys +import os +import re import bz2 import sqlite3 @@ -14,10 +18,12 @@ def genData(indexFile: str, dbFile: str) -> None: """ Reads the index file and creates the db """ if os.path.exists(dbFile): raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') lastOffset = 0 @@ -28,7 +34,7 @@ def genData(indexFile: str, dbFile: str) -> None: lineNum += 1 if lineNum % 1e5 == 0: print(f'At line {lineNum}') - # + match = lineRegex.fullmatch(line.rstrip()) assert match is not None offsetStr, pageId, title = match.group(1,2,3) @@ -48,13 +54,13 @@ def genData(indexFile: str, dbFile: str) -> None: dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) except sqlite3.IntegrityError as e: print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(INDEX_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py index 040f223..2c243f3 100755 --- a/backend/tol_data/enwiki/gen_img_data.py +++ b/backend/tol_data/enwiki/gen_img_data.py @@ -8,31 +8,39 @@ The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """ +import argparse import re -import os, bz2, html, urllib.parse +import os +import bz2 +import html +import urllib.parse import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -# + ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) +# ========== For data generation ========== + def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: print('Opening databases') indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() + print('Checking tables') if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present - imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL + imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') + # 'img_name' values are set to NULL to indicate page IDs where no image was found imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') else: # Check for already-processed page IDs @@ -44,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: else: print(f'Found already-processed page ID {pid} which was not in input set') print(f'Will skip {numSkipped} already-processed page IDs') - # + print('Getting dump-file offsets') offsetToPageids: dict[int, list[int]] = {} offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets @@ -53,7 +61,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') - # + query = 'SELECT offset, next_offset FROM offsets WHERE id = ?' row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone() if row is None: @@ -65,7 +73,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: offsetToPageids[chunkOffset] = [] offsetToPageids[chunkOffset].append(pageId) print(f'Found {len(offsetToEnd)} chunks to check') - # + print('Iterating through chunks in dump file') with open(dumpFile, mode='rb') as file: iterNum = 0 @@ -73,7 +81,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: iterNum += 1 if iterNum % 100 == 0: print(f'At iteration {iterNum}') - # + chunkPageIds = offsetToPageids[pageOffset] # Jump to chunk file.seek(pageOffset) @@ -126,14 +134,15 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: break if not foundText: print(f'WARNING: Did not find <text> for page id {pageId}') - # + print('Closing databases') indexDbCon.close() imgDbCon.commit() imgDbCon.close() + def getImageName(content: list[str]) -> str | None: """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ - # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections + # Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections for line in content: match = IMG_LINE_REGEX.match(line) if match is not None: @@ -174,6 +183,8 @@ def getImageName(content: list[str]) -> str | None: return None return None +# ========== For getting input page IDs ========== + def getInputPageIdsFromDb(dbFile: str) -> set[int]: print('Getting input page-ids') pageIds: set[int] = set() @@ -182,12 +193,15 @@ def getInputPageIdsFromDb(dbFile: str) -> set[int]: for (pageId,) in dbCur.execute('SELECT id from wiki_ids'): pageIds.add(pageId) dbCon.close() + print(f'Found {len(pageIds)}') return pageIds + +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + pageIds = getInputPageIdsFromDb(DB_FILE) genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py index 8aee1cc..95b4a60 100755 --- a/backend/tol_data/enwiki/gen_pageview_data.py +++ b/backend/tol_data/enwiki/gen_pageview_data.py @@ -3,27 +3,34 @@ """ Reads through wikimedia files containing pageview counts, computes average counts, and adds them to a database + +Each pageview file has lines that seem to hold these space-separated fields: + wiki code (eg: en.wikipedia), article title, page ID (may be: null), + platform (eg: mobile-web), monthly view count, + hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) """ # Took about 15min per file (each had about 180e6 lines) -import sys, os, glob, math, re +import argparse +import sys +import os +import glob +import math +import re from collections import defaultdict -import bz2, sqlite3 +import bz2 +import sqlite3 PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') DUMP_INDEX_DB = 'dump_index.db' DB_FILE = 'pageview_data.db' def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: - # Each pageview file has lines that seem to hold these space-separated fields: - # wiki code (eg: en.wikipedia), article title, page ID (may be: null), - # platform (eg: mobile-web), monthly view count, - # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) if os.path.exists(dbFile): print('ERROR: Database already exists') sys.exit(1) - # + namespaceRegex = re.compile(r'[a-zA-Z]+:') titleToViews: dict[str, int] = defaultdict(int) linePrefix = b'en.wikipedia ' @@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: print(f'At line {lineNum}') if not line.startswith(linePrefix): continue + # Get second and second-last fields line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields title = line[:line.find(b' ')].decode('utf-8') viewCount = int(line[line.rfind(b' ')+1:]) if namespaceRegex.match(title) is not None: continue + # Update map title = title.replace('_', ' ') titleToViews[title] += viewCount print(f'Found {len(titleToViews)} titles') - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: idbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE) diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py index f744818..c4d0932 100755 --- a/backend/tol_data/enwiki/lookup_page.py +++ b/backend/tol_data/enwiki/lookup_page.py @@ -5,6 +5,7 @@ Looks up a page with title title1 in the wiki dump, using the dump-index db, and prints the corresponding <page>. """ +import argparse import sys import bz2 import sqlite3 @@ -24,7 +25,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None: _, pageOffset, endOffset = row dbCon.close() print(f'Found chunk at offset {pageOffset}') - # + print('Reading from wiki dump') content: list[str] = [] with open(dumpFile, mode='rb') as file: @@ -32,6 +33,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None: file.seek(pageOffset) compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) data = bz2.BZ2Decompressor().decompress(compressedData).decode() + # Look in chunk for page lines = data.splitlines() lineIdx = 0 @@ -58,14 +60,13 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None: if line.lstrip() == '</page>': break lineIdx += 1 - # + print('Content: ') print('\n'.join(content)) if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('title', help='The title to look up') args = parser.parse_args() - # + lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' ')) diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py index 8454a35..5757032 100755 --- a/backend/tol_data/eol/download_imgs.py +++ b/backend/tol_data/eol/download_imgs.py @@ -13,9 +13,16 @@ already-downloaded files, and continues after the one with highest EOL ID. """ -import sys, re, os, random +import argparse +import sys +import re +import os +import random import sqlite3 -import urllib.parse, requests + +import requests +import urllib.parse + import time from threading import Thread import signal @@ -23,7 +30,7 @@ import signal IMAGES_LIST_DB = 'images_list.db' OUT_DIR = 'imgs_for_review' DB_FILE = os.path.join('..', 'data.db') -# + MAX_IMGS_PER_ID = 3 MAX_THREADS = 5 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) @@ -43,7 +50,7 @@ def downloadImgs(eolIds, imagesListDb, outDir): eolIdList = sorted(eolIds) nextIdx = 0 print(f'Result: {len(eolIdList)} EOL IDs') - # + print('Checking output directory') if not os.path.exists(outDir): os.mkdir(outDir) @@ -57,7 +64,7 @@ def downloadImgs(eolIds, imagesListDb, outDir): if nextIdx == len(eolIdList): print('No IDs left. Exiting...') return - # + print('Starting download threads') numThreads = 0 threadException: Exception | None = None # Used for ending main thread after a non-main thread exception @@ -81,6 +88,7 @@ def downloadImgs(eolIds, imagesListDb, outDir): print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) threadException = e numThreads -= 1 + # Manage downloading for idx in range(nextIdx, len(eolIdList)): eolId = eolIdList[idx] @@ -96,9 +104,11 @@ def downloadImgs(eolIds, imagesListDb, outDir): if len(extension) <= 1: print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) continue + # Check image-quantity limit if len(ownerSet) == MAX_IMGS_PER_ID: break + # Check for skip conditions if re.fullmatch(LICENSE_REGEX, license) is None: continue @@ -107,11 +117,13 @@ def downloadImgs(eolIds, imagesListDb, outDir): if copyrightOwner in ownerSet: continue ownerSet.add(copyrightOwner) + # Determine output filename outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}') if os.path.exists(outPath): print(f'WARNING: {outPath} already exists. Skipping download.') continue + # Check thread limit while numThreads == MAX_THREADS: time.sleep(1) @@ -122,6 +134,7 @@ def downloadImgs(eolIds, imagesListDb, outDir): time.sleep(1) exitLoop = True break + # Perform download print(f'Downloading image to {outPath}') numThreads += 1 @@ -129,6 +142,7 @@ def downloadImgs(eolIds, imagesListDb, outDir): thread.start() if exitLoop: break + # Close images-list db while numThreads > 0: time.sleep(1) @@ -143,10 +157,10 @@ def getEolIdsFromDb(dbFile) -> set[int]: eolIds.add(id) dbCon.close() return eolIds + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + eolIds = getEolIdsFromDb(DB_FILE) downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR) diff --git a/backend/tol_data/eol/gen_images_list_db.py b/backend/tol_data/eol/gen_images_list_db.py index ee57ac6..3e5bea1 100755 --- a/backend/tol_data/eol/gen_images_list_db.py +++ b/backend/tol_data/eol/gen_images_list_db.py @@ -4,8 +4,12 @@ Generates a sqlite db from a directory of CSV files holding EOL image data """ -import os, glob -import csv, re, sqlite3 +import argparse +import os +import glob +import csv +import re +import sqlite3 IMAGE_LISTS_GLOB = os.path.join('imagesList', '*.csv') DB_FILE = 'images_list.db' @@ -18,6 +22,7 @@ def genData(imageListsGlob: str, dbFile: str) -> None: ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \ ' copy_url TEXT, license TEXT, copyright_owner TEXT)') dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)') + print('Reading CSV files') for filename in glob.glob(imageListsGlob): print(f'Processing {filename}') @@ -27,13 +32,13 @@ def genData(imageListsGlob: str, dbFile: str) -> None: continue dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner)) + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(IMAGE_LISTS_GLOB, DB_FILE) diff --git a/backend/tol_data/eol/review_imgs.py b/backend/tol_data/eol/review_imgs.py index 9fb462c..145f338 100755 --- a/backend/tol_data/eol/review_imgs.py +++ b/backend/tol_data/eol/review_imgs.py @@ -7,8 +7,13 @@ choose an image to keep, or reject all. Also provides image rotation. Chosen images are placed in another directory, and rejected ones are deleted. """ -import sys, re, os, time +import argparse +import sys +import re +import os +import time import sqlite3 + import tkinter as tki from tkinter import ttk import PIL @@ -17,7 +22,7 @@ from PIL import ImageTk, Image, ImageOps IMG_DIR = 'imgs_for_review' OUT_DIR = 'imgs' EXTRA_INFO_DB = os.path.join('..', 'data.db') -# + IMG_DISPLAY_SZ = 400 MAX_IMGS_PER_ID = 3 IMG_BG_COLOR = (88, 28, 135) @@ -28,11 +33,13 @@ class EolImgReviewer: def __init__(self, root, imgDir, imgList, extraInfoDb, outDir): self.root = root root.title('EOL Image Reviewer') + # Setup main frame mainFrame = ttk.Frame(root, padding='5 5 5 5') mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S)) root.columnconfigure(0, weight=1) root.rowconfigure(0, weight=1) + # Set up images-to-be-reviewed frames self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter @@ -44,9 +51,11 @@ class EolImgReviewer: label = ttk.Label(frame, image=self.photoImgs[i]) label.grid(column=0, row=0) self.labels.append(label) + # Add padding for child in mainFrame.winfo_children(): child.grid_configure(padx=5, pady=5) + # Add keyboard bindings root.bind('<q>', self.quit) root.bind('<Key-j>', lambda evt: self.accept(0)) @@ -59,6 +68,7 @@ class EolImgReviewer: root.bind('<Key-A>', lambda evt: self.rotate(0, True)) root.bind('<Key-S>', lambda evt: self.rotate(1, True)) root.bind('<Key-D>', lambda evt: self.rotate(2, True)) + # Initialise fields self.imgDir = imgDir self.imgList = imgList @@ -67,13 +77,15 @@ class EolImgReviewer: self.nextEolId = 0 self.nextImgNames: list[str] = [] self.rotations: list[int] = [] + # For displaying extra info self.extraInfoDbCon = sqlite3.connect(extraInfoDb) self.extraInfoDbCur = self.extraInfoDbCon.cursor() self.numReviewed = 0 self.startTime = time.time() - # + self.getNextImgs() + def getNextImgs(self): """ Updates display with new images to review, or ends program """ # Gather names of next images to review @@ -95,6 +107,7 @@ class EolImgReviewer: self.nextImgNames.append(imgName) self.rotations.append(0) self.imgListIdx += 1 + # Update displayed images idx = 0 while idx < MAX_IMGS_PER_ID: @@ -113,16 +126,19 @@ class EolImgReviewer: self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx]) self.labels[idx].config(image=self.photoImgs[idx]) idx += 1 + # Restart if all image files non-recognisable if not self.nextImgNames: self.getNextImgs() return + # Update title firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 lastImgIdx = self.imgListIdx title = self.getExtraInfo(self.nextEolId) title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})' self.root.title(title) + def accept(self, imgIdx): """ React to a user selecting an image """ if imgIdx >= len(self.nextImgNames): @@ -142,12 +158,14 @@ class EolImgReviewer: os.remove(inFile) self.numReviewed += 1 self.getNextImgs() + def reject(self): """ React to a user rejecting all images of a set """ for i in range(len(self.nextImgNames)): os.remove(os.path.join(self.imgDir, self.nextImgNames[i])) self.numReviewed += 1 self.getNextImgs() + def rotate(self, imgIdx, anticlockwise = False): """ Respond to a user rotating an image """ deg = -90 if not anticlockwise else 90 @@ -155,6 +173,7 @@ class EolImgReviewer: self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 + def quit(self, e = None): print(f'Number reviewed: {self.numReviewed}') timeElapsed = time.time() - self.startTime @@ -163,7 +182,7 @@ class EolImgReviewer: print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds') self.extraInfoDbCon.close() self.root.destroy() - # + def resizeImgForDisplay(self, img): """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """ if max(img.width, img.height) > IMG_DISPLAY_SZ: @@ -178,6 +197,7 @@ class EolImgReviewer: int((IMG_DISPLAY_SZ - img.width) / 2), int((IMG_DISPLAY_SZ - img.height) / 2))) return bgImg + def getExtraInfo(self, eolId: int) -> str: """ Used to display extra EOL ID info """ query = 'SELECT names.alt_name FROM' \ @@ -193,12 +213,14 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str): print('Checking output directory') if not os.path.exists(outDir): os.mkdir(outDir) + print('Getting input image list') imgList = os.listdir(imgDir) imgList.sort(key=lambda s: int(s.split(' ')[0])) if not imgList: print('No input images found') sys.exit(0) + # Create GUI and defer control print('Starting GUI') root = tki.Tk() @@ -206,8 +228,7 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str): root.mainloop() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + reviewImgs(IMG_DIR, OUT_DIR, EXTRA_INFO_DB) diff --git a/backend/tol_data/gen_desc_data.py b/backend/tol_data/gen_desc_data.py index fa08a8c..69efe79 100755 --- a/backend/tol_data/gen_desc_data.py +++ b/backend/tol_data/gen_desc_data.py @@ -5,7 +5,9 @@ Maps nodes to short descriptions, using data from DBpedia and Wikipedia, and stores results in the database. """ -import os, sqlite3 +import argparse +import os +import sqlite3 DBPEDIA_DB = os.path.join('dbpedia', 'desc_data.db') ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') @@ -16,12 +18,12 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)') - # + print('Getting node mappings') nodeToWikiId: dict[str, int] = {} for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): nodeToWikiId[name] = wikiId - # + print('Reading data from DBpedia') dbpCon = sqlite3.connect(dbpediaDb) dbpCur = dbpCon.cursor() @@ -32,20 +34,22 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: iterNum += 1 if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') - # + row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone() if row is not None: nodeToIri[name] = row[0] + print('Resolving redirects') iterNum = 0 for name, iri in nodeToIri.items(): iterNum += 1 if iterNum % 1e5 == 0: print(f'At iteration {iterNum}') - # + row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone() if row is not None: nodeToIri[name] = row[0] + print('Adding descriptions') iterNum = 0 for name, iri in nodeToIri.items(): @@ -57,11 +61,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: if row is not None: dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1)) del nodeToWikiId[name] + dbpCon.close() - # + print('Reading data from Wikipedia') enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() + print('Adding descriptions') iterNum = 0 for name, wikiId in nodeToWikiId.items(): @@ -79,14 +85,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone() if row is not None: dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0)) - # + print('Closing databases') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(DBPEDIA_DB, ENWIKI_DB, DB_FILE) diff --git a/backend/tol_data/gen_imgs.py b/backend/tol_data/gen_imgs.py index 0ba75ec..2479742 100755 --- a/backend/tol_data/gen_imgs.py +++ b/backend/tol_data/gen_imgs.py @@ -11,8 +11,11 @@ processing. It uses already-existing database entries to decide what to skip. """ -import os, subprocess -import sqlite3, urllib.parse +import argparse +import os +import subprocess +import sqlite3 +import urllib.parse import signal IMG_LIST_FILE = 'img_list.txt' @@ -23,10 +26,11 @@ ENWIKI_IMG_DB = os.path.join('enwiki', 'img_data.db') PICKED_IMGS_DIR = 'picked_imgs' PICKED_IMGS_FILE = 'img_data.txt' DB_FILE = 'data.db' -# + IMG_OUT_SZ = 200 ImgId = tuple[int, str] # Holds an int ID and a source string (eg: 'eol') + class PickedImg: """ Represents a picked-image from pickedImgsDir """ def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str): @@ -44,9 +48,9 @@ def genImgs( """ Reads the image-list file, generates images, and updates db """ if not os.path.exists(outDir): os.mkdir(outDir) - # dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() + print('Checking for image tables') nodesDone: set[str] = set() imgsDone: set[ImgId] = set() @@ -63,15 +67,16 @@ def genImgs( for imgId, imgSrc in dbCur.execute('SELECT id, src from images'): imgsDone.add((imgId, imgSrc)) print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip') - # + print('Processing picked-images') success = processPickedImgs(pickedImgsDir, pickedImgsFile, nodesDone, imgsDone, outDir, dbCur) if success: print('Processing images from eol and enwiki') processImgs(imgListFile, eolImgDir, eolImgDb, enwikiImgDb, nodesDone, imgsDone, outDir, dbCur) - # Close db + dbCon.commit() dbCon.close() + def processPickedImgs( pickedImgsDir: str, pickedImgsFile: str, nodesDone: set[str], imgsDone: set[ImgId], outDir: str, dbCur: sqlite3.Cursor) -> bool: @@ -85,25 +90,30 @@ def processPickedImgs( nodeName = os.path.splitext(filename)[0] # Remove extension (otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone() nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit) + # Set SIGINT handler interrupted = False def onSigint(sig, frame): nonlocal interrupted interrupted = True signal.signal(signal.SIGINT, onSigint) + # Convert images for otolId, imgData in nodeToPickedImg.items(): # Check for SIGINT event if interrupted: print('Exiting') return False + # Skip if already processed if otolId in nodesDone: continue + # Convert image success = convertImage(os.path.join(pickedImgsDir, imgData.filename), os.path.join(outDir, otolId + '.jpg')) if not success: return False + # Add entry to db if (imgData.id, 'picked') not in imgsDone: dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', @@ -112,6 +122,7 @@ def processPickedImgs( dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked')) nodesDone.add(otolId) return True + def processImgs( imgListFile: str, eolImgDir: str, eolImgDb: str, enwikiImgDb: str, nodesDone: set[str], imgsDone: set[ImgId], outDir: str, dbCur: sqlite3.Cursor) -> bool: @@ -120,12 +131,14 @@ def processImgs( eolCur = eolCon.cursor() enwikiCon = sqlite3.connect(enwikiImgDb) enwikiCur = enwikiCon.cursor() + # Set SIGINT handler interrupted = False def onSigint(sig, frame): nonlocal interrupted interrupted = True signal.signal(signal.SIGINT, onSigint) + # Convert images flag = False # Set to True upon interruption or failure with open(imgListFile) as file: @@ -135,19 +148,24 @@ def processImgs( print('Exiting') flag = True break + # Skip lines without an image path if line.find(' ') == -1: continue + # Get filenames otolId, _, imgPath = line.rstrip().partition(' ') + # Skip if already processed if otolId in nodesDone: continue + # Convert image success = convertImage(imgPath, os.path.join(outDir, otolId + '.jpg')) if not success: flag = True break + # Add entry to db (nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone() fromEol = imgPath.startswith(eolImgDir) @@ -185,14 +203,17 @@ def processImgs( (enwikiId, 'enwiki', url, license, artist, credit)) imgsDone.add((enwikiId, 'enwiki')) dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki')) + eolCon.close() enwikiCon.close() return not flag + def convertImage(imgPath: str, outPath: str): print(f'Converting {imgPath} to {outPath}') if os.path.exists(outPath): print('ERROR: Output image already exists') return False + try: completedProcess = subprocess.run( ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], @@ -207,8 +228,7 @@ def convertImage(imgPath: str, outPath: str): return True if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genImgs(IMG_LIST_FILE, EOL_IMG_DIR, OUT_DIR, EOL_IMG_DB, ENWIKI_IMG_DB, PICKED_IMGS_DIR, PICKED_IMGS_FILE, DB_FILE) diff --git a/backend/tol_data/gen_linked_imgs.py b/backend/tol_data/gen_linked_imgs.py index 7002e92..c9d7aac 100755 --- a/backend/tol_data/gen_linked_imgs.py +++ b/backend/tol_data/gen_linked_imgs.py @@ -5,11 +5,12 @@ Look for nodes without images in the database, and tries to associate them with images from their children """ +import argparse import re import sqlite3 DB_FILE = 'data.db' -# + COMPOUND_NAME_REGEX = re.compile(r'\[(.+) \+ (.+)]') UP_PROPAGATE_COMPOUND_IMGS = False @@ -18,14 +19,14 @@ def genData(dbFile: str) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)') - # + print('Getting nodes with images') nodeToUsedId: dict[str, str] = {} # Maps name of node to otol ID of node to use image for query = 'SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name' for name, otolId in dbCur.execute(query): nodeToUsedId[name] = otolId print(f'Found {len(nodeToUsedId)}') - # + print('Getting node depths') nodeToDepth: dict[str, int] = {} maxDepth = 0 @@ -33,6 +34,7 @@ def genData(dbFile: str) -> None: for nodeName in nodeToUsedId.keys(): nodeChain = [nodeName] lastDepth = 0 + # Add ancestors while True: row = dbCur.execute('SELECT parent FROM edges WHERE child = ?', (nodeName,)).fetchone() @@ -45,11 +47,12 @@ def genData(dbFile: str) -> None: if nodeName in nodeToDepth: lastDepth = nodeToDepth[nodeName] break + # Add depths for i in range(len(nodeChain)): nodeToDepth[nodeChain[-i-1]] = i + lastDepth maxDepth = max(maxDepth, lastDepth + len(nodeChain) - 1) - # + print('Finding ancestors to give linked images') depthToNodes: dict[int, list[str]] = {depth: [] for depth in range(maxDepth + 1)} for nodeName, depth in nodeToDepth.items(): @@ -70,12 +73,12 @@ def genData(dbFile: str) -> None: (tips,) = dbCur.execute('SELECT tips FROM nodes WHERE name == ?', (node,)).fetchone() if parent not in parentToCandidate or parentToCandidate[parent][1] < tips: parentToCandidate[parent] = (node, tips) - # + print('Replacing linked-images for compound nodes') for iterNum, node in enumerate(parentToCandidate.keys(), 1): if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') - # + match = COMPOUND_NAME_REGEX.fullmatch(node) if match is not None: # Replace associated image with subname images @@ -85,12 +88,15 @@ def genData(dbFile: str) -> None: otolIdPair[0] = nodeToUsedId[subName1] if subName2 in nodeToUsedId: otolIdPair[1] = nodeToUsedId[subName2] + # Use no image if both subimages not found if otolIdPair[0] == '' and otolIdPair[1] == '': dbCur.execute('DELETE FROM linked_imgs WHERE name = ?', (node,)) continue + # Add to db dbCur.execute('UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', (','.join(otolIdPair), node)) + # Possibly repeat operation upon parent/ancestors if UP_PROPAGATE_COMPOUND_IMGS: while True: @@ -104,14 +110,13 @@ def genData(dbFile: str) -> None: node = parent continue break - # + print('Closing database') dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(DB_FILE) diff --git a/backend/tol_data/gen_mapping_data.py b/backend/tol_data/gen_mapping_data.py index 4373d1d..1ab577b 100755 --- a/backend/tol_data/gen_mapping_data.py +++ b/backend/tol_data/gen_mapping_data.py @@ -12,9 +12,12 @@ Based on code from https://github.com/OneZoom/OZtree, located in OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). """ +import argparse import os from collections import defaultdict -import gzip, csv, sqlite3 +import gzip +import csv +import sqlite3 TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv') EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz') @@ -43,27 +46,31 @@ def genData( nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID + # Get mappings from data input readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds) readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId) readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId) readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle) getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId) - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() + # Get otol id-to-name map otolIdToName: dict[int, str] = {} for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'): if nodeId.startswith('ott'): otolIdToName[int(nodeId[3:])] = nodeName + # Add eol mappings dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)') dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)') for otolId, eolId in nodeToEolId.items(): if otolId in otolIdToName: dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId)) + # Add enwiki mappings dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)') dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)') @@ -73,8 +80,10 @@ def genData( dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title])) if title in titleToIucnStatus: dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title])) + dbCon.commit() dbCon.close() + def readTaxonomyFile( taxonomyFile: str, nodeToSrcIds: dict[int, dict[str, int]], @@ -88,9 +97,11 @@ def readTaxonomyFile( for lineNum, line in enumerate(file, 1): if lineNum % 1e5 == 0: print(f'At line {lineNum}') + # Skip header line if lineNum == 1: continue + # Parse line fields = line.split('\t|\t') try: @@ -99,6 +110,7 @@ def readTaxonomyFile( print(f'Skipping non-integral ID {fields[0]} on line {lineNum}') continue srcsField = fields[4] + # Add source IDs for srcPair in srcsField.split(','): src, srcIdStr = srcPair.split(':', 1) @@ -111,6 +123,7 @@ def readTaxonomyFile( nodeToSrcIds[otolId][src] = srcId usedSrcIds.add((src, srcId)) print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6 + def readEolIdsFile( eolIdsFile: str, nodeToSrcIds: dict[int, dict[str, int]], @@ -126,9 +139,11 @@ def readEolIdsFile( for lineNum, row in enumerate(csv.reader(file), 1): if lineNum % 1e6 == 0: print(f'At line {lineNum}') + # Skip header line if lineNum == 1: continue + # Parse line eolId = int(row[3]) srcInt = int(row[2]) @@ -144,7 +159,7 @@ def readEolIdsFile( srcToEolId[src][srcId] = eolId print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries') # Was about 3.5e6 (4.2e6 without usedSrcIds) - # + print('Resolving candidate EOL IDs') # For each otol ID, find eol IDs with matching sources, and choose the 'best' one for otolId, srcInfo in nodeToSrcIds.items(): @@ -161,6 +176,7 @@ def readEolIdsFile( eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount] nodeToEolId[otolId] = min(eolIds) print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6 + def readWikidataDb( wikidataDb: str, nodeToSrcIds: dict[int, dict[str, int]], @@ -185,7 +201,7 @@ def readWikidataDb( # Was about 1.1e6 (1.2e6 without usedSrcIds) print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds) dbCon.close() - # + print('Resolving candidate Wikidata items') # For each otol ID, find wikidata titles with matching sources, and choose the 'best' one for otolId, srcInfo in nodeToSrcIds.items(): @@ -211,7 +227,7 @@ def readWikidataDb( nodeToWikiTitle[otolId] = srcToTitle[src] break print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5 - # + print('Adding extra EOL mappings from Wikidata') wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()} addedEntries: dict[int, int] = {} @@ -222,6 +238,7 @@ def readWikidataDb( nodeToEolId[otolId] = eolId addedEntries[otolId] = eolId print(f'- Added {len(addedEntries):,} entries') # Was about 3e3 + def readPickedMappings( pickedMappings: dict[str, list[str]], nodeToEolId: dict[int, int], @@ -248,6 +265,7 @@ def readPickedMappings( else: if otolId in nodeToWikiTitle: del nodeToWikiTitle[otolId] + def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None: """ Read a db for mappings from enwiki titles to page IDs """ print('Getting enwiki page IDs') @@ -264,8 +282,7 @@ def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], ti print(f'Unable to find IDs for {numNotFound} titles') # Was 2913 if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE) diff --git a/backend/tol_data/gen_name_data.py b/backend/tol_data/gen_name_data.py index 2e92c20..5b6e963 100755 --- a/backend/tol_data/gen_name_data.py +++ b/backend/tol_data/gen_name_data.py @@ -5,8 +5,12 @@ Maps nodes to vernacular names, using data from EOL, enwiki, and a picked-names file, and stores results in the database. """ -import re, os -import html, csv, sqlite3 +import argparse +import re +import os +import html +import csv +import sqlite3 EOL_NAMES_FILE = os.path.join('eol', 'vernacularNames.csv') ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') @@ -17,25 +21,26 @@ def genData(eolNamesFile: str, enwikiDb: str, pickedNamesFile: str, dbFile: str) """ Reads the files and adds to db """ dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Creating table') dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))') dbCur.execute('CREATE INDEX names_idx ON names(name)') dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)') dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)') - # + print('Getting node mappings') nodeToTips: dict[str, int] = {} for name, tips in dbCur.execute('SELECT name, tips from nodes'): nodeToTips[name] = tips - # + addEolNames(eolNamesFile, nodeToTips, dbCur) addEnwikiNames(enwikiDb, nodeToTips, dbCur) addPickedNames(pickedNamesFile, nodeToTips, dbCur) - # + print('Closing database') dbCon.commit() dbCon.close() + def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None: """ Reads EOL names, associates them with otol nodes, and writes to db """ # The CSV file has a header line, then lines with these fields: @@ -47,26 +52,31 @@ def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cu for name, eolId in dbCur.execute('SELECT name, id from eol_ids'): if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]: eolIdToNode[eolId] = name + print('Adding names from EOL') namesToSkip = {'unknown', 'unknown species', 'unidentified species'} with open(eolNamesFile, newline='') as file: for lineNum, fields in enumerate(csv.reader(file), 1): if lineNum % 1e5 == 0: print(f'At line {lineNum}') # Reached about 2.8e6 + # Skip header line if lineNum == 1: continue + # Parse line eolId = int(fields[0]) name = html.unescape(fields[2]).lower() lang = fields[3] isPreferred = 1 if fields[6] == 'preferred' else 0 + # Add to db if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \ and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')' # The 'OR IGNORE' accounts for duplicate lines dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred)) + def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None: """ Reads enwiki names, associates them with otol nodes, and writes to db """ print('Getting enwiki mappings') @@ -74,6 +84,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]: wikiIdToNode[wikiId] = name + print('Adding names from enwiki') altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)', enwikiCon = sqlite3.connect(enwikiDb) @@ -83,7 +94,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') # Reached about 3.6e5 - # + query = 'SELECT p1.title FROM pages p1' \ ' INNER JOIN redirects r1 ON p1.id = r1.id' \ ' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?' @@ -91,6 +102,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur name = name.lower() if altNameRegex.fullmatch(name) is not None and name != nodeName and name not in nodeToTips: dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0)) + def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None: # File format: # nodename1|altName1|isPreferred1 -> Add an alt-name @@ -121,8 +133,7 @@ def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqli dbCur.execute(cmd, (nodeName,)) if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(EOL_NAMES_FILE, ENWIKI_DB, PICKED_NAMES_FILE, DB_FILE) diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py index eba8779..a67ea4b 100755 --- a/backend/tol_data/gen_otol_data.py +++ b/backend/tol_data/gen_otol_data.py @@ -21,14 +21,19 @@ Reads from a picked-names file, if present, which specifies name and node ID pai These help resolve cases where multiple nodes share the same name. """ -import re, os -import json, sqlite3 +import argparse +import re +import os +import json +import sqlite3 TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes ANN_FILE = os.path.join('otol', 'annotations.json') DB_FILE = 'data.db' PICKED_NAMES_FILE = 'picked_otol_names.txt' +# ========== Classes ========== + class Node: """ Represents a tree-of-life node """ def __init__(self, name, childIds, parentId, tips, pSupport): @@ -37,13 +42,16 @@ class Node: self.parentId = parentId self.tips = tips self.pSupport = pSupport + class BasicStream: """ Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """ def __init__(self, data, idx=0): self.data = data self.idx = idx + def hasNext(self) -> bool: return self.idx < len(self.data) + def next(self) -> str: if self.hasNext(): char = self.data[self.idx] @@ -51,30 +59,37 @@ class BasicStream: return char; else: return ''; + def peek(self) -> str: if self.hasNext(): return self.data[self.idx] else: return ''; + def skipWhitespace(self) -> None: while self.hasNext() and self.data[self.idx].isspace(): self.idx += 1 + def progress(self) -> float: return (self.idx / len(self.data)) +# ========== For data generation ========== + def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None: """ Reads the files and stores the tree info """ nodeMap: dict[str, Node] = {} # Maps node IDs to node objects nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs - # + print('Parsing tree file') treeStream: BasicStream with open(treeFile) as file: treeStream = BasicStream(file.read()) + # Parse content parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds) print('Resolving duplicate names') + # Read picked-names file nameToPickedId: dict[str, str] = {} if os.path.exists(pickedNamesFile): @@ -82,6 +97,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N for line in file: name, _, otolId = line.strip().partition('|') nameToPickedId[name] = otolId + # Resolve duplicates for dupName, ids in dupNameToIds.items(): # Check for picked id @@ -98,10 +114,12 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N if id != idToUse: nodeMap[id].name += f' [{counter}]' counter += 1 + print('Changing mrca* names') for id, node in nodeMap.items(): if node.name.startswith('mrca'): convertMrcaName(id, nodeMap) + print('Parsing annotations file') # Read file with open(annFile) as file: @@ -116,6 +134,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 node.pSupport = supportQty > 0 and conflictQty == 0 + print('Creating nodes and edges tables') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -129,9 +148,11 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N childNode = nodeMap[childId] dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', (node.name, childNode.name, 1 if childNode.pSupport else 0)) + print('Closing database') dbCon.commit() dbCon.close() + def parseNewick( stream: BasicStream, nodeMap: dict[str, Node], @@ -140,6 +161,7 @@ def parseNewick( """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ if stream.idx % 1e5 == 0: print(f'Progress: {stream.progress() * 100:.2f}%') + # Find node stream.skipWhitespace() if stream.peek() == '': @@ -151,6 +173,7 @@ def parseNewick( # Read child childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds) childIds.append(childId) + # Check for next child or end of node stream.skipWhitespace() if stream.peek() == '': @@ -164,12 +187,15 @@ def parseNewick( stream.skipWhitespace() name, id = parseNewickName(stream) updateNameMaps(name, id, nameToFirstId, dupNameToIds) + # Get child num-tips total tips = 0 for childId in childIds: tips += nodeMap[childId].tips + # Add node to nodeMap nodeMap[id] = Node(name, childIds, None, tips, False) + # Update childrens' parent reference for childId in childIds: nodeMap[childId].parentId = id @@ -179,6 +205,7 @@ def parseNewick( updateNameMaps(name, id, nameToFirstId, dupNameToIds) nodeMap[id] = Node(name, [], None, 1, False) return id + def parseNewickName(stream: BasicStream) -> tuple[str, str]: """ Parses a node name from 'stream', and returns a (name, id) pair """ name: str @@ -202,6 +229,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]: nameChars.append(stream.next()) if stream.peek() == ';': # Ignore trailing input semicolon stream.next() + # Convert to (name, id) name = ''.join(nameChars).rstrip().lower() if name.startswith('mrca'): @@ -217,6 +245,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]: if match is None: raise Exception(f'ERROR: invalid name \'{name}\'') return (match.group(1).replace('_', ' '), match.group(2)) + def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None: """ Update maps upon a newly parsed name """ if name not in nameToFirstId: @@ -226,6 +255,7 @@ def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToI dupNameToIds[name] = [nameToFirstId[name], id] else: dupNameToIds[name].append(id) + def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: """ Update a node in a tree to be named after 2 descendants. Returns the name of one such descendant, for use during recursion. """ @@ -234,6 +264,7 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: childIds = node.childIds if len(childIds) < 2: raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children') + # Get 2 children with most tips childTips = [nodeMap[id].tips for id in childIds] maxIdx1 = childTips.index(max(childTips)) @@ -243,11 +274,13 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: childId2 = childIds[maxIdx2] childName1 = nodeMap[childId1].name childName2 = nodeMap[childId2].name + # Check for mrca* child names if childName1.startswith('mrca'): childName1 = convertMrcaName(childId1, nodeMap) if childName2.startswith('mrca'): childName2 = convertMrcaName(childId2, nodeMap) + # Check for composite names match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) if match is not None: @@ -255,13 +288,15 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) if match is not None: childName2 = match.group(1) + # Create composite name node.name = f'[{childName1} + {childName2}]' return childName1 +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE) diff --git a/backend/tol_data/gen_pop_data.py b/backend/tol_data/gen_pop_data.py index e6a646e..4280a12 100755 --- a/backend/tol_data/gen_pop_data.py +++ b/backend/tol_data/gen_pop_data.py @@ -5,7 +5,9 @@ Reads enwiki page view info from a database, and stores it as node popularity values in the database. """ -import os, sqlite3 +import argparse +import os +import sqlite3 PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db') DB_FILE = 'data.db' @@ -13,7 +15,7 @@ DB_FILE = 'data.db' def genData(pageviewsDb: str, dbFile: str) -> None: dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Getting view counts') pdbCon = sqlite3.connect(pageviewsDb) pdbCur = pdbCon.cursor() @@ -23,23 +25,22 @@ def genData(pageviewsDb: str, dbFile: str) -> None: iterNum += 1 if iterNum % 1e4 == 0: print(f'At iteration {iterNum}') # Reached 1.6e6 - # + row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone() if row is not None: nodeToViews[row[0]] = views pdbCon.close() - # + print(f'Writing {len(nodeToViews)} entries to db') dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)') for nodeName, views in nodeToViews.items(): dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views)) - # + dbCon.commit() dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + genData(PAGEVIEWS_DB, DB_FILE) diff --git a/backend/tol_data/gen_reduced_trees.py b/backend/tol_data/gen_reduced_trees.py index 3742544..ce628f7 100755 --- a/backend/tol_data/gen_reduced_trees.py +++ b/backend/tol_data/gen_reduced_trees.py @@ -14,12 +14,14 @@ Creates reduced versions of the tree in the database: removing some more, despite any node descriptions. """ -import sys, re +import argparse +import sys +import re import sqlite3 DB_FILE = 'data.db' PICKED_NODES_FILE = 'picked_nodes.txt' -# + COMP_NAME_REGEX = re.compile(r'\[.+ \+ .+]') # Used to recognise composite nodes class Node: @@ -30,16 +32,18 @@ class Node: self.tips = tips self.pSupport = pSupport +# ========== For data generation ========== + def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None: print('Opening database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + print('Finding root node') query = 'SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1' (rootName,) = dbCur.execute(query).fetchone() print(f'Found \'{rootName}\'') - # + print('=== Getting picked-nodes ===') pickedNames: set[str] = set() pickedTreeExists = False @@ -63,7 +67,7 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None: for (name,) in dbCur.execute('SELECT name FROM nodes_p'): pickedNames.add(name) print(f'Found {len(pickedNames)} names') - # + if (tree == 'picked' or tree is None) and not pickedTreeExists: print('=== Generating picked-nodes tree ===') genPickedNodeTree(dbCur, pickedNames, rootName) @@ -88,22 +92,27 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None: if tree == 'trimmed' or tree is None: print('=== Generating weakly-trimmed tree ===') genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName) - # + print('Closing database') dbCon.commit() dbCon.close() + def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: str) -> None: PREF_NUM_CHILDREN = 3 # Include extra children up to this limit + print('Getting ancestors') nodeMap = genNodeMap(dbCur, pickedNames, 100) print(f'Result has {len(nodeMap)} nodes') + print('Removing composite nodes') removedNames = removeCompositeNodes(nodeMap) print(f'Result has {len(nodeMap)} nodes') + print('Removing \'collapsible\' nodes') temp = removeCollapsibleNodes(nodeMap, pickedNames) removedNames.update(temp) print(f'Result has {len(nodeMap)} nodes') + print('Adding some additional nearby children') namesToAdd: list[str] = [] iterNum = 0 @@ -111,7 +120,7 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st iterNum += 1 if iterNum % 100 == 0: print(f'At iteration {iterNum}') - # + numChildren = len(node.children) if numChildren < PREF_NUM_CHILDREN: children = [row[0] for row in dbCur.execute('SELECT child FROM edges where parent = ?', (name,))] @@ -134,33 +143,44 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st parent = None if parent == '' else parent nodeMap[name] = Node(id, [], parent, 0, pSupport == 1) print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') updateTips(rootName, nodeMap) + print('Creating table') addTreeTables(nodeMap, dbCur, 'p') + def genImagesOnlyTree( dbCur: sqlite3.Cursor, nodesWithImgOrPicked: set[str], pickedNames: set[str], rootName: str) -> None: + print('Getting ancestors') nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4) print(f'Result has {len(nodeMap)} nodes') + print('Removing composite nodes') removeCompositeNodes(nodeMap) print(f'Result has {len(nodeMap)} nodes') + print('Removing \'collapsible\' nodes') removeCollapsibleNodes(nodeMap, pickedNames) print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') # Needed for next trimming step updateTips(rootName, nodeMap) + print('Trimming from nodes with \'many\' children') trimIfManyChildren(nodeMap, rootName, 300, pickedNames) print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') updateTips(rootName, nodeMap) + print('Creating table') addTreeTables(nodeMap, dbCur, 'i') + def genWeaklyTrimmedTree( dbCur: sqlite3.Cursor, nodesWithImgDescOrPicked: set[str], @@ -169,6 +189,7 @@ def genWeaklyTrimmedTree( print('Getting ancestors') nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5) print(f'Result has {len(nodeMap)} nodes') + print('Getting nodes to \'strongly keep\'') iterNum = 0 nodesFromImgOrPicked: set[str] = set() @@ -184,19 +205,26 @@ def genWeaklyTrimmedTree( else: break print(f'Node set has {len(nodesFromImgOrPicked)} nodes') + print('Removing \'collapsible\' nodes') removeCollapsibleNodes(nodeMap, nodesWithImgDescOrPicked) print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') # Needed for next trimming step updateTips(rootName, nodeMap) + print('Trimming from nodes with \'many\' children') trimIfManyChildren(nodeMap, rootName, 600, nodesFromImgOrPicked) print(f'Result has {len(nodeMap)} nodes') + print('Updating \'tips\' values') updateTips(rootName, nodeMap) + print('Creating table') addTreeTables(nodeMap, dbCur, 't') -# Helper functions + +# ========== Helper functions ========== + def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -> dict[str, Node]: """ Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map """ nodeMap: dict[str, Node] = {} @@ -206,7 +234,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) - iterNum += 1 if iterNum % itersBeforePrint == 0: print(f'At iteration {iterNum}') - # + prevName: str | None = None while name is not None: if name not in nodeMap: @@ -227,6 +255,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) - nodeMap[name].children.append(prevName) break return nodeMap + def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]: """ Given a tree, removes composite-name nodes, and returns the removed nodes' names """ namesToRemove: set[str] = set() @@ -244,10 +273,12 @@ def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]: for name in namesToRemove: del nodeMap[name] return namesToRemove + def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set()) -> set[str]: """ Given a tree, removes single-child parents, then only-childs, with given exceptions, and returns the set of removed nodes' names """ namesToRemove: set[str] = set() + # Remove single-child parents for name, node in nodeMap.items(): if len(node.children) == 1 and node.parent is not None and name not in nodesToKeep: @@ -262,6 +293,7 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set namesToRemove.add(name) for name in namesToRemove: del nodeMap[name] + # Remove only-childs (not redundant because 'nodesToKeep' can cause single-child parents to be kept) namesToRemove.clear() for name, node in nodeMap.items(): @@ -277,8 +309,9 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set namesToRemove.add(name) for name in namesToRemove: del nodeMap[name] - # + return namesToRemove + def trimIfManyChildren( nodeMap: dict[str, Node], rootName: str, childThreshold: int, nodesToKeep: set[str] = set()) -> None: namesToRemove: set[str] = set() @@ -299,14 +332,17 @@ def trimIfManyChildren( # Recurse on children for n in node.children: findTrimmables(n) + def markForRemoval(nodeName: str) -> None: nonlocal nodeMap, namesToRemove namesToRemove.add(nodeName) for child in nodeMap[nodeName].children: markForRemoval(child) + findTrimmables(rootName) for nodeName in namesToRemove: del nodeMap[nodeName] + def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int: """ Updates the 'tips' values for a node and it's descendants, returning the node's new 'tips' value """ node = nodeMap[nodeName] @@ -314,6 +350,7 @@ def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int: tips = max(1, tips) node.tips = tips return tips + def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str): """ Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix """ nodesTbl = f'nodes_{suffix}' @@ -328,10 +365,11 @@ def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str): pSupport = 1 if nodeMap[childName].pSupport else 0 dbCur.execute(f'INSERT INTO {edgesTbl} VALUES (?, ?, ?)', (name, childName, pSupport)) +# ========== Main block ========== + if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--tree', choices=['picked', 'images', 'trimmed'], help='Only generate the specified tree') args = parser.parse_args() - # + genData(args.tree, DB_FILE, PICKED_NODES_FILE) diff --git a/backend/tol_data/review_imgs_to_gen.py b/backend/tol_data/review_imgs_to_gen.py index 2283ed7..f384ddf 100755 --- a/backend/tol_data/review_imgs_to_gen.py +++ b/backend/tol_data/review_imgs_to_gen.py @@ -11,8 +11,11 @@ The program looks for an existing output file to determine what choices have already been made. """ -import os, time +import argparse +import os +import time import sqlite3 + import tkinter as tki from tkinter import ttk import PIL @@ -22,7 +25,7 @@ EOL_IMG_DIR = os.path.join('eol', 'imgs') ENWIKI_IMG_DIR = os.path.join('enwiki', 'imgs') DB_FILE = 'data.db' OUT_FILE = 'img_list.txt' -# + IMG_DISPLAY_SZ = 400 PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135)) REVIEW = 'only pairs' # Can be: 'all', 'only pairs', 'none' @@ -32,11 +35,13 @@ class ImgReviewer: def __init__(self, root, nodeToImgs, eolImgDir, enwikiImgDir, outFile, dbCon, review): self.root = root root.title('Image Reviewer') + # Setup main frame mainFrame = ttk.Frame(root, padding='5 5 5 5') mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S)) root.columnconfigure(0, weight=1) root.rowconfigure(0, weight=1) + # Set up images-to-be-reviewed frames self.eolImg = ImageTk.PhotoImage(PLACEHOLDER_IMG) self.enwikiImg = ImageTk.PhotoImage(PLACEHOLDER_IMG) @@ -47,14 +52,17 @@ class ImgReviewer: label = ttk.Label(frame, image=self.eolImg if i == 0 else self.enwikiImg) label.grid(column=0, row=0) self.labels.append(label) + # Add padding for child in mainFrame.winfo_children(): child.grid_configure(padx=5, pady=5) + # Add keyboard bindings root.bind('<q>', self.quit) root.bind('<Key-j>', lambda evt: self.accept(0)) root.bind('<Key-k>', lambda evt: self.accept(1)) root.bind('<Key-l>', lambda evt: self.reject()) + # Set fields self.nodeImgsList = list(nodeToImgs.items()) self.listIdx = -1 @@ -69,8 +77,10 @@ class ImgReviewer: self.enwikiImgPath = None self.numReviewed = 0 self.startTime = time.time() + # Initialise images to review self.getNextImgs() + def getNextImgs(self): """ Updates display with new images to review, or ends program """ # Get next image paths @@ -81,6 +91,7 @@ class ImgReviewer: self.quit() return self.otolId, imgPaths = self.nodeImgsList[self.listIdx] + # Potentially skip user choice if len(imgPaths) == 1 and (self.review == 'only pairs' or self.review == 'none'): with open(self.outFile, 'a') as file: @@ -91,6 +102,7 @@ class ImgReviewer: file.write(f'{self.otolId} {imgPaths[-1]}\n') # Prefer enwiki image continue break + # Update displayed images self.eolImgPath = self.enwikiImgPath = None imageOpenError = False @@ -113,20 +125,24 @@ class ImgReviewer: print(f'Unexpected image path {imgPath}') self.quit() return + # Re-iterate if all image paths invalid if self.eolImgPath is None and self.enwikiImgPath is None: if imageOpenError: self.reject() self.getNextImgs() return + # Add placeholder images if self.eolImgPath is None: self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG)) elif self.enwikiImgPath is None: self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG)) + # Update image-frames self.labels[0].config(image=self.eolImg) self.labels[1].config(image=self.enwikiImg) + # Update title title = f'Images for otol ID {self.otolId}' query = 'SELECT names.alt_name FROM' \ @@ -137,6 +153,7 @@ class ImgReviewer: title += f', aka {row[0]}' title += f' ({self.listIdx + 1} out of {len(self.nodeImgsList)})' self.root.title(title) + def accept(self, imgIdx): """ React to a user selecting an image """ imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath @@ -147,12 +164,14 @@ class ImgReviewer: file.write(f'{self.otolId} {imgPath}\n') self.numReviewed += 1 self.getNextImgs() + def reject(self): """"" React to a user rejecting all images of a set """ with open(self.outFile, 'a') as file: file.write(f'{self.otolId}\n') self.numReviewed += 1 self.getNextImgs() + def quit(self, e = None): print(f'Number reviewed: {self.numReviewed}') timeElapsed = time.time() - self.startTime @@ -161,6 +180,7 @@ class ImgReviewer: print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds') self.dbCon.close() self.root.destroy() + def resizeImgForDisplay(self, img): """ Returns a copy of an image, shrunk to fit it's frame (keeps aspect ratio), and with a background """ if max(img.width, img.height) > IMG_DISPLAY_SZ: @@ -180,7 +200,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev print('Opening database') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() - # + nodeToImgs: dict[str, list[str]] = {} # Maps otol-ids to arrays of image paths print('Iterating through images from EOL') if os.path.exists(eolImgDir): @@ -198,6 +218,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev if not found: print(f'WARNING: No node found for {os.path.join(eolImgDir, filename)}') print(f'Result: {len(nodeToImgs)} nodes with images') + print('Iterating through images from Wikipedia') if os.path.exists(enwikiImgDir): for filename in os.listdir(enwikiImgDir): @@ -214,7 +235,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev if not found: print(f'WARNING: No node found for {os.path.join(enwikiImgDir, filename)}') print(f'Result: {len(nodeToImgs)} nodes with images') - # + print('Filtering out already-made image choices') oldSz = len(nodeToImgs) if os.path.exists(outFile): @@ -225,7 +246,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev line = line[:line.find(' ')] del nodeToImgs[line] print(f'Filtered out {oldSz - len(nodeToImgs)} entries') - # + # Create GUI and defer control print('Starting GUI') root = tki.Tk() @@ -234,8 +255,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() - # + reviewImgs(EOL_IMG_DIR, ENWIKI_IMG_DIR, DB_FILE, OUT_FILE, REVIEW) diff --git a/backend/tol_data/wikidata/gen_taxon_src_data.py b/backend/tol_data/wikidata/gen_taxon_src_data.py index 1bddb6e..d2a3811 100755 --- a/backend/tol_data/wikidata/gen_taxon_src_data.py +++ b/backend/tol_data/wikidata/gen_taxon_src_data.py @@ -30,10 +30,21 @@ OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 -import sys, os, re, math, io +import argparse +import sys +import os +import re +import math +import io from collections import defaultdict -import bz2, json, sqlite3 -import multiprocessing, indexed_bzip2, pickle, tempfile +import bz2 +import json +import sqlite3 + +import multiprocessing +import indexed_bzip2 +import pickle +import tempfile WIKIDATA_FILE = 'latest-all.json.bz2' OFFSETS_FILE = 'offsets.dat' @@ -49,9 +60,12 @@ IUCN_STATUS_IDS = { 'Q11394': 'endangered', 'Q219127': 'critically endangered', 'Q239509': 'extinct in the wild', 'Q237350': 'extinct species', 'Q3245245': 'data deficient' } + # For filtering lines before parsing JSON LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")').encode()) +# ========== For data generation ========== + def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None: """ Reads the dump and writes source/iucn info to db """ # Maps to populate @@ -59,10 +73,12 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No idToTitle: dict[int, str] = {} # Maps wikidata ID to enwiki title idToAltId: dict[int, int] = {} # Maps taxon-item wikidata ID to taxon-alt ID (eg: 'canis lupus familiaris' -> 'dog') idToIucnStatus: dict[int, str] = {} # Maps wikidata ID to iucn-status string ('least concern', etc) + # Check db if os.path.exists(dbFile): print('ERROR: Database already exists') sys.exit(1) + # Read dump if nProcs == 1: with bz2.open(wikidataFile, mode='rb') as file: @@ -76,6 +92,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No with indexed_bzip2.open(wikidataFile) as file: with open(offsetsFile, 'wb') as file2: pickle.dump(file.block_offsets(), file2) + print('Allocating file into chunks') fileSz: int # About 1.4 TB with indexed_bzip2.open(wikidataFile) as file: @@ -86,6 +103,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1] # Each adjacent pair specifies a start+end byte index for readDumpChunk() print(f'- Chunk size: {chunkSz:,}') + print('Starting processes to read dump') with tempfile.TemporaryDirectory() as tempDirName: # Using maxtasksperchild=1 to free resources on task completion @@ -103,7 +121,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No idToTitle.update(maps[1]) idToAltId.update(maps[2]) idToIucnStatus.update(maps[3]) - # + print('Writing to db') dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() @@ -127,6 +145,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No # The 'OR IGNORE' allows for multiple taxons using the same alt dbCon.commit() dbCon.close() + def readDumpLine( lineBytes: bytes, srcIdToId: dict[str, dict[int, int]], @@ -160,6 +179,7 @@ def readDumpLine( return if not isTaxon and not altTaxa: return + # Get wikidata ID and enwiki title itemId: int | None = None itemTitle: str | None = None @@ -172,11 +192,13 @@ def readDumpLine( itemTitle = None else: return + # Update maps if itemTitle is not None: idToTitle[itemId] = itemTitle for altId in altTaxa: idToAltId[altId] = itemId + # Check for source IDs for srcPropId, src in SRC_PROP_IDS.items(): if srcPropId in claims: @@ -185,6 +207,7 @@ def readDumpLine( srcIdToId[src][srcId] = itemId except (KeyError, ValueError): continue + # Check for IUCN status if 'P141' in claims: # Check for 'iucn conservation status' statement try: @@ -192,9 +215,11 @@ def readDumpLine( idToIucnStatus[itemId] = IUCN_STATUS_IDS[iucnStatusId] except KeyError: pass + def readDumpChunkOneParam(params: tuple[int, str, str, int, int, str]) -> str: """ Forwards to readDumpChunk(), for use with pool.map() """ return readDumpChunk(*params) + def readDumpChunk( procId: int, wikidataFile: str, offsetsFile: str, startByte: int, endByte: int, outFilename: str) -> str: """ Reads lines in the dump that begin after a start-byte, and not after an end byte. @@ -205,18 +230,21 @@ def readDumpChunk( dict[int, str], dict[int, int], dict[int, str]] = (defaultdict(dict), {}, {}, {}) + # Read dump with indexed_bzip2.open(wikidataFile) as file: # Load offsets file with open(offsetsFile, 'rb') as file2: offsets = pickle.load(file2) file.set_block_offsets(offsets) + # Seek to chunk if startByte != -1: file.seek(startByte) file.readline() else: startByte = 0 # Used for progress calculation + # Read lines count = 0 while file.tell() <= endByte: @@ -225,15 +253,17 @@ def readDumpChunk( perc = (file.tell() - startByte) / (endByte - startByte) * 100 print(f'Thread {procId}: {perc:.2f}%') readDumpLine(file.readline(), *maps) + # Output results into file with open(outFilename, 'wb') as file: pickle.dump(maps, file) return outFilename +# ========== Main block ========== + if __name__ == '__main__': # Guard needed for multiprocessing - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) args = parser.parse_args() - # + multiprocessing.set_start_method('spawn') genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS) |
