diff options
Diffstat (limited to 'backend')
83 files changed, 4758 insertions, 2404 deletions
diff --git a/backend/README.md b/backend/README.md index fc68183..0f3f332 100644 --- a/backend/README.md +++ b/backend/README.md @@ -1,5 +1,11 @@ # Files -- **tolData**: Holds scripts for generating the tree-of-life database -- **tilo.py**: WSGI script that serves data from the tree-of-life database. <br> - Note: Using WSGI instead of CGI to avoid starting a new process for each request. +- **tol_data/**: Holds scripts for generating the tree-of-life database +- **tilo.py**: WSGI script that serves data from the tree-of-life database. <br> + Note: WSGI is used instead of CGI to avoid starting a new process for each request. - **server.py**: Basic dev server that serves the WSGI script and image files +- **tests/**: Holds unit testing scripts.<br> + Running all tests: `python -m unittest discover -s tests` <br> + Running a particular test: `python -m unittest tests/test_script1.py` <br> + Getting code coverage info (uses python package 'coverage'): <br> + 1. `coverage run -m unittest discover -s tests` + 2. `coverage report -m > report.txt` diff --git a/backend/server.py b/backend/server.py index 5b0d26b..5b4d050 100755 --- a/backend/server.py +++ b/backend/server.py @@ -18,7 +18,7 @@ def wrappingApp(environ: dict[str, str], start_response) -> Iterable[bytes]: if urlPath.startswith('/data/'): # Run WSGI script return application(environ, start_response) - elif urlPath.startswith('/tolData/img/'): + elif urlPath.startswith('/tol_data/img/'): # Serve image file imgPath = os.path.join(os.getcwd(), urlPath[1:]) if os.path.exists(imgPath): diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tests/__init__.py diff --git a/backend/tests/common.py b/backend/tests/common.py new file mode 100644 index 0000000..cb455e4 --- /dev/null +++ b/backend/tests/common.py @@ -0,0 +1,49 @@ +""" +Utilities for testing +""" + +from typing import Any +import bz2, gzip, sqlite3 + +def createTestFile(filename: str, content: str) -> None: + """ Creates a file with the given name and contents """ + with open(filename, 'w') as file: + file.write(content) + +def readTestFile(filename: str) -> str: + """ Returns the contents of a file with the given name """ + with open(filename) as file: + return file.read() + +def createTestBz2(filename: str, content: str) -> None: + """ Creates a bzip2 file with the given name and contents """ + with bz2.open(filename, mode='wb') as file: + file.write(content.encode()) + +def createTestGzip(filename: str, content: str) -> None: + """ Creates a gzip file with the given name and contents """ + with gzip.open(filename, mode='wt') as file: + file.write(content) + +TableRows = set[tuple[Any, ...]] +def createTestDbTable(filename: str, createCmd: str | None, insertCmd: str, rows: TableRows) -> None: + """ Creates an sqlite db with a table specified by creation+insertion commands and records. + If 'createCmd' is None, just insert into an existing table.""" + dbCon = sqlite3.connect(filename) + dbCur = dbCon.cursor() + if createCmd is not None: + dbCur.execute(createCmd) + for row in rows: + dbCur.execute(insertCmd, row) + dbCon.commit() + dbCon.close() + +def readTestDbTable(filename: str, selectCmd: str) -> TableRows: + """ Returns the records in a sqlite db with the given name, using the given select command """ + rows: set[tuple[Any, ...]] = set() + dbCon = sqlite3.connect(filename) + dbCur = dbCon.cursor() + for row in dbCur.execute(selectCmd): + rows.add(row) + dbCon.close() + return rows diff --git a/backend/tests/dbpedia/__init__.py b/backend/tests/dbpedia/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tests/dbpedia/__init__.py diff --git a/backend/tests/dbpedia/test_gen_desc_data.py b/backend/tests/dbpedia/test_gen_desc_data.py new file mode 100644 index 0000000..7d35677 --- /dev/null +++ b/backend/tests/dbpedia/test_gen_desc_data.py @@ -0,0 +1,107 @@ +import unittest +import tempfile, os + +from tests.common import createTestBz2, readTestDbTable +from tol_data.dbpedia.gen_desc_data import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp labels file + labelsFile = os.path.join(tempDir, 'labels.ttl.bz2') + createTestBz2(labelsFile, ( + '<http://dbpedia.org/resource/One> <http://www.w3.org/2000/01/rdf-schema#label> "One"@en .\n' + '<http://dbpedia.org/resource/Two> <http://www.w3.org/2000/01/rdf-schema#label> "II"@en .\n' + '<http://dbpedia.org/resource/Three> <http://www.w3.org/2000/01/rdf-schema#label> "three"@en .\n' + '<http://dbpedia.org/resource/A_Hat> <http://www.w3.org/2000/01/rdf-schema#label> "A Hat"@en .\n' + )) + # Create temp ids file + idsFile = f'{tempDir}ids.ttl.bz2' + createTestBz2(idsFile, ( + '<http://dbpedia.org/resource/One> <http://dbpedia.org/ontology/wikiPageID>' + ' "1"^^<http://www.w3.org/2001/XMLSchema#integer> .\n' + '<http://dbpedia.org/resource/Two> <http://dbpedia.org/ontology/wikiPageID>' + ' "2"^^<http://www.w3.org/2001/XMLSchema#integer> .\n' + '<http://dbpedia.org/resource/Three> <http://dbpedia.org/ontology/wikiPageID>' + ' "3"^^<http://www.w3.org/2001/XMLSchema#integer> .\n' + '<http://dbpedia.org/resource/A_Hat> <http://dbpedia.org/ontology/wikiPageID>' + ' "210"^^<http://www.w3.org/2001/XMLSchema#integer> .\n' + )) + # Create temp redirects file + redirectsFile = os.path.join(tempDir, 'redirects.ttl.bz2') + createTestBz2(redirectsFile, ( + '<http://dbpedia.org/resource/Three> <http://dbpedia.org/ontology/wikiPageRedirects>' + ' <http://dbpedia.org/resource/A_Hat> .\n' + )) + # Create temp disambig file + disambigFile = os.path.join(tempDir, 'disambig.ttl.bz2') + createTestBz2(disambigFile, ( + '<http://dbpedia.org/resource/Two> <http://dbpedia.org/ontology/wikiPageDisambiguates>' + ' <http://dbpedia.org/resource/One> .\n' + '<http://dbpedia.org/resource/Two> <http://dbpedia.org/ontology/wikiPageDisambiguates>' + ' <http://dbpedia.org/resource/Three> .\n' + )) + # Create temp types file + typesFile = os.path.join(tempDir, 'types.ttl.bz2') + createTestBz2(typesFile, ( + '<http://dbpedia.org/resource/One> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>' + ' <http://dbpedia.org/ontology/Thing> .\n' + '<http://dbpedia.org/resource/Three> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>' + ' <http://dbpedia.org/ontology/Thing> .\n' + )) + # Create temp abstracts file + abstractsFile = os.path.join(tempDir, 'abstracts.ttl.bz2') + createTestBz2(abstractsFile, ( + '<http://dbpedia.org/resource/One> <http://www.w3.org/2000/01/rdf-schema#comment>' + ' "One is a number."@en .\n' + '<http://dbpedia.org/resource/A_Hat> <http://www.w3.org/2000/01/rdf-schema#comment>' + ' "Hats are not parrots, nor are they potatoes."@en .\n' + )) + # Run + dbFile = os.path.join(tempDir, 'descData.db') + genData(labelsFile, idsFile, redirectsFile, disambigFile, typesFile, abstractsFile, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT iri, label from labels'), + { + ('http://dbpedia.org/resource/One', 'One'), + ('http://dbpedia.org/resource/Two', 'II'), + ('http://dbpedia.org/resource/Three', 'three'), + ('http://dbpedia.org/resource/A_Hat', 'A Hat'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT iri, id from ids'), + { + ('http://dbpedia.org/resource/One', 1), + ('http://dbpedia.org/resource/Two', 2), + ('http://dbpedia.org/resource/Three', 3), + ('http://dbpedia.org/resource/A_Hat', 210), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT iri, target from redirects'), + { + ('http://dbpedia.org/resource/Three', 'http://dbpedia.org/resource/A_Hat'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT iri from disambiguations'), + { + ('http://dbpedia.org/resource/Two',), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT iri, type from types'), + { + ('http://dbpedia.org/resource/One', 'http://dbpedia.org/ontology/Thing'), + ('http://dbpedia.org/resource/Three', 'http://dbpedia.org/ontology/Thing'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT iri, abstract from abstracts'), + { + ('http://dbpedia.org/resource/One', 'One is a number.'), + ('http://dbpedia.org/resource/A_Hat', 'Hats are not parrots, nor are they potatoes.'), + } + ) diff --git a/backend/tests/enwiki/__init__.py b/backend/tests/enwiki/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tests/enwiki/__init__.py diff --git a/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 b/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 Binary files differnew file mode 100644 index 0000000..2abfdaa --- /dev/null +++ b/backend/tests/enwiki/sample_enwiki_pages_articles.xml.bz2 diff --git a/backend/tests/enwiki/test_download_img_license_info.py b/backend/tests/enwiki/test_download_img_license_info.py new file mode 100644 index 0000000..ed6e426 --- /dev/null +++ b/backend/tests/enwiki/test_download_img_license_info.py @@ -0,0 +1,185 @@ +import unittest +from unittest.mock import Mock, patch +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from tol_data.enwiki.download_img_license_info import downloadInfo + +TEST_RESPONSE1 = { + 'batchcomplete': '', + 'query': { + 'normalized': [ + { + 'from': 'File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'to': 'File:Georgia Aquarium - Giant Grouper edit.jpg' + } + ], + 'pages': { + '-1': { + 'ns': 6, + 'title': 'File:Octopus2.jpg', + 'missing': '', + 'known': '', + 'imagerepository': 'shared', + 'imageinfo': [ + { + 'url': 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg', + 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Octopus2.jpg', + 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=2795257', + 'extmetadata': { + 'Credit': { + 'value': '<span class=\\"int-own-work\\" lang=\\"en\\">Own work</span>', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Artist': { + 'value': 'albert kok', + 'source': 'commons-desc-page' + }, + 'LicenseShortName': { + 'value': 'CC BY-SA 3.0', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Restrictions': { + 'value': '', + 'source': 'commons-desc-page', + 'hidden': '' + } + } + } + ] + } + } + } +} +TEST_RESPONSE2 = { + 'batchcomplete': '', + 'query': { + 'normalized': [ + { + 'from': 'File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'to': 'File:Georgia Aquarium - Giant Grouper edit.jpg' + } + ], + 'pages': { + '-1': { + 'ns': 6, + 'title': 'File:Octopus2.jpg', + 'missing': '', + 'known': '', + 'imagerepository': 'shared', + 'imageinfo': [ + { + 'url': 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg', + 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Octopus2.jpg', + 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=2795257', + 'extmetadata': { + 'Credit': { + 'value': '<span class=\\"int-own-work\\" lang=\\"en\\">Own work</span>', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Artist': { + 'value': 'albert kok', + 'source': 'commons-desc-page' + }, + 'LicenseShortName': { + 'value': 'CC BY-SA 3.0', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Restrictions': { + 'value': '', + 'source': 'commons-desc-page', + 'hidden': '' + } + } + } + ] + }, + '-2': { + 'ns': 6, + 'title': 'File:Georgia Aquarium - Giant Grouper edit.jpg', + 'missing': '', + 'known': '', + 'imagerepository': 'shared', + 'imageinfo': [ + { + 'url': 'https://upload.wikimedia.org/wikipedia/commons/2/23/Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Georgia_Aquarium_-_Giant_Grouper_edit.jpg', + 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=823649', + 'extmetadata': { + 'Credit': { + "value": "<a href=\"//commons.wikimedia.org/wiki/File:Georgia_Aquarium_-_Giant_Grouper.jpg\" title=\"File:Georgia Aquarium - Giant Grouper.jpg\">File:Georgia Aquarium - Giant Grouper.jpg</a>", + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Artist': { + "value": "Taken by <a href=\"//commons.wikimedia.org/wiki/User:Diliff\" title=\"User:Diliff\">Diliff</a> Edited by <a href=\"//commons.wikimedia.org/wiki/User:Fir0002\" title=\"User:Fir0002\">Fir0002</a>", + 'source': 'commons-desc-page' + }, + 'LicenseShortName': { + 'value': 'CC BY 2.5', + 'source': 'commons-desc-page', + 'hidden': '' + }, + 'Restrictions': { + 'value': '', + 'source': 'commons-desc-page', + 'hidden': '' + } + } + } + ] + } + } + } +} + +class TestDownloadInfo(unittest.TestCase): + @patch('requests.get', autospec=True) + def test_download(self, requestsGetMock): + requestsGetMock.side_effect = [Mock(json=lambda: TEST_RESPONSE1), Mock(json=lambda: TEST_RESPONSE2)] + with tempfile.TemporaryDirectory() as tempDir: + # Create temp image-data db + imgDb = os.path.join(tempDir, 'img_data.db') + createTestDbTable( + imgDb, + 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)', + 'INSERT into page_imgs VALUES (?, ?)', + { + (1, 'Octopus2.jpg'), + } + ) + # Run + downloadInfo(imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT name, license, artist, credit, restrictions, url from imgs'), + { + ('Octopus2.jpg', 'CC BY-SA 3.0', 'albert kok', 'Own work', '', + 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'), + } + ) + # Run with updated image-data db + createTestDbTable( + imgDb, + None, + 'INSERT into page_imgs VALUES (?, ?)', + { + (2, 'Georgia_Aquarium_-_Giant_Grouper_edit.jpg'), + } + ) + downloadInfo(imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT name, license, artist, credit, restrictions, url from imgs'), + { + ('Octopus2.jpg', 'CC BY-SA 3.0', 'albert kok', 'Own work', '', + 'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'), + ('Georgia_Aquarium_-_Giant_Grouper_edit.jpg', 'CC BY 2.5', 'Taken by Diliff Edited by Fir0002', + 'File:Georgia Aquarium - Giant Grouper.jpg', '', 'https://upload.wikimedia.org/' \ + 'wikipedia/commons/2/23/Georgia_Aquarium_-_Giant_Grouper_edit.jpg'), + } + ) diff --git a/backend/tests/enwiki/test_download_imgs.py b/backend/tests/enwiki/test_download_imgs.py new file mode 100644 index 0000000..2618b8a --- /dev/null +++ b/backend/tests/enwiki/test_download_imgs.py @@ -0,0 +1,54 @@ +import unittest +from unittest.mock import Mock, patch +import tempfile, os + +from tests.common import readTestFile, createTestDbTable +from tol_data.enwiki.download_imgs import downloadImgs + +class TestDownloadInfo(unittest.TestCase): + @patch('requests.get', autospec=True) + def test_download(self, requestsGetMock): + requestsGetMock.side_effect = lambda url, **kwargs: Mock(content=('img:' + url).encode()) + with tempfile.TemporaryDirectory() as tempDir: + # Create temp image-data db + imgDb = os.path.join(tempDir, 'img_data.db') + createTestDbTable( + imgDb, + 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)', + 'INSERT into page_imgs VALUES (?, ?)', + { + (1, 'one'), + (2, 'two'), + (3, 'three'), + (4, 'four'), + (5, 'five'), + (6, 'six'), + (7, 'seven'), + } + ) + createTestDbTable( + imgDb, + 'CREATE TABLE imgs' \ + '(name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)', + 'INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', + { + ('one','cc-by','alice','anna','','https://upload.wikimedia.org/1.jpg'), + ('two','???','bob','barbara','','https://upload.wikimedia.org/2.png'), + ('three','cc-by-sa','clare','File:?','','https://upload.wikimedia.org/3.gif'), + ('four','cc-by-sa 4.0','dave','dan','all','https://upload.wikimedia.org/4.jpeg'), + ('five','cc0','eve','eric',None,'https://upload.wikimedia.org/5.png'), + ('six','cc-by','','fred','','https://upload.wikimedia.org/6.png'), + } + ) + # Create temp output directory + with tempfile.TemporaryDirectory() as outDir: + # Run + downloadImgs(imgDb, outDir, 0) + # Check + expectedImgs = { + '1.jpg': 'img:https://upload.wikimedia.org/1.jpg', + '5.png': 'img:https://upload.wikimedia.org/5.png', + } + self.assertEqual(set(os.listdir(outDir)), set(expectedImgs.keys())) + for imgName, content in expectedImgs.items(): + self.assertEqual(readTestFile(os.path.join(outDir, imgName)), content) diff --git a/backend/tests/enwiki/test_gen_desc_data.py b/backend/tests/enwiki/test_gen_desc_data.py new file mode 100644 index 0000000..801aa69 --- /dev/null +++ b/backend/tests/enwiki/test_gen_desc_data.py @@ -0,0 +1,37 @@ +import unittest +import os, tempfile + +from tests.common import readTestDbTable +from tol_data.enwiki.gen_desc_data import genData + +TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2') + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Run + dbFile = os.path.join(tempDir, 'descData.db') + genData(TEST_DUMP_FILE, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, title FROM pages'), + { + (10, 'AccessibleComputing'), + (13, 'AfghanistanHistory'), + (25, 'Autism'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, target FROM redirects'), + { + (10, 'Computer accessibility'), + (13, 'History of Afghanistan'), + } + ) + descsRows = readTestDbTable(dbFile, 'SELECT id, desc FROM descs') + expectedDescPrefixes = { + 25: 'Kanner autism, or classic autism, is a neurodevelopmental disorder', + } + self.assertEqual({row[0] for row in descsRows}, set(expectedDescPrefixes.keys())) + for id, desc in descsRows: + self.assertTrue(id in expectedDescPrefixes and desc.startswith(expectedDescPrefixes[id])) diff --git a/backend/tests/enwiki/test_gen_dump_index_db.py b/backend/tests/enwiki/test_gen_dump_index_db.py new file mode 100644 index 0000000..e0715f3 --- /dev/null +++ b/backend/tests/enwiki/test_gen_dump_index_db.py @@ -0,0 +1,39 @@ +import unittest +import tempfile, os + +from tests.common import createTestBz2, readTestDbTable +from tol_data.enwiki.gen_dump_index_db import genData + +def runGenData(indexFileContents: str): + """ Sets up index file to be read by genData(), runs it, reads the output database, and returns offset info. """ + with tempfile.TemporaryDirectory() as tempDir: + # Create temp index file + indexFile = os.path.join(tempDir, 'index.txt.bz2') + createTestBz2(indexFile, indexFileContents) + # Run + dbFile = os.path.join(tempDir, 'data.db') + genData(indexFile, dbFile) + # Read db + return readTestDbTable(dbFile, 'SELECT title, id, offset, next_offset FROM offsets') + +class TestGenData(unittest.TestCase): + def setUp(self): + self.maxDiff = None # Remove output-diff size limit + def test_index_file(self): + indexFileContents = ( + '100:10:apple\n' + '100:11:ant\n' + '300:99:banana ice-cream\n' + '1000:2030:Custard!\n' + ) + offsetsMap = runGenData(indexFileContents) + self.assertEqual(offsetsMap, { + ('apple', 10, 100, 300), + ('ant', 11, 100, 300), + ('banana ice-cream', 99, 300, 1000), + ('Custard!', 2030, 1000, -1), + }) + def test_emp_index(self): + offsetsMap = runGenData('') + self.assertEqual(offsetsMap, set()) + pass diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py new file mode 100644 index 0000000..1703b78 --- /dev/null +++ b/backend/tests/enwiki/test_gen_img_data.py @@ -0,0 +1,64 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from tol_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData + +TEST_DUMP_FILE = os.path.join(os.path.dirname(__file__), 'sample_enwiki_pages_articles.xml.bz2') + +class TestGetInputPageIdsFromDb(unittest.TestCase): + def test_get(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp tree-of-life db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO wiki_ids VALUES (?, ?)', + { + ('one', 1), + ('and another', 2), + } + ) + # Run + pageIds = getInputPageIdsFromDb(dbFile) + # Check + self.assertEqual(pageIds, {1, 2}) + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp dump-index db + indexDb = os.path.join(tempDir, 'dump_index.db') + createTestDbTable( + indexDb, + 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)', + 'INSERT INTO offsets VALUES (?, ?, ?, ?)', + { + ('AccessibleComputing',10,0,-1), + ('AfghanistanHistory',13,0,-1), + ('Autism',25,0,-1), + } + ) + # Run + imgDb = os.path.join(tempDir, 'imgData.db') + genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'), + { + (10, None), + (25, 'Autism-stacking-cans 2nd edit.jpg'), + } + ) + # Run with updated page-ids set + genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb) + # Check + self.assertEqual( + readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'), + { + (10, None), + (13, None), + (25, 'Autism-stacking-cans 2nd edit.jpg'), + } + ) diff --git a/backend/tests/enwiki/test_gen_pageview_data.py b/backend/tests/enwiki/test_gen_pageview_data.py new file mode 100644 index 0000000..5002eb0 --- /dev/null +++ b/backend/tests/enwiki/test_gen_pageview_data.py @@ -0,0 +1,44 @@ +import unittest +import tempfile, os + +from tests.common import createTestBz2, createTestDbTable, readTestDbTable +from tol_data.enwiki.gen_pageview_data import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp pageview files + pageviewFiles = [os.path.join(tempDir, 'pageviews1.bz2'), os.path.join(tempDir, 'pageviews2.bz2')] + createTestBz2(pageviewFiles[0], ( + 'aa.wikibooks One null desktop 1 W1\n' + 'en.wikipedia Two null mobile-web 10 A9B1\n' + 'en.wikipedia Three null desktop 4 D3\n' + )) + createTestBz2(pageviewFiles[1], ( + 'fr.wikipedia Four null desktop 12 T6U6\n' + 'en.wikipedia Three null desktop 10 E4G5Z61\n' + )) + # Create temp dump-index db + dumpIndexDb = os.path.join(tempDir, 'dump_index.db') + createTestDbTable( + dumpIndexDb, + 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)', + 'INSERT INTO offsets VALUES (?, ?, ?, ?)', + { + ('One', 1, 0, -1), + ('Two', 2, 0, -1), + ('Three', 3, 0, -1), + ('Four', 4, 0, -1), + } + ) + # Run + dbFile = os.path.join(tempDir, 'data.db') + genData(pageviewFiles, dumpIndexDb, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT title, id, views from views'), + { + ('Two', 2, 5), + ('Three', 3, 7), + } + ) diff --git a/backend/tests/eol/__init__.py b/backend/tests/eol/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tests/eol/__init__.py diff --git a/backend/tests/eol/test_download_imgs.py b/backend/tests/eol/test_download_imgs.py new file mode 100644 index 0000000..975d1c7 --- /dev/null +++ b/backend/tests/eol/test_download_imgs.py @@ -0,0 +1,74 @@ +import unittest +from unittest.mock import Mock, patch +import tempfile, os + +from tests.common import readTestFile, createTestDbTable +from tol_data.eol.download_imgs import getEolIdsFromDb, downloadImgs + +class TestGetEolIdsFromDb(unittest.TestCase): + def test_get(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO eol_ids VALUES (?, ?)', + { + ('one', 1), + ('a second', 2), + } + ) + # Run + eolIds = getEolIdsFromDb(dbFile) + # Check + self.assertEqual(eolIds, {1, 2}) + +class TestDownloadImgs(unittest.TestCase): + @patch('requests.get', autospec=True) + def test_gen(self, requestsGetMock): + requestsGetMock.side_effect = lambda url: Mock(content=('img:' + url).encode()) + with tempfile.TemporaryDirectory() as tempDir: + eolIds = {1, 2, 4} + # Create temp images-list db + imagesListDb = os.path.join(tempDir, 'images_list.db') + createTestDbTable( + imagesListDb, + 'CREATE TABLE images (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \ + ' copy_url TEXT, license TEXT, copyright_owner TEXT)', + 'INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + { + (10, 1, '???', 'https://content.eol.org/1.jpg', 'cc-by-sa', 'owner1'), + (20, 2, '', 'https://content.eol.org/2.jpg', 'cc-by', 'owner2'), + (21, 2, '', 'https://content.eol.org/2b.jpg', 'public domain', 'owner2'), + (22, 2, '', 'https://content.eol.org/2c.jpg', '???', 'owner3'), + (23, 2, '', 'data/2d.jpg', 'cc-by-nc', 'owner5'), + (24, 2, '', 'https://content.eol.org/2e', 'cc-by', 'owner6'), + (25, 2, '', 'https://content.eol.org/2f.gif', 'cc-by', 'owner7'), + (30, 3, '', 'https://content.eol.org/3.png', 'cc-by', 'owner3'), + } + ) + # Create temp output dir + with tempfile.TemporaryDirectory() as outDir: + # Run + downloadImgs(eolIds, imagesListDb, outDir) + # Check + expectedImgs1 = { + '1 10.jpg': 'img:https://content.eol.org/1.jpg', + '2 20.jpg': 'img:https://content.eol.org/2.jpg', + '2 23.jpg': 'img:https://content.eol.org/data/2d.jpg', + '2 25.gif': 'img:https://content.eol.org/2f.gif', + } + expectedImgs2 = { + '1 10.jpg': 'img:https://content.eol.org/1.jpg', + '2 21.jpg': 'img:https://content.eol.org/2b.jpg', + '2 23.jpg': 'img:https://content.eol.org/data/2d.jpg', + '2 25.gif': 'img:https://content.eol.org/2f.gif', + } + outImgSet = set(os.listdir(outDir)) + expectedImgSet1 = set(expectedImgs1.keys()) + expectedImgSet2 = set(expectedImgs2.keys()) + self.assertIn(outImgSet, (expectedImgSet1, expectedImgSet2)) + matchingImgs = expectedImgs1 if outImgSet == expectedImgSet1 else expectedImgs2 + for imgName, imgContent in matchingImgs.items(): + self.assertEqual(readTestFile(os.path.join(outDir, imgName)), imgContent) diff --git a/backend/tests/eol/test_gen_images_list_db.py b/backend/tests/eol/test_gen_images_list_db.py new file mode 100644 index 0000000..ca9b495 --- /dev/null +++ b/backend/tests/eol/test_gen_images_list_db.py @@ -0,0 +1,32 @@ +import unittest +import tempfile, os + +from tests.common import createTestFile, readTestDbTable +from tol_data.eol.gen_images_list_db import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp images-list files + imageListsGlob = os.path.join(tempDir, 'imgs-*.csv') + createTestFile(os.path.join(tempDir, 'imgs-1.csv'), ( + 'EOL content ID,EOL page ID,Medium Source URL,EOL Full-Size Copy URL,License Name,Copyright Owner\n' + '1,10,https://example.com/1/,https://content.eol.org/1.jpg,cc-by,owner1\n' + '2,20,https://example2.com/2/,https://content.eol.org/2.jpg,cc-by-sa,owner2\n' + )) + createTestFile(os.path.join(tempDir, 'imgs-2.csv'), ( + '3,30,https://example.com/3/,https://content.eol.org/3.png,public,owner3\n' + )) + # Run + dbFile = os.path.join(tempDir, 'imagesList.db') + genData(imageListsGlob, dbFile) + # Check + self.assertEqual( + readTestDbTable( + dbFile, 'SELECT content_id, page_id, source_url, copy_url, license, copyright_owner from images'), + { + (1, 10, 'https://example.com/1/', 'https://content.eol.org/1.jpg', 'cc-by', 'owner1'), + (2, 20, 'https://example2.com/2/', 'https://content.eol.org/2.jpg', 'cc-by-sa', 'owner2'), + (3, 30, 'https://example.com/3/', 'https://content.eol.org/3.png', 'public', 'owner3'), + } + ) diff --git a/backend/tests/eol/test_review_imgs.py b/backend/tests/eol/test_review_imgs.py new file mode 100644 index 0000000..49c09bb --- /dev/null +++ b/backend/tests/eol/test_review_imgs.py @@ -0,0 +1,46 @@ +import unittest +import tempfile, os, shutil + +from tests.common import createTestDbTable +from tol_data.eol.review_imgs import reviewImgs + +CLICK_IMG = os.path.join(os.path.dirname(__file__), '..', 'green.png') +AVOID_IMG = os.path.join(os.path.dirname(__file__), '..', 'red.png') + +class TestReviewImgs(unittest.TestCase): + def test_review(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create input images + imgDir = os.path.join(tempDir, 'imgs_for_review') + os.mkdir(imgDir) + shutil.copy(CLICK_IMG, os.path.join(imgDir, '1 10.jpg')) + shutil.copy(CLICK_IMG, os.path.join(imgDir, '2 20.jpeg')) + shutil.copy(AVOID_IMG, os.path.join(imgDir, '2 21.gif')) + shutil.copy(AVOID_IMG, os.path.join(imgDir, '2 22.jpg')) + shutil.copy(AVOID_IMG, os.path.join(imgDir, '3 30.png')) + shutil.copy(AVOID_IMG, os.path.join(imgDir, '3 31.jpg')) + # Create temp extra-info db + extraInfoDb = os.path.join(tempDir, 'data.db') + createTestDbTable( + extraInfoDb, + 'CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO eol_ids VALUES (?, ?)', + { + ('one', 1), + ('two', 2), + ('three', 3), + } + ) + createTestDbTable( + extraInfoDb, + 'CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))', + 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, ?)', + { + ('two','II',1,'eol'), + } + ) + # Run + outDir = os.path.join(tempDir, 'imgs') + reviewImgs(imgDir, outDir, extraInfoDb) + # Check + self.assertEqual(set(os.listdir(outDir)), {'1 10.jpg', '2 20.jpeg'}) diff --git a/backend/tests/green.png b/backend/tests/green.png Binary files differnew file mode 100644 index 0000000..d4f15c9 --- /dev/null +++ b/backend/tests/green.png diff --git a/backend/tests/red.png b/backend/tests/red.png Binary files differnew file mode 100644 index 0000000..7828e96 --- /dev/null +++ b/backend/tests/red.png diff --git a/backend/tests/test_gen_desc_data.py b/backend/tests/test_gen_desc_data.py new file mode 100644 index 0000000..cc0582d --- /dev/null +++ b/backend/tests/test_gen_desc_data.py @@ -0,0 +1,101 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from tol_data.gen_desc_data import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp dbpedia db + dbpediaDb = os.path.join(tempDir, 'dbp_descs.db') + createTestDbTable( + dbpediaDb, + 'CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)', + 'INSERT INTO ids VALUES (?, ?)', + { + ('<http://dbpedia.org/resource/One>', 1), + ('<http://dbpedia.org/resource/Two>', 2), + ('<http://dbpedia.org/resource/Three>', 3), + } + ) + createTestDbTable( + dbpediaDb, + 'CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)', + 'INSERT INTO redirects VALUES (?, ?)', + { + ('<http://dbpedia.org/resource/Two>', '<http://dbpedia.org/resource/Three>'), + } + ) + createTestDbTable( + dbpediaDb, + 'CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)', + 'INSERT INTO abstracts VALUES (?, ?)', + { + ('<http://dbpedia.org/resource/One>', 'One from dbp'), + ('<http://dbpedia.org/resource/Two>', 'Two from dbp'), + ('<http://dbpedia.org/resource/Three>', 'Three from dbp'), + } + ) + # Create temp enwiki db + enwikiDb = os.path.join(tempDir, 'enwiki_descs.db') + createTestDbTable( + enwikiDb, + 'CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)', + 'INSERT INTO pages VALUES (?, ?)', + { + (1, 'I'), + (3, 'III'), + (4, 'IV'), + (5, 'V'), + (6, 'VI'), + } + ) + createTestDbTable( + enwikiDb, + 'CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)', + 'INSERT INTO redirects VALUES (?, ?)', + { + (5, 'IV'), + } + ) + createTestDbTable( + enwikiDb, + 'CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)', + 'INSERT INTO descs VALUES (?, ?)', + { + (1, 'One from enwiki'), + (3, 'Three from enwiki'), + (4, 'Four from enwiki'), + (5, 'Five from enwiki'), + } + ) + # Create temp tree-of-life db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO wiki_ids VALUES (?, ?)', + { + ('first', 1), + ('second', 2), + ('third', 3), + ('fourth', 4), + ('fifth', 5), + ('sixth', 6), + ('seventh', 7), + } + ) + # Run + genData(dbpediaDb, enwikiDb, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT wiki_id, desc, from_dbp from descs'), + { + (1, 'One from dbp', 1), + (2, 'Three from dbp', 1), + (3, 'Three from dbp', 1), + (4, 'Four from enwiki', 0), + (5, 'Four from enwiki', 0), + } + ) diff --git a/backend/tests/test_gen_imgs.py b/backend/tests/test_gen_imgs.py new file mode 100644 index 0000000..1ddd438 --- /dev/null +++ b/backend/tests/test_gen_imgs.py @@ -0,0 +1,125 @@ +import unittest +from unittest.mock import patch +import tempfile, os, shutil + +from tests.common import createTestFile, createTestDbTable, readTestDbTable +from tol_data.gen_imgs import genImgs + +TEST_IMG = os.path.join(os.path.dirname(__file__), 'green.png') + +class TestGenImgs(unittest.TestCase): + @patch('tol_data.gen_imgs.convertImage', autospec=True) + def test_gen(self, convertImageMock): + with tempfile.TemporaryDirectory() as tempDir: + convertImageMock.side_effect = \ + lambda imgPath, outPath: shutil.copy(imgPath, outPath) + # Create temp EOL images + eolImgDir = os.path.join(tempDir, 'eol_imgs') + os.mkdir(eolImgDir) + shutil.copy(TEST_IMG, os.path.join(eolImgDir, '1 10.jpg')) + shutil.copy(TEST_IMG, os.path.join(eolImgDir, '2 20.png')) + shutil.copy(TEST_IMG, os.path.join(eolImgDir, '5 50.jpg')) + # Create temp EOL image db + eolImgDb = os.path.join(tempDir, 'eol_imgs.db') + createTestDbTable( + eolImgDb, + 'CREATE TABLE images (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \ + ' copy_url TEXT, license TEXT, copyright_owner TEXT)', + 'INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + { + (10, 1, 'https://example.com/1.jpg', '', 'cc-by', 'eol owner1'), + (20, 2, 'https://example.com/2.png', '', 'cc-by-sa', 'eol owner2'), + (50, 5, 'https://example.com/5.jpg', '', 'cc-by-sa', 'eol owner3'), + } + ) + # Create temp enwiki images + enwikiImgDir = os.path.join(tempDir, 'enwiki_imgs') + os.mkdir(enwikiImgDir) + shutil.copy(TEST_IMG, os.path.join(enwikiImgDir, '100.jpg')) + shutil.copy(TEST_IMG, os.path.join(enwikiImgDir, '200.jpeg')) + shutil.copy(TEST_IMG, os.path.join(enwikiImgDir, '400.png')) + # Create temp enwiki image db + enwikiImgDb = os.path.join(tempDir, 'enwiki_imgs.db') + createTestDbTable( + enwikiImgDb, + 'CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)', + 'INSERT INTO page_imgs VALUES (?, ?)', + { + (100, 'one.jpg'), + (200, 'two.jpeg'), + (300, 'two.jpeg'), + (400, 'two.jpeg'), + } + ) + createTestDbTable( + enwikiImgDb, + 'CREATE TABLE imgs (' \ + 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)', + 'INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', + { + ('one.jpg', 'CC BY-SA 3.0', 'author1', 'credits1', '', 'https://upload.wikimedia.org/one.jpg'), + ('two.jpeg', 'cc-by', 'author2', 'credits2', '', 'https://upload.wikimedia.org/two.jpeg'), + ('four.png', 'cc0', 'author3', '', '', 'https://upload.wikimedia.org/x.png'), + } + ) + # Create temp picked-images file + pickedImgsFile = os.path.join(tempDir, 'img_data.txt') + createTestFile(pickedImgsFile, ( + 'node5.jpg|url1|cc-by-sa 4.0|artist1|credit1\n' + )) + # Create temp picked-images + pickedImgDir = os.path.join(tempDir, 'picked_imgs') + os.mkdir(pickedImgDir) + shutil.copy(TEST_IMG, os.path.join(pickedImgDir, 'node5.jpg')) + # Create temp img-list file + imgListFile = os.path.join(tempDir, 'img_list.txt') + createTestFile(imgListFile, ( + 'ott1 ' + os.path.join(eolImgDir, '1 10.jpg') + '\n' + 'ott2 ' + os.path.join(enwikiImgDir, '200.jpeg') + '\n' + 'ott3\n' + 'ott4 ' + os.path.join(enwikiImgDir, '400.png') + '\n' + 'ott5 ' + os.path.join(eolImgDir, '5 50.jpg') + '\n' + )) + # Create temp tree-of-life db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)', + 'INSERT INTO nodes VALUES (?, ?, ?)', + { + ('node1', 'ott1', 1), + ('node2', 'ott2', 1), + ('node3', 'ott3', 2), + ('node4', 'ott4', 4), + ('node5', 'ott5', 1), + ('node6', 'ott6', 10), + } + ) + # Run + outDir = os.path.join(tempDir, 'img') + genImgs(imgListFile, eolImgDir, outDir, eolImgDb, enwikiImgDb, pickedImgDir, pickedImgsFile, dbFile) + # Check + self.assertEqual(set(os.listdir(outDir)), { + 'ott1.jpg', + 'ott2.jpg', + 'ott4.jpg', + 'ott5.jpg', + }) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, img_id, src from node_imgs'), + { + ('node1', 1, 'eol'), + ('node2', 200, 'enwiki'), + ('node4', 400, 'enwiki'), + ('node5', 1, 'picked'), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT id, src, url, license, artist, credit from images'), + { + (1, 'eol', 'https://example.com/1.jpg', 'cc-by', 'eol owner1', ''), + (200, 'enwiki', 'https://en.wikipedia.org/wiki/File:two.jpeg', 'cc-by', 'author2', 'credits2'), + (400, 'enwiki', 'https://en.wikipedia.org/wiki/File:two.jpeg', 'cc-by', 'author2', 'credits2'), + (1, 'picked', 'url1', 'cc-by-sa 4.0', 'artist1', 'credit1'), + } + ) diff --git a/backend/tests/test_gen_linked_imgs.py b/backend/tests/test_gen_linked_imgs.py new file mode 100644 index 0000000..b989407 --- /dev/null +++ b/backend/tests/test_gen_linked_imgs.py @@ -0,0 +1,84 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from tol_data.gen_linked_imgs import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp tree-of-life db + # Test tree ('I' means a node has an image): + # one -> two -> sixI + # -> seven + # -> eight + # -> threeI + # -> [nine + ten] -> nineI + # -> ten + # -> fiveI -> [twelve + thirteen] -> twelveI + # -> thirteenI + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)', + 'INSERT INTO nodes VALUES (?, ?, ?)', + { + ('one', 'ott1', 8), + ('two', 'ott2', 3), + ('three', 'ott3', 1), + ('[nine + ten]', 'ott4', 2), + ('five', 'ott5', 2), + ('six', 'ott6', 1), + ('seven', 'ott7', 1), + ('eight', 'ott8', 1), + ('nine', 'ott9', 1), + ('ten', 'ott10', 1), + ('[twelve + thirteen]', 'ott11', 2), + ('twelve', 'ott12', 1), + ('thirteen', 'ott13', 1), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))', + 'INSERT INTO edges VALUES (?, ?, ?)', + { + ('one', 'two', 1), + ('one', 'three', 1), + ('one', '[nine + ten]', 0), + ('one', 'five', 1), + ('two', 'six', 1), + ('two', 'seven', 1), + ('two', 'eight', 0), + ('[nine + ten]', 'nine', 0), + ('[nine + ten]', 'ten', 1), + ('five', '[twelve + thirteen]', 1), + ('[twelve + thirteen]', 'twelve', 1), + ('[twelve + thirteen]', 'thirteen', 0), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)', + 'INSERT INTO node_imgs VALUES (?, ?, ?)', + { + ('six', 1, 'eol'), + ('three', 10, 'enwiki'), + ('nine', 1, 'picked'), + ('five', 2, 'eol'), + ('twelve', 11, 'enwiki'), + ('thirteen', 12, 'enwiki'), + } + ) + # Run + genData(dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, otol_ids from linked_imgs'), + { + ('one', 'ott6'), + ('two', 'ott6'), + ('[nine + ten]', 'ott9,'), + ('[twelve + thirteen]', 'ott12,ott13'), + } + ) diff --git a/backend/tests/test_gen_mapping_data.py b/backend/tests/test_gen_mapping_data.py new file mode 100644 index 0000000..9aa99b7 --- /dev/null +++ b/backend/tests/test_gen_mapping_data.py @@ -0,0 +1,302 @@ +import unittest +import tempfile, os + +from tests.common import createTestFile, createTestGzip, createTestDbTable, readTestDbTable +from tol_data.gen_mapping_data import \ + genData, readTaxonomyFile, readEolIdsFile, readWikidataDb, readPickedMappings, getEnwikiPageIds + +class TestReadTaxonomyFile(unittest.TestCase): + def test_read(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp taxonomy file + taxonomyFile = os.path.join(tempDir, 'taxonomy.tsv') + SEP = '\t|\t' + createTestFile(taxonomyFile, ''.join([ + SEP.join(['uid', 'parent_uid', 'name', 'rank', 'sourceinfo', 'uniqueName', 'flags', '\n']), + SEP.join(['1', '2', 'one', 'species', 'ncbi:10', '', '', '\n']), + SEP.join(['2', '3', 'two', 'genus', 'ncbi:20,gbif:1', 'bananas', '', '\n']), + SEP.join(['10', '20', 'ten', 'family', 'if:10,if:100', '', '', '\n']), + SEP.join(['11', '100', 'eleven', '', 'igloo:1,ncbi:?', '', '', '\n']) + ])) + # Run + nodeToSrcIds = {} + usedSrcIds = set() + readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds) + # Check + self.assertEqual(nodeToSrcIds, { + 1: {'ncbi': 10}, + 2: {'ncbi': 20, 'gbif': 1}, + 10: {'if': 10}, + }) + self.assertEqual(usedSrcIds, { + ('ncbi', 10), + ('ncbi', 20), + ('gbif', 1), + ('if', 10) + }) +class TestReadEolIdsFile(unittest.TestCase): + def test_read(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp EOL IDs file + eolIdsFile = os.path.join(tempDir, 'ids.csv.gz') + createTestGzip(eolIdsFile, ( + 'node_id,resource_pk,resource_id,page_id,preferred_canonical_for_page\n' + '0,10,676,1,rhubarb\n' # EOL ID 1 with ncbi ID 10 + '0,99,767,2,nothing\n' # EOL ID 2 with worms ID 99 + '0,234,459,100,goat\n' # EOL ID 100 with gbif ID 234 + '0,23,676,101,lemon\n' # EOL ID 101 with ncbi ID 23 + )) + # Create input maps + nodeToSrcIds = { + 10: {'ncbi': 10}, + 20: {'ncbi': 23, 'gbif': 234} + } + # Run + usedSrcIds = {('ncbi', 10), ('gbif', 234), ('ncbi', 23)} + nodeToEolId = {} + readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId) + # Check + self.assertEqual(nodeToEolId, { + 10: 1, + 20: 101, + }) +class TestReadWikidataDb(unittest.TestCase): + def test_read(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp wikidata db + wikidataDb = os.path.join(tempDir, 'taxon_srcs.db') + createTestDbTable( + wikidataDb, + 'CREATE TABLE src_id_to_title (src TEXT, id INT, title TEXT, PRIMARY KEY(src, id))', + 'INSERT INTO src_id_to_title VALUES (?, ?, ?)', + [ + ('ncbi', 1, 'one'), + ('ncbi', 11, 'two'), + ('gbif', 21, 'three'), + ('if', 31, 'three'), + ('ncbi', 2, 'four'), + ('gbif', 1, 'five'), + ('eol', 1, 'one'), + ('eol', 2, 'three'), + ('ncbi', 100, 'six'), + ] + ) + createTestDbTable( + wikidataDb, + 'CREATE TABLE title_iucn (title TEXT PRIMARY KEY, status TEXT)', + 'INSERT INTO title_iucn VALUES (?, ?)', + [ + ('one', 'least concern'), + ('three', 'vulnerable'), + ('six', 'extinct in the wild'), + ] + ) + # Create input maps + nodeToSrcIds = { + 10: {'ncbi': 1}, + 20: {'ncbi': 11, 'gbif': 21, 'if': 31}, + 30: {'ncbi': 2, 'gbif': 1}, + 40: {'ncbi': 99}, + } + usedSrcIds = { + ('ncbi', 1), ('ncbi', 2), ('gbif', 1), ('ncbi', 11), ('gbif', 21), ('if', 31), + ('eol', 10), ('ncbi', 99) + } + nodeToEolId = { + 20: 100, + } + # Run + nodeToWikiTitle = {} + titleToIucnStatus = {} + readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId) + # Check + self.assertEqual(nodeToWikiTitle, { + 10: 'one', + 20: 'three', + 30: 'four', + }) + self.assertEqual(titleToIucnStatus, { + 'one': 'least concern', + 'three': 'vulnerable', + }) + self.assertEqual(nodeToEolId, { + 10: 1, + 20: 100, + }) +class TestReadPickedMappings(unittest.TestCase): + def test_read(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp picked-mappings files + pickedMappings = {'eol': ['1.txt'], 'enwiki': ['2.txt', '3.txt']} + pickedMappingsContent = {'eol': [''], 'enwiki': ['', '']} + pickedMappingsContent['eol'][0] = ( + '10|100\n' + '20|202\n' + ) + pickedMappingsContent['enwiki'][0] = ( + '12|abc\n' + '23|def\n' + ) + pickedMappingsContent['enwiki'][1] = ( + '15|ghi\n' + '35|jkl\n' + ) + for src in pickedMappings: + for idx in range(len(pickedMappings[src])): + pickedMappings[src][idx] = os.path.join(tempDir, pickedMappings[src][idx]) + createTestFile(pickedMappings[src][idx], pickedMappingsContent[src][idx]) + # Create input maps + nodeToEolId = { + 1: 1, + 10: 66, + } + nodeToWikiTitle = { + 10: 'one', + 12: 'two', + 35: 'goanna', + } + # Run + readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle) + # Check + self.assertEqual(nodeToEolId, { + 1: 1, + 10: 100, + 20: 202, + }) + self.assertEqual(nodeToWikiTitle, { + 10: 'one', + 12: 'abc', + 23: 'def', + 15: 'ghi', + 35: 'jkl', + }) +class TestReadGetEnwikiPageIds(unittest.TestCase): + def test_read(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp dump index + dumpIndexDb = os.path.join(tempDir, 'dump_index.db') + createTestDbTable( + dumpIndexDb, + 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)', + 'INSERT INTO offsets VALUES (?, ?, ?, ?)', + [ + ('one', 1, 10, 100), + ('two', 22, 10, 100), + ('four', 3, 1000, 2000), + ] + ) + # Create input maps + nodeToWikiTitle = { + 10: 'one', + 20: 'two', + 30: 'three', + } + # Run + titleToPageId = {} + getEnwikiPageIds(dumpIndexDb, nodeToWikiTitle, titleToPageId) + # Check + self.assertEqual(titleToPageId, { + 'one': 1, + 'two': 22, + }) +class TestGenData(unittest.TestCase): + def test_mapping(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp taxonomy file + taxonomyFile = os.path.join(tempDir, 'taxonomy.tsv') + SEP = '\t|\t' + createTestFile(taxonomyFile, ''.join([ + SEP.join(['uid', 'parent_uid', 'name', 'rank', 'sourceinfo', 'uniqueName', 'flags', '\n']), + SEP.join(['1', '', '', '', 'ncbi:10', '', '', '\n']), + SEP.join(['2', '', '', '', 'ncbi:20,gbif:1', '', '', '\n']), + SEP.join(['3', '', '', '', 'ncbi:30,if:2', '', '', '\n']), + ])) + # Create temp EOL IDs file + eolIdsFile = os.path.join(tempDir, 'ids.csv.gz') + createTestGzip(eolIdsFile, ( + 'node_id,resource_pk,resource_id,page_id,preferred_canonical_for_page\n' + '0,10,676,1,\n' # EOL ID 1 with ncbi ID 10 + '0,30,676,2,\n' # EOL ID 2 with ncbi ID 30 + )) + # Create temp wikidata db + wikidataDb = os.path.join(tempDir, 'taxon_srcs.db') + createTestDbTable( + wikidataDb, + 'CREATE TABLE src_id_to_title (src TEXT, id INT, title TEXT, PRIMARY KEY(src, id))', + 'INSERT INTO src_id_to_title VALUES (?, ?, ?)', + [ + ('ncbi', 10, 'one'), + ('gbif', 1, 'two'), + ('eol', 100, 'two'), + ('if', 2, 'three'), + ] + ) + createTestDbTable( + wikidataDb, + 'CREATE TABLE title_iucn (title TEXT PRIMARY KEY, status TEXT)', + 'INSERT INTO title_iucn VALUES (?, ?)', + [ + ('one', 'least concern'), + ('three', 'vulnerable'), + ] + ) + # Create temp picked-mappings files + pickedMappings = {'eol': [], 'enwiki': ['w_ids.txt']} + pickedMappingsContent = {'eol': [], 'enwiki': ['']} + pickedMappingsContent['enwiki'][0] = ( + '3|four\n' + ) + for src in pickedMappings: + for idx in range(len(pickedMappings[src])): + pickedMappings[src][idx] = os.path.join(tempDir, pickedMappings[src][idx]) + createTestFile(pickedMappings[src][idx], pickedMappingsContent[src][idx]) + # Create temp dump index + dumpIndexDb = os.path.join(tempDir, 'dump_index.db') + createTestDbTable( + dumpIndexDb, + 'CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)', + 'INSERT INTO offsets VALUES (?, ?, ?, ?)', + [ + ('one', 1000, 1, 2), + ('two', 2000, 1, 2), + ('three', 3000, 1, 2), + ('four', 4000, 1, 2), + ] + ) + # Create temp tree-of-life db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)', + 'INSERT INTO nodes VALUES (?, ?, ?)', + [ + ('first', 'ott1', 10), + ('second', 'ott2', 1), + ('third', 'ott3', 2), + ] + ) + # Run + genData(taxonomyFile, eolIdsFile, wikidataDb, pickedMappings, dumpIndexDb, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, id from eol_ids'), + { + ('first', 1), + ('second', 100), + ('third', 2), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, id from wiki_ids'), + { + ('first', 1000), + ('second', 2000), + ('third', 4000), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, iucn from node_iucn'), + { + ('first', 'least concern'), + } + ) diff --git a/backend/tests/test_gen_name_data.py b/backend/tests/test_gen_name_data.py new file mode 100644 index 0000000..85e81d8 --- /dev/null +++ b/backend/tests/test_gen_name_data.py @@ -0,0 +1,93 @@ +import unittest +import tempfile, os + +from tests.common import createTestFile, createTestDbTable, readTestDbTable +from tol_data.gen_name_data import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp eol names file + eolNamesFile = os.path.join(tempDir, 'vernacular_names.csv') + createTestFile(eolNamesFile, ( + 'page_id,,vernacular_string,language_code,,,is_preferred_by_eol\n' + '10,,cat,eng,,,preferred\n' + '10,,kitty,eng,,,\n' + '20,,apple,eng,,,preferred\n' + '20,,pomme,fr,,,preferred\n' + '20,,apples,eng,,,\n' + '30,,those things with wings,eng,,,\n' + )) + # Create temp enwiki db + enwikiDb = os.path.join(tempDir, 'desc_data.db') + createTestDbTable( + enwikiDb, + 'CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)', + 'INSERT INTO pages VALUES (?, ?)', + [ + (1, 'abc'), + (2, 'def'), + (3, 'ghi'), + ] + ) + createTestDbTable( + enwikiDb, + 'CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)', + 'INSERT INTO redirects VALUES (?, ?)', + [ + (3, 'abc'), + (4, 'def'), + ] + ) + # Create temp picked-names file + pickedNamesFile = os.path.join(tempDir, 'picked_names.txt') + createTestFile(pickedNamesFile, ( + 'three|xxx|1\n' + 'one|kitty|\n' + 'two|two|\n' + )) + # Create temp db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)', + 'INSERT INTO nodes VALUES (?, ?, ?)', + [ + ('one', 'ott1', 1), + ('two', 'ott2', 1), + ('three', 'ott3', 1), + ] + ) + createTestDbTable( + dbFile, + 'CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO eol_ids VALUES (?, ?)', + [ + ('one', 10), + ('two', 20), + ('three', 30), + ] + ) + createTestDbTable( + dbFile, + 'CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO wiki_ids VALUES (?, ?)', + [ + ('one', 1), + ('two', 3), + ('three', 2), + ] + ) + # Run + genData(eolNamesFile, enwikiDb, pickedNamesFile, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, alt_name, pref_alt, src FROM names'), + { + ('one', 'cat', 1, 'eol'), + ('one', 'ghi', 0, 'enwiki'), + ('two', 'apple', 0, 'eol'), + ('two', 'apples', 0, 'eol'), + ('three', 'xxx', 1, 'picked'), + } + ) diff --git a/backend/tests/test_gen_otol_data.py b/backend/tests/test_gen_otol_data.py new file mode 100644 index 0000000..25e65e3 --- /dev/null +++ b/backend/tests/test_gen_otol_data.py @@ -0,0 +1,118 @@ +import unittest +import tempfile, os + +from tests.common import createTestFile, readTestDbTable +from tol_data.gen_otol_data import genData + +def runGenData(treeFileContents: str, annFileContents: str, pickedFileContents: str): + """ Sets up files to be read by genData(), runs it, reads the output database, and returns node+edge info """ + with tempfile.TemporaryDirectory() as tempDir: + # Create temp tree file + treeFile = os.path.join(tempDir, 'tree.tre') + createTestFile(treeFile, treeFileContents) + # Create temp annotations file + annFile = os.path.join(tempDir, 'ann.json') + createTestFile(annFile, annFileContents) + # Create temp picked names file + pickedFile = os.path.join(tempDir, 'pn.txt') + createTestFile(pickedFile, pickedFileContents) + # Run genData() + dbFile = os.path.join(tempDir, 'data.db') + genData(treeFile, annFile, pickedFile, dbFile) + # Read database + nodes = readTestDbTable(dbFile, 'SELECT name, id, tips FROM nodes') + edges = readTestDbTable(dbFile, 'SELECT parent, child, p_support FROM edges') + return nodes, edges + +class TestGenData(unittest.TestCase): + def setUp(self): + self.maxDiff = None # Remove output-diff size limit + def test_newick(self): + treeFileContents = """ + ( + 'land plants ott2', + ( + 'TRAVELLER''s tree ott100', + (domestic_banana_ott4, (lemon_ott6, orange_ott7)citrus_ott5)mrcaott4ott5 + ) mrcaott100ott4, + 'Highly Unu2u8| name!! ott999', + 'citrus ott230' + )cellular_organisms_ott1;""" + annFileContents = '{"nodes": {}}' + pickedFileContents = '' + nodes, edges = runGenData(treeFileContents, annFileContents, pickedFileContents) + self.assertEqual(nodes, { + ('land plants', 'ott2', 1), + ('traveller\'s tree', 'ott100', 1), + ('domestic banana', 'ott4', 1), + ('lemon', 'ott6', 1), + ('orange', 'ott7', 1), + ('citrus', 'ott5', 2), + ('[citrus + domestic banana]', 'mrcaott4ott5', 3), + ('[citrus + traveller\'s tree]', 'mrcaott100ott4', 4), + ('highly unu2u8| name!! ', 'ott999', 1), + ('citrus [2]', 'ott230', 1), + ('cellular organisms', 'ott1', 7), + }) + self.assertEqual(edges, { + ('cellular organisms', 'land plants', 0), + ('cellular organisms', '[citrus + traveller\'s tree]', 0), + ('cellular organisms', 'highly unu2u8| name!! ', 0), + ('cellular organisms', 'citrus [2]', 0), + ('[citrus + traveller\'s tree]', 'traveller\'s tree', 0), + ('[citrus + traveller\'s tree]', '[citrus + domestic banana]', 0), + ('[citrus + domestic banana]', 'domestic banana', 0), + ('[citrus + domestic banana]', 'citrus', 0), + ('citrus', 'lemon', 0), + ('citrus', 'orange', 0), + }) + def test_newick_invalid(self): + with self.assertRaises(Exception): + runGenData('(A,B,(C,D));', '{"nodes": {}}', '') + def test_annotations(self): + treeFileContents = '(two_ott2, three_ott3, four_ott4)one_ott1;' + annFileContents = """ + { + "date_completed": "xxx", + "nodes": { + "ott3": { + "supported_by": { + "tree1": "node1" + } + }, + "ott4": { + "supported_by": { + "tree1": "node2", + "tree2": "node100" + }, + "conflicts_with": { + "tree3": ["x", "y"] + } + } + } + }""" + nodes, edges = runGenData(treeFileContents, annFileContents, '') + self.assertEqual(nodes, { + ('one', 'ott1', 3), + ('two', 'ott2', 1), + ('three', 'ott3', 1), + ('four', 'ott4', 1), + }) + self.assertEqual(edges, { + ('one', 'two', 0), + ('one', 'three', 1), + ('one', 'four', 0), + }) + def test_picked_names_file(self): + treeFileContents = '(one_ott2, two_ott3)one_ott1;' + pickedFileContents = 'one|ott2' + nodes, edges = runGenData(treeFileContents, '{"nodes": {}}', pickedFileContents) + self.assertEqual(nodes, { + ('one [2]', 'ott1', 2), + ('one', 'ott2', 1), + ('two', 'ott3', 1), + }) + self.assertEqual(edges, { + ('one [2]', 'one', 0), + ('one [2]', 'two', 0), + }) diff --git a/backend/tests/test_gen_pop_data.py b/backend/tests/test_gen_pop_data.py new file mode 100644 index 0000000..dd1cb22 --- /dev/null +++ b/backend/tests/test_gen_pop_data.py @@ -0,0 +1,42 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable, readTestDbTable +from tol_data.gen_pop_data import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp pageviews db + pageviewsDb = os.path.join(tempDir, 'pageview_data.db') + createTestDbTable( + pageviewsDb, + 'CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)', + 'INSERT INTO views VALUES (?, ?, ?)', + { + ('one', 1, 10), + ('two', 2, 20), + ('three', 3, 30), + } + ) + # Create temp tree-of-life db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO wiki_ids VALUES (?, ?)', + { + ('node1', 1), + ('node3', 3), + } + ) + # Run + genData(pageviewsDb, dbFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, pop from node_pop'), + { + ('node1', 10), + ('node3', 30) + } + ) diff --git a/backend/tests/test_gen_reduced_trees.py b/backend/tests/test_gen_reduced_trees.py new file mode 100644 index 0000000..2ae4dfd --- /dev/null +++ b/backend/tests/test_gen_reduced_trees.py @@ -0,0 +1,166 @@ +import unittest +import tempfile, os + +from tests.common import createTestFile, createTestDbTable, readTestDbTable +from tol_data.gen_reduced_trees import genData + +class TestGenData(unittest.TestCase): + def test_gen(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp tree-of-life db + # Test tree (P/I/L/D means picked/image/linked_image/desc): + # one -> two -> threeI -> four + # -> fiveP + # -> [seven + eight] -> sevenD + # -> eightP + # -> nine -> tenI + # -> elevenL + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)', + 'INSERT INTO nodes VALUES (?, ?, ?)', + { + ('one', 'ott1', 6), + ('two', 'ott2', 2), + ('three', 'ott3', 1), + ('four', 'ott4', 1), + ('five', 'ott5', 1), + ('[seven + eight]', 'ott6', 2), + ('seven', 'ott7', 1), + ('eight', 'ott8', 1), + ('nine', 'ott9', 1), + ('ten', 'ott10', 1), + ('eleven', 'ott11', 1), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))', + 'INSERT INTO edges VALUES (?, ?, ?)', + { + ('one', 'two', 1), + ('two', 'three', 1), + ('three', 'four', 0), + ('two', 'five', 0), + ('one', '[seven + eight]', 1), + ('[seven + eight]', 'seven', 0), + ('[seven + eight]', 'eight', 1), + ('one', 'nine', 1), + ('nine', 'ten', 0), + ('one', 'eleven', 1), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))', + 'INSERT INTO names VALUES (?, ?, ?, ?)', + { + ('eight', 'VIII', 1, 'eol'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO wiki_ids VALUES (?, ?)', + { + ('seven', 10), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)', + 'INSERT INTO descs VALUES (?, ?, ?)', + { + (10, 'Seven prefers orange juice', 1), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)', + 'INSERT INTO node_imgs VALUES (?, ?, ?)', + { + ('three', 1, 'eol'), + ('ten', 10, 'enwiki'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)', + 'INSERT INTO linked_imgs VALUES (?, ?)', + { + ('eleven', 'ott3'), + } + ) + # Create temp picked-nodes file + pickedNodesFile = os.path.join(tempDir, 'picked_nodes.txt') + createTestFile(pickedNodesFile, ( + 'five\n' + 'VIII\n' + )) + # Run + genData(None, dbFile, pickedNodesFile) + # Check + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, id, tips from nodes_p'), + { + ('one', 'ott1', 3), + ('five', 'ott5', 1), + ('eight', 'ott8', 1), + ('eleven', 'ott11', 1), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT parent, child, p_support from edges_p'), + { + ('one', 'five', 0), + ('one', 'eight', 1), + ('one', 'eleven', 1), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, id, tips from nodes_i'), + { + ('one', 'ott1', 4), + ('two', 'ott2', 2), + ('three', 'ott3', 1), + ('five', 'ott5', 1), + ('eight', 'ott8', 1), + ('ten', 'ott10', 1), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT parent, child, p_support from edges_i'), + { + ('one', 'two', 1), + ('two', 'three', 1), + ('two', 'five', 0), + ('one', 'eight', 1), + ('one', 'ten', 0), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT name, id, tips from nodes_t'), + { + ('one', 'ott1', 5), + ('two', 'ott2', 2), + ('three', 'ott3', 1), + ('five', 'ott5', 1), + ('[seven + eight]', 'ott6', 2), + ('seven', 'ott7', 1), + ('eight', 'ott8', 1), + ('ten', 'ott10', 1), + } + ) + self.assertEqual( + readTestDbTable(dbFile, 'SELECT parent, child, p_support from edges_t'), + { + ('one', 'two', 1), + ('two', 'three', 1), + ('two', 'five', 0), + ('one', '[seven + eight]', 1), + ('[seven + eight]', 'seven', 0), + ('[seven + eight]', 'eight', 1), + ('one', 'ten', 0), + } + ) diff --git a/backend/tests/test_review_imgs_to_gen.py b/backend/tests/test_review_imgs_to_gen.py new file mode 100644 index 0000000..d88523b --- /dev/null +++ b/backend/tests/test_review_imgs_to_gen.py @@ -0,0 +1,84 @@ +import unittest +import tempfile, os, shutil + +from tests.common import readTestFile, createTestDbTable +from tol_data.review_imgs_to_gen import reviewImgs + +CLICK_IMG = os.path.join(os.path.dirname(__file__), 'green.png') +AVOID_IMG = os.path.join(os.path.dirname(__file__), 'red.png') + +class TestReviewImgs(unittest.TestCase): + def test_review(self): + with tempfile.TemporaryDirectory() as tempDir: + # Create temp eol imgs + eolImgDir = os.path.join(tempDir, 'eol_imgs') + os.mkdir(eolImgDir) + shutil.copy(CLICK_IMG, os.path.join(eolImgDir, '1 10.jpg')) + shutil.copy(AVOID_IMG, os.path.join(eolImgDir, '2 20.gif')) + shutil.copy(AVOID_IMG, os.path.join(eolImgDir, '4 40.jpg')) + # Create temp enwiki imgs + enwikiImgDir = os.path.join(tempDir, 'enwiki_imgs') + os.mkdir(enwikiImgDir) + shutil.copy(AVOID_IMG, os.path.join(enwikiImgDir, '1.jpg')) + shutil.copy(CLICK_IMG, os.path.join(enwikiImgDir, '3.png')) + shutil.copy(CLICK_IMG, os.path.join(enwikiImgDir, '4.png')) + # Create temp tree-of-life db + dbFile = os.path.join(tempDir, 'data.db') + createTestDbTable( + dbFile, + 'CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)', + 'INSERT INTO nodes VALUES (?, ?, ?)', + { + ('one', 'ott1', 1), + ('two', 'ott2', 10), + ('three', 'ott3', 2), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))', + 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, ?)', + { + ('two', 'II', 1, 'eol'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO eol_ids VALUES (?, ?)', + { + ('one', 1), + ('two', 2), + ('four', 4), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO wiki_ids VALUES (?, ?)', + { + ('one', 1), + ('three', 3), + ('four', 4), + } + ) + # Run + outFile = os.path.join(tempDir, 'imgList.txt') + reviewImgs(eolImgDir, enwikiImgDir, dbFile, outFile, 'all') + # Check + self.assertEqual(set(readTestFile(outFile).splitlines()), { + 'ott1 ' + os.path.join(eolImgDir, '1 10.jpg'), + 'ott2', + 'ott3 ' + os.path.join(enwikiImgDir, '3.png'), + }) + # Add extra data + createTestDbTable(dbFile, None, 'INSERT INTO nodes VALUES (?, ?, ?)',{('four', 'ott4', 2)}) + # Run + reviewImgs(eolImgDir, enwikiImgDir, dbFile, outFile, 'all') + # Check + self.assertEqual(set(readTestFile(outFile).splitlines()), { + 'ott1 ' + os.path.join(eolImgDir, '1 10.jpg'), + 'ott2', + 'ott3 ' + os.path.join(enwikiImgDir, '3.png'), + 'ott4 ' + os.path.join(enwikiImgDir, '4.png'), + }) diff --git a/backend/tests/test_tilo.py b/backend/tests/test_tilo.py new file mode 100644 index 0000000..cfc719a --- /dev/null +++ b/backend/tests/test_tilo.py @@ -0,0 +1,160 @@ +import unittest +import tempfile, os + +from tests.common import createTestDbTable +from tilo import handleReq, TolNode, SearchSuggResponse, SearchSugg, InfoResponse, NodeInfo, DescInfo, ImgInfo + +def initTestDb(dbFile: str) -> None: + # Test tree (I/D means image/desc): + # oneI -> twoD -> threeD + # -> fourI + # -> fiveI -> sixID -> seven + createTestDbTable( + dbFile, + 'CREATE TABLE nodes_t (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)', + 'INSERT INTO nodes_t VALUES (?, ?, ?)', + { + ('one', 'ott1', 3), + ('two', 'ott2', 2), + ('three', 'ott3', 1), + ('four', 'ott4', 1), + ('five', 'ott5', 1), + ('six', 'ott6', 1), + ('seven', 'ott7', 1), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE edges_t (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))', + 'INSERT INTO edges_t VALUES (?, ?, ?)', + { + ('one', 'two', 1), + ('two', 'three', 0), + ('two', 'four', 1), + ('one', 'five', 0), + ('five', 'six', 1), + ('six', 'seven', 1), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))', + 'INSERT INTO names VALUES (?, ?, ?, ?)', + { + ('one', 'turtle', 1, 'eol'), + ('two', 'II', 1, 'eol'), + ('five', 'V', 0, 'enwiki'), + ('six', 'VI', 1, 'enwiki'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)', + 'INSERT INTO node_imgs VALUES (?, ?, ?)', + { + ('one', 1, 'eol'), + ('four', 10, 'enwiki'), + ('five', 10, 'enwiki'), + ('six', 1, 'picked'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)', + 'INSERT INTO linked_imgs VALUES (?, ?)', + { + ('two', 'ott4'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE images (' \ + 'id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))', + 'INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + { + (1, 'eol', 'url1', 'license1', 'artist1', 'credit1'), + (10, 'enwiki', 'url2', 'license2', 'artist2', 'credit2'), + (1, 'picked', 'url3', 'license3', 'artist3', 'credit3'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)', + 'INSERT INTO node_iucn VALUES (?, ?)', + { + ('one', 'vulnerable'), + ('six', 'endangered'), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)', + 'INSERT INTO node_pop VALUES (?, ?)', + { + ('one', 10), + ('two', 20), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)', + 'INSERT INTO wiki_ids VALUES (?, ?)', + { + ('two', 200), + ('three', 300), + ('six', 600), + } + ) + createTestDbTable( + dbFile, + 'CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)', + 'INSERT INTO descs VALUES (?, ?, ?)', + { + (200, 'two is 2', 1), + (300, 'three is 3', 0), + (600, 'six is 6', 1), + } + ) + +class TestHandleReq(unittest.TestCase): + def setUp(self): + self.maxDiff = None + self.tempDir = tempfile.TemporaryDirectory() + self.dbFile = os.path.join(self.tempDir.name, 'data.db') + initTestDb(self.dbFile) + def tearDown(self): + self.tempDir.cleanup() + def test_node_req(self): + response = handleReq(self.dbFile, {'QUERY_STRING': 'name=two&type=node&tree=trimmed'}) + self.assertEqual(response, { + 'two': TolNode('ott2', ['three', 'four'], 'one', 2, True, 'II', 'ott4.jpg', None), + 'three': TolNode('ott3', [], 'two', 1, False, None, None, None), + 'four': TolNode('ott4', [], 'two', 1, True, None, 'ott4.jpg', None), + }) + def test_node_toroot_req(self): + response = handleReq(self.dbFile, {'QUERY_STRING': 'name=seven&type=node&toroot=1&excl=five&tree=trimmed'}) + self.assertEqual(response, { + 'five': TolNode('ott5', ['six'], 'one', 1, 0, None, 'ott5.jpg', None), + 'six': TolNode('ott6', ['seven'], 'five', 1, 1, 'VI', 'ott6.jpg', 'endangered'), + 'seven': TolNode('ott7', [], 'six', 1, 1, None, None, None), + }) + def test_sugg_req(self): + response = handleReq(self.dbFile, {'QUERY_STRING': 'name=t&type=sugg&tree=trimmed'}) + self.assertEqual(response, SearchSuggResponse( + [ + SearchSugg('turtle', 'one', 10), + SearchSugg('two', None, 20), + SearchSugg('three', None, 0), + ], + False + )) + def test_info_req(self): + response = handleReq(self.dbFile, {'QUERY_STRING': 'name=six&type=info&tree=trimmed'}) + self.assertEqual(response, InfoResponse( + NodeInfo( + TolNode('ott6', ['seven'], 'five', 1, True, 'VI', 'ott6.jpg', 'endangered'), + DescInfo('six is 6', 600, True), + ImgInfo(1, 'picked', 'url3', 'license3', 'artist3', 'credit3'), + ), + [] + )) diff --git a/backend/tests/wikidata/__init__.py b/backend/tests/wikidata/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tests/wikidata/__init__.py diff --git a/backend/tests/wikidata/test_gen_taxon_src_data.py b/backend/tests/wikidata/test_gen_taxon_src_data.py new file mode 100644 index 0000000..1f886b3 --- /dev/null +++ b/backend/tests/wikidata/test_gen_taxon_src_data.py @@ -0,0 +1,109 @@ +import unittest +import tempfile, os, json, bz2, pickle, indexed_bzip2 + +from tests.common import readTestDbTable +from tol_data.wikidata.gen_taxon_src_data import genData + +def runGenData(wikiItemArray: str, preGenOffsets: bool, nProcs: int): + """ Sets up wikidata file to be read by genData(), runs it, reads the output database, and returns src+iucn info. + If 'preGenOffsets' is True, generates a bz2 offsets file before running genData(). """ + with tempfile.TemporaryDirectory() as tempDir: + # Create temp wikidata file + wikidataFile = os.path.join(tempDir, 'dump.json.bz2') + with bz2.open(wikidataFile, mode='wb') as file: + file.write(b'[\n') + for i in range(len(wikiItemArray)): + file.write(json.dumps(wikiItemArray[i], separators=(',',':')).encode()) + if i < len(wikiItemArray) - 1: + file.write(b',') + file.write(b'\n') + file.write(b']\n') + # Create temp offsets file if requested + offsetsFile = os.path.join(tempDir, 'offsets.dat') + if preGenOffsets: + with indexed_bzip2.open(wikidataFile) as file: + with open(offsetsFile, 'wb') as file2: + pickle.dump(file.block_offsets(), file2) + # Run genData() + dbFile = os.path.join(tempDir, 'data.db') + genData(wikidataFile, offsetsFile, dbFile, nProcs) + # Read db + srcRows = readTestDbTable(dbFile, 'SELECT src, id, title FROM src_id_to_title') + iucnRows = readTestDbTable(dbFile, 'SELECT title, status FROM title_iucn') + return srcRows, iucnRows + +class TestGenData(unittest.TestCase): + def setUp(self): + self.maxDiff = None # Remove output-diff size limit + self.testWikiItems = [ + { + 'id': 'Q1', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # instance-of 'taxon' + 'P830': [{'mainsnak': {'datavalue': {'value': 100}}}], # EOL ID 100 + 'P685': [{'mainsnak': {'datavalue': {'value': 200}}}], # NCBI ID 200 + 'P141': [{'mainsnak': {'datavalue': {'value': {'id': 'Q211005'}}}}], # IUCN 'least concern' + }, + 'sitelinks': {'enwiki': {'title': 'eucalyptus'}}, + }, + { + 'id': 'Q2', + 'claims': { + 'P685': [{'mainsnak': {'datavalue': {'value': 101}}}], # NCBI ID 101 + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q23038290'}}}}], # fossil taxon + }, + 'sitelinks': {'enwiki': {'title': 'dolphin'}}, + }, + { + 'id': 'Q30', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q502895'}}}, # instance-of common name + 'qualifiers': {'P642': [{'datavalue': {'value': {'numeric-id': 100}}}]}}], # of Q100 + 'P685': [{'mainsnak': {'datavalue': {'value': 333}}}], # NCBI ID 333 + }, + 'sitelinks': {'enwiki': {'title': 'dog'}}, + }, + { + 'id': 'Q100', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # instance-of taxon + 'P5055': [{'mainsnak': {'datavalue': {'value': 9}}}], # IRMNG ID 9 + 'P141': [{'mainsnak': {'datavalue': {'value': {'id': 'Q11394'}}}}], # IUCN endangered + }, + }, + { + 'id': 'Q1', + 'claims': { + 'P31': [{'mainsnak': {'datavalue': {'value': {'id': 'Q16521'}}}}], # instance-of taxon + } + # No title + }, + {'id': 'Q932', 'claims': {}}, + ] + self.expectedSrcRows = { + ('eol', 100, 'eucalyptus'), + ('ncbi', 200, 'eucalyptus'), + ('ncbi', 101, 'dolphin'), + ('ncbi', 333, 'dog'), + ('irmng', 9, 'dog'), + } + self.expectedIucnRows = { + ('eucalyptus', 'least concern'), + ('dog', 'endangered'), + } + def test_wikiItems(self): + srcMap, iucnMap = runGenData(self.testWikiItems, False, 1) + self.assertEqual(srcMap, self.expectedSrcRows) + self.assertEqual(iucnMap, self.expectedIucnRows) + def test_empty_dump(self): + srcMap, iucnMap = runGenData([{}], False, 1) + self.assertEqual(srcMap, set()) + self.assertEqual(iucnMap, set()) + def test_multiprocessing(self): + srcMap, iucnMap = runGenData(self.testWikiItems, False, 4) + self.assertEqual(srcMap, self.expectedSrcRows) + self.assertEqual(iucnMap, self.expectedIucnRows) + def test_existing_offsets(self): + srcMap, iucnMap = runGenData(self.testWikiItems, True, 3) + self.assertEqual(srcMap, self.expectedSrcRows) + self.assertEqual(iucnMap, self.expectedIucnRows) diff --git a/backend/tilo.py b/backend/tilo.py index c1ecc34..dfefab1 100755 --- a/backend/tilo.py +++ b/backend/tilo.py @@ -28,7 +28,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description=HELP_INFO, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() -DB_FILE = 'tolData/data.db' +DB_FILE = 'tol_data/data.db' DEFAULT_SUGG_LIM = 5 MAX_SUGG_LIM = 50 ROOT_NAME = 'cellular organisms' @@ -45,7 +45,7 @@ class TolNode: pSupport=False, commonName: str | None = None, imgName: None | str | tuple[str, str] | tuple[None, str] | tuple[str, None] = None, - iucn: str = None): + iucn: str | None = None): self.otolId = otolId self.children = children self.parent = parent @@ -54,23 +54,52 @@ class TolNode: self.commonName = commonName self.imgName = imgName self.iucn = iucn + # Used in unit testing + def __eq__(self, other): + return isinstance(other, TolNode) and \ + (self.otolId, set(self.children), self.parent, self.tips, \ + self.pSupport, self.commonName, self.imgName, self.iucn) == \ + (other.otolId, set(other.children), other.parent, other.tips, \ + other.pSupport, other.commonName, other.imgName, other.iucn) + def __repr__(self): + return str(self.__dict__) class SearchSugg: """ Represents a search suggestion """ def __init__(self, name: str, canonicalName: str | None = None, pop=0): self.name = name self.canonicalName = canonicalName self.pop = pop if pop is not None else 0 + # Used in unit testing + def __eq__(self, other): + return isinstance(other, SearchSugg) and \ + (self.name, self.canonicalName, self.pop) == (other.name, other.canonicalName, other.pop) + def __repr__(self): + return str(self.__dict__) + def __hash__(self): + return (self.name, self.canonicalName, self.pop).__hash__() class SearchSuggResponse: """ Sent as responses to 'sugg' requests """ def __init__(self, searchSuggs: list[SearchSugg], hasMore: bool): self.suggs = searchSuggs self.hasMore = hasMore + # Used in unit testing + def __eq__(self, other): + return isinstance(other, SearchSuggResponse) and \ + (set(self.suggs), self.hasMore) == (set(other.suggs), other.hasMore) + def __repr__(self): + return str(self.__dict__) class DescInfo: """ Represents a node's associated description """ def __init__(self, text: str, wikiId: int, fromDbp: bool): self.text = text self.wikiId = wikiId self.fromDbp = fromDbp + # Used in unit testing + def __eq__(self, other): + return isinstance(other, DescInfo) and \ + (self.text, self.wikiId, self.fromDbp) == (other.text, other.wikiId, other.fromDbp) + def __repr__(self): + return str(self.__dict__) class ImgInfo: """ Represents a node's associated image """ def __init__(self, id: int, src: str, url: str, license: str, artist: str, credit: str): @@ -80,17 +109,36 @@ class ImgInfo: self.license = license self.artist = artist self.credit = credit + # Used in unit testing + def __eq__(self, other): + return isinstance(other, ImgInfo) and \ + (self.id, self.src, self.url, self.license, self.artist, self.credit) == \ + (other.id, other.src, other.url, other.license, other.artist, other.credit) + def __repr__(self): + return str(self.__dict__) class NodeInfo: """ Represents info about a node """ def __init__(self, tolNode: TolNode, descInfo: DescInfo | None, imgInfo: ImgInfo | None): self.tolNode = tolNode self.descInfo = descInfo self.imgInfo = imgInfo + # Used in unit testing + def __eq__(self, other): + return isinstance(other, NodeInfo) and \ + (self.tolNode, self.descInfo, self.imgInfo) == (other.tolNode, other.descInfo, other.imgInfo) + def __repr__(self): + return str(self.__dict__) class InfoResponse: """ Sent as responses to 'info' requests """ def __init__(self, nodeInfo: NodeInfo, subNodesInfo: tuple[()] | tuple[NodeInfo | None, NodeInfo | None]): self.nodeInfo = nodeInfo self.subNodesInfo = subNodesInfo + # Used in unit testing + def __eq__(self, other): + return isinstance(other, InfoResponse) and \ + (self.nodeInfo, self.subNodesInfo) == (other.nodeInfo, other.subNodesInfo) + def __repr__(self): + return str(self.__dict__) # For data lookup def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, TolNode]: @@ -123,8 +171,9 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, nameToNodes[childName].pSupport = pSupport == 1 # Get image names idsToNames = {nameToNodes[n].otolId: n for n in nameToNodes.keys()} - query = 'SELECT nodes.id from nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name' \ - ' WHERE nodes.id IN ({})'.format(','.join(['?'] * len(idsToNames))) + query = f'SELECT {nodesTable}.id from {nodesTable}' \ + f' INNER JOIN node_imgs ON {nodesTable}.name = node_imgs.name' \ + f' WHERE {nodesTable}.id IN ' '({})'.format(','.join(['?'] * len(idsToNames))) for (otolId,) in dbCur.execute(query, list(idsToNames.keys())): nameToNodes[idsToNames[otolId]].imgName = otolId + '.jpg' # Get 'linked' images for unresolved names @@ -143,11 +192,13 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, # Get preferred-name info query = f'SELECT name, alt_name FROM names WHERE pref_alt = 1 AND name IN ({queryParamStr})' for name, altName in dbCur.execute(query, names): - nameToNodes[name].commonName = altName + if name in nameToNodes: + nameToNodes[name].commonName = altName # Get IUCN status query = f'SELECT name, iucn FROM node_iucn WHERE name IN ({queryParamStr})' for name, iucn in dbCur.execute(query, names): - nameToNodes[name].iucn = iucn + if name in nameToNodes: + nameToNodes[name].iucn = iucn # return nameToNodes def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor) -> SearchSuggResponse: @@ -157,7 +208,7 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor nodesTable = f'nodes_{getTableSuffix(tree)}' nameQuery = f'SELECT {nodesTable}.name, node_pop.pop FROM {nodesTable}' \ f' LEFT JOIN node_pop ON {nodesTable}.name = node_pop.name' \ - f' WHERE node_pop.name LIKE ? AND node_pop.name NOT LIKE "[%"' \ + f' WHERE {nodesTable}.name LIKE ? AND {nodesTable}.name NOT LIKE "[%"' \ f' ORDER BY node_pop.pop DESC' altNameQuery = f'SELECT alt_name, names.name, pref_alt, node_pop.pop FROM' \ f' names INNER JOIN {nodesTable} ON names.name = {nodesTable}.name' \ @@ -204,6 +255,7 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor return SearchSuggResponse(suggList[:suggLimit], hasMore) def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | None: """ For a node name, returns a descriptive InfoResponse, or None """ + nodesTable = f'nodes_{getTableSuffix(tree)}' # Get node info nameToNodes = lookupNodes([name], tree, dbCur) tolNode = nameToNodes[name] if name in nameToNodes else None @@ -230,10 +282,10 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No idsToNames = {cast(str, nameToNodes[n].imgName)[:-4]: n for n in namesToLookup if nameToNodes[n].imgName is not None} idsToLookup = list(idsToNames.keys()) # Lookup using IDs avoids having to check linked_imgs - query = 'SELECT nodes.id, images.id, images.src, url, license, artist, credit FROM' \ - ' nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name' \ - ' INNER JOIN images ON node_imgs.img_id = images.id AND node_imgs.src = images.src' \ - ' WHERE nodes.id IN ({})'.format(','.join(['?'] * len(idsToLookup))) + query = f'SELECT {nodesTable}.id, images.id, images.src, url, license, artist, credit FROM' \ + f' {nodesTable} INNER JOIN node_imgs ON {nodesTable}.name = node_imgs.name' \ + f' INNER JOIN images ON node_imgs.img_id = images.id AND node_imgs.src = images.src' \ + f' WHERE {nodesTable}.id IN ' '({})'.format(','.join(['?'] * len(idsToLookup))) for id, imgId, imgSrc, url, license, artist, credit in dbCur.execute(query, idsToLookup): nameToImgInfo[idsToNames[id]] = ImgInfo(imgId, imgSrc, url, license, artist, credit) # Construct response @@ -251,10 +303,11 @@ def getTableSuffix(tree: str) -> str: """ converts a reduced-tree descriptor into a sql-table-suffix """ return 't' if tree == 'trimmed' else 'i' if tree == 'images' else 'p' -def handleReq( - dbCur: sqlite3.Cursor, - environ: dict[str, str]) -> None | dict[str, TolNode] | SearchSuggResponse | InfoResponse: +def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode] | SearchSuggResponse | InfoResponse: """ Queries the database, and constructs a response object """ + # Open db + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() # Get query params queryStr = environ['QUERY_STRING'] if 'QUERY_STRING' in environ else '' queryDict = urllib.parse.parse_qs(queryStr) @@ -342,11 +395,8 @@ def handleReq( return None def application(environ: dict[str, str], start_response) -> Iterable[bytes]: """ Entry point for the WSGI script """ - # Open db - dbCon = sqlite3.connect(DB_FILE) - dbCur = dbCon.cursor() # Get response object - val = handleReq(dbCur, environ) + val = handleReq(DB_FILE, environ) # Construct response data = jsonpickle.encode(val, unpicklable=False).encode() headers = [('Content-type', 'application/json')] diff --git a/backend/tolData/dbpedia/genDescData.py b/backend/tolData/dbpedia/genDescData.py deleted file mode 100755 index 43ed815..0000000 --- a/backend/tolData/dbpedia/genDescData.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/python3 - -import re -import bz2, sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Adds DBpedia labels/types/abstracts/etc data into a database -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -labelsFile = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries -idsFile = 'page_lang=en_ids.ttl.bz2' -redirectsFile = 'redirects_lang=en_transitive.ttl.bz2' -disambigFile = 'disambiguations_lang=en.ttl.bz2' -typesFile = 'instance-types_lang=en_specific.ttl.bz2' -abstractsFile = 'short-abstracts_lang=en.ttl.bz2' -dbFile = 'descData.db' -# In testing, this script took a few hours to run, and generated about 10GB - -print('Creating database') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print('Reading/storing label data') -dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)') -dbCur.execute('CREATE INDEX labels_idx ON labels(label)') -dbCur.execute('CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)') -labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n') -lineNum = 0 -with bz2.open(labelsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # - match = labelLineRegex.fullmatch(line) - if match is None: - raise Exception(f'ERROR: Line {lineNum} has unexpected format') - dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2))) - -print('Reading/storing wiki page ids') -dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)') -dbCur.execute('CREATE INDEX ids_idx ON ids(id)') -idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') -lineNum = 0 -with bz2.open(idsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # - match = idLineRegex.fullmatch(line) - if match is None: - raise Exception(f'ERROR: Line {lineNum} has unexpected format') - try: - dbCur.execute('INSERT INTO ids VALUES (?, ?)', (match.group(1), int(match.group(2)))) - except sqlite3.IntegrityError as e: - # Accounts for certain lines that have the same IRI - print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}') - -print('Reading/storing redirection data') -dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)') -redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') -lineNum = 0 -with bz2.open(redirectsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # - match = redirLineRegex.fullmatch(line) - if match is None: - raise Exception(f'ERROR: Line {lineNum} has unexpected format') - dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2))) - -print('Reading/storing diambiguation-page data') -dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)') -disambigLineRegex = redirLineRegex -lineNum = 0 -with bz2.open(disambigFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # - match = disambigLineRegex.fullmatch(line) - if match is None: - raise Exception(f'ERROR: Line {lineNum} has unexpected format') - dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),)) - -print('Reading/storing instance-type data') -dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)') -dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)') -typeLineRegex = redirLineRegex -lineNum = 0 -with bz2.open(typesFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # - match = typeLineRegex.fullmatch(line) - if match is None: - raise Exception(f'ERROR: Line {lineNum} has unexpected format') - dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2))) - -print('Reading/storing abstracts') -dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)') -descLineRegex = labelLineRegex -lineNum = 0 -with bz2.open(abstractsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # - if line[0] == '#': - continue - match = descLineRegex.fullmatch(line) - if match is None: - raise Exception(f'ERROR: Line {lineNum} has unexpected format') - dbCur.execute('INSERT INTO abstracts VALUES (?, ?)', - (match.group(1), match.group(2).replace(r'\"', '"'))) - -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py deleted file mode 100755 index ba6317e..0000000 --- a/backend/tolData/enwiki/downloadImgLicenseInfo.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/python3 - -import re -import sqlite3, urllib.parse, html -import requests -import time, signal - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads image names from a database, and uses enwiki's online API to obtain -licensing information for them, adding the info to the database. - -SIGINT causes the program to finish an ongoing download and exit. -The program can be re-run to continue downloading, and looks -at already-processed names to decide what to skip. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imgDb = 'imgData.db' -apiUrl = 'https://en.wikipedia.org/w/api.php' -userAgent = 'terryt.dev (terry06890@gmail.com)' -batchSz = 50 # Max 50 -tagRegex = re.compile(r'<[^<]+>') -whitespaceRegex = re.compile(r'\s+') - -print('Opening database') -dbCon = sqlite3.connect(imgDb) -dbCur = dbCon.cursor() -dbCur2 = dbCon.cursor() -print('Checking for table') -if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: - dbCur.execute('CREATE TABLE imgs(' \ - 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') - -print('Reading image names') -imgNames: set[str] = set() -for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): - imgNames.add(imgName) -print(f'Found {len(imgNames)}') - -print('Checking for already-processed images') -oldSz = len(imgNames) -for (imgName,) in dbCur.execute('SELECT name FROM imgs'): - imgNames.discard(imgName) -print(f'Found {oldSz - len(imgNames)}') - -# Set SIGINT handler -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) - -print('Iterating through image names') -imgNameList = list(imgNames) -iterNum = 0 -for i in range(0, len(imgNameList), batchSz): - iterNum += 1 - if iterNum % 1 == 0: - print(f'At iteration {iterNum} (after {(iterNum - 1) * batchSz} images)') - if interrupted: - print(f'Exiting loop at iteration {iterNum}') - break - # Get batch - imgBatch = imgNameList[i:i+batchSz] - imgBatch = ['File:' + x for x in imgBatch] - # Make request - headers = { - 'user-agent': userAgent, - 'accept-encoding': 'gzip', - } - params = { - 'action': 'query', - 'format': 'json', - 'prop': 'imageinfo', - 'iiprop': 'extmetadata|url', - 'maxlag': '5', - 'titles': '|'.join(imgBatch), - 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions', - } - responseObj = None - try: - response = requests.get(apiUrl, params=params, headers=headers) - responseObj = response.json() - except Exception as e: - print(f'ERROR: Exception while downloading info: {e}') - print('\tImage batch: ' + '|'.join(imgBatch)) - continue - # Parse response-object - if 'query' not in responseObj or 'pages' not in responseObj['query']: - print('WARNING: Response object for doesn\'t have page data') - print('\tImage batch: ' + '|'.join(imgBatch)) - if 'error' in responseObj: - errorCode = responseObj['error']['code'] - print(f'\tError code: {errorCode}') - if errorCode == 'maxlag': - time.sleep(5) - continue - pages = responseObj['query']['pages'] - normalisedToInput: dict[str, str] = {} - if 'normalized' in responseObj['query']: - for entry in responseObj['query']['normalized']: - normalisedToInput[entry['to']] = entry['from'] - for _, page in pages.items(): - # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data - # LicenseShortName: short human-readable license name, apparently more reliable than 'License', - # Artist: author name (might contain complex html, multiple authors, etc) - # Credit: 'source' - # For image-map-like images, can be quite large/complex html, creditng each sub-image - # May be <a href='text1'>text2</a>, where the text2 might be non-indicative - # Restrictions: specifies non-copyright legal restrictions - title: str = page['title'] - if title in normalisedToInput: - title = normalisedToInput[title] - title = title[5:] # Remove 'File:' - if title not in imgNames: - print(f'WARNING: Got title "{title}" not in image-name list') - continue - if 'imageinfo' not in page: - print(f'WARNING: No imageinfo section for page "{title}"') - continue - metadata = page['imageinfo'][0]['extmetadata'] - url: str = page['imageinfo'][0]['url'] - license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None - artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None - credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None - restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None - # Remove markup - if artist is not None: - artist = tagRegex.sub(' ', artist) - artist = whitespaceRegex.sub(' ', artist) - artist = html.unescape(artist) - artist = urllib.parse.unquote(artist) - if credit is not None: - credit = tagRegex.sub(' ', credit) - credit = whitespaceRegex.sub(' ', credit) - credit = html.unescape(credit) - credit = urllib.parse.unquote(credit) - # Add to db - dbCur2.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', - (title, license, artist, credit, restrictions, url)) - -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py deleted file mode 100755 index def4714..0000000 --- a/backend/tolData/enwiki/downloadImgs.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import sqlite3 -import urllib.parse, requests -import time, signal - -import argparse -parser = argparse.ArgumentParser(description=""" -Downloads images from URLs in an image database, into an output directory, -with names of the form 'pageId1.ext1'. - -SIGINT causes the program to finish an ongoing download and exit. -The program can be re-run to continue downloading, and looks -in the output directory do decide what to skip. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imgDb = 'imgData.db' # About 130k image names -outDir = 'imgs' -licenseRegex = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) -# In testing, this downloaded about 100k images, over several days - -if not os.path.exists(outDir): - os.mkdir(outDir) -print('Checking for already-downloaded images') -fileList = os.listdir(outDir) -pageIdsDone: set[int] = set() -for filename in fileList: - basename, extension = os.path.splitext(filename) - pageIdsDone.add(int(basename)) -print(f'Found {len(pageIdsDone)}') - -# Set SIGINT handler -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) - -print('Opening database') -dbCon = sqlite3.connect(imgDb) -dbCur = dbCon.cursor() -print('Starting downloads') -iterNum = 0 -query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \ - ' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name' -for pageId, license, artist, credit, restrictions, url in dbCur.execute(query): - if pageId in pageIdsDone: - continue - if interrupted: - print('Exiting loop') - break - # Check for problematic attributes - if license is None or licenseRegex.fullmatch(license) is None: - continue - if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None: - continue - if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None: - continue - if restrictions is not None and restrictions != '': - continue - # Download image - iterNum += 1 - print(f'Iteration {iterNum}: Downloading for page-id {pageId}') - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f'WARNING: No filename extension found in URL {url}') - sys.exit(1) - outFile = f'{outDir}/{pageId}{extension}' - headers = { - 'user-agent': 'terryt.dev (terry06890@gmail.com)', - 'accept-encoding': 'gzip', - } - try: - response = requests.get(url, headers=headers) - with open(outFile, 'wb') as file: - file.write(response.content) - time.sleep(1) - # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle self to 1 cache miss per sec' - # It's unclear how to properly check for cache misses, so this just aims for 1 per sec - except Exception as e: - print(f'Error while downloading to {outFile}: {e}') -print('Closing database') -dbCon.close() diff --git a/backend/tolData/enwiki/genDescData.py b/backend/tolData/enwiki/genDescData.py deleted file mode 100755 index 1698f5c..0000000 --- a/backend/tolData/enwiki/genDescData.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/python3 - -import sys, os, re -import bz2 -import html, mwxml, mwparserfromhell -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads through the wiki dump, and attempts to parse short-descriptions, -and add them to a database -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages -enwikiDb = 'descData.db' -# In testing, this script took over 10 hours to run, and generated about 5GB - -descLineRegex = re.compile('^ *[A-Z\'"]') -embeddedHtmlRegex = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') - # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag -convertTemplateRegex = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') -def convertTemplateReplace(match): - if match.group(2) is None: - return f'{match.group(1)} {match.group(4)}' - else: - return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' -parensGroupRegex = re.compile(r' \([^()]*\)') -leftoverBraceRegex = re.compile(r'(?:{\||{{).*') - -def parseDesc(text: str) -> str | None: - # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, - # and then accumulate lines until a blank one. - # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, - lines: list[str] = [] - openBraceCount = 0 - openBracketCount = 0 - inComment = False - skip = False - for line in text.splitlines(): - line = line.strip() - if not lines: - if line: - if openBraceCount > 0 or line[0] == '{': - openBraceCount += line.count('{') - openBraceCount -= line.count('}') - skip = True - if openBracketCount > 0 or line[0] == '[': - openBracketCount += line.count('[') - openBracketCount -= line.count(']') - skip = True - if inComment or line.find('<!--') != -1: - if line.find('-->') != -1: - if inComment: - inComment = False - skip = True - else: - inComment = True - skip = True - if skip: - skip = False - continue - if line[-1] == ':': # Seems to help avoid disambiguation pages - return None - if descLineRegex.match(line) is not None: - lines.append(line) - else: - if not line: - return removeMarkup(' '.join(lines)) - lines.append(line) - if lines: - return removeMarkup(' '.join(lines)) - return None -def removeMarkup(content: str) -> str: - content = embeddedHtmlRegex.sub('', content) - content = convertTemplateRegex.sub(convertTemplateReplace, content) - content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup - content = parensGroupRegex.sub('', content) - content = leftoverBraceRegex.sub('', content) - return content -def convertTitle(title: str) -> str: - return html.unescape(title).replace('_', ' ') - -print('Creating database') -if os.path.exists(enwikiDb): - raise Exception(f'ERROR: Existing {enwikiDb}') -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)') -dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)') -dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') -dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') -dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') - -print('Iterating through dump file') -with bz2.open(dumpFile, mode='rt') as file: - dump = mwxml.Dump.from_file(file) - pageNum = 0 - for page in dump: - pageNum += 1 - if pageNum % 1e4 == 0: - print(f'At page {pageNum}') - if pageNum > 3e4: - break - # Parse page - if page.namespace == 0: - try: - dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) - except sqlite3.IntegrityError as e: - # Accounts for certain pages that have the same title - print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr) - continue - if page.redirect is not None: - dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect))) - else: - revision = next(page) - desc = parseDesc(revision.text) - if desc is not None: - dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) - -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py deleted file mode 100755 index 3bd129f..0000000 --- a/backend/tolData/enwiki/genDumpIndexDb.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/python3 - -import sys, os, re -import bz2 -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Adds data from the wiki dump index-file into a database -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -indexFile = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines -indexDb = 'dumpIndex.db' - -if os.path.exists(indexDb): - raise Exception(f'ERROR: Existing {indexDb}') -print('Creating database') -dbCon = sqlite3.connect(indexDb) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') - -print('Iterating through index file') -lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') -lastOffset = 0 -lineNum = 0 -entriesToAdd: list[tuple[str, str]] = [] -with bz2.open(indexFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # - match = lineRegex.fullmatch(line.rstrip()) - assert match is not None - offsetStr, pageId, title = match.group(1,2,3) - offset = int(offsetStr) - if offset > lastOffset: - for t, p in entriesToAdd: - try: - dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset)) - except sqlite3.IntegrityError as e: - # Accounts for certain entries in the file that have the same title - print(f'Failed on title "{t}": {e}', file=sys.stderr) - entriesToAdd = [] - lastOffset = offset - entriesToAdd.append((title, pageId)) -for title, pageId in entriesToAdd: - try: - dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) - except sqlite3.IntegrityError as e: - print(f'Failed on title "{t}": {e}', file=sys.stderr) - -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py deleted file mode 100755 index 00140f6..0000000 --- a/backend/tolData/enwiki/genImgData.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/python3 - -import re -import bz2, html, urllib.parse -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -For some set of page IDs, looks up their content in the wiki dump, -and tries to parse infobox image names, storing them into a database. - -The program can be re-run with an updated set of page IDs, and -will skip already-processed page IDs. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -def getInputPageIds(): - pageIds: set[int] = set() - dbCon = sqlite3.connect('../data.db') - dbCur = dbCon.cursor() - for (pageId,) in dbCur.execute('SELECT id from wiki_ids'): - pageIds.add(pageId) - dbCon.close() - return pageIds -dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2' -indexDb = 'dumpIndex.db' -imgDb = 'imgData.db' # The database to create -idLineRegex = re.compile(r'<id>(.*)</id>') -imageLineRegex = re.compile(r'.*\| *image *= *([^|]*)') -bracketImageRegex = re.compile(r'\[\[(File:[^|]*).*]]') -imageNameRegex = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) -cssImgCropRegex = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) - -print('Getting input page-ids') -pageIds = getInputPageIds() -print(f'Found {len(pageIds)}') - -print('Opening databases') -indexDbCon = sqlite3.connect(indexDb) -indexDbCur = indexDbCon.cursor() -imgDbCon = sqlite3.connect(imgDb) -imgDbCur = imgDbCon.cursor() -print('Checking tables') -if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: - # Create tables if not present - imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL - imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') -else: - # Check for already-processed page IDs - numSkipped = 0 - for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'): - if pid in pageIds: - pageIds.remove(pid) - numSkipped += 1 - else: - print(f'WARNING: Found already-processed page ID {pid} which was not in input set') - print(f'Will skip {numSkipped} already-processed page IDs') - -print('Getting dump-file offsets') -offsetToPageids: dict[int, list[int]] = {} -offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets -iterNum = 0 -for pageId in pageIds: - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') - # - query = 'SELECT offset, next_offset FROM offsets WHERE id = ?' - row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone() - if row is None: - print(f'WARNING: Page ID {pageId} not found') - continue - chunkOffset, endOffset = row - offsetToEnd[chunkOffset] = endOffset - if chunkOffset not in offsetToPageids: - offsetToPageids[chunkOffset] = [] - offsetToPageids[chunkOffset].append(pageId) -print(f'Found {len(offsetToEnd)} chunks to check') - -print('Iterating through chunks in dump file') -def getImageName(content: list[str]) -> str | None: - """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ - # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections - for line in content: - match = imageLineRegex.match(line) - if match is not None: - imageName = match.group(1).strip() - if imageName == '': - return None - imageName = html.unescape(imageName) - # Account for {{... - if imageName.startswith('{'): - match = cssImgCropRegex.match(imageName) - if match is None: - return None - imageName = match.group(1) - # Account for [[File:...|...]] - if imageName.startswith('['): - match = bracketImageRegex.match(imageName) - if match is None: - return None - imageName = match.group(1) - # Account for <!-- - if imageName.find('<!--') != -1: - return None - # Remove an initial 'File:' - if imageName.startswith('File:'): - imageName = imageName[5:] - # Remove an initial 'Image:' - if imageName.startswith('Image:'): - imageName = imageName[6:] - # Check for extension - match = imageNameRegex.match(imageName) - if match is not None: - imageName = match.group(0) - imageName = urllib.parse.unquote(imageName) - imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases) - imageName = imageName.replace('_', ' ') - return imageName - # Exclude lines like: | image = <imagemap> - return None - return None -with open(dumpFile, mode='rb') as file: - iterNum = 0 - for pageOffset, endOffset in offsetToEnd.items(): - iterNum += 1 - if iterNum % 100 == 0: - print(f'At iteration {iterNum}') - # - pageIds = offsetToPageids[pageOffset] - # Jump to chunk - file.seek(pageOffset) - compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) - data = bz2.BZ2Decompressor().decompress(compressedData).decode() - # Look in chunk for pages - lines = data.splitlines() - lineIdx = 0 - while lineIdx < len(lines): - # Look for <page> - if lines[lineIdx].lstrip() != '<page>': - lineIdx += 1 - continue - # Check page id - lineIdx += 3 - idLine = lines[lineIdx].lstrip() - match = idLineRegex.fullmatch(idLine) - if match is None or int(match.group(1)) not in pageIds: - lineIdx += 1 - continue - pageId = int(match.group(1)) - lineIdx += 1 - # Look for <text> in <page> - foundText = False - while lineIdx < len(lines): - if not lines[lineIdx].lstrip().startswith('<text '): - lineIdx += 1 - continue - foundText = True - # Get text content - content: list[str] = [] - line = lines[lineIdx] - content.append(line[line.find('>') + 1:]) - lineIdx += 1 - foundTextEnd = False - while lineIdx < len(lines): - line = lines[lineIdx] - if not line.endswith('</text>'): - content.append(line) - lineIdx += 1 - continue - foundTextEnd = True - content.append(line[:line.rfind('</text>')]) - # Look for image-filename - imageName = getImageName(content) - imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName)) - break - if not foundTextEnd: - print(f'WARNING: Did not find </text> for page id {pageId}') - break - if not foundText: - print(f'WARNING: Did not find <text> for page id {pageId}') - -print('Closing databases') -indexDbCon.close() -imgDbCon.commit() -imgDbCon.close() diff --git a/backend/tolData/enwiki/genPageviewData.py b/backend/tolData/enwiki/genPageviewData.py deleted file mode 100755 index 6a5d79c..0000000 --- a/backend/tolData/enwiki/genPageviewData.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/python3 - -import sys, os, glob, math, re -from collections import defaultdict -import bz2, sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads through wikimedia files containing pageview counts, -computes average counts, and adds them to a database -""", formatter_class=argparse.RawDescriptionHelpFormatter) -args = parser.parse_args() - -pageviewFiles = glob.glob('./pageviews/pageviews-*-user.bz2') -dbFile = 'pageviewData.db' -dumpIndexDb = 'dumpIndex.db' - -# Took about 15min per file (each about 180e6 lines) - -if os.path.exists(dbFile): - print('ERROR: Database already exists') - sys.exit(1) - -# Each pageview file has lines that seem to hold these space-separated fields: - # wiki code (eg: en.wikipedia), article title, page ID (may be: null), - # platform (eg: mobile-web), monthly view count, - # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) -namespaceRegex = re.compile(r'[a-zA-Z]+:') -titleToViews: dict[str, int] = defaultdict(int) -linePrefix = b'en.wikipedia ' -for filename in pageviewFiles: - print(f'Reading from {filename}') - with bz2.open(filename, 'rb') as file: - for lineNum, line in enumerate(file, 1): - if lineNum % 1e6 == 0: - print(f'At line {lineNum}') - if not line.startswith(linePrefix): - continue - # Get second and second-last fields - line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields - title = line[:line.find(b' ')].decode('utf-8') - viewCount = int(line[line.rfind(b' ')+1:]) - if namespaceRegex.match(title) is not None: - continue - # Update map - titleToViews[title] += viewCount -print(f'Found {len(titleToViews)} titles') - -print('Writing to db') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -idbCon = sqlite3.connect(dumpIndexDb) -idbCur = idbCon.cursor() -dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)') -for title, views in titleToViews.items(): - row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() - if row is not None: - wikiId = int(row[0]) - dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles)))) -dbCon.commit() -dbCon.close() -idbCon.close() diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py deleted file mode 100755 index 427aa7a..0000000 --- a/backend/tolData/enwiki/lookupPage.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/python3 - -import sys -import bz2 -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Looks up a page with title title1 in the wiki dump, using the dump-index -db, and prints the corresponding <page>. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.add_argument("title", help="The title to look up") -args = parser.parse_args() - -dumpFile = 'enwiki-20220501-pages-articles-multistream.xml.bz2' -indexDb = 'dumpIndex.db' -pageTitle = args.title.replace('_', ' ') - -print('Looking up offset in index db') -dbCon = sqlite3.connect(indexDb) -dbCur = dbCon.cursor() -query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?' -row = dbCur.execute(query, (pageTitle,)).fetchone() -if row is None: - print('Title not found') - sys.exit(0) -_, pageOffset, endOffset = row -dbCon.close() -print(f'Found chunk at offset {pageOffset}') - -print('Reading from wiki dump') -content: list[str] = [] -with open(dumpFile, mode='rb') as file: - # Get uncompressed chunk - file.seek(pageOffset) - compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) - data = bz2.BZ2Decompressor().decompress(compressedData).decode() - # Look in chunk for page - lines = data.splitlines() - lineIdx = 0 - found = False - pageNum = 0 - while not found: - line = lines[lineIdx] - if line.lstrip() == '<page>': - pageNum += 1 - if pageNum > 100: - print('ERROR: Did not find title after 100 pages') - break - lineIdx += 1 - titleLine = lines[lineIdx] - if titleLine.lstrip() == '<title>' + pageTitle + '</title>': - found = True - print(f'Found title in chunk as page {pageNum}') - content.append(line) - content.append(titleLine) - while True: - lineIdx += 1 - line = lines[lineIdx] - content.append(line) - if line.lstrip() == '</page>': - break - lineIdx += 1 - -print('Content: ') -print('\n'.join(content)) diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py deleted file mode 100755 index 5213aaf..0000000 --- a/backend/tolData/eol/downloadImgs.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os, random -import sqlite3 -import urllib.parse, requests -import time -from threading import Thread -import signal - -import argparse -parser = argparse.ArgumentParser(description=""" -For some set of EOL IDs, downloads associated images from URLs in -an image-list database. Uses multiple downloading threads. - -May obtain multiple images per ID. The images will get names -with the form 'eolId1 contentId1.ext1'. - -SIGINT causes the program to finish ongoing downloads and exit. -The program can be re-run to continue downloading. It looks for -already-downloaded files, and continues after the one with -highest EOL ID. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imagesListDb = 'imagesList.db' -def getInputEolIds() -> set[int]: - eolIds: set[int] = set() - dbCon = sqlite3.connect('../data.db') - dbCur = dbCon.cursor() - for (id,) in dbCur.execute('SELECT id FROM eol_ids'): - eolIds.add(id) - dbCon.close() - return eolIds -outDir = 'imgsForReview/' -MAX_IMGS_PER_ID = 3 -MAX_THREADS = 5 -POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) -POST_DL_DELAY_MAX = 3 -LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' - -print('Getting input EOL IDs') -eolIds = getInputEolIds() -print('Getting EOL IDs to download for') -# Get IDs from images-list db -imgDbCon = sqlite3.connect(imagesListDb) -imgCur = imgDbCon.cursor() -imgListIds: set[int] = set() -for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): - imgListIds.add(pageId) -# Get set intersection, and sort into list -eolIds = eolIds.intersection(imgListIds) -eolIdList = sorted(eolIds) -print(f'Result: {len(eolIdList)} EOL IDs') - -print('Checking output directory') -if not os.path.exists(outDir): - os.mkdir(outDir) -print('Finding next ID to download for') -nextIdx = 0 -fileList = os.listdir(outDir) -ids = [int(filename.split(' ')[0]) for filename in fileList] -if ids: - ids.sort() - nextIdx = eolIdList.index(ids[-1]) + 1 -if nextIdx == len(eolIdList): - print('No IDs left. Exiting...') - sys.exit(0) - -print('Starting download threads') -numThreads = 0 -threadException: Exception | None = None # Used for ending main thread after a non-main thread exception -# Handle SIGINT signals -interrupted = False -oldHandler = None -def onSigint(sig, frame): - global interrupted - interrupted = True - signal.signal(signal.SIGINT, oldHandler) -oldHandler = signal.signal(signal.SIGINT, onSigint) -# Function for threads to execute -def downloadImg(url, outFile): - global numThreads, threadException - try: - data = requests.get(url) - with open(outFile, 'wb') as file: - file.write(data.content) - time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) - except Exception as e: - print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) - threadException = e - numThreads -= 1 -# Manage downloading -for idx in range(nextIdx, len(eolIdList)): - eolId = eolIdList[idx] - # Get image urls - ownerSet: set[str] = set() # Used to get images from different owners, for variety - exitLoop = False - query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' - for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): - if url.startswith('data/'): - url = 'https://content.eol.org/' + url - urlParts = urllib.parse.urlparse(url) - extension = os.path.splitext(urlParts.path)[1] - if len(extension) <= 1: - print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) - continue - # Check image-quantity limit - if len(ownerSet) == MAX_IMGS_PER_ID: - break - # Check for skip conditions - if re.fullmatch(LICENSE_REGEX, license) is None: - continue - if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic - continue - if copyrightOwner in ownerSet: - continue - ownerSet.add(copyrightOwner) - # Determine output filename - outPath = f'{outDir}{eolId} {contentId}{extension}' - if os.path.exists(outPath): - print(f'WARNING: {outPath} already exists. Skipping download.') - continue - # Check thread limit - while numThreads == MAX_THREADS: - time.sleep(1) - # Wait for threads after an interrupt or thread-exception - if interrupted or threadException is not None: - print('Waiting for existing threads to end') - while numThreads > 0: - time.sleep(1) - exitLoop = True - break - # Perform download - print(f'Downloading image to {outPath}') - numThreads += 1 - thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) - thread.start() - if exitLoop: - break -# Close images-list db -print('Finished downloading') -imgDbCon.close() diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py deleted file mode 100755 index 808292d..0000000 --- a/backend/tolData/eol/genImagesListDb.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/python3 - -import os, re -import csv -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Generates a sqlite db from a directory of CSV files holding EOL image data -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imagesListDir = 'imagesList/' -dbFile = 'imagesList.db' - -print('Creating database') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE images' \ - ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT)') -dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)') -print('Reading CSV files') -csvFilenames = os.listdir(imagesListDir) -for filename in csvFilenames: - print(f'Processing {imagesListDir}{filename}') - with open(imagesListDir + filename, newline='') as file: - for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file): - if re.match(r'^[a-zA-Z]', contentId): # Skip header line - continue - dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', - (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner)) -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/genDescData.py b/backend/tolData/genDescData.py deleted file mode 100755 index bb1cbc8..0000000 --- a/backend/tolData/genDescData.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/python3 - -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Maps nodes to short descriptions, using data from DBpedia and -Wikipedia, and stores results in the database. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -args = parser.parse_args() - -dbpediaDb = 'dbpedia/descData.db' -enwikiDb = 'enwiki/descData.db' -dbFile = 'data.db' - -print('Creating table') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)') - -print('Getting node mappings') -nodeToWikiId: dict[str, int] = {} -for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): - nodeToWikiId[name] = wikiId - -print('Reading data from DBpedia') -dbpCon = sqlite3.connect(dbpediaDb) -dbpCur = dbpCon.cursor() -print('Getting node IRIs') -nodeToIri: dict[str, str] = {} -iterNum = 0 -for name, wikiId in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f'At iteration {iterNum}') - # - row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone() - if row is not None: - nodeToIri[name] = row[0] -print('Resolving redirects') -iterNum = 0 -for name, iri in nodeToIri.items(): - iterNum += 1 - if iterNum % 1e5 == 0: - print(f'At iteration {iterNum}') - # - row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone() - if row is not None: - nodeToIri[name] = row[0] -print('Adding descriptions') -iterNum = 0 -for name, iri in nodeToIri.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') - # - row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone() - if row is not None: - dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1)) - del nodeToWikiId[name] -dbpCon.close() - -print('Reading data from Wikipedia') -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -print('Resolving redirects') -iterNum = 0 -for name, wikiId in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') - # - query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?' - row = enwikiCur.execute(query, (wikiId,)).fetchone() - if row is not None: - nodeToWikiId[name] = row[0] -print('Adding descriptions') -iterNum = 0 -for name, wikiId in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e3 == 0: - print(f'At iteration {iterNum}') - # - row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiId,)).fetchone() - if row is not None: - dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0)) - -print('Closing databases') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/genImgs.py b/backend/tolData/genImgs.py deleted file mode 100755 index 6f72b49..0000000 --- a/backend/tolData/genImgs.py +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/python3 - -import sys, os, subprocess -import sqlite3, urllib.parse -import signal - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads node IDs and image paths from a file, and possibly from a directory, -and generates cropped/resized versions of those images into a directory, -with names of the form 'nodeId1.jpg'. Also adds image metadata to the -database. - -SIGINT can be used to stop, and the program can be re-run to continue -processing. It uses already-existing database entries to decide what -to skip. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imgListFile = 'imgList.txt' -outDir = 'img/' -eolImgDb = 'eol/imagesList.db' -enwikiImgDb = 'enwiki/imgData.db' -pickedImgsDir = 'pickedImgs/' -pickedImgsFilename = 'imgData.txt' -dbFile = 'data.db' -IMG_OUT_SZ = 200 -genImgFiles = True # Usable for debugging - -class PickedImg: - """ Represents a picked-image from pickedImgsDir """ - def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str): - self.nodeName = nodeName - self.id = id - self.filename = filename - self.url = url - self.license = license - self.artist = artist - self.credit = credit - -if not os.path.exists(outDir): - os.mkdir(outDir) - -print('Opening databases') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -eolCon = sqlite3.connect(eolImgDb) -eolCur = eolCon.cursor() -enwikiCon = sqlite3.connect(enwikiImgDb) -enwikiCur = enwikiCon.cursor() -print('Checking for picked-images') -nodeToPickedImg: dict[str, PickedImg] = {} -if os.path.exists(pickedImgsDir + pickedImgsFilename): - lineNum = 0 - with open(pickedImgsDir + pickedImgsFilename) as file: - for line in file: - lineNum += 1 - filename, url, license, artist, credit = line.rstrip().split('|') - nodeName = os.path.splitext(filename)[0] # Remove extension - (otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone() - nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit) - -print('Checking for image tables') -nodesDone: set[str] = set() -imgsDone: set[tuple[int, str]] = set() -if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="node_imgs"').fetchone() is None: - # Add image tables if not present - dbCur.execute('CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)') - dbCur.execute('CREATE TABLE images' \ - ' (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))') -else: - # Get existing image-associated nodes - for (otolId,) in dbCur.execute('SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name'): - nodesDone.add(otolId) - # Get existing node-associated images - for imgId, imgSrc in dbCur.execute('SELECT id, src from images'): - imgsDone.add((imgId, imgSrc)) - print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip') - -# Set SIGINT handler -interrupted = False -def onSigint(sig, frame): - global interrupted - interrupted = True -signal.signal(signal.SIGINT, onSigint) - -print('Iterating through input images') -def quit(): - print('Closing databases') - dbCon.commit() - dbCon.close() - eolCon.close() - enwikiCon.close() - sys.exit(0) -def convertImage(imgPath, outPath): - print(f'Converting {imgPath} to {outPath}') - if os.path.exists(outPath): - print('ERROR: Output image already exists') - return False - try: - completedProcess = subprocess.run( - ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], - stdout=subprocess.DEVNULL - ) - except Exception as e: - print(f'ERROR: Exception while attempting to run smartcrop: {e}') - return False - if completedProcess.returncode != 0: - print(f'ERROR: smartcrop had exit status {completedProcess.returncode}') - return False - return True -print('Processing picked-images') -for otolId, imgData in nodeToPickedImg.items(): - # Check for SIGINT event - if interrupted: - print('Exiting') - quit() - # Skip if already processed - if otolId in nodesDone: - continue - # Convert image - if genImgFiles: - success = convertImage(pickedImgsDir + imgData.filename, outDir + otolId + '.jpg') - if not success: - quit() - else: - print(f'Processing {imgData.nodeName}: {otolId}.jpg') - # Add entry to db - if (imgData.id, 'picked') not in imgsDone: - dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', - (imgData.id, 'picked', imgData.url, imgData.license, imgData.artist, imgData.credit)) - imgsDone.add((imgData.id, 'picked')) - dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked')) - nodesDone.add(otolId) -print('Processing images from eol and enwiki') -iterNum = 0 -with open(imgListFile) as file: - for line in file: - iterNum += 1 - # Check for SIGINT event - if interrupted: - print('Exiting') - break - # Skip lines without an image path - if line.find(' ') == -1: - continue - # Get filenames - otolId, _, imgPath = line.rstrip().partition(' ') - # Skip if already processed - if otolId in nodesDone: - continue - # Convert image - if genImgFiles: - success = convertImage(imgPath, outDir + otolId + '.jpg') - if not success: - break - else: - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') - # Add entry to db - (nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone() - fromEol = imgPath.startswith('eol/') - imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component - imgName = os.path.splitext(imgName)[0] # Remove extension - if fromEol: - eolIdStr, _, contentIdStr = imgName.partition(' ') - eolId, contentId = (int(eolIdStr), int(contentIdStr)) - if (eolId, 'eol') not in imgsDone: - query = 'SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?' - row = eolCur.execute(query, (contentId,)).fetchone() - if row is None: - print(f'ERROR: No image record for EOL ID {eolId}, content ID {contentId}') - break - url, license, owner = row - dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', - (eolId, 'eol', url, license, owner, '')) - imgsDone.add((eolId, 'eol')) - dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, eolId, 'eol')) - else: - enwikiId = int(imgName) - if (enwikiId, 'enwiki') not in imgsDone: - query = 'SELECT name, license, artist, credit FROM' \ - ' page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name' \ - ' WHERE page_imgs.page_id = ?' - row = enwikiCur.execute(query, (enwikiId,)).fetchone() - if row is None: - print(f'ERROR: No image record for enwiki ID {enwikiId}') - break - name, license, artist, credit = row - url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name) - dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', - (enwikiId, 'enwiki', url, license, artist, credit)) - imgsDone.add((enwikiId, 'enwiki')) - dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki')) -# Close dbs -quit() diff --git a/backend/tolData/genLinkedImgs.py b/backend/tolData/genLinkedImgs.py deleted file mode 100755 index 6d2feff..0000000 --- a/backend/tolData/genLinkedImgs.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/python3 - -import re -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Look for nodes without images in the database, and tries to -associate them with images from their children -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -dbFile = 'data.db' -compoundNameRegex = re.compile(r'\[(.+) \+ (.+)]') -upPropagateCompoundImgs = False - -print('Opening databases') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)') - -print('Getting nodes with images') -resolvedNodes: dict[str, str] = {} # Will map node names to otol IDs with a usable image -query = 'SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name' -for name, otolId in dbCur.execute(query): - resolvedNodes[name] = otolId -print(f'Found {len(resolvedNodes)}') - -print('Iterating through nodes, trying to resolve images for ancestors') -nodesToResolve: dict[str, list[dict[str, str | int | None]]] = {} - # Maps a node name to a list of objects that represent possible child images -processedNodes: dict[str, str] = {} # Map a node name to an OTOL ID, representing a child node whose image is to be used -parentToChosenTips: dict[str, int] = {} # Used to prefer images from children with more tips -iterNum = 0 -while resolvedNodes: - iterNum += 1 - if iterNum % 1e3 == 0: - print(f'At iteration {iterNum}') - # Get next node - nodeName, otolId = resolvedNodes.popitem() - processedNodes[nodeName] = otolId - # Traverse upwards, resolving ancestors if able - while True: - # Get parent - row = dbCur.execute('SELECT parent FROM edges WHERE child = ?', (nodeName,)).fetchone() - if row is None or row[0] in processedNodes or row[0] in resolvedNodes: - break - parent: str = row[0] - # Get parent data - if parent not in nodesToResolve: - childNames: list[str] = [ - row[0] for row in dbCur.execute('SELECT child FROM edges WHERE parent = ?', (parent,))] - query = 'SELECT name, tips FROM nodes WHERE name IN ({})'.format(','.join(['?'] * len(childNames))) - childObjs = [{'name': row[0], 'tips': row[1], 'otolId': None} for row in dbCur.execute(query, childNames)] - childObjs.sort(key=lambda x: x['tips'], reverse=True) - nodesToResolve[parent] = childObjs - else: - childObjs = nodesToResolve[parent] - # Check if highest-tips child - if childObjs[0]['name'] == nodeName: - # Resolve parent, and continue from it - dbCur.execute('INSERT INTO linked_imgs VALUES (?, ?)', (parent, otolId)) - del nodesToResolve[parent] - processedNodes[parent] = otolId - parentToChosenTips[parent] = childObjs[0]['tips'] - nodeName = parent - continue - else: - # Mark child as a potential choice - childObj = next(c for c in childObjs if c['name'] == nodeName) - childObj['otolId'] = otolId - break - # When out of resolved nodes, resolve nodesToResolve nodes, possibly adding more nodes to resolve - if not resolvedNodes: - for name, childObjs in nodesToResolve.items(): - childObj = next(c for c in childObjs if c['otolId'] is not None) - resolvedNodes[name] = childObj['otolId'] - parentToChosenTips[name] = childObj['tips'] - dbCur.execute('INSERT INTO linked_imgs VALUES (?, ?)', (name, childObj['otolId'])) - nodesToResolve.clear() - -print('Replacing linked-images for compound nodes') -iterNum = 0 -for nodeName in processedNodes.keys(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') - # - match = compoundNameRegex.fullmatch(nodeName) - if match is not None: - # Replace associated image with subname images - subName1, subName2 = match.group(1,2) - otolIdPair = ['', ''] - if subName1 in processedNodes: - otolIdPair[0] = processedNodes[subName1] - if subName2 in processedNodes: - otolIdPair[1] = processedNodes[subName2] - # Use no image if both subimages not found - if otolIdPair[0] == '' and otolIdPair[1] == '': - dbCur.execute('DELETE FROM linked_imgs WHERE name = ?', (nodeName,)) - continue - # Add to db - dbCur.execute('UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', - (otolIdPair[0] + ',' + otolIdPair[1], nodeName)) - # Possibly repeat operation upon parent/ancestors - if upPropagateCompoundImgs: - while True: - # Get parent - row = dbCur.execute('SELECT parent FROM edges WHERE child = ?', (nodeName,)).fetchone() - if row is not None: - parent = row[0] - # Check num tips - (numTips,) = dbCur.execute('SELECT tips from nodes WHERE name = ?', (nodeName,)).fetchone() - if parent in parentToChosenTips and parentToChosenTips[parent] <= numTips: - # Replace associated image - dbCur.execute('UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', - (otolIdPair[0] + ',' + otolIdPair[1], parent)) - nodeName = parent - continue - break - -print('Closing databases') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/genMappingData.py b/backend/tolData/genMappingData.py deleted file mode 100755 index 5339c4e..0000000 --- a/backend/tolData/genMappingData.py +++ /dev/null @@ -1,229 +0,0 @@ -#!/usr/bin/python3 - -import os -from collections import defaultdict -import gzip, csv, sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Maps otol IDs to EOL and enwiki titles, using IDs from various -other sources (like NCBI). - -Reads otol taxonomy data to get source IDs for otol IDs, -then looks up those IDs in an EOL provider_ids file, -and in a wikidata dump, and stores results in the database. - -Based on code from https://github.com/OneZoom/OZtree, located in -OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). -""", formatter_class=argparse.RawDescriptionHelpFormatter) -args = parser.parse_args() - -taxonomyFile = 'otol/taxonomy.tsv' -eolIdsFile = 'eol/provider_ids.csv.gz' -wikidataDb = 'wikidata/taxonSrcs.db' -enwikiDumpIndexDb = 'enwiki/dumpIndex.db' -pickedMappings = { - 'eol': ['pickedEolIds.txt'], - 'enwiki': ['pickedWikiIds.txt', 'pickedWikiIdsRough.txt'] -} -dbFile = 'data.db' - -print('Reading taxonomy file') -# The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence): - # uid (otol-id, eg: 93302), parent_uid, name, rank, - # sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags -OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority -nodeToSrcIds: dict[int, dict[str, int]] = defaultdict(dict) # Maps otol ID to {src1: id1, src2: id2, ...} -usedSrcIds: set[tuple[str, int]] = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used) -with open(taxonomyFile) as file: # Had about 4.5e6 lines - lineNum = 0 - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') - # Skip header line - if lineNum == 1: - continue - # Parse line - fields = line.split('\t|\t') - try: - otolId = int(fields[0]) - except ValueError: - print(f'Skipping non-integral ID {fields[0]} on line {lineNum}') - continue - srcsField = fields[4] - # Add source IDs - for srcPair in srcsField.split(','): - src, srcIdStr = srcPair.split(':', 1) - if srcIdStr.isdecimal() and src in OTOL_SRCS and src not in nodeToSrcIds[otolId]: - srcId = int(srcIdStr) - nodeToSrcIds[otolId][src] = srcId - usedSrcIds.add((src, srcId)) -print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6 - -print('Reading EOL provider_ids file') -# The CSV file has a header line, then lines that hold these fields: - # node_id, resource_pk (ID from external source), resource_id (int denoting external-source), - # page_id (eol ID), preferred_canonical_for_page -EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps ints to external-source names -srcToEolId: dict[str, dict[int, int]] = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...} -with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines - for lineNum, row in enumerate(csv.reader(file), 1): - if lineNum % 1e6 == 0: - print(f'At line {lineNum}') - # Skip header line - if lineNum == 1: - continue - # Parse line - eolId = int(row[3]) - srcVal = int(row[2]) - srcIdStr = row[1] - if srcIdStr.isdecimal() and srcVal in EOL_SRCS: - srcId = int(srcIdStr) - src = EOL_SRCS[srcVal] - if (src, srcId) not in usedSrcIds: - continue - if srcId in srcToEolId[src]: - print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}') - continue - srcToEolId[src][srcId] = eolId -print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries') - # Was about 3.5e6 (4.2e6 without usedSrcIds) - -print('Resolving candidate EOL IDs') -# For each otol ID, find eol IDs with matching sources, and choose the 'best' one -nodeToEolId: dict[int, int] = {} # Maps otol ID to eol ID -for otolId, srcInfo in nodeToSrcIds.items(): - eolIdToCount: dict[int, int] = defaultdict(int) - for src, srcId in srcInfo.items(): - if src in srcToEolId and srcId in srcToEolId[src]: - eolId = srcToEolId[src][srcId] - eolIdToCount[eolId] += 1 - if len(eolIdToCount) == 1: - nodeToEolId[otolId] = list(eolIdToCount)[0] - elif len(eolIdToCount) > 1: - # For multiple candidates, prefer those with most sources, and break ties by picking the lowest - maxCount = max(eolIdToCount.values()) - eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount] - nodeToEolId[otolId] = min(eolIds) -print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6 - -print('Reading from Wikidata db') -srcToWikiTitle: dict[str, dict[int, str]] = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...} -wikiTitles = set() -titleToIucnStatus: dict[str, str] = {} -dbCon = sqlite3.connect(wikidataDb) -dbCur = dbCon.cursor() -for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'): - if (src, srcId) not in usedSrcIds and src != 'eol': # Keep EOL IDs for later use - continue - srcToWikiTitle[src][srcId] = title - wikiTitles.add(title) -for title, status in dbCur.execute('SELECT title, status from title_iucn'): - if title in wikiTitles: - titleToIucnStatus[title] = status -print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries') - # Was about 1.1e6 (1.2e6 without usedSrcIds) -print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds) -dbCon.close() - -print('Resolving candidate Wikidata items') -# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one -nodeToWikiTitle: dict[int, str] = {} -for otolId, srcInfo in nodeToSrcIds.items(): - titleToSrcs: dict[str, list[str]] = defaultdict(list) # Maps candidate titles to list of sources - for src, srcId in srcInfo.items(): - if src in srcToWikiTitle and srcId in srcToWikiTitle[src]: - title = srcToWikiTitle[src][srcId] - titleToSrcs[title].append(src) - # Choose title to use - if len(titleToSrcs) == 1: - nodeToWikiTitle[otolId] = list(titleToSrcs)[0] - elif len(titleToSrcs) > 1: # Test example: otol ID 621052 - # Get titles with most sources - maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()]) - titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt} - if len(titleToSrcs) == 1: - nodeToWikiTitle[otolId] = list(titleToSrcs)[0] - else: # Test example: otol ID 4235272 - # Get a title with a source with highest priority - srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]} - for src in OTOL_SRCS: - if src in srcToTitle: - nodeToWikiTitle[otolId] = srcToTitle[src] - break -print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5 - -print('Adding extra EOL mappings from Wikidata') -eolIdToNode = {eolId: node for node, eolId in nodeToEolId.items()} -wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()} -addedEntries: dict[int, int] = {} -for eolId, title in srcToWikiTitle['eol'].items(): - if title in wikiTitleToNode: - otolId = wikiTitleToNode[title] - if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID - nodeToEolId[otolId] = eolId - addedEntries[otolId] = eolId -print(f'- Added {len(addedEntries):,} entries') # Was about 3e3 - -print('Reading picked mappings') -for src in pickedMappings: - for filename in pickedMappings[src]: - if not os.path.exists(filename): - continue - with open(filename) as file: - for line in file: - otolIdStr, mappedVal = line.rstrip().split('|') - otolId = int(otolIdStr) - if src == 'eol': - if mappedVal: - nodeToEolId[otolId] = int(mappedVal) - else: - if otolId in nodeToEolId: - del nodeToEolId[otolId] - else: # src == 'enwiki' - if mappedVal: - nodeToWikiTitle[otolId] = mappedVal - else: - if otolId in nodeToWikiTitle: - del nodeToWikiTitle[otolId] - -print('Getting enwiki page IDs') -titleToPageId: dict[str, int] = {} -numNotFound = 0 -dbCon = sqlite3.connect(enwikiDumpIndexDb) -dbCur = dbCon.cursor() -for title in nodeToWikiTitle.values(): - record = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() - if record != None: - titleToPageId[title] = record[0] - else: - numNotFound += 1 -dbCon.close() -print(f'Unable to find IDs for {numNotFound} titles') # Was 2913 - -print('Writing to db') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -# Get otol id-to-name map -otolIdToName: dict[int, str] = {} -for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'): - if nodeId.startswith('ott'): - otolIdToName[int(nodeId[3:])] = nodeName -# Add eol mappings -dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)') -dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)') -for otolId, eolId in nodeToEolId.items(): - if otolId in otolIdToName: - dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId)) -# Add enwiki mappings -dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)') -dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)') -dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)') -for otolId, title in nodeToWikiTitle.items(): - if otolId in otolIdToName and title in titleToPageId: - dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title])) - if title in titleToIucnStatus: - dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title])) -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/genNameData.py b/backend/tolData/genNameData.py deleted file mode 100755 index 2df144d..0000000 --- a/backend/tolData/genNameData.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/python3 - -import re, os -import html, csv, sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Maps nodes to vernacular names, using data from EOL, enwiki, and a -picked-names file, and stores results in the database. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -args = parser.parse_args() - -eolNamesFile = 'eol/vernacularNames.csv' -enwikiDb = 'enwiki/descData.db' -pickedNamesFile = 'pickedNames.txt' -dbFile = 'data.db' - -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print('Getting node mappings') -nodeToTips: dict[str, int] = {} -eolIdToNode: dict[int, str] = {} # Maps eol ID to node name (if there are multiple, choose one with most tips) -wikiIdToNode: dict[int, str] = {} -for name, tips in dbCur.execute('SELECT name, tips from nodes'): - nodeToTips[name] = tips -for name, eolId in dbCur.execute('SELECT name, id from eol_ids'): - if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]: - eolIdToNode[eolId] = name -for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): - if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]: - wikiIdToNode[wikiId] = name - -print('Creating table') -dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))') -dbCur.execute('CREATE INDEX names_idx ON names(name)') -dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)') -dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)') - -print('Getting names from EOL') -# The CSV file has a header line, then lines with these fields: - # page_id, canonical_form (canonical name, not always unique to page ID), - # vernacular_string (vernacular name), language_code, - # resource_name, is_preferred_by_resource, is_preferred_by_eol -namesToSkip = {'unknown', 'unknown species', 'unidentified species'} -with open(eolNamesFile, newline='') as file: - for lineNum, fields in enumerate(csv.reader(file), 1): - if lineNum % 1e5 == 0: - print(f'At line {lineNum}') # Reached about 2.8e6 - # Skip header line - if lineNum == 1: - continue - # Parse line - eolId = int(fields[0]) - name = html.unescape(fields[2]).lower() - lang = fields[3] - isPreferred = 1 if fields[6] == 'preferred' else 0 - # Add to db - if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \ - and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words - cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')' - # The 'OR IGNORE' accounts for duplicate lines - dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred)) - -print('Getting names from Wikipedia') -altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)', -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -iterNum = 0 -for wikiId, nodeName in wikiIdToNode.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') # Reached about 3.6e5 - # - query = 'SELECT p1.title FROM pages p1' \ - ' INNER JOIN redirects r1 ON p1.id = r1.id' \ - ' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?' - for (name,) in enwikiCur.execute(query, (wikiId,)): - name = name.lower() - if altNameRegex.fullmatch(name) is not None and name != nodeName and name not in nodeToTips: - dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0)) - -print('Getting picked names') -# File format: - # nodename1|altName1|isPreferred1 -> Add an alt-name - # nodename1|altName1| -> Remove an alt-name - # nodename1|nodeName1| -> Remove any preferred-alt status -if os.path.exists(pickedNamesFile): - with open(pickedNamesFile) as file: - for line in file: - nodeName, altName, isPreferredStr = line.lower().rstrip().split('|') - if nodeName not in nodeToTips: - print(f'Skipping "{nodeName}", as no such node exists') - continue - if isPreferredStr: - isPreferred = 1 if isPreferredStr == '1' else 0 - if isPreferred == 1: - # Remove any existing preferred-alt status - cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1' - dbCur.execute(cmd, (nodeName, altName)) - # Remove any existing record - dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName)) - # Add record - dbCur.execute('INSERT INTO names VALUES (?, ?, ?, "picked")', (nodeName, altName, isPreferred)) - elif nodeName != altName: # Remove any matching record - dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName)) - else: # Remove any preferred-alt status - cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1' - dbCur.execute(cmd, (nodeName, altName)) - -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/genOtolData.py b/backend/tolData/genOtolData.py deleted file mode 100755 index d4d6ee8..0000000 --- a/backend/tolData/genOtolData.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/python3 - -import re, os -import json, sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads files describing a tree-of-life from an 'Open Tree of Life' release, -and stores tree info in a database. - -Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: - The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 - The root node is named n6, and has children n1, n2, and n5. - Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', - 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. - The node with ID 'ott770315' will get the name 'homo sapiens'. - A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). - It is possible for multiple nodes to have the same name. - In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. -Reads an annotations.json file, which is assumed to have this format: - Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, - such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that - support/conflict with the node's placement. -Reads from a picked-names file, if present, which specifies name and node ID pairs. - These help resolve cases where multiple nodes share the same name. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -class Node: - ' Represents a tree-of-life node ' - def __init__(self, name, childIds, parentId, tips, pSupport): - self.name = name - self.childIds = childIds - self.parentId = parentId - self.tips = tips - self.pSupport = pSupport - -treeFile = 'otol/labelled_supertree_ottnames.tre' # Had about 2.5e9 nodes -annFile = 'otol/annotations.json' -dbFile = 'data.db' -nodeMap: dict[str, Node] = {} # Maps node IDs to node objects -nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) -dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs -pickedNamesFile = 'pickedOtolNames.txt' - -print('Parsing tree file') -# Read file -data: str -with open(treeFile) as file: - data = file.read() -dataIdx = 0 -# Parse content -iterNum = 0 -def parseNewick() -> str: - """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ - global data, dataIdx, iterNum - iterNum += 1 - if iterNum % 1e5 == 0: - print(f'At iteration {iterNum}') - # Check for EOF - if dataIdx == len(data): - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - # Check for node - if data[dataIdx] == '(': # parse inner node - dataIdx += 1 - childIds: list[str] = [] - while True: - # Read child - childId = parseNewick() - childIds.append(childId) - if (dataIdx == len(data)): - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - # Check for next child - if (data[dataIdx] == ','): - dataIdx += 1 - continue - else: - # Get node name and id - dataIdx += 1 # Consume an expected ')' - name, id = parseNewickName() - updateNameMaps(name, id) - # Get child num-tips total - tips = 0 - for childId in childIds: - tips += nodeMap[childId].tips - # Add node to nodeMap - nodeMap[id] = Node(name, childIds, None, tips, False) - # Update childrens' parent reference - for childId in childIds: - nodeMap[childId].parentId = id - return id - else: # Parse node name - name, id = parseNewickName() - updateNameMaps(name, id) - nodeMap[id] = Node(name, [], None, 1, False) - return id -def parseNewickName() -> tuple[str, str]: - """ Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair """ - global data, dataIdx - name: str - end = dataIdx - # Get name - if (end < len(data) and data[end] == "'"): # Check for quoted name - end += 1 - inQuote = True - while end < len(data): - if (data[end] == "'"): - if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote - end += 2 - continue - else: - end += 1 - inQuote = False - break - end += 1 - if inQuote: - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - name = data[dataIdx:end] - dataIdx = end - else: - while end < len(data) and not re.match(r'[(),]', data[end]): - end += 1 - if (end == dataIdx): - raise Exception(f'ERROR: Unexpected EOF at index {dataIdx}') - name = data[dataIdx:end].rstrip() - if end == len(data): # Ignore trailing input semicolon - name = name[:-1] - dataIdx = end - # Convert to (name, id) - name = name.lower() - if name.startswith('mrca'): - return (name, name) - elif name[0] == "'": - match = re.fullmatch(r"'([^\\\']+) (ott\d+)'", name) - if match is None: - raise Exception(f'ERROR: invalid name \'{name}\'') - name = match.group(1).replace("''", "'") - return (name, match.group(2)) - else: - match = re.fullmatch(r"([^\\\']+)_(ott\d+)", name) - if match is None: - raise Exception(f'ERROR: invalid name \'{name}\'') - return (match.group(1).replace('_', ' '), match.group(2)) -def updateNameMaps(name, id): - global nameToFirstId, dupNameToIds - if name not in nameToFirstId: - nameToFirstId[name] = id - else: - if name not in dupNameToIds: - dupNameToIds[name] = [nameToFirstId[name], id] - else: - dupNameToIds[name].append(id) -rootId: str = parseNewick() - -print('Resolving duplicate names') -# Read picked-names file -nameToPickedId: dict[str, str] = {} -if os.path.exists(pickedNamesFile): - with open(pickedNamesFile) as file: - for line in file: - name, _, otolId = line.rstrip().partition('|') - nameToPickedId[name] = otolId -# Resolve duplicates -for dupName, ids in dupNameToIds.items(): - # Check for picked id - if dupName in nameToPickedId: - idToUse = nameToPickedId[dupName] - else: - # Get conflicting node with most tips - tipNums = [nodeMap[id].tips for id in ids] - maxIdx = tipNums.index(max(tipNums)) - idToUse = ids[maxIdx] - # Adjust name of other conflicting nodes - counter = 2 - for id in ids: - if id != idToUse: - nodeMap[id].name += f' [{counter}]' - counter += 1 - -print('Changing mrca* names') -def convertMrcaName(id: str): - node = nodeMap[id] - name = node.name - childIds = node.childIds - if len(childIds) < 2: - print(f'WARNING: MRCA node \'{name}\' has less than 2 children') - return - # Get 2 children with most tips - childTips = [nodeMap[id].tips for id in childIds] - maxIdx1 = childTips.index(max(childTips)) - childTips[maxIdx1] = 0 - maxIdx2 = childTips.index(max(childTips)) - childId1 = childIds[maxIdx1] - childId2 = childIds[maxIdx2] - childName1 = nodeMap[childId1].name - childName2 = nodeMap[childId2].name - # Check for mrca* child names - if childName1.startswith('mrca'): - childName1 = convertMrcaName(childId1) - if childName2.startswith('mrca'): - childName2 = convertMrcaName(childId2) - # Check for composite names - match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) - if match is not None: - childName1 = match.group(1) - match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) - if match is not None: - childName2 = match.group(1) - # Create composite name - node.name = f'[{childName1} + {childName2}]' - return childName1 -for id, node in nodeMap.items(): - if node.name.startswith('mrca'): - convertMrcaName(id) - -print('Parsing annotations file') -# Read file -with open(annFile) as file: - data = file.read() -obj = json.loads(data) -nodeAnnsMap = obj['nodes'] -# Find relevant annotations -for id, node in nodeMap.items(): - # Set has-support value using annotations - if id in nodeAnnsMap: - nodeAnns = nodeAnnsMap[id] - supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 - conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 - node.pSupport = supportQty > 0 and conflictQty == 0 - -print('Creating nodes and edges tables') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') -dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)') -dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') -dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)') -for otolId, node in nodeMap.items(): - dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips)) - for childId in node.childIds: - childNode = nodeMap[childId] - dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', - (node.name, childNode.name, 1 if childNode.pSupport else 0)) -print('Closing database') -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/genPopData.py b/backend/tolData/genPopData.py deleted file mode 100755 index 3bb1325..0000000 --- a/backend/tolData/genPopData.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/python3 - -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads enwiki page view info from a database, and stores it -as node popularity values in the database. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -args = parser.parse_args() - -pageviewsDb = 'enwiki/pageviewData.db' -dbFile = 'data.db' - -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print('Getting view counts') -pdbCon = sqlite3.connect(pageviewsDb) -pdbCur = pdbCon.cursor() -nodeToViews: dict[str, int] = {} # Maps node names to counts -iterNum = 0 -for wikiId, views in pdbCur.execute('SELECT id, views from views'): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f'At iteration {iterNum}') # Reached 1.6e6 - # - row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone() - if row is not None: - nodeToViews[row[0]] = views -pdbCon.close() - -print(f'Writing {len(nodeToViews)} entries to db') -dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)') -for nodeName, views in nodeToViews.items(): - dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views)) - -dbCon.commit() -dbCon.close() diff --git a/backend/tolData/README.md b/backend/tol_data/README.md index 3b78af8..a21418b 100644 --- a/backend/tolData/README.md +++ b/backend/tol_data/README.md @@ -67,31 +67,31 @@ Some of the scripts use third-party packages: ## Generate Tree Structure Data 1. Obtain 'tree data files' in otol/, as specified in it's README. -2. Run genOtolData.py, which creates data.db, and adds the `nodes` and `edges` tables, +2. Run `gen_otol_data.py`, which creates data.db, and adds the `nodes` and `edges` tables, using data in otol/. It also uses these files, if they exist: - - pickedOtolNames.txt: Has lines of the form `name1|otolId1`. + - `picked_otol_names.txt`: Has lines of the form `name1|otolId1`. Can be used to override numeric suffixes added to same-name nodes. ## Generate Dataset Mappings 1. Obtain 'taxonomy data files' in otol/, 'mapping files' in eol/, files in wikidata/, and 'dump-index files' in enwiki/, as specified in their READMEs. -2. Run genMappingData.py, which adds the `eol_ids` and `wiki_ids` tables, +2. Run `gen_mapping_data.py`, which adds the `eol_ids` and `wiki_ids` tables, as well as `node_iucn`. It uses the files obtained above, the `nodes` table, and 'picked mappings' files, if they exist. - - pickedEolIds.txt contains lines like `3785967|405349`, specifying + - `picked_eol_ids.txt` contains lines like `3785967|405349`, specifying an otol ID and an eol ID to map it to. The eol ID can be empty, in which case the otol ID won't be mapped. - - pickedWikiIds.txt and pickedWikiIdsRough.txt contain lines like + - `picked_wiki_ids.txt` and `picked_wiki_ids_rough.txt` contain lines like `5341349|Human`, specifying an otol ID and an enwiki title, which may contain spaces. The title can be empty. ## Generate Node Name Data 1. Obtain 'name data files' in eol/, and 'description database files' in enwiki/, as specified in their READMEs. -2. Run genNameData.py, which adds the `names` table, using data in eol/ and enwiki/, +2. Run `gen_name_data.py`, which adds the `names` table, using data in eol/ and enwiki/, along with the `nodes`, `eol_ids`, and `wiki_ids` tables. <br> - It also uses pickedNames.txt, if it exists. This file can hold lines like + It also uses `picked_names.txt`, if it exists. This file can hold lines like `embryophyta|land plant|1`, specifying a node name, an alt-name to add for it, and a 1 or 0 indicating whether it is a 'preferred' alt-name. The last field can be empty, which indicates that the alt-name should be removed, or, if the @@ -99,32 +99,37 @@ Some of the scripts use third-party packages: ## Generate Node Description Data 1. Obtain files in dbpedia/, as specified in it's README. -2. Run genDescData.py, which adds the `descs` table, using data in dbpedia/ and +2. Run `gen_desc_data.py`, which adds the `descs` table, using data in dbpedia/ and enwiki/, and the `nodes` table. ## Generate Node Images Data ### Get images from EOL 1. Obtain 'image metadata files' in eol/, as specified in it's README. -2. In eol/, run downloadImgs.py, which downloads images (possibly multiple per node), - into eol/imgsForReview, using data in eol/, as well as the `eol_ids` table. -3. In eol/, run reviewImgs.py, which interactively displays the downloaded images for - each node, providing the choice of which to use, moving them to eol/imgs/. - Uses `names` and `eol_ids` to display extra info. +2. In eol/, run `download_imgs.py`, which downloads images (possibly multiple per node), + into eol/imgs_for_review, using data in eol/, as well as the `eol_ids` table. + By default, more images than needed are downloaded for review. To skip this, set + the script's MAX_IMGS_PER_ID to 1. +3. In eol/, run `review_imgs.py`, which interactively displays the downloaded images for + each node, providing the choice of which (if any) to use, moving them to eol/imgs/. + Uses `names` and `eol_ids` to display extra info. If MAX_IMGS_PER_ID was set to 1 in + the previous step, you can skip review by renaming the image folder. ### Get Images from Wikipedia -1. In enwiki/, run genImgData.py, which looks for wikipedia image names for each node, +1. In enwiki/, run `gen_img_data.py`, which looks for wikipedia image names for each node, using the `wiki_ids` table, and stores them in a database. -2. In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing information for +2. In enwiki/, run `download_img_license_info.py`, which downloads licensing information for those images, using wikipedia's online API. -3. In enwiki/, run downloadImgs.py, which downloads 'permissively-licensed' +3. In enwiki/, run `download_imgs.py`, which downloads 'permissively-licensed' images into enwiki/imgs/. ### Merge the Image Sets -1. Run reviewImgsToGen.py, which displays images from eol/imgs/ and enwiki/imgs/, +1. Run `review_imgs_to_gen.py`, which displays images from eol/imgs/ and enwiki/imgs/, and enables choosing, for each node, which image should be used, if any, - and outputs choice information into imgList.txt. Uses the `nodes`, + and outputs choice information into `img_list.txt`. Uses the `nodes`, `eol_ids`, and `wiki_ids` tables (as well as `names` to display extra info). -2. Run genImgs.py, which creates cropped/resized images in img/, from files listed in - imgList.txt and located in eol/ and enwiki/, and creates the `node_imgs` and - `images` tables. If pickedImgs/ is present, images within it are also used. <br> + To skip manual review, set REVIEW to 'none' in the script (the script will select any + image, preferring ones from Wikipedia). +2. Run `gen_imgs.py`, which creates cropped/resized images in img/, from files listed in + `img_list.txt` and located in eol/ and enwiki/, and creates the `node_imgs` and + `images` tables. If `picked_imgs/` is present, images within it are also used. <br> The outputs might need to be manually created/adjusted: - An input image might have no output produced, possibly due to data incompatibilities, memory limits, etc. A few input image files @@ -134,16 +139,17 @@ Some of the scripts use third-party packages: This seems to happen when the image is very large, and triggers a decompression bomb warning. ### Add more Image Associations -1. Run genLinkedImgs.py, which tries to associate nodes without images to +1. Run `gen_linked_imgs.py`, which tries to associate nodes without images to images of it's children. Adds the `linked_imgs` table, and uses the `nodes`, `edges`, and `node_imgs` tables. ## Generate Reduced Trees -1. Run genReducedTrees.py, which generates multiple reduced versions of the tree, - adding the `nodes_*` and `edges_*` tables, using `nodes` and `names`. Reads from - pickedNodes.txt, which lists names of nodes that must be included (1 per line). +1. Run `gen_reduced_trees.py`, which generates multiple reduced versions of the tree, + adding the `nodes_*` and `edges_*` tables, using `nodes`, `edges`, `wiki_ids`, + `node_imgs`, `linked_imgs`, and `names`. Reads from `picked_nodes.txt`, which lists + names of nodes that must be included (1 per line). ## Generate Node Popularity Data -1. Obtain 'page view files' in enwiki/Run genPopData.py, as specified in it's README. -2. Run genPopData.py, which adds the `node_pop` table, using data in enwiki/, +1. Obtain 'page view files' in enwiki/, as specified in it's README. +2. Run `gen_pop_data.py`, which adds the `node_pop` table, using data in enwiki/, and the `wiki_ids` table. diff --git a/backend/tol_data/__init__.py b/backend/tol_data/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tol_data/__init__.py diff --git a/backend/tolData/dbpedia/README.md b/backend/tol_data/dbpedia/README.md index dd9bda7..a708122 100644 --- a/backend/tolData/dbpedia/README.md +++ b/backend/tol_data/dbpedia/README.md @@ -16,10 +16,10 @@ This directory holds files obtained/derived from [Dbpedia](https://www.dbpedia.o Downloaded from <https://databus.dbpedia.org/vehnem/text/short-abstracts/2021.05.01/short-abstracts_lang=en.ttl.bz2>. # Other Files -- genDescData.py <br> +- `gen_desc_data.py` <br> Used to generate a database representing data from the ttl files. -- descData.db <br> - Generated by genDescData.py. <br> +- `desc_data.db` <br> + Generated by `gen_desc_data.py`. <br> Tables: <br> - `labels`: `iri TEXT PRIMARY KEY, label TEXT ` - `ids`: `iri TEXT PRIMARY KEY, id INT` diff --git a/backend/tol_data/dbpedia/__init__.py b/backend/tol_data/dbpedia/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tol_data/dbpedia/__init__.py diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py new file mode 100755 index 0000000..50418e0 --- /dev/null +++ b/backend/tol_data/dbpedia/gen_desc_data.py @@ -0,0 +1,120 @@ +#!/usr/bin/python3 + +""" +Adds DBpedia labels/types/abstracts/etc data into a database +""" + +# In testing, this script took a few hours to run, and generated about 10GB + +import re +import bz2, sqlite3 + +LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries +IDS_FILE = 'page_lang=en_ids.ttl.bz2' +REDIRECTS_FILE = 'redirects_lang=en_transitive.ttl.bz2' +DISAMBIG_FILE = 'disambiguations_lang=en.ttl.bz2' +TYPES_FILE = 'instance-types_lang=en_specific.ttl.bz2' +ABSTRACTS_FILE = 'short-abstracts_lang=en.ttl.bz2' +DB_FILE = 'desc_data.db' + +def genData( + labelsFile: str, idsFile: str, redirectsFile: str, disambigFile: str, + typesFile: str, abstractsFile: str, dbFile: str) -> None: + """ Reads the files and writes to db """ + print('Creating database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Reading/storing label data') + dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)') + dbCur.execute('CREATE INDEX labels_idx ON labels(label)') + dbCur.execute('CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)') + labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n') + with bz2.open(labelsFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = labelLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2))) + # + print('Reading/storing wiki page ids') + dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)') + dbCur.execute('CREATE INDEX ids_idx ON ids(id)') + idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') + with bz2.open(idsFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = idLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + try: + dbCur.execute('INSERT INTO ids VALUES (?, ?)', (match.group(1), int(match.group(2)))) + except sqlite3.IntegrityError as e: + # Accounts for certain lines that have the same IRI + print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}') + # + print('Reading/storing redirection data') + dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)') + redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') + with bz2.open(redirectsFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = redirLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2))) + # + print('Reading/storing diambiguation-page data') + dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)') + disambigLineRegex = redirLineRegex + with bz2.open(disambigFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = disambigLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),)) + # + print('Reading/storing instance-type data') + dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)') + dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)') + typeLineRegex = redirLineRegex + with bz2.open(typesFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = typeLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2))) + # + print('Reading/storing abstracts') + dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)') + descLineRegex = labelLineRegex + with bz2.open(abstractsFile, mode='rt') as file: + for lineNum, line in enumerate(file): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + if line[0] == '#': + continue + match = descLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO abstracts VALUES (?, ?)', + (match.group(1), match.group(2).replace(r'\"', '"'))) + # + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE) diff --git a/backend/tolData/enwiki/README.md b/backend/tol_data/enwiki/README.md index 76f9ee5..ba1de33 100644 --- a/backend/tolData/enwiki/README.md +++ b/backend/tol_data/enwiki/README.md @@ -1,63 +1,63 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page). # Downloaded Files -- enwiki-20220501-pages-articles-multistream.xml.bz2 <br> +- `enwiki-20220501-pages-articles-multistream.xml.bz2` <br> Contains text content and metadata for pages in enwiki. Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror). Some file content and format information was available from <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>. -- enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br> +- `enwiki-20220501-pages-articles-multistream-index.txt.bz2` <br> Obtained like above. Holds lines of the form offset1:pageId1:title1, providing, for each page, an offset into the dump file of a chunk of 100 pages that includes it. # Dump-Index Files -- genDumpIndexDb.py <br> +- `gen_dump_index_db.py` <br> Creates a database version of the enwiki-dump index file. -- dumpIndex.db <br> - Generated by genDumpIndexDb.py. <br> +- `dumpIndex.db` <br> + Generated by `gen_dump_index_db.py`. <br> Tables: <br> - `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT` # Description Database Files -- genDescData.py <br> +- `gen_desc_data.py` <br> Reads through pages in the dump file, and adds short-description info to a database. -- descData.db <br> - Generated by genDescData.py. <br> +- `desc_data.db` <br> + Generated by `gen_desc_data.py`. <br> Tables: <br> - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE` - `redirects`: `id INT PRIMARY KEY, target TEXT` - `descs`: `id INT PRIMARY KEY, desc TEXT` # Image Database Files -- genImgData.py <br> +- `gen_img_data.py` <br> Used to find infobox image names for page IDs, storing them into a database. -- downloadImgLicenseInfo.py <br> +- `downloadImgLicenseInfo.py` <br> Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database. -- imgData.db <br> +- `img_data.db` <br> Used to hold metadata about infobox images for a set of pageIDs. - Generated using getEnwikiImgData.py and downloadImgLicenseInfo.py. <br> + Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`. <br> Tables: <br> - `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT` <br> `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids. - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br> Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. -- downloadImgs.py <br> +- `downloadImgs.py` <br> Used to download image files into imgs/. # Page View Files -- pageviews/pageviews-*-user.bz2 +- `pageviews/pageviews-*-user.bz2` Each holds wikimedia article page view data for some month. Obtained via <https://dumps.wikimedia.org/other/pageview_complete/monthly/>. Some format info was available from <https://dumps.wikimedia.org/other/pageview_complete/readme.html>. -- genPageviewData.py <br> +- `gen_pageview_data.py` <br> Reads pageview/*, and creates a database holding average monthly pageview counts. -- pageviewData.db <br> - Generated using genPageviewData.py. <br> +- `pageview_data.db` <br> + Generated using `gen_pageview_data.py`. <br> Tables: <br> - `views`: `title TEXT PRIMARY KEY, id INT, views INT` # Other Files -- lookupPage.py <br> - Running `lookupPage.py title1` looks in the dump for a page with a given title, +- `lookup_page.py` <br> + Running `lookup_page.py title1` looks in the dump for a page with a given title, and prints the contents to stdout. Uses dumpIndex.db. diff --git a/backend/tol_data/enwiki/__init__.py b/backend/tol_data/enwiki/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tol_data/enwiki/__init__.py diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py new file mode 100755 index 0000000..0a809ac --- /dev/null +++ b/backend/tol_data/enwiki/download_img_license_info.py @@ -0,0 +1,154 @@ +#!/usr/bin/python3 + +""" +Reads image names from a database, and uses enwiki's online API to obtain +licensing information for them, adding the info to the database. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +at already-processed names to decide what to skip. +""" + +import re +import sqlite3, urllib.parse, html +import requests +import time, signal + +IMG_DB = 'img_data.db' +# +API_URL = 'https://en.wikipedia.org/w/api.php' +USER_AGENT = 'terryt.dev (terry06890@gmail.com)' +BATCH_SZ = 50 # Max 50 +TAG_REGEX = re.compile(r'<[^<]+>') +WHITESPACE_REGEX = re.compile(r'\s+') + +def downloadInfo(imgDb: str) -> None: + print('Opening database') + dbCon = sqlite3.connect(imgDb) + dbCur = dbCon.cursor() + print('Checking for table') + if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: + dbCur.execute('CREATE TABLE imgs (' \ + 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') + # + print('Reading image names') + imgNames: set[str] = set() + for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): + imgNames.add(imgName) + print(f'Found {len(imgNames)}') + # + print('Checking for already-processed images') + oldSz = len(imgNames) + for (imgName,) in dbCur.execute('SELECT name FROM imgs'): + imgNames.discard(imgName) + print(f'Found {oldSz - len(imgNames)}') + # + # Set SIGINT handler + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # + print('Iterating through image names') + imgNameList = list(imgNames) + iterNum = 0 + for i in range(0, len(imgNameList), BATCH_SZ): + iterNum += 1 + if iterNum % 1 == 0: + print(f'At iteration {iterNum} (after {(iterNum - 1) * BATCH_SZ} images)') + if interrupted: + print(f'Exiting loop at iteration {iterNum}') + break + # Get batch + imgBatch = imgNameList[i:i+BATCH_SZ] + imgBatch = ['File:' + x for x in imgBatch] + # Make request + headers = { + 'user-agent': USER_AGENT, + 'accept-encoding': 'gzip', + } + params = { + 'action': 'query', + 'format': 'json', + 'prop': 'imageinfo', + 'iiprop': 'extmetadata|url', + 'maxlag': '5', + 'titles': '|'.join(imgBatch), + 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions', + } + responseObj = None + try: + response = requests.get(API_URL, params=params, headers=headers) + responseObj = response.json() + except Exception as e: + print(f'ERROR: Exception while downloading info: {e}') + print('\tImage batch: ' + '|'.join(imgBatch)) + continue + # Parse response-object + if 'query' not in responseObj or 'pages' not in responseObj['query']: + print('WARNING: Response object for doesn\'t have page data') + print('\tImage batch: ' + '|'.join(imgBatch)) + if 'error' in responseObj: + errorCode = responseObj['error']['code'] + print(f'\tError code: {errorCode}') + if errorCode == 'maxlag': + time.sleep(5) + continue + pages = responseObj['query']['pages'] + normalisedToInput: dict[str, str] = {} + if 'normalized' in responseObj['query']: + for entry in responseObj['query']['normalized']: + normalisedToInput[entry['to']] = entry['from'] + for page in pages.values(): + # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data + # LicenseShortName: short human-readable license name, apparently more reliable than 'License', + # Artist: author name (might contain complex html, multiple authors, etc) + # Credit: 'source' + # For image-map-like images, can be quite large/complex html, creditng each sub-image + # May be <a href='text1'>text2</a>, where the text2 might be non-indicative + # Restrictions: specifies non-copyright legal restrictions + title: str = page['title'] + if title in normalisedToInput: + title = normalisedToInput[title] + title = title[5:] # Remove 'File:' + if title not in imgNames: + print(f'WARNING: Got title "{title}" not in image-name list') + continue + if 'imageinfo' not in page: + print(f'WARNING: No imageinfo section for page "{title}"') + continue + metadata = page['imageinfo'][0]['extmetadata'] + url: str = page['imageinfo'][0]['url'] + license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None + artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None + credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None + restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup + if artist is not None: + artist = TAG_REGEX.sub(' ', artist).strip() + artist = WHITESPACE_REGEX.sub(' ', artist) + artist = html.unescape(artist) + artist = urllib.parse.unquote(artist) + if credit is not None: + credit = TAG_REGEX.sub(' ', credit).strip() + credit = WHITESPACE_REGEX.sub(' ', credit) + credit = html.unescape(credit) + credit = urllib.parse.unquote(credit) + # Add to db + print((title, license, artist, credit, restrictions, url)) + dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', + (title, license, artist, credit, restrictions, url)) + # + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + downloadInfo(IMG_DB) diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py new file mode 100755 index 0000000..ba874e1 --- /dev/null +++ b/backend/tol_data/enwiki/download_imgs.py @@ -0,0 +1,99 @@ +#!/usr/bin/python3 + +""" +Downloads images from URLs in an image database, into an output directory, +with names of the form 'pageId1.ext1'. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +in the output directory do decide what to skip. +""" + +# In testing, this downloaded about 100k images, over several days + +import re, os +import sqlite3 +import urllib.parse, requests +import time, signal + +IMG_DB = 'img_data.db' # About 130k image names +OUT_DIR = 'imgs' +# +LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) +USER_AGENT = 'terryt.dev (terry06890@gmail.com)' +TIMEOUT = 1 + # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' + # It's unclear how to properly check for cache misses, so we just aim for 1 per sec + +def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: + if not os.path.exists(outDir): + os.mkdir(outDir) + print('Checking for already-downloaded images') + fileList = os.listdir(outDir) + pageIdsDone: set[int] = set() + for filename in fileList: + pageIdsDone.add(int(os.path.splitext(filename)[0])) + print(f'Found {len(pageIdsDone)}') + # + # Set SIGINT handler + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # + print('Opening database') + dbCon = sqlite3.connect(imgDb) + dbCur = dbCon.cursor() + print('Starting downloads') + iterNum = 0 + query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \ + ' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name' + for pageId, license, artist, credit, restrictions, url in dbCur.execute(query): + if pageId in pageIdsDone: + continue + if interrupted: + print('Exiting loop') + break + # Check for problematic attributes + if license is None or LICENSE_REGEX.fullmatch(license) is None: + continue + if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None: + continue + if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None: + continue + if restrictions is not None and restrictions != '': + continue + # Download image + iterNum += 1 + print(f'Iteration {iterNum}: Downloading for page-id {pageId}') + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f'WARNING: No filename extension found in URL {url}') + continue + outFile = os.path.join(outDir, f'{pageId}{extension}') + print(outFile) + headers = { + 'user-agent': USER_AGENT, + 'accept-encoding': 'gzip', + } + try: + response = requests.get(url, headers=headers) + with open(outFile, 'wb') as file: + file.write(response.content) + time.sleep(timeout) + except Exception as e: + print(f'Error while downloading to {outFile}: {e}') + return + print('Closing database') + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py new file mode 100755 index 0000000..0dca16b --- /dev/null +++ b/backend/tol_data/enwiki/gen_desc_data.py @@ -0,0 +1,126 @@ +#!/usr/bin/python3 + +""" +Reads through the wiki dump, and attempts to parse short-descriptions, +and add them to a database +""" + +# In testing, this script took over 10 hours to run, and generated about 5GB + +import sys, os, re +import bz2 +import html, mwxml, mwparserfromhell +import sqlite3 + +DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages +DB_FILE = 'desc_data.db' + +DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') +EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') + # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag +CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +def convertTemplateReplace(match): + """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ + if match.group(2) is None: + return f'{match.group(1)} {match.group(4)}' + else: + return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +def genData(dumpFile: str, dbFile: str) -> None: + print('Creating database') + if os.path.exists(dbFile): + raise Exception(f'ERROR: Existing {dbFile}') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)') + dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)') + dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') + dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') + dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') + # + print('Iterating through dump file') + with bz2.open(dumpFile, mode='rt') as file: + for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): + if pageNum % 1e4 == 0: + print(f'At page {pageNum}') + # Parse page + if page.namespace == 0: + try: + dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) + except sqlite3.IntegrityError as e: + # Accounts for certain pages that have the same title + print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr) + continue + if page.redirect is not None: + dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect))) + else: + revision = next(page) + desc = parseDesc(revision.text) + if desc is not None: + dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) + # + print('Closing database') + dbCon.commit() + dbCon.close() +def parseDesc(text: str) -> str | None: + # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, + # and then accumulate lines until a blank one. + # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, + # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + lines: list[str] = [] + openBraceCount = 0 + openBracketCount = 0 + inComment = False + skip = False + for line in text.splitlines(): + line = line.strip() + if not lines: + if line: + if openBraceCount > 0 or line[0] == '{': + openBraceCount += line.count('{') + openBraceCount -= line.count('}') + skip = True + if openBracketCount > 0 or line[0] == '[': + openBracketCount += line.count('[') + openBracketCount -= line.count(']') + skip = True + if inComment or line.find('<!--') != -1: + if line.find('-->') != -1: + if inComment: + inComment = False + skip = True + else: + inComment = True + skip = True + if skip: + skip = False + continue + if line[-1] == ':': # Seems to help avoid disambiguation pages + return None + if DESC_LINE_REGEX.match(line) is not None: + lines.append(line) + else: + if not line: + return removeMarkup(' '.join(lines)) + lines.append(line) + if lines: + return removeMarkup(' '.join(lines)) + return None +def removeMarkup(content: str) -> str: + content = EMBEDDED_HTML_REGEX.sub('', content) + content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) + content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup + content = PARENS_GROUP_REGEX.sub('', content) + content = LEFTOVER_BRACE_REGEX.sub('', content) + return content +def convertTitle(title: str) -> str: + return html.unescape(title).replace('_', ' ') + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(DUMP_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py new file mode 100755 index 0000000..5f21c9b --- /dev/null +++ b/backend/tol_data/enwiki/gen_dump_index_db.py @@ -0,0 +1,60 @@ +#!/usr/bin/python3 + +""" +Adds data from the wiki dump index-file into a database +""" +import sys, os, re +import bz2 +import sqlite3 + +INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines +DB_FILE = 'dumpIndex.db' + +def genData(indexFile: str, dbFile: str) -> None: + """ Reads the index file and creates the db """ + if os.path.exists(dbFile): + raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') + lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') + lastOffset = 0 + lineNum = 0 + entriesToAdd: list[tuple[str, str]] = [] + with bz2.open(indexFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + # + match = lineRegex.fullmatch(line.rstrip()) + assert match is not None + offsetStr, pageId, title = match.group(1,2,3) + offset = int(offsetStr) + if offset > lastOffset: + for t, p in entriesToAdd: + try: + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset)) + except sqlite3.IntegrityError as e: + # Accounts for certain entries in the file that have the same title + print(f'Failed on title "{t}": {e}', file=sys.stderr) + entriesToAdd = [] + lastOffset = offset + entriesToAdd.append((title, pageId)) + for title, pageId in entriesToAdd: + try: + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) + except sqlite3.IntegrityError as e: + print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(INDEX_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py new file mode 100755 index 0000000..d4696f0 --- /dev/null +++ b/backend/tol_data/enwiki/gen_img_data.py @@ -0,0 +1,193 @@ +#!/usr/bin/python3 + +""" +For some set of page IDs, looks up their content in the wiki dump, +and tries to parse infobox image names, storing them into a database. + +The program can be re-run with an updated set of page IDs, and +will skip already-processed page IDs. +""" + +import re +import os, bz2, html, urllib.parse +import sqlite3 + +DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' +INDEX_DB = 'dumpIndex.db' +IMG_DB = 'img_data.db' # The database to create +DB_FILE = os.path.join('..', 'data.db') +# +ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') +IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') +BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') +IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) +CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) + +def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: + print('Opening databases') + indexDbCon = sqlite3.connect(indexDb) + indexDbCur = indexDbCon.cursor() + imgDbCon = sqlite3.connect(imgDb) + imgDbCur = imgDbCon.cursor() + print('Checking tables') + if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: + # Create tables if not present + imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL + imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') + else: + # Check for already-processed page IDs + numSkipped = 0 + for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'): + if pid in pageIds: + pageIds.remove(pid) + numSkipped += 1 + else: + print(f'Found already-processed page ID {pid} which was not in input set') + print(f'Will skip {numSkipped} already-processed page IDs') + # + print('Getting dump-file offsets') + offsetToPageids: dict[int, list[int]] = {} + offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets + iterNum = 0 + for pageId in pageIds: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + query = 'SELECT offset, next_offset FROM offsets WHERE id = ?' + row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone() + if row is None: + print(f'WARNING: Page ID {pageId} not found') + continue + chunkOffset, endOffset = row + offsetToEnd[chunkOffset] = endOffset + if chunkOffset not in offsetToPageids: + offsetToPageids[chunkOffset] = [] + offsetToPageids[chunkOffset].append(pageId) + print(f'Found {len(offsetToEnd)} chunks to check') + # + print('Iterating through chunks in dump file') + with open(dumpFile, mode='rb') as file: + iterNum = 0 + for pageOffset, endOffset in offsetToEnd.items(): + iterNum += 1 + if iterNum % 100 == 0: + print(f'At iteration {iterNum}') + # + chunkPageIds = offsetToPageids[pageOffset] + # Jump to chunk + file.seek(pageOffset) + compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) + data = bz2.BZ2Decompressor().decompress(compressedData).decode() + # Look in chunk for pages + lines = data.splitlines() + lineIdx = 0 + while lineIdx < len(lines): + # Look for <page> + if lines[lineIdx].lstrip() != '<page>': + lineIdx += 1 + continue + # Check page id + lineIdx += 3 + idLine = lines[lineIdx].lstrip() + match = ID_LINE_REGEX.fullmatch(idLine) + if match is None or int(match.group(1)) not in chunkPageIds: + lineIdx += 1 + continue + pageId = int(match.group(1)) + lineIdx += 1 + # Look for <text> in <page> + foundText = False + while lineIdx < len(lines): + if not lines[lineIdx].lstrip().startswith('<text '): + lineIdx += 1 + continue + foundText = True + # Get text content + content: list[str] = [] + line = lines[lineIdx] + content.append(line[line.find('>') + 1:]) + lineIdx += 1 + foundTextEnd = False + while lineIdx < len(lines): + line = lines[lineIdx] + if not line.endswith('</text>'): + content.append(line) + lineIdx += 1 + continue + foundTextEnd = True + content.append(line[:line.rfind('</text>')]) + # Look for image-filename + imageName = getImageName(content) + imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName)) + break + if not foundTextEnd: + print(f'WARNING: Did not find </text> for page id {pageId}') + break + if not foundText: + print(f'WARNING: Did not find <text> for page id {pageId}') + # + print('Closing databases') + indexDbCon.close() + imgDbCon.commit() + imgDbCon.close() +def getImageName(content: list[str]) -> str | None: + """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ + # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections + for line in content: + match = IMG_LINE_REGEX.match(line) + if match is not None: + imageName = match.group(1).strip() + if imageName == '': + return None + imageName = html.unescape(imageName) + # Account for {{... + if imageName.startswith('{'): + match = CSS_IMG_CROP_REGEX.match(imageName) + if match is None: + return None + imageName = match.group(1) + # Account for [[File:...|...]] + if imageName.startswith('['): + match = BRACKET_IMG_REGEX.match(imageName) + if match is None: + return None + imageName = match.group(1) + # Account for <!-- + if imageName.find('<!--') != -1: + return None + # Remove an initial 'File:' + if imageName.startswith('File:'): + imageName = imageName[5:] + # Remove an initial 'Image:' + if imageName.startswith('Image:'): + imageName = imageName[6:] + # Check for extension + match = IMG_NAME_REGEX.match(imageName) + if match is not None: + imageName = match.group(0) + imageName = urllib.parse.unquote(imageName) + imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases) + imageName = imageName.replace('_', ' ') + return imageName + # Exclude lines like: | image = <imagemap> + return None + return None + +def getInputPageIdsFromDb(dbFile: str) -> set[int]: + print('Getting input page-ids') + pageIds: set[int] = set() + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + for (pageId,) in dbCur.execute('SELECT id from wiki_ids'): + pageIds.add(pageId) + dbCon.close() + print(f'Found {len(pageIds)}') + return pageIds +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + pageIds = getInputPageIdsFromDb(DB_FILE) + genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB) diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py new file mode 100755 index 0000000..ce3b674 --- /dev/null +++ b/backend/tol_data/enwiki/gen_pageview_data.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 + +""" +Reads through wikimedia files containing pageview counts, +computes average counts, and adds them to a database +""" + +# Took about 15min per file (each had about 180e6 lines) + +import sys, os, glob, math, re +from collections import defaultdict +import bz2, sqlite3 + +PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2') +DUMP_INDEX_DB = 'dumpIndex.db' +DB_FILE = 'pageview_data.db' + +def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None: + # Each pageview file has lines that seem to hold these space-separated fields: + # wiki code (eg: en.wikipedia), article title, page ID (may be: null), + # platform (eg: mobile-web), monthly view count, + # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2) + if os.path.exists(dbFile): + print('ERROR: Database already exists') + sys.exit(1) + # + namespaceRegex = re.compile(r'[a-zA-Z]+:') + titleToViews: dict[str, int] = defaultdict(int) + linePrefix = b'en.wikipedia ' + for filename in pageviewFiles: + print(f'Reading from {filename}') + with bz2.open(filename, 'rb') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e6 == 0: + print(f'At line {lineNum}') + if not line.startswith(linePrefix): + continue + # Get second and second-last fields + line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields + title = line[:line.find(b' ')].decode('utf-8') + viewCount = int(line[line.rfind(b' ')+1:]) + if namespaceRegex.match(title) is not None: + continue + # Update map + titleToViews[title] += viewCount + print(f'Found {len(titleToViews)} titles') + # + print('Writing to db') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + idbCon = sqlite3.connect(dumpIndexDb) + idbCur = idbCon.cursor() + dbCur.execute('CREATE TABLE views (title TEXT PRIMARY KEY, id INT, views INT)') + for title, views in titleToViews.items(): + row = idbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if row is not None: + wikiId = int(row[0]) + dbCur.execute('INSERT INTO views VALUES (?, ?, ?)', (title, wikiId, math.floor(views / len(pageviewFiles)))) + dbCon.commit() + dbCon.close() + idbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE) diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py new file mode 100755 index 0000000..8ef1229 --- /dev/null +++ b/backend/tol_data/enwiki/lookup_page.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +""" +Looks up a page with title title1 in the wiki dump, using the dump-index +db, and prints the corresponding <page>. +""" + +import sys +import bz2 +import sqlite3 + +DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' +INDEX_DB = 'dumpIndex.db' + +def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None: + print('Looking up offset in index db') + dbCon = sqlite3.connect(indexDb) + dbCur = dbCon.cursor() + query = 'SELECT title, offset, next_offset FROM offsets WHERE title = ?' + row = dbCur.execute(query, (pageTitle,)).fetchone() + if row is None: + print('Title not found') + sys.exit(0) + _, pageOffset, endOffset = row + dbCon.close() + print(f'Found chunk at offset {pageOffset}') + # + print('Reading from wiki dump') + content: list[str] = [] + with open(dumpFile, mode='rb') as file: + # Get uncompressed chunk + file.seek(pageOffset) + compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) + data = bz2.BZ2Decompressor().decompress(compressedData).decode() + # Look in chunk for page + lines = data.splitlines() + lineIdx = 0 + found = False + pageNum = 0 + while not found: + line = lines[lineIdx] + if line.lstrip() == '<page>': + pageNum += 1 + if pageNum > 100: + print('ERROR: Did not find title after 100 pages') + break + lineIdx += 1 + titleLine = lines[lineIdx] + if titleLine.lstrip() == '<title>' + pageTitle + '</title>': + found = True + print(f'Found title in chunk as page {pageNum}') + content.append(line) + content.append(titleLine) + while True: + lineIdx += 1 + line = lines[lineIdx] + content.append(line) + if line.lstrip() == '</page>': + break + lineIdx += 1 + # + print('Content: ') + print('\n'.join(content)) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('title', help='The title to look up') + args = parser.parse_args() + # + lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' ')) diff --git a/backend/tolData/eol/README.md b/backend/tol_data/eol/README.md index c07b48e..580310d 100644 --- a/backend/tolData/eol/README.md +++ b/backend/tol_data/eol/README.md @@ -6,26 +6,26 @@ This directory holds files obtained via the [Encyclopedia of Life](https://eol.o Associates EOL IDs with taxon IDs from sources like NCBI and Index Fungorium. # Name Data Files -- vernacularNames.csv <br> +- `vernacularNames.csv` <br> Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020). Contains alternative-node-names data from EOL. # Image Metadata Files -- imagesList.tgz <br> +- `imagesList.tgz` <br> Obtained from <https://opendata.eol.org/dataset/images-list> on 24/04/2022 (last updated on 05/02/2020). Contains metadata for images from EOL. -- imagesList/ <br> +- `imagesList/` <br> Extracted from imagesList.tgz. -- genImagesListDb.py <br> +- `gen_images_list_db.py` <br> Creates a database, and imports imagesList/*.csv files into it. -- imagesList.db <br> +- `images_list.db` <br> Created by running genImagesListDb.py <br> Tables: <br> - `images`: `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` # Image Generation Files -- downloadImgs.py <br> - Used to download image files into imgsForReview/. -- reviewImgs.py <br> - Used to review images in imgsForReview/, moving acceptable ones into imgs/. +- `download_imgs.py` <br> + Used to download image files into imgs_for_review/. +- `review_imgs.py` <br> + Used to review images in imgs_for_review/, moving acceptable ones into imgs/. diff --git a/backend/tol_data/eol/__init__.py b/backend/tol_data/eol/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tol_data/eol/__init__.py diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py new file mode 100755 index 0000000..8454a35 --- /dev/null +++ b/backend/tol_data/eol/download_imgs.py @@ -0,0 +1,152 @@ +#!/usr/bin/python3 + +""" +For some set of EOL IDs, downloads associated images from URLs in +an image-list database. Uses multiple downloading threads. + +May obtain multiple images per ID. The images will get names +with the form 'eolId1 contentId1.ext1'. + +SIGINT causes the program to finish ongoing downloads and exit. +The program can be re-run to continue downloading. It looks for +already-downloaded files, and continues after the one with +highest EOL ID. +""" + +import sys, re, os, random +import sqlite3 +import urllib.parse, requests +import time +from threading import Thread +import signal + +IMAGES_LIST_DB = 'images_list.db' +OUT_DIR = 'imgs_for_review' +DB_FILE = os.path.join('..', 'data.db') +# +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 5 +POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) +POST_DL_DELAY_MAX = 3 +LICENSE_REGEX = r'cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain' + +def downloadImgs(eolIds, imagesListDb, outDir): + print('Getting EOL IDs to download for') + # Get IDs from images-list db + imgDbCon = sqlite3.connect(imagesListDb) + imgCur = imgDbCon.cursor() + imgListIds: set[int] = set() + for (pageId,) in imgCur.execute('SELECT DISTINCT page_id FROM images'): + imgListIds.add(pageId) + # Get set intersection, and sort into list + eolIds = eolIds.intersection(imgListIds) + eolIdList = sorted(eolIds) + nextIdx = 0 + print(f'Result: {len(eolIdList)} EOL IDs') + # + print('Checking output directory') + if not os.path.exists(outDir): + os.mkdir(outDir) + else: + print('Finding next ID to download for') + fileList = os.listdir(outDir) + ids = [int(filename.split(' ')[0]) for filename in fileList] + if ids: + ids.sort() + nextIdx = eolIdList.index(ids[-1]) + 1 + if nextIdx == len(eolIdList): + print('No IDs left. Exiting...') + return + # + print('Starting download threads') + numThreads = 0 + threadException: Exception | None = None # Used for ending main thread after a non-main thread exception + # Handle SIGINT signals + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # Function for threads to execute + def downloadImg(url, outFile): + nonlocal numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr) + threadException = e + numThreads -= 1 + # Manage downloading + for idx in range(nextIdx, len(eolIdList)): + eolId = eolIdList[idx] + # Get image urls + ownerSet: set[str] = set() # Used to get images from different owners, for variety + exitLoop = False + query = 'SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?' + for contentId, url, license, copyrightOwner in imgCur.execute(query, (eolId,)): + if url.startswith('data/'): + url = 'https://content.eol.org/' + url + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr) + continue + # Check image-quantity limit + if len(ownerSet) == MAX_IMGS_PER_ID: + break + # Check for skip conditions + if re.fullmatch(LICENSE_REGEX, license) is None: + continue + if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner in ownerSet: + continue + ownerSet.add(copyrightOwner) + # Determine output filename + outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}') + if os.path.exists(outPath): + print(f'WARNING: {outPath} already exists. Skipping download.') + continue + # Check thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException is not None: + print('Waiting for existing threads to end') + while numThreads > 0: + time.sleep(1) + exitLoop = True + break + # Perform download + print(f'Downloading image to {outPath}') + numThreads += 1 + thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) + thread.start() + if exitLoop: + break + # Close images-list db + while numThreads > 0: + time.sleep(1) + print('Finished downloading') + imgDbCon.close() + +def getEolIdsFromDb(dbFile) -> set[int]: + eolIds: set[int] = set() + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + for (id,) in dbCur.execute('SELECT id FROM eol_ids'): + eolIds.add(id) + dbCon.close() + return eolIds +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + eolIds = getEolIdsFromDb(DB_FILE) + downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR) diff --git a/backend/tol_data/eol/gen_images_list_db.py b/backend/tol_data/eol/gen_images_list_db.py new file mode 100755 index 0000000..ee57ac6 --- /dev/null +++ b/backend/tol_data/eol/gen_images_list_db.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +""" +Generates a sqlite db from a directory of CSV files holding EOL image data +""" + +import os, glob +import csv, re, sqlite3 + +IMAGE_LISTS_GLOB = os.path.join('imagesList', '*.csv') +DB_FILE = 'images_list.db' + +def genData(imageListsGlob: str, dbFile: str) -> None: + print('Creating database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE images' \ + ' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \ + ' copy_url TEXT, license TEXT, copyright_owner TEXT)') + dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)') + print('Reading CSV files') + for filename in glob.glob(imageListsGlob): + print(f'Processing {filename}') + with open(filename, newline='') as file: + for contentId, pageId, sourceUrl, copyUrl, license, owner in csv.reader(file): + if re.match(r'^[a-zA-Z]', contentId): # Skip header line (not in all files) + continue + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (int(contentId), int(pageId), sourceUrl, copyUrl, license, owner)) + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(IMAGE_LISTS_GLOB, DB_FILE) diff --git a/backend/tolData/eol/reviewImgs.py b/backend/tol_data/eol/review_imgs.py index e44fb3d..9fb462c 100755 --- a/backend/tolData/eol/reviewImgs.py +++ b/backend/tol_data/eol/review_imgs.py @@ -1,5 +1,12 @@ #!/usr/bin/python3 +""" +Provides a GUI for reviewing images. Looks in a for-review directory for +images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to +choose an image to keep, or reject all. Also provides image rotation. +Chosen images are placed in another directory, and rejected ones are deleted. +""" + import sys, re, os, time import sqlite3 import tkinter as tki @@ -7,47 +14,18 @@ from tkinter import ttk import PIL from PIL import ImageTk, Image, ImageOps -import argparse -parser = argparse.ArgumentParser(description=""" -Provides a GUI for reviewing images. Looks in a for-review directory for -images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to -choose an image to keep, or reject all. Also provides image rotation. -Chosen images are placed in another directory, and rejected ones are deleted. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -imgDir = 'imgsForReview/' -outDir = 'imgs/' -extraInfoDbCon = sqlite3.connect('../data.db') -extraInfoDbCur = extraInfoDbCon.cursor() -def getExtraInfo(eolId: int) -> str: - global extraInfoDbCur - query = 'SELECT names.alt_name FROM' \ - ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \ - ' WHERE id = ? and pref_alt = 1' - row = extraInfoDbCur.execute(query, (eolId,)).fetchone() - if row is not None: - return f'Reviewing EOL ID {eolId}, aka "{row[0]}"' - else: - return f'Reviewing EOL ID {eolId}' +IMG_DIR = 'imgs_for_review' +OUT_DIR = 'imgs' +EXTRA_INFO_DB = os.path.join('..', 'data.db') +# IMG_DISPLAY_SZ = 400 MAX_IMGS_PER_ID = 3 IMG_BG_COLOR = (88, 28, 135) PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) -print('Checking output directory') -if not os.path.exists(outDir): - os.mkdir(outDir) -print('Getting input image list') -imgList = os.listdir(imgDir) -imgList.sort(key=lambda s: int(s.split(' ')[0])) -if not imgList: - print('No input images found') - sys.exit(0) - class EolImgReviewer: """ Provides the GUI for reviewing images """ - def __init__(self, root, imgList): + def __init__(self, root, imgDir, imgList, extraInfoDb, outDir): self.root = root root.title('EOL Image Reviewer') # Setup main frame @@ -81,16 +59,21 @@ class EolImgReviewer: root.bind('<Key-A>', lambda evt: self.rotate(0, True)) root.bind('<Key-S>', lambda evt: self.rotate(1, True)) root.bind('<Key-D>', lambda evt: self.rotate(2, True)) - # Initialise images to review + # Initialise fields + self.imgDir = imgDir self.imgList = imgList + self.outDir = outDir self.imgListIdx = 0 self.nextEolId = 0 self.nextImgNames: list[str] = [] self.rotations: list[int] = [] - self.getNextImgs() # For displaying extra info + self.extraInfoDbCon = sqlite3.connect(extraInfoDb) + self.extraInfoDbCur = self.extraInfoDbCon.cursor() self.numReviewed = 0 self.startTime = time.time() + # + self.getNextImgs() def getNextImgs(self): """ Updates display with new images to review, or ends program """ # Gather names of next images to review @@ -117,10 +100,10 @@ class EolImgReviewer: while idx < MAX_IMGS_PER_ID: if idx < len(self.nextImgNames): try: - img = Image.open(imgDir + self.nextImgNames[idx]) + img = Image.open(os.path.join(self.imgDir, self.nextImgNames[idx])) img = ImageOps.exif_transpose(img) except PIL.UnidentifiedImageError: - os.remove(imgDir + self.nextImgNames[idx]) + os.remove(os.path.join(self.imgDir, self.nextImgNames[idx])) del self.nextImgNames[idx] del self.rotations[idx] continue @@ -137,7 +120,7 @@ class EolImgReviewer: # Update title firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1 lastImgIdx = self.imgListIdx - title = getExtraInfo(self.nextEolId) + title = self.getExtraInfo(self.nextEolId) title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})' self.root.title(title) def accept(self, imgIdx): @@ -146,9 +129,9 @@ class EolImgReviewer: print('Invalid selection') return for i in range(len(self.nextImgNames)): - inFile = imgDir + self.nextImgNames[i] + inFile = os.path.join(self.imgDir, self.nextImgNames[i]) if i == imgIdx: # Move accepted image, rotating if needed - outFile = outDir + self.nextImgNames[i] + outFile = os.path.join(self.outDir, self.nextImgNames[i]) img = Image.open(inFile) img = ImageOps.exif_transpose(img) if self.rotations[i] != 0: @@ -162,7 +145,7 @@ class EolImgReviewer: def reject(self): """ React to a user rejecting all images of a set """ for i in range(len(self.nextImgNames)): - os.remove(imgDir + self.nextImgNames[i]) + os.remove(os.path.join(self.imgDir, self.nextImgNames[i])) self.numReviewed += 1 self.getNextImgs() def rotate(self, imgIdx, anticlockwise = False): @@ -173,14 +156,14 @@ class EolImgReviewer: self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 def quit(self, e = None): - global extraInfoDbCon print(f'Number reviewed: {self.numReviewed}') timeElapsed = time.time() - self.startTime print(f'Time elapsed: {timeElapsed:.2f} seconds') if self.numReviewed > 0: print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds') - extraInfoDbCon.close() + self.extraInfoDbCon.close() self.root.destroy() + # def resizeImgForDisplay(self, img): """ Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """ if max(img.width, img.height) > IMG_DISPLAY_SZ: @@ -195,8 +178,36 @@ class EolImgReviewer: int((IMG_DISPLAY_SZ - img.width) / 2), int((IMG_DISPLAY_SZ - img.height) / 2))) return bgImg -# Create GUI and defer control -print('Starting GUI') -root = tki.Tk() -EolImgReviewer(root, imgList) -root.mainloop() + def getExtraInfo(self, eolId: int) -> str: + """ Used to display extra EOL ID info """ + query = 'SELECT names.alt_name FROM' \ + ' names INNER JOIN eol_ids ON eol_ids.name = names.name' \ + ' WHERE id = ? and pref_alt = 1' + row = self.extraInfoDbCur.execute(query, (eolId,)).fetchone() + if row is not None: + return f'Reviewing EOL ID {eolId}, aka "{row[0]}"' + else: + return f'Reviewing EOL ID {eolId}' + +def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str): + print('Checking output directory') + if not os.path.exists(outDir): + os.mkdir(outDir) + print('Getting input image list') + imgList = os.listdir(imgDir) + imgList.sort(key=lambda s: int(s.split(' ')[0])) + if not imgList: + print('No input images found') + sys.exit(0) + # Create GUI and defer control + print('Starting GUI') + root = tki.Tk() + EolImgReviewer(root, imgDir, imgList, extraInfoDb, outDir) + root.mainloop() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + reviewImgs(IMG_DIR, OUT_DIR, EXTRA_INFO_DB) diff --git a/backend/tol_data/gen_desc_data.py b/backend/tol_data/gen_desc_data.py new file mode 100755 index 0000000..fa08a8c --- /dev/null +++ b/backend/tol_data/gen_desc_data.py @@ -0,0 +1,92 @@ +#!/usr/bin/python3 + +""" +Maps nodes to short descriptions, using data from DBpedia and +Wikipedia, and stores results in the database. +""" + +import os, sqlite3 + +DBPEDIA_DB = os.path.join('dbpedia', 'desc_data.db') +ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') +DB_FILE = 'data.db' + +def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None: + print('Creating table') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)') + # + print('Getting node mappings') + nodeToWikiId: dict[str, int] = {} + for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): + nodeToWikiId[name] = wikiId + # + print('Reading data from DBpedia') + dbpCon = sqlite3.connect(dbpediaDb) + dbpCur = dbpCon.cursor() + print('Getting node IRIs') + nodeToIri: dict[str, str] = {} + iterNum = 0 + for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone() + if row is not None: + nodeToIri[name] = row[0] + print('Resolving redirects') + iterNum = 0 + for name, iri in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone() + if row is not None: + nodeToIri[name] = row[0] + print('Adding descriptions') + iterNum = 0 + for name, iri in nodeToIri.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + row = dbpCur.execute('SELECT abstract FROM abstracts WHERE iri = ?', (iri,)).fetchone() + if row is not None: + dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1)) + del nodeToWikiId[name] + dbpCon.close() + # + print('Reading data from Wikipedia') + enwikiCon = sqlite3.connect(enwikiDb) + enwikiCur = enwikiCon.cursor() + print('Adding descriptions') + iterNum = 0 + for name, wikiId in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e3 == 0: + print(f'At iteration {iterNum}') + # Check for redirect + wikiIdToGet = wikiId + query = 'SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title' \ + ' WHERE redirects.id = ?' + row = enwikiCur.execute(query, (wikiId,)).fetchone() + if row is not None: + wikiIdToGet = row[0] + # + row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone() + if row is not None: + dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0)) + # + print('Closing databases') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(DBPEDIA_DB, ENWIKI_DB, DB_FILE) diff --git a/backend/tol_data/gen_imgs.py b/backend/tol_data/gen_imgs.py new file mode 100755 index 0000000..6d54e4d --- /dev/null +++ b/backend/tol_data/gen_imgs.py @@ -0,0 +1,214 @@ +#!/usr/bin/python3 + +""" +Reads node IDs and image paths from a file, and possibly from a directory, +and generates cropped/resized versions of those images into a directory, +with names of the form 'nodeId1.jpg'. Also adds image metadata to the +database. + +SIGINT can be used to stop, and the program can be re-run to continue +processing. It uses already-existing database entries to decide what +to skip. +""" + +import os, subprocess +import sqlite3, urllib.parse +import signal + +IMG_LIST_FILE = 'img_list.txt' +EOL_IMG_DIR = os.path.join('eol', 'imgs') # Used to decide which IMG_LIST_FILE lines denote chosen EOL images +OUT_DIR = 'img' +EOL_IMG_DB = os.path.join('eol', 'images_list.db') +ENWIKI_IMG_DB = os.path.join('enwiki', 'img_data.db') +PICKED_IMGS_DIR = 'picked_imgs' +PICKED_IMGS_FILE = 'img_data.txt' +DB_FILE = 'data.db' +# +IMG_OUT_SZ = 200 + +ImgId = tuple[int, str] # Holds an int ID and a source string (eg: 'eol') +class PickedImg: + """ Represents a picked-image from pickedImgsDir """ + def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str): + self.nodeName = nodeName + self.id = id + self.filename = filename + self.url = url + self.license = license + self.artist = artist + self.credit = credit + +def genImgs( + imgListFile: str, eolImgDir: str, outDir: str, eolImgDb: str, enwikiImgDb: str, + pickedImgsDir: str, pickedImgsFile: str, dbFile): + """ Reads the image-list file, generates images, and updates db """ + if not os.path.exists(outDir): + os.mkdir(outDir) + # + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + print('Checking for image tables') + nodesDone: set[str] = set() + imgsDone: set[ImgId] = set() + if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="node_imgs"').fetchone() is None: + # Add image tables if not present + dbCur.execute('CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)') + dbCur.execute('CREATE TABLE images (' \ + 'id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))') + else: + # Get existing image-associated nodes + for (otolId,) in dbCur.execute('SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name'): + nodesDone.add(otolId) + # Get existing node-associated images + for imgId, imgSrc in dbCur.execute('SELECT id, src from images'): + imgsDone.add((imgId, imgSrc)) + print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip') + # + print('Processing picked-images') + success = processPickedImgs(pickedImgsDir, pickedImgsFile, nodesDone, imgsDone, outDir, dbCur) + if success: + print('Processing images from eol and enwiki') + processImgs(imgListFile, eolImgDir, eolImgDb, enwikiImgDb, nodesDone, imgsDone, outDir, dbCur) + # Close db + dbCon.commit() + dbCon.close() +def processPickedImgs( + pickedImgsDir: str, pickedImgsFile: str, nodesDone: set[str], imgsDone: set[ImgId], + outDir: str, dbCur: sqlite3.Cursor) -> bool: + """ Converts picked-images and updates db, returning False upon interruption or failure """ + # Read picked-image data + nodeToPickedImg: dict[str, PickedImg] = {} + if os.path.exists(os.path.join(pickedImgsDir, pickedImgsFile)): + with open(os.path.join(pickedImgsDir, pickedImgsFile)) as file: + for lineNum, line in enumerate(file, 1): + filename, url, license, artist, credit = line.rstrip().split('|') + nodeName = os.path.splitext(filename)[0] # Remove extension + (otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone() + nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit) + # Set SIGINT handler + interrupted = False + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, onSigint) + # Convert images + for otolId, imgData in nodeToPickedImg.items(): + # Check for SIGINT event + if interrupted: + print('Exiting') + return False + # Skip if already processed + if otolId in nodesDone: + continue + # Convert image + success = convertImage(os.path.join(pickedImgsDir, imgData.filename), os.path.join(outDir, otolId + '.jpg')) + if not success: + return False + # Add entry to db + if (imgData.id, 'picked') not in imgsDone: + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (imgData.id, 'picked', imgData.url, imgData.license, imgData.artist, imgData.credit)) + imgsDone.add((imgData.id, 'picked')) + dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked')) + nodesDone.add(otolId) + return True +def processImgs( + imgListFile: str, eolImgDir: str, eolImgDb: str, enwikiImgDb: str, + nodesDone: set[str], imgsDone: set[ImgId], outDir: str, dbCur: sqlite3.Cursor) -> bool: + """ Converts EOL and enwiki images, and updates db, returning False upon interrupted or failure """ + eolCon = sqlite3.connect(eolImgDb) + eolCur = eolCon.cursor() + enwikiCon = sqlite3.connect(enwikiImgDb) + enwikiCur = enwikiCon.cursor() + # Set SIGINT handler + interrupted = False + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, onSigint) + # Convert images + flag = False # Set to True upon interruption or failure + with open(imgListFile) as file: + for line in file: + # Check for SIGINT event + if interrupted: + print('Exiting') + flag = True + break + # Skip lines without an image path + if line.find(' ') == -1: + continue + # Get filenames + otolId, _, imgPath = line.rstrip().partition(' ') + # Skip if already processed + if otolId in nodesDone: + continue + # Convert image + success = convertImage(imgPath, os.path.join(outDir, otolId + '.jpg')) + if not success: + flag = True + break + # Add entry to db + (nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone() + fromEol = imgPath.startswith(eolImgDir) + imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component + imgName = os.path.splitext(imgName)[0] # Remove extension + if fromEol: + eolIdStr, _, contentIdStr = imgName.partition(' ') + eolId, contentId = int(eolIdStr), int(contentIdStr) + if (eolId, 'eol') not in imgsDone: + query = 'SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?' + row = eolCur.execute(query, (contentId,)).fetchone() + if row is None: + print(f'ERROR: No image record for EOL ID {eolId}, content ID {contentId}') + flag = True + break + url, license, owner = row + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (eolId, 'eol', url, license, owner, '')) + imgsDone.add((eolId, 'eol')) + dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, eolId, 'eol')) + else: + enwikiId = int(imgName) + if (enwikiId, 'enwiki') not in imgsDone: + query = 'SELECT name, license, artist, credit FROM' \ + ' page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name' \ + ' WHERE page_imgs.page_id = ?' + row = enwikiCur.execute(query, (enwikiId,)).fetchone() + if row is None: + print(f'ERROR: No image record for enwiki ID {enwikiId}') + flag = True + break + name, license, artist, credit = row + url = 'https://en.wikipedia.org/wiki/File:' + urllib.parse.quote(name) + dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)', + (enwikiId, 'enwiki', url, license, artist, credit)) + imgsDone.add((enwikiId, 'enwiki')) + dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki')) + eolCon.close() + enwikiCon.close() + return not flag +def convertImage(imgPath: str, outPath: str): + print(f'Converting {imgPath} to {outPath}') + if os.path.exists(outPath): + print('ERROR: Output image already exists') + return False + try: + completedProcess = subprocess.run( + ['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath], + stdout=subprocess.DEVNULL + ) + except Exception as e: + print(f'ERROR: Exception while attempting to run smartcrop: {e}') + return False + if completedProcess.returncode != 0: + print(f'ERROR: smartcrop had exit status {completedProcess.returncode}') + return False + return True + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genImgs(IMG_LIST_FILE, EOL_IMG_DIR, OUT_DIR, EOL_IMG_DB, ENWIKI_IMG_DB, PICKED_IMGS_DIR, PICKED_IMGS_FILE, DB_FILE) diff --git a/backend/tol_data/gen_linked_imgs.py b/backend/tol_data/gen_linked_imgs.py new file mode 100755 index 0000000..7002e92 --- /dev/null +++ b/backend/tol_data/gen_linked_imgs.py @@ -0,0 +1,117 @@ +#!/usr/bin/python3 + +""" +Look for nodes without images in the database, and tries to +associate them with images from their children +""" + +import re +import sqlite3 + +DB_FILE = 'data.db' +# +COMPOUND_NAME_REGEX = re.compile(r'\[(.+) \+ (.+)]') +UP_PROPAGATE_COMPOUND_IMGS = False + +def genData(dbFile: str) -> None: + print('Opening database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)') + # + print('Getting nodes with images') + nodeToUsedId: dict[str, str] = {} # Maps name of node to otol ID of node to use image for + query = 'SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name' + for name, otolId in dbCur.execute(query): + nodeToUsedId[name] = otolId + print(f'Found {len(nodeToUsedId)}') + # + print('Getting node depths') + nodeToDepth: dict[str, int] = {} + maxDepth = 0 + nodeToParent: dict[str, str | None] = {} # Maps name of node to name of parent + for nodeName in nodeToUsedId.keys(): + nodeChain = [nodeName] + lastDepth = 0 + # Add ancestors + while True: + row = dbCur.execute('SELECT parent FROM edges WHERE child = ?', (nodeName,)).fetchone() + if row is None: + nodeToParent[nodeName] = None + break + nodeToParent[nodeName] = row[0] + nodeName = row[0] + nodeChain.append(nodeName) + if nodeName in nodeToDepth: + lastDepth = nodeToDepth[nodeName] + break + # Add depths + for i in range(len(nodeChain)): + nodeToDepth[nodeChain[-i-1]] = i + lastDepth + maxDepth = max(maxDepth, lastDepth + len(nodeChain) - 1) + # + print('Finding ancestors to give linked images') + depthToNodes: dict[int, list[str]] = {depth: [] for depth in range(maxDepth + 1)} + for nodeName, depth in nodeToDepth.items(): + depthToNodes[depth].append(nodeName) + parentToCandidate: dict[str, tuple[str, int]] = {} # Maps parent node name to candidate child name and tips-val + iterNum = 0 + for depth in range(maxDepth, -1, -1): + for node in depthToNodes[depth]: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + if node in parentToCandidate: + nodeToUsedId[node] = nodeToUsedId[parentToCandidate[node][0]] + dbCur.execute('INSERT INTO linked_imgs VALUES (?, ?)', (node, nodeToUsedId[node])) + parent = nodeToParent[node] + if parent is not None and parent not in nodeToUsedId: + (tips,) = dbCur.execute('SELECT tips FROM nodes WHERE name == ?', (node,)).fetchone() + if parent not in parentToCandidate or parentToCandidate[parent][1] < tips: + parentToCandidate[parent] = (node, tips) + # + print('Replacing linked-images for compound nodes') + for iterNum, node in enumerate(parentToCandidate.keys(), 1): + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + match = COMPOUND_NAME_REGEX.fullmatch(node) + if match is not None: + # Replace associated image with subname images + subName1, subName2 = match.group(1,2) + otolIdPair = ['', ''] + if subName1 in nodeToUsedId: + otolIdPair[0] = nodeToUsedId[subName1] + if subName2 in nodeToUsedId: + otolIdPair[1] = nodeToUsedId[subName2] + # Use no image if both subimages not found + if otolIdPair[0] == '' and otolIdPair[1] == '': + dbCur.execute('DELETE FROM linked_imgs WHERE name = ?', (node,)) + continue + # Add to db + dbCur.execute('UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', (','.join(otolIdPair), node)) + # Possibly repeat operation upon parent/ancestors + if UP_PROPAGATE_COMPOUND_IMGS: + while True: + parent = nodeToParent[node] + if parent is not None: + (tips,) = dbCur.execute('SELECT tips from nodes WHERE name = ?', (node,)).fetchone() + if parent in parentToCandidate and parentToCandidate[parent][1] <= tips: + # Replace associated image + dbCur.execute( + 'UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', (','.join(otolIdPair), parent)) + node = parent + continue + break + # + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(DB_FILE) diff --git a/backend/tol_data/gen_mapping_data.py b/backend/tol_data/gen_mapping_data.py new file mode 100755 index 0000000..95e930b --- /dev/null +++ b/backend/tol_data/gen_mapping_data.py @@ -0,0 +1,271 @@ +#!/usr/bin/python3 + +""" +Maps otol IDs to EOL and enwiki titles, using IDs from various +other sources (like NCBI). + +Reads otol taxonomy data to get source IDs for otol IDs, +then looks up those IDs in an EOL provider_ids file, +and in a wikidata dump, and stores results in the database. + +Based on code from https://github.com/OneZoom/OZtree, located in +OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). +""" + +import os +from collections import defaultdict +import gzip, csv, sqlite3 + +TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv') +EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz') +WIKIDATA_DB = os.path.join('wikidata', 'taxon_srcs.db') +ENWIKI_DUMP_INDEX_DB = os.path.join('enwiki', 'dumpIndex.db') +PICKED_MAPPINGS = { + 'eol': ['picked_eol_ids.txt'], + 'enwiki': ['picked_wiki_ids.txt', 'picked_wiki_ids_rough.txt'] +} +DB_FILE = 'data.db' + +OTOL_SRCS = ['ncbi', 'if', 'worms', 'irmng', 'gbif'] # Earlier sources will get higher priority +EOL_SRCS = {676: 'ncbi', 459: 'worms', 767: 'gbif'} # Maps external-source int-identifiers to names + +def genData( + taxonomyFile: str, + eolIdsFile: str, + wikidataDb: str, + pickedMappings: dict[str, list[str]], + enwikiDumpIndexDb: str, + dbFile: str) -> None: + """ Reads the files and enwiki db and creates the db """ + nodeToSrcIds: dict[int, dict[str, int]] = {} # Maps otol ID to {src1: id1, src2: id2, ...} + usedSrcIds: set[tuple[str, int]] = set() # {(src1, id1), ...} (used to avoid storing IDs that won't be used) + nodeToEolId: dict[int, int] = {} # Maps otol ID to eol ID + nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title + titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string + titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID + # Get mappings from data input + readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds) + readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId) + readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId) + readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle) + getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId) + # + print('Writing to db') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # Get otol id-to-name map + otolIdToName: dict[int, str] = {} + for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'): + if nodeId.startswith('ott'): + otolIdToName[int(nodeId[3:])] = nodeName + # Add eol mappings + dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)') + dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)') + for otolId, eolId in nodeToEolId.items(): + if otolId in otolIdToName: + dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId)) + # Add enwiki mappings + dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)') + dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)') + dbCur.execute('CREATE TABLE node_iucn (name TEXT PRIMARY KEY, iucn TEXT)') + for otolId, title in nodeToWikiTitle.items(): + if otolId in otolIdToName and title in titleToPageId: + dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title])) + if title in titleToIucnStatus: + dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title])) + dbCon.commit() + dbCon.close() +def readTaxonomyFile( + taxonomyFile: str, + nodeToSrcIds: dict[int, dict[str, int]], + usedSrcIds: set[tuple[str, int]]) -> None: + """ Reads taxonomy file, and maps OTOL node IDs to external-source IDs """ + # The file has a header line, then lines that hold these fields (each is followed by a tab-pipe-tab sequence): + # uid (otol-id, eg: 93302), parent_uid, name, rank, + # sourceinfo (comma-separated source specifiers, eg: ncbi:2952,gbif:3207147), uniqueName, flags + print('Reading taxonomy file') + with open(taxonomyFile) as file: # Had about 4.5e6 lines + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + # Skip header line + if lineNum == 1: + continue + # Parse line + fields = line.split('\t|\t') + try: + otolId = int(fields[0]) + except ValueError: + print(f'Skipping non-integral ID {fields[0]} on line {lineNum}') + continue + srcsField = fields[4] + # Add source IDs + for srcPair in srcsField.split(','): + src, srcIdStr = srcPair.split(':', 1) + if srcIdStr.isdecimal() and src in OTOL_SRCS: + if otolId not in nodeToSrcIds: + nodeToSrcIds[otolId] = {} + elif src in nodeToSrcIds[otolId]: + continue + srcId = int(srcIdStr) + nodeToSrcIds[otolId][src] = srcId + usedSrcIds.add((src, srcId)) + print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6 +def readEolIdsFile( + eolIdsFile: str, + nodeToSrcIds: dict[int, dict[str, int]], + usedSrcIds: set[tuple[str, int]], + nodeToEolId: dict[int, int]) -> None: + """ Reads EOL provider IDs file, and maps EOL IDs to external-source IDs """ + # The file is a CSV with a header line, then lines that hold these fields: + # node_id, resource_pk (ID from external source), resource_id (int denoting external-source), + # page_id (eol ID), preferred_canonical_for_page + print('Reading EOL provider IDs file') + srcToEolId: dict[str, dict[int, int]] = {src: {} for src in EOL_SRCS.values()} # Maps src1 to {id1: eolId1, ...} + with gzip.open(eolIdsFile, mode='rt') as file: # Had about 13e6 lines + for lineNum, row in enumerate(csv.reader(file), 1): + if lineNum % 1e6 == 0: + print(f'At line {lineNum}') + # Skip header line + if lineNum == 1: + continue + # Parse line + eolId = int(row[3]) + srcInt = int(row[2]) + srcIdStr = row[1] + if srcIdStr.isdecimal() and srcInt in EOL_SRCS: + srcId = int(srcIdStr) + src = EOL_SRCS[srcInt] + if (src, srcId) not in usedSrcIds: + continue + if srcId in srcToEolId[src]: + print(f'Found {src} ID {srcId} with multiple EOL IDs {srcToEolId[src][srcId]} and {eolId}') + continue + srcToEolId[src][srcId] = eolId + print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries') + # Was about 3.5e6 (4.2e6 without usedSrcIds) + # + print('Resolving candidate EOL IDs') + # For each otol ID, find eol IDs with matching sources, and choose the 'best' one + for otolId, srcInfo in nodeToSrcIds.items(): + eolIdToCount: dict[int, int] = defaultdict(int) + for src, srcId in srcInfo.items(): + if src in srcToEolId and srcId in srcToEolId[src]: + eolId = srcToEolId[src][srcId] + eolIdToCount[eolId] += 1 + if len(eolIdToCount) == 1: + nodeToEolId[otolId] = list(eolIdToCount)[0] + elif len(eolIdToCount) > 1: + # For multiple candidates, prefer those with most sources, and break ties by picking the lowest + maxCount = max(eolIdToCount.values()) + eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount] + nodeToEolId[otolId] = min(eolIds) + print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6 +def readWikidataDb( + wikidataDb: str, + nodeToSrcIds: dict[int, dict[str, int]], + usedSrcIds: set[tuple[str, int]], + nodeToWikiTitle: dict[int, str], + titleToIucnStatus: dict[str, str], + nodeToEolId: dict[int, int]) -> None: + """ Reads db holding ID and IUCN mappings from wikidata, and maps otol IDs to Wikipedia titles and EOL IDs """ + print('Reading from Wikidata db') + srcToWikiTitle: dict[str, dict[int, str]] = defaultdict(dict) # Maps 'eol'/etc to {srcId1: title1, ...} + wikiTitles = set() + dbCon = sqlite3.connect(wikidataDb) + dbCur = dbCon.cursor() + for src, srcId, title in dbCur.execute('SELECT src, id, title from src_id_to_title'): + if (src, srcId) in usedSrcIds or src == 'eol': # Keep EOL IDs for later use + srcToWikiTitle[src][srcId] = title + wikiTitles.add(title) + for title, status in dbCur.execute('SELECT title, status from title_iucn'): + if title in wikiTitles: + titleToIucnStatus[title] = status + print(f'- Source-to-title map has {sum([len(v) for v in srcToWikiTitle.values()]):,} entries') + # Was about 1.1e6 (1.2e6 without usedSrcIds) + print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds) + dbCon.close() + # + print('Resolving candidate Wikidata items') + # For each otol ID, find wikidata titles with matching sources, and choose the 'best' one + for otolId, srcInfo in nodeToSrcIds.items(): + titleToSrcs: dict[str, list[str]] = defaultdict(list) # Maps candidate titles to list of sources + for src, srcId in srcInfo.items(): + if src in srcToWikiTitle and srcId in srcToWikiTitle[src]: + title = srcToWikiTitle[src][srcId] + titleToSrcs[title].append(src) + # Choose title to use + if len(titleToSrcs) == 1: + nodeToWikiTitle[otolId] = list(titleToSrcs)[0] + elif len(titleToSrcs) > 1: # Test example: otol ID 621052 + # Get titles with most sources + maxSrcCnt = max([len(srcs) for srcs in titleToSrcs.values()]) + titleToSrcs = {t: s for t, s in titleToSrcs.items() if len(s) == maxSrcCnt} + if len(titleToSrcs) == 1: + nodeToWikiTitle[otolId] = list(titleToSrcs)[0] + else: + # Get a title with a source with highest priority + srcToTitle = {s: t for t in titleToSrcs for s in titleToSrcs[t]} + for src in OTOL_SRCS: + if src in srcToTitle: + nodeToWikiTitle[otolId] = srcToTitle[src] + break + print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5 + # + print('Adding extra EOL mappings from Wikidata') + wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()} + addedEntries: dict[int, int] = {} + for eolId, title in srcToWikiTitle['eol'].items(): + if title in wikiTitleToNode: + otolId = wikiTitleToNode[title] + if otolId not in nodeToEolId: # Only add if the otol ID has no EOL ID + nodeToEolId[otolId] = eolId + addedEntries[otolId] = eolId + print(f'- Added {len(addedEntries):,} entries') # Was about 3e3 +def readPickedMappings( + pickedMappings: dict[str, list[str]], + nodeToEolId: dict[int, int], + nodeToWikiTitle: dict[int, str]) -> None: + """ Read mappings from OTOL IDs to EOL IDs and Wikipedia titles """ + print('Reading picked mappings') + for src in pickedMappings: + for filename in pickedMappings[src]: + if not os.path.exists(filename): + continue + with open(filename) as file: + for line in file: + otolIdStr, mappedVal = line.rstrip().split('|') + otolId = int(otolIdStr) + if src == 'eol': + if mappedVal: + nodeToEolId[otolId] = int(mappedVal) + else: + if otolId in nodeToEolId: + del nodeToEolId[otolId] + else: # src == 'enwiki' + if mappedVal: + nodeToWikiTitle[otolId] = mappedVal + else: + if otolId in nodeToWikiTitle: + del nodeToWikiTitle[otolId] +def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None: + """ Read a db for mappings from enwiki titles to page IDs """ + print('Getting enwiki page IDs') + numNotFound = 0 + dbCon = sqlite3.connect(enwikiDumpIndexDb) + dbCur = dbCon.cursor() + for title in nodeToWikiTitle.values(): + record = dbCur.execute('SELECT id FROM offsets WHERE title = ?', (title,)).fetchone() + if record != None: + titleToPageId[title] = record[0] + else: + numNotFound += 1 + dbCon.close() + print(f'Unable to find IDs for {numNotFound} titles') # Was 2913 + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE) diff --git a/backend/tol_data/gen_name_data.py b/backend/tol_data/gen_name_data.py new file mode 100755 index 0000000..2e92c20 --- /dev/null +++ b/backend/tol_data/gen_name_data.py @@ -0,0 +1,128 @@ +#!/usr/bin/python3 + +""" +Maps nodes to vernacular names, using data from EOL, enwiki, and a +picked-names file, and stores results in the database. +""" + +import re, os +import html, csv, sqlite3 + +EOL_NAMES_FILE = os.path.join('eol', 'vernacularNames.csv') +ENWIKI_DB = os.path.join('enwiki', 'desc_data.db') +PICKED_NAMES_FILE = 'picked_names.txt' +DB_FILE = 'data.db' + +def genData(eolNamesFile: str, enwikiDb: str, pickedNamesFile: str, dbFile: str) -> None: + """ Reads the files and adds to db """ + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Creating table') + dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))') + dbCur.execute('CREATE INDEX names_idx ON names(name)') + dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)') + dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)') + # + print('Getting node mappings') + nodeToTips: dict[str, int] = {} + for name, tips in dbCur.execute('SELECT name, tips from nodes'): + nodeToTips[name] = tips + # + addEolNames(eolNamesFile, nodeToTips, dbCur) + addEnwikiNames(enwikiDb, nodeToTips, dbCur) + addPickedNames(pickedNamesFile, nodeToTips, dbCur) + # + print('Closing database') + dbCon.commit() + dbCon.close() +def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None: + """ Reads EOL names, associates them with otol nodes, and writes to db """ + # The CSV file has a header line, then lines with these fields: + # page_id, canonical_form (canonical name, not always unique to page ID), + # vernacular_string (vernacular name), language_code, + # resource_name, is_preferred_by_resource, is_preferred_by_eol + print('Getting EOL mappings') + eolIdToNode: dict[int, str] = {} # Maps eol ID to node name (if there are multiple, choose one with most tips) + for name, eolId in dbCur.execute('SELECT name, id from eol_ids'): + if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]: + eolIdToNode[eolId] = name + print('Adding names from EOL') + namesToSkip = {'unknown', 'unknown species', 'unidentified species'} + with open(eolNamesFile, newline='') as file: + for lineNum, fields in enumerate(csv.reader(file), 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') # Reached about 2.8e6 + # Skip header line + if lineNum == 1: + continue + # Parse line + eolId = int(fields[0]) + name = html.unescape(fields[2]).lower() + lang = fields[3] + isPreferred = 1 if fields[6] == 'preferred' else 0 + # Add to db + if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \ + and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words + cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')' + # The 'OR IGNORE' accounts for duplicate lines + dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred)) +def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None: + """ Reads enwiki names, associates them with otol nodes, and writes to db """ + print('Getting enwiki mappings') + wikiIdToNode: dict[int, str] = {} + for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'): + if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]: + wikiIdToNode[wikiId] = name + print('Adding names from enwiki') + altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)', + enwikiCon = sqlite3.connect(enwikiDb) + enwikiCur = enwikiCon.cursor() + iterNum = 0 + for wikiId, nodeName in wikiIdToNode.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') # Reached about 3.6e5 + # + query = 'SELECT p1.title FROM pages p1' \ + ' INNER JOIN redirects r1 ON p1.id = r1.id' \ + ' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?' + for (name,) in enwikiCur.execute(query, (wikiId,)): + name = name.lower() + if altNameRegex.fullmatch(name) is not None and name != nodeName and name not in nodeToTips: + dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0)) +def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None: + # File format: + # nodename1|altName1|isPreferred1 -> Add an alt-name + # nodename1|altName1| -> Remove an alt-name + # nodename1|nodeName1| -> Remove any preferred-alt status + if os.path.exists(pickedNamesFile): + print('Getting picked names') + with open(pickedNamesFile) as file: + for line in file: + nodeName, altName, isPreferredStr = line.lower().rstrip().split('|') + if nodeName not in nodeToTips: + print(f'Skipping "{nodeName}", as no such node exists') + continue + if isPreferredStr: + isPreferred = 1 if isPreferredStr == '1' else 0 + if isPreferred == 1: + # Remove any existing preferred-alt status + cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ? AND pref_alt = 1' + dbCur.execute(cmd, (nodeName, altName)) + # Remove any existing record + dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName)) + # Add record + dbCur.execute('INSERT INTO names VALUES (?, ?, ?, "picked")', (nodeName, altName, isPreferred)) + elif nodeName != altName: # Remove any matching record + dbCur.execute('DELETE FROM names WHERE name = ? AND alt_name = ?', (nodeName, altName)) + else: # Remove any preferred-alt status + cmd = 'UPDATE names SET pref_alt = 0 WHERE name = ? AND pref_alt = 1' + dbCur.execute(cmd, (nodeName,)) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(EOL_NAMES_FILE, ENWIKI_DB, PICKED_NAMES_FILE, DB_FILE) diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py new file mode 100755 index 0000000..eba8779 --- /dev/null +++ b/backend/tol_data/gen_otol_data.py @@ -0,0 +1,267 @@ +#!/usr/bin/python3 + +""" +Reads files describing a tree-of-life from an 'Open Tree of Life' release, +and stores tree info in a database. + +Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: + The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 + The root node is named n6, and has children n1, n2, and n5. + Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', + 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. + The node with ID 'ott770315' will get the name 'homo sapiens'. + A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). + It is possible for multiple nodes to have the same name. + In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. +Reads an annotations.json file, which is assumed to have this format: + Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, + such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that + support/conflict with the node's placement. +Reads from a picked-names file, if present, which specifies name and node ID pairs. + These help resolve cases where multiple nodes share the same name. +""" + +import re, os +import json, sqlite3 + +TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes +ANN_FILE = os.path.join('otol', 'annotations.json') +DB_FILE = 'data.db' +PICKED_NAMES_FILE = 'picked_otol_names.txt' + +class Node: + """ Represents a tree-of-life node """ + def __init__(self, name, childIds, parentId, tips, pSupport): + self.name = name + self.childIds = childIds + self.parentId = parentId + self.tips = tips + self.pSupport = pSupport +class BasicStream: + """ Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """ + def __init__(self, data, idx=0): + self.data = data + self.idx = idx + def hasNext(self) -> bool: + return self.idx < len(self.data) + def next(self) -> str: + if self.hasNext(): + char = self.data[self.idx] + self.idx += 1 + return char; + else: + return ''; + def peek(self) -> str: + if self.hasNext(): + return self.data[self.idx] + else: + return ''; + def skipWhitespace(self) -> None: + while self.hasNext() and self.data[self.idx].isspace(): + self.idx += 1 + def progress(self) -> float: + return (self.idx / len(self.data)) + +def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None: + """ Reads the files and stores the tree info """ + nodeMap: dict[str, Node] = {} # Maps node IDs to node objects + nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs) + dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs + # + print('Parsing tree file') + treeStream: BasicStream + with open(treeFile) as file: + treeStream = BasicStream(file.read()) + # Parse content + parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds) + print('Resolving duplicate names') + # Read picked-names file + nameToPickedId: dict[str, str] = {} + if os.path.exists(pickedNamesFile): + with open(pickedNamesFile) as file: + for line in file: + name, _, otolId = line.strip().partition('|') + nameToPickedId[name] = otolId + # Resolve duplicates + for dupName, ids in dupNameToIds.items(): + # Check for picked id + if dupName in nameToPickedId: + idToUse = nameToPickedId[dupName] + else: + # Get conflicting node with most tips + tipNums = [nodeMap[id].tips for id in ids] + maxIdx = tipNums.index(max(tipNums)) + idToUse = ids[maxIdx] + # Adjust name of other conflicting nodes + counter = 2 + for id in ids: + if id != idToUse: + nodeMap[id].name += f' [{counter}]' + counter += 1 + print('Changing mrca* names') + for id, node in nodeMap.items(): + if node.name.startswith('mrca'): + convertMrcaName(id, nodeMap) + print('Parsing annotations file') + # Read file + with open(annFile) as file: + data = file.read() + obj = json.loads(data) + nodeAnnsMap = obj['nodes'] + # Find relevant annotations + for id, node in nodeMap.items(): + # Set has-support value using annotations + if id in nodeAnnsMap: + nodeAnns = nodeAnnsMap[id] + supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0 + conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0 + node.pSupport = supportQty > 0 and conflictQty == 0 + print('Creating nodes and edges tables') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)') + dbCur.execute('CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)') + dbCur.execute('CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))') + dbCur.execute('CREATE INDEX edges_child_idx ON edges(child)') + for otolId, node in nodeMap.items(): + dbCur.execute('INSERT INTO nodes VALUES (?, ?, ?)', (node.name, otolId, node.tips)) + for childId in node.childIds: + childNode = nodeMap[childId] + dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)', + (node.name, childNode.name, 1 if childNode.pSupport else 0)) + print('Closing database') + dbCon.commit() + dbCon.close() +def parseNewick( + stream: BasicStream, + nodeMap: dict[str, Node], + nameToFirstId: dict[str, str], + dupNameToIds: dict[str, list[str]]) -> str: + """ Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """ + if stream.idx % 1e5 == 0: + print(f'Progress: {stream.progress() * 100:.2f}%') + # Find node + stream.skipWhitespace() + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == '(': # Start of inner node + stream.next() + childIds: list[str] = [] + while True: + # Read child + childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds) + childIds.append(childId) + # Check for next child or end of node + stream.skipWhitespace() + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == ',': # Expect another child + stream.next() + continue + else: # End of child list + # Get node name and id + stream.next() # Consume an expected ')' + stream.skipWhitespace() + name, id = parseNewickName(stream) + updateNameMaps(name, id, nameToFirstId, dupNameToIds) + # Get child num-tips total + tips = 0 + for childId in childIds: + tips += nodeMap[childId].tips + # Add node to nodeMap + nodeMap[id] = Node(name, childIds, None, tips, False) + # Update childrens' parent reference + for childId in childIds: + nodeMap[childId].parentId = id + return id + else: # Parse node name + name, id = parseNewickName(stream) + updateNameMaps(name, id, nameToFirstId, dupNameToIds) + nodeMap[id] = Node(name, [], None, 1, False) + return id +def parseNewickName(stream: BasicStream) -> tuple[str, str]: + """ Parses a node name from 'stream', and returns a (name, id) pair """ + name: str + nameChars = [] + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == "'": # Quoted name + nameChars.append(stream.next()) + while True: + if stream.peek() == '': + raise Exception(f'ERROR: Unexpected EOF at index {stream.idx}') + elif stream.peek() == "'": + nameChars.append(stream.next()) + if stream.peek() == "'": # '' is escaped-quote + nameChars.append(stream.next()) + continue + break + nameChars.append(stream.next()) + else: + while stream.hasNext() and not re.match(r'[(),;]', stream.peek()): + nameChars.append(stream.next()) + if stream.peek() == ';': # Ignore trailing input semicolon + stream.next() + # Convert to (name, id) + name = ''.join(nameChars).rstrip().lower() + if name.startswith('mrca'): + return (name, name) + elif name[0] == "'": + match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) + if match is None: + raise Exception(f'ERROR: invalid name \'{name}\'') + name = match.group(1).replace("''", "'") + return (name, match.group(2)) + else: + match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) + if match is None: + raise Exception(f'ERROR: invalid name \'{name}\'') + return (match.group(1).replace('_', ' '), match.group(2)) +def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None: + """ Update maps upon a newly parsed name """ + if name not in nameToFirstId: + nameToFirstId[name] = id + else: + if name not in dupNameToIds: + dupNameToIds[name] = [nameToFirstId[name], id] + else: + dupNameToIds[name].append(id) +def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str: + """ Update a node in a tree to be named after 2 descendants. + Returns the name of one such descendant, for use during recursion. """ + node = nodeMap[id] + name = node.name + childIds = node.childIds + if len(childIds) < 2: + raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children') + # Get 2 children with most tips + childTips = [nodeMap[id].tips for id in childIds] + maxIdx1 = childTips.index(max(childTips)) + childTips[maxIdx1] = 0 + maxIdx2 = childTips.index(max(childTips)) + childId1 = childIds[maxIdx1] + childId2 = childIds[maxIdx2] + childName1 = nodeMap[childId1].name + childName2 = nodeMap[childId2].name + # Check for mrca* child names + if childName1.startswith('mrca'): + childName1 = convertMrcaName(childId1, nodeMap) + if childName2.startswith('mrca'): + childName2 = convertMrcaName(childId2, nodeMap) + # Check for composite names + match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1) + if match is not None: + childName1 = match.group(1) + match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2) + if match is not None: + childName2 = match.group(1) + # Create composite name + node.name = f'[{childName1} + {childName2}]' + return childName1 + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE) diff --git a/backend/tol_data/gen_pop_data.py b/backend/tol_data/gen_pop_data.py new file mode 100755 index 0000000..e6a646e --- /dev/null +++ b/backend/tol_data/gen_pop_data.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +""" +Reads enwiki page view info from a database, and stores it +as node popularity values in the database. +""" + +import os, sqlite3 + +PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db') +DB_FILE = 'data.db' + +def genData(pageviewsDb: str, dbFile: str) -> None: + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Getting view counts') + pdbCon = sqlite3.connect(pageviewsDb) + pdbCur = pdbCon.cursor() + nodeToViews: dict[str, int] = {} # Maps node names to counts + iterNum = 0 + for wikiId, views in pdbCur.execute('SELECT id, views from views'): + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') # Reached 1.6e6 + # + row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone() + if row is not None: + nodeToViews[row[0]] = views + pdbCon.close() + # + print(f'Writing {len(nodeToViews)} entries to db') + dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)') + for nodeName, views in nodeToViews.items(): + dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views)) + # + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # + genData(PAGEVIEWS_DB, DB_FILE) diff --git a/backend/tolData/genReducedTrees.py b/backend/tol_data/gen_reduced_trees.py index 66fef40..3742544 100755 --- a/backend/tolData/genReducedTrees.py +++ b/backend/tol_data/gen_reduced_trees.py @@ -1,10 +1,6 @@ #!/usr/bin/python3 -import sys, re -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" +""" Creates reduced versions of the tree in the database: - A 'picked nodes' tree: Created from a minimal set of node names read from a file, @@ -16,13 +12,14 @@ Creates reduced versions of the tree in the database: Created by removing nodes that lack an image or description, or presence in the 'picked' tree. And, for nodes with 'many' children, removing some more, despite any node descriptions. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.add_argument('--tree', choices=['picked', 'images', 'trimmed'], help='Only generate the specified tree') -args = parser.parse_args() +""" + +import sys, re +import sqlite3 -tree = args.tree -dbFile = 'data.db' -pickedNodesFile = 'pickedNodes.txt' +DB_FILE = 'data.db' +PICKED_NODES_FILE = 'picked_nodes.txt' +# COMP_NAME_REGEX = re.compile(r'\[.+ \+ .+]') # Used to recognise composite nodes class Node: @@ -33,12 +30,69 @@ class Node: self.tips = tips self.pSupport = pSupport -print('Opening database') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - +def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None: + print('Opening database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Finding root node') + query = 'SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1' + (rootName,) = dbCur.execute(query).fetchone() + print(f'Found \'{rootName}\'') + # + print('=== Getting picked-nodes ===') + pickedNames: set[str] = set() + pickedTreeExists = False + if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="nodes_p"').fetchone() is None: + print(f'Reading from {pickedNodesFile}') + with open(pickedNodesFile) as file: + for line in file: + name = line.rstrip() + row = dbCur.execute('SELECT name from nodes WHERE name = ?', (name,)).fetchone() + if row is None: + row = dbCur.execute('SELECT name from names WHERE alt_name = ?', (name,)).fetchone() + if row is not None: + pickedNames.add(row[0]) + if not pickedNames: + raise Exception('ERROR: No picked names found') + else: + pickedTreeExists = True + print('Picked-node tree already exists') + if tree == 'picked': + sys.exit() + for (name,) in dbCur.execute('SELECT name FROM nodes_p'): + pickedNames.add(name) + print(f'Found {len(pickedNames)} names') + # + if (tree == 'picked' or tree is None) and not pickedTreeExists: + print('=== Generating picked-nodes tree ===') + genPickedNodeTree(dbCur, pickedNames, rootName) + if tree != 'picked': + print('=== Finding \'non-low significance\' nodes ===') + nodesWithImgOrPicked: set[str] = set() + nodesWithImgDescOrPicked: set[str] = set() + print('Finding nodes with descs') + for (name,) in dbCur.execute('SELECT name FROM wiki_ids INNER JOIN descs ON wiki_ids.id = descs.wiki_id'): + nodesWithImgDescOrPicked.add(name) + print('Finding nodes with images') + for (name,) in dbCur.execute('SELECT name FROM node_imgs'): + nodesWithImgDescOrPicked.add(name) + nodesWithImgOrPicked.add(name) + print('Adding picked nodes') + for name in pickedNames: + nodesWithImgDescOrPicked.add(name) + nodesWithImgOrPicked.add(name) + if tree == 'images' or tree is None: + print('=== Generating images-only tree ===') + genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName) + if tree == 'trimmed' or tree is None: + print('=== Generating weakly-trimmed tree ===') + genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName) + # + print('Closing database') + dbCon.commit() + dbCon.close() def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: str) -> None: - global COMP_NAME_REGEX PREF_NUM_CHILDREN = 3 # Include extra children up to this limit print('Getting ancestors') nodeMap = genNodeMap(dbCur, pickedNames, 100) @@ -175,7 +229,6 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) - return nodeMap def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]: """ Given a tree, removes composite-name nodes, and returns the removed nodes' names """ - global COMP_NAME_REGEX namesToRemove: set[str] = set() for name, node in nodeMap.items(): parent = node.parent @@ -275,60 +328,10 @@ def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str): pSupport = 1 if nodeMap[childName].pSupport else 0 dbCur.execute(f'INSERT INTO {edgesTbl} VALUES (?, ?, ?)', (name, childName, pSupport)) -print('Finding root node') -query = 'SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1' -(rootName,) = dbCur.execute(query).fetchone() -print(f'Found \'{rootName}\'') - -print('=== Getting picked-nodes ===') -pickedNames: set[str] = set() -pickedTreeExists = False -if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="nodes_p"').fetchone() is None: - print(f'Reading from {pickedNodesFile}') - with open(pickedNodesFile) as file: - for line in file: - name = line.rstrip() - row = dbCur.execute('SELECT name from nodes WHERE name = ?', (name,)).fetchone() - if row is None: - row = dbCur.execute('SELECT name from names WHERE alt_name = ?', (name,)).fetchone() - if row is not None: - pickedNames.add(row[0]) - if not pickedNames: - raise Exception('ERROR: No picked names found') -else: - pickedTreeExists = True - print('Picked-node tree already exists') - if tree == 'picked': - sys.exit() - for (name,) in dbCur.execute('SELECT name FROM nodes_p'): - pickedNames.add(name) -print(f'Found {len(pickedNames)} names') - -if (tree == 'picked' or tree is None) and not pickedTreeExists: - print('=== Generating picked-nodes tree ===') - genPickedNodeTree(dbCur, pickedNames, rootName) -if tree != 'picked': - print('=== Finding \'non-low significance\' nodes ===') - nodesWithImgOrPicked: set[str] = set() - nodesWithImgDescOrPicked: set[str] = set() - print('Finding nodes with descs') - for (name,) in dbCur.execute('SELECT name FROM wiki_ids'): # Can assume the wiki_id has a desc - nodesWithImgDescOrPicked.add(name) - print('Finding nodes with images') - for (name,) in dbCur.execute('SELECT name FROM node_imgs'): - nodesWithImgDescOrPicked.add(name) - nodesWithImgOrPicked.add(name) - print('Adding picked nodes') - for name in pickedNames: - nodesWithImgDescOrPicked.add(name) - nodesWithImgOrPicked.add(name) - if tree == 'images' or tree is None: - print('=== Generating images-only tree ===') - genImagesOnlyTree(dbCur, nodesWithImgOrPicked, pickedNames, rootName) - if tree == 'trimmed' or tree is None: - print('=== Generating weakly-trimmed tree ===') - genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName) - -print('Closing database') -dbCon.commit() -dbCon.close() +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('--tree', choices=['picked', 'images', 'trimmed'], help='Only generate the specified tree') + args = parser.parse_args() + # + genData(args.tree, DB_FILE, PICKED_NODES_FILE) diff --git a/backend/tolData/otol/README.md b/backend/tol_data/otol/README.md index e018369..e018369 100644 --- a/backend/tolData/otol/README.md +++ b/backend/tol_data/otol/README.md diff --git a/backend/tolData/pickedImgs/README.md b/backend/tol_data/picked_imgs/README.md index dfe192b..1edd951 100644 --- a/backend/tolData/pickedImgs/README.md +++ b/backend/tol_data/picked_imgs/README.md @@ -4,7 +4,7 @@ on top of those from EOL and Wikipedia. Possible Files ============== - (Image files) -- imgData.txt <br> +- img_data.txt <br> Contains lines with the format `filename|url|license|artist|credit`. The filename should consist of a node name, with an image extension. Other fields correspond to those in the `images` table (see ../README.md). diff --git a/backend/tolData/reviewImgsToGen.py b/backend/tol_data/review_imgs_to_gen.py index f3791bc..2283ed7 100755 --- a/backend/tolData/reviewImgsToGen.py +++ b/backend/tol_data/review_imgs_to_gen.py @@ -1,14 +1,6 @@ #!/usr/bin/python3 -import os, time -import sqlite3 -import tkinter as tki -from tkinter import ttk -import PIL -from PIL import ImageTk, Image, ImageOps - -import argparse -parser = argparse.ArgumentParser(description=""" +""" Provides a GUI that displays, for each node in the database, associated images from EOL and Wikipedia, and allows choosing which to use. Writes choice data to a text file with lines of the form 'otolId1 imgPath1', or @@ -17,69 +9,27 @@ choice data to a text file with lines of the form 'otolId1 imgPath1', or The program can be closed, and run again to continue from the last choice. The program looks for an existing output file to determine what choices have already been made. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() +""" + +import os, time +import sqlite3 +import tkinter as tki +from tkinter import ttk +import PIL +from PIL import ImageTk, Image, ImageOps -eolImgDir = 'eol/imgs/' -enwikiImgDir = 'enwiki/imgs/' -dbFile = 'data.db' -outFile = 'imgList.txt' +EOL_IMG_DIR = os.path.join('eol', 'imgs') +ENWIKI_IMG_DIR = os.path.join('enwiki', 'imgs') +DB_FILE = 'data.db' +OUT_FILE = 'img_list.txt' +# IMG_DISPLAY_SZ = 400 PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135)) -onlyReviewPairs = True - -print('Opening database') -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -nodeToImgs: dict[str, list[str]] = {} # Maps otol-ids to arrays of image paths -print('Iterating through images from EOL') -if os.path.exists(eolImgDir): - for filename in os.listdir(eolImgDir): - # Get associated EOL ID - eolId, _, _ = filename.partition(' ') - query = 'SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?' - # Get associated node IDs - found = False - for (otolId,) in dbCur.execute(query, (int(eolId),)): - if otolId not in nodeToImgs: - nodeToImgs[otolId] = [] - nodeToImgs[otolId].append(eolImgDir + filename) - found = True - if not found: - print(f'WARNING: No node found for {eolImgDir}{filename}') -print(f'Result: {len(nodeToImgs)} nodes with images') -print('Iterating through images from Wikipedia') -if os.path.exists(enwikiImgDir): - for filename in os.listdir(enwikiImgDir): - # Get associated page ID - wikiId, _, _ = filename.partition('.') - # Get associated node IDs - query = 'SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id = ?' - found = False - for (otolId,) in dbCur.execute(query, (int(wikiId),)): - if otolId not in nodeToImgs: - nodeToImgs[otolId] = [] - nodeToImgs[otolId].append(enwikiImgDir + filename) - found = True - if not found: - print(f'WARNING: No node found for {enwikiImgDir}{filename}') -print(f'Result: {len(nodeToImgs)} nodes with images') -print('Filtering out already-made image choices') - -oldSz = len(nodeToImgs) -if os.path.exists(outFile): - with open(outFile) as file: - for line in file: - line = line.rstrip() - if ' ' in line: - line = line[:line.find(' ')] - del nodeToImgs[line] -print(f'Filtered out {oldSz - len(nodeToImgs)} entries') +REVIEW = 'only pairs' # Can be: 'all', 'only pairs', 'none' class ImgReviewer: """ Provides the GUI for reviewing images """ - def __init__(self, root, nodeToImgs): + def __init__(self, root, nodeToImgs, eolImgDir, enwikiImgDir, outFile, dbCon, review): self.root = root root.title('Image Reviewer') # Setup main frame @@ -108,6 +58,12 @@ class ImgReviewer: # Set fields self.nodeImgsList = list(nodeToImgs.items()) self.listIdx = -1 + self.eolImgDir = eolImgDir + self.enwikiImgDir = enwikiImgDir + self.outFile = outFile + self.review = review + self.dbCon = dbCon + self.dbCur = dbCon.cursor() self.otolId = None self.eolImgPath = None self.enwikiImgPath = None @@ -126,10 +82,14 @@ class ImgReviewer: return self.otolId, imgPaths = self.nodeImgsList[self.listIdx] # Potentially skip user choice - if onlyReviewPairs and len(imgPaths) == 1: - with open(outFile, 'a') as file: + if len(imgPaths) == 1 and (self.review == 'only pairs' or self.review == 'none'): + with open(self.outFile, 'a') as file: file.write(f'{self.otolId} {imgPaths[0]}\n') continue + elif self.review == 'none': + with open(self.outFile, 'a') as file: + file.write(f'{self.otolId} {imgPaths[-1]}\n') # Prefer enwiki image + continue break # Update displayed images self.eolImgPath = self.enwikiImgPath = None @@ -143,10 +103,10 @@ class ImgReviewer: print(f'UnidentifiedImageError for {imgPath}') imageOpenError = True continue - if imgPath.startswith('eol/'): + if imgPath.startswith(self.eolImgDir): self.eolImgPath = imgPath self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(img)) - elif imgPath.startswith('enwiki/'): + elif imgPath.startswith(self.enwikiImgDir): self.enwikiImgPath = imgPath self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(img)) else: @@ -172,7 +132,7 @@ class ImgReviewer: query = 'SELECT names.alt_name FROM' \ ' nodes INNER JOIN names ON nodes.name = names.name' \ ' WHERE nodes.id = ? and pref_alt = 1' - row = dbCur.execute(query, (self.otolId,)).fetchone() + row = self.dbCur.execute(query, (self.otolId,)).fetchone() if row is not None: title += f', aka {row[0]}' title += f' ({self.listIdx + 1} out of {len(self.nodeImgsList)})' @@ -183,24 +143,23 @@ class ImgReviewer: if imgPath is None: print('Invalid selection') return - with open(outFile, 'a') as file: + with open(self.outFile, 'a') as file: file.write(f'{self.otolId} {imgPath}\n') self.numReviewed += 1 self.getNextImgs() def reject(self): """"" React to a user rejecting all images of a set """ - with open(outFile, 'a') as file: + with open(self.outFile, 'a') as file: file.write(f'{self.otolId}\n') self.numReviewed += 1 self.getNextImgs() def quit(self, e = None): - global dbCon print(f'Number reviewed: {self.numReviewed}') timeElapsed = time.time() - self.startTime print(f'Time elapsed: {timeElapsed:.2f} seconds') if self.numReviewed > 0: print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds') - dbCon.close() + self.dbCon.close() self.root.destroy() def resizeImgForDisplay(self, img): """ Returns a copy of an image, shrunk to fit it's frame (keeps aspect ratio), and with a background """ @@ -216,8 +175,67 @@ class ImgReviewer: int((IMG_DISPLAY_SZ - img.width) / 2), int((IMG_DISPLAY_SZ - img.height) / 2))) return bgImg -# Create GUI and defer control -print('Starting GUI') -root = tki.Tk() -ImgReviewer(root, nodeToImgs) -root.mainloop() + +def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, review: str) -> None: + print('Opening database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + nodeToImgs: dict[str, list[str]] = {} # Maps otol-ids to arrays of image paths + print('Iterating through images from EOL') + if os.path.exists(eolImgDir): + for filename in os.listdir(eolImgDir): + # Get associated EOL ID + eolId, _, _ = filename.partition(' ') + query = 'SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?' + # Get associated node IDs + found = False + for (otolId,) in dbCur.execute(query, (int(eolId),)): + if otolId not in nodeToImgs: + nodeToImgs[otolId] = [] + nodeToImgs[otolId].append(os.path.join(eolImgDir, filename)) + found = True + if not found: + print(f'WARNING: No node found for {os.path.join(eolImgDir, filename)}') + print(f'Result: {len(nodeToImgs)} nodes with images') + print('Iterating through images from Wikipedia') + if os.path.exists(enwikiImgDir): + for filename in os.listdir(enwikiImgDir): + # Get associated page ID + wikiId, _, _ = filename.partition('.') + # Get associated node IDs + query = 'SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id = ?' + found = False + for (otolId,) in dbCur.execute(query, (int(wikiId),)): + if otolId not in nodeToImgs: + nodeToImgs[otolId] = [] + nodeToImgs[otolId].append(os.path.join(enwikiImgDir, filename)) + found = True + if not found: + print(f'WARNING: No node found for {os.path.join(enwikiImgDir, filename)}') + print(f'Result: {len(nodeToImgs)} nodes with images') + # + print('Filtering out already-made image choices') + oldSz = len(nodeToImgs) + if os.path.exists(outFile): + with open(outFile) as file: + for line in file: + line = line.rstrip() + if ' ' in line: + line = line[:line.find(' ')] + del nodeToImgs[line] + print(f'Filtered out {oldSz - len(nodeToImgs)} entries') + # + # Create GUI and defer control + print('Starting GUI') + root = tki.Tk() + ImgReviewer(root, nodeToImgs, eolImgDir, enwikiImgDir, outFile, dbCon, review) + root.mainloop() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + reviewImgs(EOL_IMG_DIR, ENWIKI_IMG_DIR, DB_FILE, OUT_FILE, REVIEW) diff --git a/backend/tolData/wikidata/README.md b/backend/tol_data/wikidata/README.md index db45b3c..7b3105e 100644 --- a/backend/tolData/wikidata/README.md +++ b/backend/tol_data/wikidata/README.md @@ -6,13 +6,13 @@ This directory holds files obtained via [Wikidata](https://www.wikidata.org/). Format info can be found at <https://doc.wikimedia.org/Wikibase/master/php/md_docs_topics_json.html>. # Other Files -- genTaxonSrcData.py <br> +- `gen_taxon_src_data.py` <br> Used to generate a database holding taxon information from the dump. -- offsets.dat <br> +- `offsets.dat` <br> Holds bzip2 block offsets for the dump. Generated and used by genTaxonSrcData.py for parallel processing of the dump. -- taxonSrcs.db <br> - Generated by genTaxonSrcData.py. <br> +- `taxon_srcs.db` <br> + Generated by `gen_taxon_src_data.py`. <br> Tables: <br> - `src_id_to_title`: `src TEXT, id INT, title TEXT, PRIMARY KEY(src, id)` - `title_iucn`: `title TEXT PRIMARY KEY, status TEXT` diff --git a/backend/tol_data/wikidata/__init__.py b/backend/tol_data/wikidata/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/backend/tol_data/wikidata/__init__.py diff --git a/backend/tolData/wikidata/genTaxonSrcData.py b/backend/tol_data/wikidata/gen_taxon_src_data.py index 5d10c71..50ed917 100755 --- a/backend/tolData/wikidata/genTaxonSrcData.py +++ b/backend/tol_data/wikidata/gen_taxon_src_data.py @@ -1,12 +1,6 @@ #!/usr/bin/python3 -import sys, os, re, math, io -from collections import defaultdict -import bz2, json, sqlite3 -import multiprocessing, indexed_bzip2, pickle, tempfile - -import argparse -parser = argparse.ArgumentParser(description=""" +""" Reads a wikidata JSON dump, looking for enwiki taxon items, and associated IDs from sources like GBIF/etc, and IUCN conservation status. Writes results into a database. @@ -14,11 +8,11 @@ into a database. The JSON dump contains an array of objects, each of which describes a Wikidata item item1, and takes up it's own line. - Getting item1's Wikidata ID: item1['id'] (eg: "Q144") -- Checking if item1 is a taxon: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['numeric-id'] == id1 +- Checking if item1 is a taxon: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['id'] == id1 'idx1' indexes an array of statements - 'id1' is a Wikidata ID denoting a taxon item type (eg: 310890 means 'monotypic taxon') -- Checking if item1 is a taxon-alt: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['numeric-id'] == id1 - 'id1' denotes a common-name-alternative item type (eg: 55983715 means 'organisms known by a particular common name') + 'id1' is a Wikidata ID denoting a taxon item type (eg: Q310890 means 'monotypic taxon') +- Checking if item1 is a taxon-alt: item1['claims']['P31'][idx1]['mainsnak']['datavalue']['value']['id'] == id1 + 'id1' denotes a common-name-alternative item type (eg: Q55983715 means 'organisms known by a particular common name') Getting the ID of the item that item1 is an alternative for: item1['claims']['P31'][idx1]['qualifiers']['P642'][idx2]['datavalue']['value']['numeric-id'] - Checking for an EOL/NCBI/etc ID: item['claims'][prop1][idx1]['mainsnak']['datavalue']['value'] (eg: "328672") @@ -27,8 +21,7 @@ Wikidata item item1, and takes up it's own line. Based on code from https://github.com/OneZoom/OZtree, located in OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022). -""", formatter_class=argparse.RawDescriptionHelpFormatter) -args = parser.parse_args() +""" # On Linux, running on the full dataset caused the processes to hang after processing. This was resolved by: # - Storing subprocess results in temp files. Apparently passing large objects through pipes can cause deadlock. @@ -37,10 +30,15 @@ args = parser.parse_args() # - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8). # Possibly related: https://github.com/python/cpython/issues/72882 -WD_FILE = 'latest-all.json.bz2' +import sys, os, re, math, io +from collections import defaultdict +import bz2, json, sqlite3 +import multiprocessing, indexed_bzip2, pickle, tempfile + +WIKIDATA_FILE = 'latest-all.json.bz2' OFFSETS_FILE = 'offsets.dat' -DB_FILE = 'taxonSrcs.db' -N_PROCS = 6 # Took about 3 hours (probably would've taken 6-12 with N_PROCS=1) +DB_FILE = 'taxon_srcs.db' +N_PROCS = 6 # Took about 3 hours with N_PROCS=6 # Wikidata entity IDs TAXON_IDS = ['Q16521', 'Q310890', 'Q23038290', 'Q713623'] # 'taxon', 'monotypic taxon', 'fossil taxon', 'clade' @@ -52,47 +50,50 @@ IUCN_STATUS_IDS = { 'Q237350': 'extinct species', 'Q3245245': 'data deficient' } # For filtering lines before parsing JSON -LINE_REGEX = re.compile(('"numeric-id":(?:' + '|'.join([s[1:] for s in TAXON_IDS + TAXON_ALT_IDS]) + ')\D').encode()) +LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")\D').encode()) -def main() -> None: +def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None: + """ Reads the dump and writes source/iucn info to db """ # Maps to populate srcIdToId: dict[str, dict[int, int]] = defaultdict(dict) # Maps 'eol'/etc to {srcId1: wikidataId1, ...} idToTitle: dict[int, str] = {} # Maps wikidata ID to enwiki title idToAltId: dict[int, int] = {} # Maps taxon-item wikidata ID to taxon-alt ID (eg: 'canis lupus familiaris' -> 'dog') idToIucnStatus: dict[int, str] = {} # Maps wikidata ID to iucn-status string ('least concern', etc) # Check db - if os.path.exists(DB_FILE): + if os.path.exists(dbFile): print('ERROR: Database already exists') sys.exit(1) # Read dump - if N_PROCS == 1: - with bz2.open(WD_FILE, mode='rb') as file: + if nProcs == 1: + with bz2.open(wikidataFile, mode='rb') as file: for lineNum, line in enumerate(file, 1): if lineNum % 1e4 == 0: print(f'At line {lineNum}') readDumpLine(line, srcIdToId, idToTitle, idToAltId, idToIucnStatus) else: - if not os.path.exists(OFFSETS_FILE): + if not os.path.exists(offsetsFile): print('Creating offsets file') # For indexed access for multiprocessing (creation took about 6.7 hours) - with indexed_bzip2.open(WD_FILE) as file: - with open(OFFSETS_FILE, 'wb') as file2: + with indexed_bzip2.open(wikidataFile) as file: + with open(offsetsFile, 'wb') as file2: pickle.dump(file.block_offsets(), file2) print('Allocating file into chunks') fileSz: int # About 1.4 TB - with indexed_bzip2.open(WD_FILE) as file: - with open(OFFSETS_FILE, 'rb') as file2: + with indexed_bzip2.open(wikidataFile) as file: + with open(offsetsFile, 'rb') as file2: file.set_block_offsets(pickle.load(file2)) fileSz = file.seek(0, io.SEEK_END) - chunkSz = math.floor(fileSz / N_PROCS) - chunkIdxs = [-1] + [chunkSz * i for i in range(1, N_PROCS)] + [fileSz-1] + chunkSz = math.floor(fileSz / nProcs) + chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1] # Each adjacent pair specifies a start+end byte index for readDumpChunk() print(f'- Chunk size: {chunkSz:,}') print('Starting processes to read dump') with tempfile.TemporaryDirectory() as tempDirName: # Using maxtasksperchild=1 to free resources on task completion - with multiprocessing.Pool(processes=N_PROCS, maxtasksperchild=1) as pool: - for outFilename in pool.map(readDumpChunkOneParam, - ((i, chunkIdxs[i], chunkIdxs[i+1], f'{tempDirName}/{i}.pickle') for i in range(N_PROCS))): + with multiprocessing.Pool(processes=nProcs, maxtasksperchild=1) as pool: + for outFilename in pool.map( + readDumpChunkOneParam, + ((i, wikidataFile, offsetsFile, chunkIdxs[i], chunkIdxs[i+1], + os.path.join(tempDirName, f'{i}.pickle')) for i in range(nProcs))): # Get map data from subprocess output file with open(outFilename, 'rb') as file: maps = pickle.load(file) @@ -104,7 +105,7 @@ def main() -> None: idToIucnStatus.update(maps[3]) # print('Writing to db') - dbCon = sqlite3.connect(DB_FILE) + dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute('CREATE TABLE src_id_to_title (src TEXT, id INT, title TEXT, PRIMARY KEY(src, id))') for src, submap in srcIdToId.items(): @@ -124,13 +125,8 @@ def main() -> None: continue dbCur.execute('INSERT OR IGNORE INTO title_iucn VALUES (?, ?)', (idToTitle[wId], status)) # The 'OR IGNORE' allows for multiple taxons using the same alt - #dbCur.execute('CREATE TABLE id_to_alt_title (id TEXT PRIMARY KEY, title TEXT, alt TEXT)') - #for wId, altId in idToAltId.items(): - # dbCur.execute('INSERT INTO id_to_alt_title VALUES (?, ?, ?)', - # (wId, idToTitle[wId] if wId in idToTitle else None, idToTitle[altId])) dbCon.commit() dbCon.close() - def readDumpLine( lineBytes: bytes, srcIdToId: dict[str, dict[int, int]], @@ -196,14 +192,13 @@ def readDumpLine( idToIucnStatus[itemId] = IUCN_STATUS_IDS[iucnStatusId] except KeyError: pass - -def readDumpChunkOneParam(params: tuple[int, int, int, str]) -> str: +def readDumpChunkOneParam(params: tuple[int, str, str, int, int, str]) -> str: """ Forwards to readDumpChunk(), for use with pool.map() """ return readDumpChunk(*params) - -# Reads lines in the dump that begin after a start-byte, and not after an end byte - # If startByte is -1, start at the first line -def readDumpChunk(procId: int, startByte: int, endByte: int, outFilename: str) -> str: +def readDumpChunk( + procId: int, wikidataFile: str, offsetsFile: str, startByte: int, endByte: int, outFilename: str) -> str: + """ Reads lines in the dump that begin after a start-byte, and not after an end byte. + If startByte is -1, start at the first line. """ # Maps to populate maps: tuple[ dict[str, dict[int, int]], @@ -211,9 +206,9 @@ def readDumpChunk(procId: int, startByte: int, endByte: int, outFilename: str) - dict[int, int], dict[int, str]] = (defaultdict(dict), {}, {}, {}) # Read dump - with indexed_bzip2.open(WD_FILE) as file: + with indexed_bzip2.open(wikidataFile) as file: # Load offsets file - with open(OFFSETS_FILE, 'rb') as file2: + with open(offsetsFile, 'rb') as file2: offsets = pickle.load(file2) file.set_block_offsets(offsets) # Seek to chunk @@ -236,5 +231,9 @@ def readDumpChunk(procId: int, startByte: int, endByte: int, outFilename: str) - return outFilename if __name__ == '__main__': # Guard needed for multiprocessing + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + args = parser.parse_args() + # multiprocessing.set_start_method('spawn') - main() + genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS) |
