aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-01-29 11:30:47 +1100
committerTerry Truong <terry06890@gmail.com>2023-01-29 11:30:47 +1100
commit8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch)
treeffd824aa9b945d69b47f012617ee13d98764d078
parentf5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff)
Adjust backend coding style
Add line spacing, section comments, and import consistency
-rwxr-xr-xbackend/server.py7
-rw-r--r--backend/tests/common.py4
-rw-r--r--backend/tests/dbpedia/test_gen_desc_data.py5
-rw-r--r--backend/tests/enwiki/test_download_img_license_info.py8
-rw-r--r--backend/tests/enwiki/test_download_imgs.py5
-rw-r--r--backend/tests/enwiki/test_gen_desc_data.py4
-rw-r--r--backend/tests/enwiki/test_gen_dump_index_db.py7
-rw-r--r--backend/tests/enwiki/test_gen_img_data.py9
-rw-r--r--backend/tests/enwiki/test_gen_pageview_data.py6
-rw-r--r--backend/tests/eol/test_download_imgs.py8
-rw-r--r--backend/tests/eol/test_gen_images_list_db.py5
-rw-r--r--backend/tests/eol/test_review_imgs.py7
-rw-r--r--backend/tests/test_gen_desc_data.py7
-rw-r--r--backend/tests/test_gen_imgs.py6
-rw-r--r--backend/tests/test_gen_linked_imgs.py5
-rw-r--r--backend/tests/test_gen_mapping_data.py20
-rw-r--r--backend/tests/test_gen_name_data.py5
-rw-r--r--backend/tests/test_gen_otol_data.py15
-rw-r--r--backend/tests/test_gen_pop_data.py5
-rw-r--r--backend/tests/test_gen_reduced_trees.py5
-rw-r--r--backend/tests/test_review_imgs_to_gen.py9
-rw-r--r--backend/tests/test_tilo.py8
-rwxr-xr-xbackend/tilo.py107
-rwxr-xr-xbackend/tol_data/dbpedia/gen_desc_data.py21
-rwxr-xr-xbackend/tol_data/enwiki/download_img_license_info.py30
-rwxr-xr-xbackend/tol_data/enwiki/download_imgs.py24
-rwxr-xr-xbackend/tol_data/enwiki/gen_desc_data.py45
-rwxr-xr-xbackend/tol_data/enwiki/gen_dump_index_db.py16
-rwxr-xr-xbackend/tol_data/enwiki/gen_img_data.py36
-rwxr-xr-xbackend/tol_data/enwiki/gen_pageview_data.py28
-rwxr-xr-xbackend/tol_data/enwiki/lookup_page.py9
-rwxr-xr-xbackend/tol_data/eol/download_imgs.py28
-rwxr-xr-xbackend/tol_data/eol/gen_images_list_db.py13
-rwxr-xr-xbackend/tol_data/eol/review_imgs.py33
-rwxr-xr-xbackend/tol_data/gen_desc_data.py23
-rwxr-xr-xbackend/tol_data/gen_imgs.py36
-rwxr-xr-xbackend/tol_data/gen_linked_imgs.py23
-rwxr-xr-xbackend/tol_data/gen_mapping_data.py31
-rwxr-xr-xbackend/tol_data/gen_name_data.py29
-rwxr-xr-xbackend/tol_data/gen_otol_data.py45
-rwxr-xr-xbackend/tol_data/gen_pop_data.py15
-rwxr-xr-xbackend/tol_data/gen_reduced_trees.py62
-rwxr-xr-xbackend/tol_data/review_imgs_to_gen.py34
-rwxr-xr-xbackend/tol_data/wikidata/gen_taxon_src_data.py42
44 files changed, 667 insertions, 223 deletions
diff --git a/backend/server.py b/backend/server.py
index c953a9f..d7f6309 100755
--- a/backend/server.py
+++ b/backend/server.py
@@ -18,10 +18,8 @@ def wrappingApp(environ: dict[str, str], start_response) -> Iterable[bytes]:
""" WSGI handler that uses 'application', but also serves image files """
urlPath = environ['PATH_INFO']
if urlPath.startswith('/data/'):
- # Run WSGI script
- return application(environ, start_response)
- elif urlPath.startswith('/tol_data/img/'):
- # Serve image file
+ return application(environ, start_response) # Run WSGI script
+ elif urlPath.startswith('/tol_data/img/'): # Serve image file
imgPath = os.path.join(os.getcwd(), urlPath[1:])
if os.path.exists(imgPath):
imgType = mimetypes.guess_type(imgPath)[0]
@@ -33,6 +31,7 @@ def wrappingApp(environ: dict[str, str], start_response) -> Iterable[bytes]:
else:
start_response('404 Not Found', [('Content-type', 'text/plain')])
return [b'Unrecognised path']
+
# Start server
with simple_server.make_server('', 8000, wrappingApp) as httpd:
print('Serving HTTP on port 8000...')
diff --git a/backend/tests/common.py b/backend/tests/common.py
index cb455e4..abfa471 100644
--- a/backend/tests/common.py
+++ b/backend/tests/common.py
@@ -3,7 +3,9 @@ Utilities for testing
"""
from typing import Any
-import bz2, gzip, sqlite3
+import bz2
+import gzip
+import sqlite3
def createTestFile(filename: str, content: str) -> None:
""" Creates a file with the given name and contents """
diff --git a/backend/tests/dbpedia/test_gen_desc_data.py b/backend/tests/dbpedia/test_gen_desc_data.py
index 7d35677..ae56c5e 100644
--- a/backend/tests/dbpedia/test_gen_desc_data.py
+++ b/backend/tests/dbpedia/test_gen_desc_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestBz2, readTestDbTable
from tol_data.dbpedia.gen_desc_data import genData
@@ -57,9 +58,11 @@ class TestGenData(unittest.TestCase):
'<http://dbpedia.org/resource/A_Hat> <http://www.w3.org/2000/01/rdf-schema#comment>'
' "Hats are not parrots, nor are they potatoes."@en .\n'
))
+
# Run
dbFile = os.path.join(tempDir, 'descData.db')
genData(labelsFile, idsFile, redirectsFile, disambigFile, typesFile, abstractsFile, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT iri, label from labels'),
diff --git a/backend/tests/enwiki/test_download_img_license_info.py b/backend/tests/enwiki/test_download_img_license_info.py
index ed6e426..bd91478 100644
--- a/backend/tests/enwiki/test_download_img_license_info.py
+++ b/backend/tests/enwiki/test_download_img_license_info.py
@@ -1,6 +1,7 @@
import unittest
from unittest.mock import Mock, patch
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestDbTable, readTestDbTable
from tol_data.enwiki.download_img_license_info import downloadInfo
@@ -53,6 +54,7 @@ TEST_RESPONSE1 = {
}
}
}
+
TEST_RESPONSE2 = {
'batchcomplete': '',
'query': {
@@ -152,8 +154,10 @@ class TestDownloadInfo(unittest.TestCase):
(1, 'Octopus2.jpg'),
}
)
+
# Run
downloadInfo(imgDb)
+
# Check
self.assertEqual(
readTestDbTable(imgDb, 'SELECT name, license, artist, credit, restrictions, url from imgs'),
@@ -162,6 +166,7 @@ class TestDownloadInfo(unittest.TestCase):
'https://upload.wikimedia.org/wikipedia/commons/5/57/Octopus2.jpg'),
}
)
+
# Run with updated image-data db
createTestDbTable(
imgDb,
@@ -172,6 +177,7 @@ class TestDownloadInfo(unittest.TestCase):
}
)
downloadInfo(imgDb)
+
# Check
self.assertEqual(
readTestDbTable(imgDb, 'SELECT name, license, artist, credit, restrictions, url from imgs'),
diff --git a/backend/tests/enwiki/test_download_imgs.py b/backend/tests/enwiki/test_download_imgs.py
index 2618b8a..aaf27bc 100644
--- a/backend/tests/enwiki/test_download_imgs.py
+++ b/backend/tests/enwiki/test_download_imgs.py
@@ -1,6 +1,7 @@
import unittest
from unittest.mock import Mock, patch
-import tempfile, os
+import tempfile
+import os
from tests.common import readTestFile, createTestDbTable
from tol_data.enwiki.download_imgs import downloadImgs
@@ -40,10 +41,12 @@ class TestDownloadInfo(unittest.TestCase):
('six','cc-by','','fred','','https://upload.wikimedia.org/6.png'),
}
)
+
# Create temp output directory
with tempfile.TemporaryDirectory() as outDir:
# Run
downloadImgs(imgDb, outDir, 0)
+
# Check
expectedImgs = {
'1.jpg': 'img:https://upload.wikimedia.org/1.jpg',
diff --git a/backend/tests/enwiki/test_gen_desc_data.py b/backend/tests/enwiki/test_gen_desc_data.py
index 801aa69..0d1536b 100644
--- a/backend/tests/enwiki/test_gen_desc_data.py
+++ b/backend/tests/enwiki/test_gen_desc_data.py
@@ -1,5 +1,6 @@
import unittest
-import os, tempfile
+import os
+import tempfile
from tests.common import readTestDbTable
from tol_data.enwiki.gen_desc_data import genData
@@ -12,6 +13,7 @@ class TestGenData(unittest.TestCase):
# Run
dbFile = os.path.join(tempDir, 'descData.db')
genData(TEST_DUMP_FILE, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT id, title FROM pages'),
diff --git a/backend/tests/enwiki/test_gen_dump_index_db.py b/backend/tests/enwiki/test_gen_dump_index_db.py
index e0715f3..b918f15 100644
--- a/backend/tests/enwiki/test_gen_dump_index_db.py
+++ b/backend/tests/enwiki/test_gen_dump_index_db.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestBz2, readTestDbTable
from tol_data.enwiki.gen_dump_index_db import genData
@@ -10,15 +11,18 @@ def runGenData(indexFileContents: str):
# Create temp index file
indexFile = os.path.join(tempDir, 'index.txt.bz2')
createTestBz2(indexFile, indexFileContents)
+
# Run
dbFile = os.path.join(tempDir, 'data.db')
genData(indexFile, dbFile)
+
# Read db
return readTestDbTable(dbFile, 'SELECT title, id, offset, next_offset FROM offsets')
class TestGenData(unittest.TestCase):
def setUp(self):
self.maxDiff = None # Remove output-diff size limit
+
def test_index_file(self):
indexFileContents = (
'100:10:apple\n'
@@ -33,6 +37,7 @@ class TestGenData(unittest.TestCase):
('banana ice-cream', 99, 300, 1000),
('Custard!', 2030, 1000, -1),
})
+
def test_emp_index(self):
offsetsMap = runGenData('')
self.assertEqual(offsetsMap, set())
diff --git a/backend/tests/enwiki/test_gen_img_data.py b/backend/tests/enwiki/test_gen_img_data.py
index 1703b78..0a8f79d 100644
--- a/backend/tests/enwiki/test_gen_img_data.py
+++ b/backend/tests/enwiki/test_gen_img_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestDbTable, readTestDbTable
from tol_data.enwiki.gen_img_data import getInputPageIdsFromDb, genData
@@ -20,8 +21,10 @@ class TestGetInputPageIdsFromDb(unittest.TestCase):
('and another', 2),
}
)
+
# Run
pageIds = getInputPageIdsFromDb(dbFile)
+
# Check
self.assertEqual(pageIds, {1, 2})
@@ -40,9 +43,11 @@ class TestGenData(unittest.TestCase):
('Autism',25,0,-1),
}
)
+
# Run
imgDb = os.path.join(tempDir, 'imgData.db')
genData({10, 25}, TEST_DUMP_FILE, indexDb, imgDb)
+
# Check
self.assertEqual(
readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
@@ -51,8 +56,10 @@ class TestGenData(unittest.TestCase):
(25, 'Autism-stacking-cans 2nd edit.jpg'),
}
)
+
# Run with updated page-ids set
genData({13, 10}, TEST_DUMP_FILE, indexDb, imgDb)
+
# Check
self.assertEqual(
readTestDbTable(imgDb, 'SELECT page_id, img_name from page_imgs'),
diff --git a/backend/tests/enwiki/test_gen_pageview_data.py b/backend/tests/enwiki/test_gen_pageview_data.py
index 5002eb0..0c4a35e 100644
--- a/backend/tests/enwiki/test_gen_pageview_data.py
+++ b/backend/tests/enwiki/test_gen_pageview_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestBz2, createTestDbTable, readTestDbTable
from tol_data.enwiki.gen_pageview_data import genData
@@ -18,6 +19,7 @@ class TestGenData(unittest.TestCase):
'fr.wikipedia Four null desktop 12 T6U6\n'
'en.wikipedia Three null desktop 10 E4G5Z61\n'
))
+
# Create temp dump-index db
dumpIndexDb = os.path.join(tempDir, 'dump_index.db')
createTestDbTable(
@@ -31,9 +33,11 @@ class TestGenData(unittest.TestCase):
('Four', 4, 0, -1),
}
)
+
# Run
dbFile = os.path.join(tempDir, 'data.db')
genData(pageviewFiles, dumpIndexDb, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT title, id, views from views'),
diff --git a/backend/tests/eol/test_download_imgs.py b/backend/tests/eol/test_download_imgs.py
index 975d1c7..4872ca3 100644
--- a/backend/tests/eol/test_download_imgs.py
+++ b/backend/tests/eol/test_download_imgs.py
@@ -1,6 +1,7 @@
import unittest
from unittest.mock import Mock, patch
-import tempfile, os
+import tempfile
+import os
from tests.common import readTestFile, createTestDbTable
from tol_data.eol.download_imgs import getEolIdsFromDb, downloadImgs
@@ -19,8 +20,10 @@ class TestGetEolIdsFromDb(unittest.TestCase):
('a second', 2),
}
)
+
# Run
eolIds = getEolIdsFromDb(dbFile)
+
# Check
self.assertEqual(eolIds, {1, 2})
@@ -30,6 +33,7 @@ class TestDownloadImgs(unittest.TestCase):
requestsGetMock.side_effect = lambda url: Mock(content=('img:' + url).encode())
with tempfile.TemporaryDirectory() as tempDir:
eolIds = {1, 2, 4}
+
# Create temp images-list db
imagesListDb = os.path.join(tempDir, 'images_list.db')
createTestDbTable(
@@ -48,10 +52,12 @@ class TestDownloadImgs(unittest.TestCase):
(30, 3, '', 'https://content.eol.org/3.png', 'cc-by', 'owner3'),
}
)
+
# Create temp output dir
with tempfile.TemporaryDirectory() as outDir:
# Run
downloadImgs(eolIds, imagesListDb, outDir)
+
# Check
expectedImgs1 = {
'1 10.jpg': 'img:https://content.eol.org/1.jpg',
diff --git a/backend/tests/eol/test_gen_images_list_db.py b/backend/tests/eol/test_gen_images_list_db.py
index ca9b495..c1c81f3 100644
--- a/backend/tests/eol/test_gen_images_list_db.py
+++ b/backend/tests/eol/test_gen_images_list_db.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestFile, readTestDbTable
from tol_data.eol.gen_images_list_db import genData
@@ -17,9 +18,11 @@ class TestGenData(unittest.TestCase):
createTestFile(os.path.join(tempDir, 'imgs-2.csv'), (
'3,30,https://example.com/3/,https://content.eol.org/3.png,public,owner3\n'
))
+
# Run
dbFile = os.path.join(tempDir, 'imagesList.db')
genData(imageListsGlob, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(
diff --git a/backend/tests/eol/test_review_imgs.py b/backend/tests/eol/test_review_imgs.py
index 49c09bb..21d4756 100644
--- a/backend/tests/eol/test_review_imgs.py
+++ b/backend/tests/eol/test_review_imgs.py
@@ -1,5 +1,7 @@
import unittest
-import tempfile, os, shutil
+import tempfile
+import os
+import shutil
from tests.common import createTestDbTable
from tol_data.eol.review_imgs import reviewImgs
@@ -19,6 +21,7 @@ class TestReviewImgs(unittest.TestCase):
shutil.copy(AVOID_IMG, os.path.join(imgDir, '2 22.jpg'))
shutil.copy(AVOID_IMG, os.path.join(imgDir, '3 30.png'))
shutil.copy(AVOID_IMG, os.path.join(imgDir, '3 31.jpg'))
+
# Create temp extra-info db
extraInfoDb = os.path.join(tempDir, 'data.db')
createTestDbTable(
@@ -39,8 +42,10 @@ class TestReviewImgs(unittest.TestCase):
('two','II',1,'eol'),
}
)
+
# Run
outDir = os.path.join(tempDir, 'imgs')
reviewImgs(imgDir, outDir, extraInfoDb)
+
# Check
self.assertEqual(set(os.listdir(outDir)), {'1 10.jpg', '2 20.jpeg'})
diff --git a/backend/tests/test_gen_desc_data.py b/backend/tests/test_gen_desc_data.py
index cc0582d..8d21978 100644
--- a/backend/tests/test_gen_desc_data.py
+++ b/backend/tests/test_gen_desc_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestDbTable, readTestDbTable
from tol_data.gen_desc_data import genData
@@ -37,6 +38,7 @@ class TestGenData(unittest.TestCase):
('<http://dbpedia.org/resource/Three>', 'Three from dbp'),
}
)
+
# Create temp enwiki db
enwikiDb = os.path.join(tempDir, 'enwiki_descs.db')
createTestDbTable(
@@ -70,6 +72,7 @@ class TestGenData(unittest.TestCase):
(5, 'Five from enwiki'),
}
)
+
# Create temp tree-of-life db
dbFile = os.path.join(tempDir, 'data.db')
createTestDbTable(
@@ -86,8 +89,10 @@ class TestGenData(unittest.TestCase):
('seventh', 7),
}
)
+
# Run
genData(dbpediaDb, enwikiDb, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT wiki_id, desc, from_dbp from descs'),
diff --git a/backend/tests/test_gen_imgs.py b/backend/tests/test_gen_imgs.py
index 1ddd438..efab361 100644
--- a/backend/tests/test_gen_imgs.py
+++ b/backend/tests/test_gen_imgs.py
@@ -1,6 +1,8 @@
import unittest
from unittest.mock import patch
-import tempfile, os, shutil
+import tempfile
+import os
+import shutil
from tests.common import createTestFile, createTestDbTable, readTestDbTable
from tol_data.gen_imgs import genImgs
@@ -95,9 +97,11 @@ class TestGenImgs(unittest.TestCase):
('node6', 'ott6', 10),
}
)
+
# Run
outDir = os.path.join(tempDir, 'img')
genImgs(imgListFile, eolImgDir, outDir, eolImgDb, enwikiImgDb, pickedImgDir, pickedImgsFile, dbFile)
+
# Check
self.assertEqual(set(os.listdir(outDir)), {
'ott1.jpg',
diff --git a/backend/tests/test_gen_linked_imgs.py b/backend/tests/test_gen_linked_imgs.py
index b989407..be4b0d1 100644
--- a/backend/tests/test_gen_linked_imgs.py
+++ b/backend/tests/test_gen_linked_imgs.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestDbTable, readTestDbTable
from tol_data.gen_linked_imgs import genData
@@ -70,8 +71,10 @@ class TestGenData(unittest.TestCase):
('thirteen', 12, 'enwiki'),
}
)
+
# Run
genData(dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT name, otol_ids from linked_imgs'),
diff --git a/backend/tests/test_gen_mapping_data.py b/backend/tests/test_gen_mapping_data.py
index 9aa99b7..57c9ef7 100644
--- a/backend/tests/test_gen_mapping_data.py
+++ b/backend/tests/test_gen_mapping_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestFile, createTestGzip, createTestDbTable, readTestDbTable
from tol_data.gen_mapping_data import \
@@ -18,10 +19,12 @@ class TestReadTaxonomyFile(unittest.TestCase):
SEP.join(['10', '20', 'ten', 'family', 'if:10,if:100', '', '', '\n']),
SEP.join(['11', '100', 'eleven', '', 'igloo:1,ncbi:?', '', '', '\n'])
]))
+
# Run
nodeToSrcIds = {}
usedSrcIds = set()
readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds)
+
# Check
self.assertEqual(nodeToSrcIds, {
1: {'ncbi': 10},
@@ -34,6 +37,7 @@ class TestReadTaxonomyFile(unittest.TestCase):
('gbif', 1),
('if', 10)
})
+
class TestReadEolIdsFile(unittest.TestCase):
def test_read(self):
with tempfile.TemporaryDirectory() as tempDir:
@@ -51,15 +55,18 @@ class TestReadEolIdsFile(unittest.TestCase):
10: {'ncbi': 10},
20: {'ncbi': 23, 'gbif': 234}
}
+
# Run
usedSrcIds = {('ncbi', 10), ('gbif', 234), ('ncbi', 23)}
nodeToEolId = {}
readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId)
+
# Check
self.assertEqual(nodeToEolId, {
10: 1,
20: 101,
})
+
class TestReadWikidataDb(unittest.TestCase):
def test_read(self):
with tempfile.TemporaryDirectory() as tempDir:
@@ -105,10 +112,12 @@ class TestReadWikidataDb(unittest.TestCase):
nodeToEolId = {
20: 100,
}
+
# Run
nodeToWikiTitle = {}
titleToIucnStatus = {}
readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId)
+
# Check
self.assertEqual(nodeToWikiTitle, {
10: 'one',
@@ -123,6 +132,7 @@ class TestReadWikidataDb(unittest.TestCase):
10: 1,
20: 100,
})
+
class TestReadPickedMappings(unittest.TestCase):
def test_read(self):
with tempfile.TemporaryDirectory() as tempDir:
@@ -155,8 +165,10 @@ class TestReadPickedMappings(unittest.TestCase):
12: 'two',
35: 'goanna',
}
+
# Run
readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle)
+
# Check
self.assertEqual(nodeToEolId, {
1: 1,
@@ -170,6 +182,7 @@ class TestReadPickedMappings(unittest.TestCase):
15: 'ghi',
35: 'jkl',
})
+
class TestReadGetEnwikiPageIds(unittest.TestCase):
def test_read(self):
with tempfile.TemporaryDirectory() as tempDir:
@@ -191,14 +204,17 @@ class TestReadGetEnwikiPageIds(unittest.TestCase):
20: 'two',
30: 'three',
}
+
# Run
titleToPageId = {}
getEnwikiPageIds(dumpIndexDb, nodeToWikiTitle, titleToPageId)
+
# Check
self.assertEqual(titleToPageId, {
'one': 1,
'two': 22,
})
+
class TestGenData(unittest.TestCase):
def test_mapping(self):
with tempfile.TemporaryDirectory() as tempDir:
@@ -275,8 +291,10 @@ class TestGenData(unittest.TestCase):
('third', 'ott3', 2),
]
)
+
# Run
genData(taxonomyFile, eolIdsFile, wikidataDb, pickedMappings, dumpIndexDb, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT name, id from eol_ids'),
diff --git a/backend/tests/test_gen_name_data.py b/backend/tests/test_gen_name_data.py
index 85e81d8..0dab23a 100644
--- a/backend/tests/test_gen_name_data.py
+++ b/backend/tests/test_gen_name_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestFile, createTestDbTable, readTestDbTable
from tol_data.gen_name_data import genData
@@ -78,8 +79,10 @@ class TestGenData(unittest.TestCase):
('three', 2),
]
)
+
# Run
genData(eolNamesFile, enwikiDb, pickedNamesFile, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT name, alt_name, pref_alt, src FROM names'),
diff --git a/backend/tests/test_gen_otol_data.py b/backend/tests/test_gen_otol_data.py
index 25e65e3..cc0404a 100644
--- a/backend/tests/test_gen_otol_data.py
+++ b/backend/tests/test_gen_otol_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestFile, readTestDbTable
from tol_data.gen_otol_data import genData
@@ -16,9 +17,11 @@ def runGenData(treeFileContents: str, annFileContents: str, pickedFileContents:
# Create temp picked names file
pickedFile = os.path.join(tempDir, 'pn.txt')
createTestFile(pickedFile, pickedFileContents)
+
# Run genData()
dbFile = os.path.join(tempDir, 'data.db')
genData(treeFile, annFile, pickedFile, dbFile)
+
# Read database
nodes = readTestDbTable(dbFile, 'SELECT name, id, tips FROM nodes')
edges = readTestDbTable(dbFile, 'SELECT parent, child, p_support FROM edges')
@@ -27,6 +30,7 @@ def runGenData(treeFileContents: str, annFileContents: str, pickedFileContents:
class TestGenData(unittest.TestCase):
def setUp(self):
self.maxDiff = None # Remove output-diff size limit
+
def test_newick(self):
treeFileContents = """
(
@@ -40,7 +44,9 @@ class TestGenData(unittest.TestCase):
)cellular_organisms_ott1;"""
annFileContents = '{"nodes": {}}'
pickedFileContents = ''
+
nodes, edges = runGenData(treeFileContents, annFileContents, pickedFileContents)
+
self.assertEqual(nodes, {
('land plants', 'ott2', 1),
('traveller\'s tree', 'ott100', 1),
@@ -66,9 +72,11 @@ class TestGenData(unittest.TestCase):
('citrus', 'lemon', 0),
('citrus', 'orange', 0),
})
+
def test_newick_invalid(self):
with self.assertRaises(Exception):
runGenData('(A,B,(C,D));', '{"nodes": {}}', '')
+
def test_annotations(self):
treeFileContents = '(two_ott2, three_ott3, four_ott4)one_ott1;'
annFileContents = """
@@ -91,7 +99,9 @@ class TestGenData(unittest.TestCase):
}
}
}"""
+
nodes, edges = runGenData(treeFileContents, annFileContents, '')
+
self.assertEqual(nodes, {
('one', 'ott1', 3),
('two', 'ott2', 1),
@@ -103,10 +113,13 @@ class TestGenData(unittest.TestCase):
('one', 'three', 1),
('one', 'four', 0),
})
+
def test_picked_names_file(self):
treeFileContents = '(one_ott2, two_ott3)one_ott1;'
pickedFileContents = 'one|ott2'
+
nodes, edges = runGenData(treeFileContents, '{"nodes": {}}', pickedFileContents)
+
self.assertEqual(nodes, {
('one [2]', 'ott1', 2),
('one', 'ott2', 1),
diff --git a/backend/tests/test_gen_pop_data.py b/backend/tests/test_gen_pop_data.py
index dd1cb22..b71ebc5 100644
--- a/backend/tests/test_gen_pop_data.py
+++ b/backend/tests/test_gen_pop_data.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestDbTable, readTestDbTable
from tol_data.gen_pop_data import genData
@@ -30,8 +31,10 @@ class TestGenData(unittest.TestCase):
('node3', 3),
}
)
+
# Run
genData(pageviewsDb, dbFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT name, pop from node_pop'),
diff --git a/backend/tests/test_gen_reduced_trees.py b/backend/tests/test_gen_reduced_trees.py
index 2ae4dfd..99cbd92 100644
--- a/backend/tests/test_gen_reduced_trees.py
+++ b/backend/tests/test_gen_reduced_trees.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestFile, createTestDbTable, readTestDbTable
from tol_data.gen_reduced_trees import genData
@@ -98,8 +99,10 @@ class TestGenData(unittest.TestCase):
'five\n'
'VIII\n'
))
+
# Run
genData(None, dbFile, pickedNodesFile)
+
# Check
self.assertEqual(
readTestDbTable(dbFile, 'SELECT name, id, tips from nodes_p'),
diff --git a/backend/tests/test_review_imgs_to_gen.py b/backend/tests/test_review_imgs_to_gen.py
index d88523b..e98ab32 100644
--- a/backend/tests/test_review_imgs_to_gen.py
+++ b/backend/tests/test_review_imgs_to_gen.py
@@ -1,5 +1,7 @@
import unittest
-import tempfile, os, shutil
+import tempfile
+import os
+import shutil
from tests.common import readTestFile, createTestDbTable
from tol_data.review_imgs_to_gen import reviewImgs
@@ -62,19 +64,24 @@ class TestReviewImgs(unittest.TestCase):
('four', 4),
}
)
+
# Run
outFile = os.path.join(tempDir, 'imgList.txt')
reviewImgs(eolImgDir, enwikiImgDir, dbFile, outFile, 'all')
+
# Check
self.assertEqual(set(readTestFile(outFile).splitlines()), {
'ott1 ' + os.path.join(eolImgDir, '1 10.jpg'),
'ott2',
'ott3 ' + os.path.join(enwikiImgDir, '3.png'),
})
+
# Add extra data
createTestDbTable(dbFile, None, 'INSERT INTO nodes VALUES (?, ?, ?)',{('four', 'ott4', 2)})
+
# Run
reviewImgs(eolImgDir, enwikiImgDir, dbFile, outFile, 'all')
+
# Check
self.assertEqual(set(readTestFile(outFile).splitlines()), {
'ott1 ' + os.path.join(eolImgDir, '1 10.jpg'),
diff --git a/backend/tests/test_tilo.py b/backend/tests/test_tilo.py
index cfc719a..718fb8b 100644
--- a/backend/tests/test_tilo.py
+++ b/backend/tests/test_tilo.py
@@ -1,5 +1,6 @@
import unittest
-import tempfile, os
+import tempfile
+import os
from tests.common import createTestDbTable
from tilo import handleReq, TolNode, SearchSuggResponse, SearchSugg, InfoResponse, NodeInfo, DescInfo, ImgInfo
@@ -122,8 +123,10 @@ class TestHandleReq(unittest.TestCase):
self.tempDir = tempfile.TemporaryDirectory()
self.dbFile = os.path.join(self.tempDir.name, 'data.db')
initTestDb(self.dbFile)
+
def tearDown(self):
self.tempDir.cleanup()
+
def test_node_req(self):
response = handleReq(self.dbFile, {'QUERY_STRING': 'name=two&type=node&tree=trimmed'})
self.assertEqual(response, {
@@ -131,6 +134,7 @@ class TestHandleReq(unittest.TestCase):
'three': TolNode('ott3', [], 'two', 1, False, None, None, None),
'four': TolNode('ott4', [], 'two', 1, True, None, 'ott4.jpg', None),
})
+
def test_node_toroot_req(self):
response = handleReq(self.dbFile, {'QUERY_STRING': 'name=seven&type=node&toroot=1&excl=five&tree=trimmed'})
self.assertEqual(response, {
@@ -138,6 +142,7 @@ class TestHandleReq(unittest.TestCase):
'six': TolNode('ott6', ['seven'], 'five', 1, 1, 'VI', 'ott6.jpg', 'endangered'),
'seven': TolNode('ott7', [], 'six', 1, 1, None, None, None),
})
+
def test_sugg_req(self):
response = handleReq(self.dbFile, {'QUERY_STRING': 'name=t&type=sugg&tree=trimmed'})
self.assertEqual(response, SearchSuggResponse(
@@ -148,6 +153,7 @@ class TestHandleReq(unittest.TestCase):
],
False
))
+
def test_info_req(self):
response = handleReq(self.dbFile, {'QUERY_STRING': 'name=six&type=info&tree=trimmed'})
self.assertEqual(response, InfoResponse(
diff --git a/backend/tilo.py b/backend/tilo.py
index 21b5a7f..f33449b 100755
--- a/backend/tilo.py
+++ b/backend/tilo.py
@@ -18,16 +18,20 @@ Expected HTTP query parameters:
"""
from typing import Iterable, cast
-import sys, re
-import urllib.parse, sqlite3
-import gzip, jsonpickle
+import sys
+import re
+import urllib.parse
+import sqlite3
+import gzip
+import jsonpickle
DB_FILE = 'tol_data/data.db'
DEFAULT_SUGG_LIM = 5
MAX_SUGG_LIM = 50
ROOT_NAME = 'cellular organisms'
-# Classes for objects sent as responses (matches lib.ts types in client-side code)
+# ========== Classes for values sent as responses ==========
+
class TolNode:
""" Used when responding to 'node' and 'chain' requests """
def __init__(
@@ -48,52 +52,61 @@ class TolNode:
self.commonName = commonName
self.imgName = imgName
self.iucn = iucn
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, TolNode) and \
(self.otolId, set(self.children), self.parent, self.tips, \
self.pSupport, self.commonName, self.imgName, self.iucn) == \
(other.otolId, set(other.children), other.parent, other.tips, \
other.pSupport, other.commonName, other.imgName, other.iucn)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
+
class SearchSugg:
""" Represents a search suggestion """
def __init__(self, name: str, canonicalName: str | None = None, pop=0):
self.name = name
self.canonicalName = canonicalName
self.pop = pop if pop is not None else 0
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, SearchSugg) and \
(self.name, self.canonicalName, self.pop) == (other.name, other.canonicalName, other.pop)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
- def __hash__(self):
+
+ def __hash__(self): # Used in unit testing
return (self.name, self.canonicalName, self.pop).__hash__()
+
class SearchSuggResponse:
""" Sent as responses to 'sugg' requests """
def __init__(self, searchSuggs: list[SearchSugg], hasMore: bool):
self.suggs = searchSuggs
self.hasMore = hasMore
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, SearchSuggResponse) and \
(set(self.suggs), self.hasMore) == (set(other.suggs), other.hasMore)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
+
class DescInfo:
""" Represents a node's associated description """
def __init__(self, text: str, wikiId: int, fromDbp: bool):
self.text = text
self.wikiId = wikiId
self.fromDbp = fromDbp
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, DescInfo) and \
(self.text, self.wikiId, self.fromDbp) == (other.text, other.wikiId, other.fromDbp)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
+
class ImgInfo:
""" Represents a node's associated image """
def __init__(self, id: int, src: str, url: str, license: str, artist: str, credit: str):
@@ -103,38 +116,44 @@ class ImgInfo:
self.license = license
self.artist = artist
self.credit = credit
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, ImgInfo) and \
(self.id, self.src, self.url, self.license, self.artist, self.credit) == \
(other.id, other.src, other.url, other.license, other.artist, other.credit)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
+
class NodeInfo:
""" Represents info about a node """
def __init__(self, tolNode: TolNode, descInfo: DescInfo | None, imgInfo: ImgInfo | None):
self.tolNode = tolNode
self.descInfo = descInfo
self.imgInfo = imgInfo
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, NodeInfo) and \
(self.tolNode, self.descInfo, self.imgInfo) == (other.tolNode, other.descInfo, other.imgInfo)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
+
class InfoResponse:
""" Sent as responses to 'info' requests """
def __init__(self, nodeInfo: NodeInfo, subNodesInfo: tuple[()] | tuple[NodeInfo | None, NodeInfo | None]):
self.nodeInfo = nodeInfo
self.subNodesInfo = subNodesInfo
- # Used in unit testing
- def __eq__(self, other):
+
+ def __eq__(self, other): # Used in unit testing
return isinstance(other, InfoResponse) and \
(self.nodeInfo, self.subNodesInfo) == (other.nodeInfo, other.subNodesInfo)
- def __repr__(self):
+
+ def __repr__(self): # Used in unit testing
return str(self.__dict__)
-# For data lookup
+# ========== For data lookup ==========
+
def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str, TolNode]:
""" For a set of node names, returns a name-to-TolNode map that describes those nodes """
# Get node info
@@ -146,6 +165,7 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str,
query = f'SELECT name, id, tips FROM {nodesTable} WHERE name IN ({queryParamStr})'
for nodeName, otolId, tips in dbCur.execute(query, names):
nameToNodes[nodeName] = TolNode(otolId, [], tips=tips)
+
# Get child info
query = f'SELECT parent, child FROM {edgesTable} WHERE parent IN ({queryParamStr})'
for nodeName, childName in dbCur.execute(query, names):
@@ -158,11 +178,13 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str,
for n, tips in dbCur.execute(query, node.children):
childToTips[n] = tips
node.children.sort(key=lambda n: childToTips[n], reverse=True)
+
# Get parent info
query = f'SELECT parent, child, p_support FROM {edgesTable} WHERE child IN ({queryParamStr})'
for nodeName, childName, pSupport in dbCur.execute(query, names):
nameToNodes[childName].parent = nodeName
nameToNodes[childName].pSupport = pSupport == 1
+
# Get image names
idsToNames = {nameToNodes[n].otolId: n for n in nameToNodes.keys()}
query = f'SELECT {nodesTable}.id from {nodesTable}' \
@@ -170,6 +192,7 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str,
f' WHERE {nodesTable}.id IN ' '({})'.format(','.join(['?'] * len(idsToNames)))
for (otolId,) in dbCur.execute(query, list(idsToNames.keys())):
nameToNodes[idsToNames[otolId]].imgName = otolId + '.jpg'
+
# Get 'linked' images for unresolved names
unresolvedNames = [n for n in nameToNodes if nameToNodes[n].imgName is None]
query = 'SELECT name, otol_ids from linked_imgs WHERE name IN ({})'
@@ -183,21 +206,25 @@ def lookupNodes(names: list[str], tree: str, dbCur: sqlite3.Cursor) -> dict[str,
id1 + '.jpg' if id1 != '' else None,
id2 + '.jpg' if id2 != '' else None,
)
+
# Get preferred-name info
query = f'SELECT name, alt_name FROM names WHERE pref_alt = 1 AND name IN ({queryParamStr})'
for name, altName in dbCur.execute(query, names):
if name in nameToNodes:
nameToNodes[name].commonName = altName
+
# Get IUCN status
query = f'SELECT name, iucn FROM node_iucn WHERE name IN ({queryParamStr})'
for name, iucn in dbCur.execute(query, names):
if name in nameToNodes:
nameToNodes[name].iucn = iucn
- #
+
return nameToNodes
+
def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor) -> SearchSuggResponse:
""" For a search string, returns a SearchSuggResponse describing search suggestions """
hasMore = False
+
# Get node names and alt-names, ordering by popularity
nodesTable = f'nodes_{getTableSuffix(tree)}'
nameQuery = f'SELECT {nodesTable}.name, node_pop.pop FROM {nodesTable}' \
@@ -210,6 +237,7 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor
f' WHERE alt_name LIKE ? ORDER BY node_pop.pop DESC'
suggs: dict[str, SearchSugg] = {}
tempLimit = suggLimit + 1 # For determining if 'more suggestions exist'
+
# Prefix search
for altName, nodeName, prefAlt, pop in dbCur.execute(altNameQuery, (searchStr + '%',)):
if nodeName not in suggs or prefAlt == 1 and suggs[nodeName].canonicalName is not None:
@@ -224,6 +252,7 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor
if len(suggs) == tempLimit:
break
suggList = sorted(suggs.values(), key=lambda x: x.pop, reverse=True)
+
# If insufficient results, try substring-search
if len(suggs) < tempLimit:
newNames: set[str] = set()
@@ -243,18 +272,21 @@ def lookupSuggs(searchStr: str, suggLimit: int, tree: str, dbCur: sqlite3.Cursor
if len(suggs) == tempLimit:
break
suggList.extend(sorted([suggs[n] for n in newNames], key=lambda x: x.pop, reverse=True))
- #
+
if len(suggList) > suggLimit:
hasMore = True
return SearchSuggResponse(suggList[:suggLimit], hasMore)
+
def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | None:
""" For a node name, returns a descriptive InfoResponse, or None """
nodesTable = f'nodes_{getTableSuffix(tree)}'
+
# Get node info
nameToNodes = lookupNodes([name], tree, dbCur)
tolNode = nameToNodes[name] if name in nameToNodes else None
if tolNode is None:
return None
+
# Check for compound node
match = re.fullmatch(r'\[(.+) \+ (.+)]', name)
subNames = [match.group(1), match.group(2)] if match is not None else []
@@ -264,6 +296,7 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No
subNames = [n if n in nameToSubNodes else None for n in subNames]
nameToNodes.update(nameToSubNodes)
namesToLookup = [name] if not subNames else [n for n in subNames if n is not None]
+
# Get desc info
nameToDescInfo: dict[str, DescInfo] = {}
query = 'SELECT name, desc, wiki_id, from_dbp FROM' \
@@ -271,6 +304,7 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No
' WHERE wiki_ids.name IN ({})'.format(','.join(['?'] * len(namesToLookup)))
for nodeName, desc, wikiId, fromDbp in dbCur.execute(query, namesToLookup):
nameToDescInfo[nodeName] = DescInfo(desc, wikiId, fromDbp == 1)
+
# Get image info
nameToImgInfo: dict[str, ImgInfo] = {}
idsToNames = {cast(str, nameToNodes[n].imgName)[:-4]: n
@@ -282,6 +316,7 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No
f' WHERE {nodesTable}.id IN ' '({})'.format(','.join(['?'] * len(idsToLookup)))
for id, imgId, imgSrc, url, license, artist, credit in dbCur.execute(query, idsToLookup):
nameToImgInfo[idsToNames[id]] = ImgInfo(imgId, imgSrc, url, license, artist, credit)
+
# Construct response
nodeInfoObjs = [
NodeInfo(
@@ -293,15 +328,19 @@ def lookupInfo(name: str, tree: str, dbCur: sqlite3.Cursor) -> InfoResponse | No
return InfoResponse(
nodeInfoObjs[0],
cast(tuple[()] | tuple[NodeInfo | None, NodeInfo | None], nodeInfoObjs[1:]))
+
def getTableSuffix(tree: str) -> str:
- """ converts a reduced-tree descriptor into a sql-table-suffix """
+ """ Converts a reduced-tree descriptor into a sql-table-suffix """
return 't' if tree == 'trimmed' else 'i' if tree == 'images' else 'p'
+# ========== Entry point ==========
+
def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode] | SearchSuggResponse | InfoResponse:
""" Queries the database, and constructs a response object """
# Open db
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+
# Get query params
queryStr = environ['QUERY_STRING'] if 'QUERY_STRING' in environ else ''
queryDict = urllib.parse.parse_qs(queryStr)
@@ -313,6 +352,7 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode]
#(name,) = dbCur.execute(query).fetchone()
reqType = queryDict['type'][0] if 'type' in queryDict else None
tree = queryDict['tree'][0] if 'tree' in queryDict else 'images'
+
# Check for valid 'tree'
if tree is not None and re.fullmatch(r'trimmed|images|picked', tree) is None:
return None
@@ -339,7 +379,7 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode]
parent = row[0]
nodesToSkip.add(parent)
nodeName = parent
- #
+
results: dict[str, TolNode] = {}
ranOnce = False
while True:
@@ -378,6 +418,7 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode]
except ValueError:
invalidLimit = True
print(f'INFO: Invalid limit {suggLimit}', file=sys.stderr)
+
# Get search suggestions
if not invalidLimit:
return lookupSuggs(name, suggLimit, tree, dbCur)
@@ -385,12 +426,15 @@ def handleReq(dbFile: str, environ: dict[str, str]) -> None | dict[str, TolNode]
infoResponse = lookupInfo(name, tree, dbCur)
if infoResponse is not None:
return infoResponse
+
# On failure, provide empty response
return None
+
def application(environ: dict[str, str], start_response) -> Iterable[bytes]:
""" Entry point for the WSGI script """
# Get response object
val = handleReq(DB_FILE, environ)
+
# Construct response
data = jsonpickle.encode(val, unpicklable=False).encode()
headers = [('Content-type', 'application/json')]
@@ -400,4 +444,5 @@ def application(environ: dict[str, str], start_response) -> Iterable[bytes]:
headers.append(('Content-encoding', 'gzip'))
headers.append(('Content-Length', str(len(data))))
start_response('200 OK', headers)
+
return [data]
diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py
index 50418e0..f8a665a 100755
--- a/backend/tol_data/dbpedia/gen_desc_data.py
+++ b/backend/tol_data/dbpedia/gen_desc_data.py
@@ -6,8 +6,10 @@ Adds DBpedia labels/types/abstracts/etc data into a database
# In testing, this script took a few hours to run, and generated about 10GB
+import argparse
import re
-import bz2, sqlite3
+import bz2
+import sqlite3
LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries
IDS_FILE = 'page_lang=en_ids.ttl.bz2'
@@ -24,7 +26,7 @@ def genData(
print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Reading/storing label data')
dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)')
dbCur.execute('CREATE INDEX labels_idx ON labels(label)')
@@ -38,7 +40,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing wiki page ids')
dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX ids_idx ON ids(id)')
@@ -55,7 +57,7 @@ def genData(
except sqlite3.IntegrityError as e:
# Accounts for certain lines that have the same IRI
print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}')
- #
+
print('Reading/storing redirection data')
dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)')
redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
@@ -67,7 +69,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing diambiguation-page data')
dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)')
disambigLineRegex = redirLineRegex
@@ -79,7 +81,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),))
- #
+
print('Reading/storing instance-type data')
dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)')
dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)')
@@ -92,7 +94,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing abstracts')
dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)')
descLineRegex = labelLineRegex
@@ -107,14 +109,13 @@ def genData(
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO abstracts VALUES (?, ?)',
(match.group(1), match.group(2).replace(r'\"', '"')))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py
index 17e15b4..6efc7a4 100755
--- a/backend/tol_data/enwiki/download_img_license_info.py
+++ b/backend/tol_data/enwiki/download_img_license_info.py
@@ -9,13 +9,19 @@ The program can be re-run to continue downloading, and looks
at already-processed names to decide what to skip.
"""
+import argparse
import re
-import sqlite3, urllib.parse, html
+import sqlite3
+
import requests
-import time, signal
+import urllib.parse
+import html
+
+import time
+import signal
IMG_DB = 'img_data.db'
-#
+
API_URL = 'https://en.wikipedia.org/w/api.php'
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
BATCH_SZ = 50 # Max 50
@@ -30,19 +36,19 @@ def downloadInfo(imgDb: str) -> None:
if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
dbCur.execute('CREATE TABLE imgs (' \
'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
- #
+
print('Reading image names')
imgNames: set[str] = set()
for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
imgNames.add(imgName)
print(f'Found {len(imgNames)}')
- #
+
print('Checking for already-processed images')
oldSz = len(imgNames)
for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
imgNames.discard(imgName)
print(f'Found {oldSz - len(imgNames)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -51,7 +57,7 @@ def downloadInfo(imgDb: str) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Iterating through image names')
imgNameList = list(imgNames)
iterNum = 0
@@ -62,9 +68,11 @@ def downloadInfo(imgDb: str) -> None:
if interrupted:
print(f'Exiting loop at iteration {iterNum}')
break
+
# Get batch
imgBatch = imgNameList[i:i+BATCH_SZ]
imgBatch = ['File:' + x for x in imgBatch]
+
# Make request
headers = {
'user-agent': USER_AGENT,
@@ -87,6 +95,7 @@ def downloadInfo(imgDb: str) -> None:
print(f'ERROR: Exception while downloading info: {e}')
print('\tImage batch: ' + '|'.join(imgBatch))
continue
+
# Parse response-object
if 'query' not in responseObj or 'pages' not in responseObj['query']:
print('WARNING: Response object doesn\'t have page data')
@@ -126,6 +135,7 @@ def downloadInfo(imgDb: str) -> None:
artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
# Remove markup
if artist is not None:
artist = TAG_REGEX.sub(' ', artist).strip()
@@ -137,17 +147,17 @@ def downloadInfo(imgDb: str) -> None:
credit = WHITESPACE_REGEX.sub(' ', credit)
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
+
# Add to db
dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
(title, license, artist, credit, restrictions, url))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadInfo(IMG_DB)
diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py
index c6a1c21..164289d 100755
--- a/backend/tol_data/enwiki/download_imgs.py
+++ b/backend/tol_data/enwiki/download_imgs.py
@@ -11,14 +11,20 @@ in the output directory do decide what to skip.
# In testing, this downloaded about 100k images, over several days
-import re, os
+import argparse
+import re
+import os
import sqlite3
-import urllib.parse, requests
-import time, signal
+
+import requests
+import urllib.parse
+
+import time
+import signal
IMG_DB = 'img_data.db' # About 130k image names
OUT_DIR = 'imgs'
-#
+
LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
TIMEOUT = 1
@@ -34,7 +40,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
for filename in fileList:
pageIdsDone.add(int(os.path.splitext(filename)[0]))
print(f'Found {len(pageIdsDone)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -43,7 +49,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if interrupted:
print('Exiting loop')
break
+
# Check for problematic attributes
if license is None or LICENSE_REGEX.fullmatch(license) is None:
continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
continue
if restrictions is not None and restrictions != '':
continue
+
# Download image
iterNum += 1
print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
@@ -87,12 +95,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
except Exception as e:
print(f'Error while downloading to {outFile}: {e}')
return
+
print('Closing database')
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py
index b3fde52..44e4d6f 100755
--- a/backend/tol_data/enwiki/gen_desc_data.py
+++ b/backend/tol_data/enwiki/gen_desc_data.py
@@ -7,10 +7,16 @@ and adds them to a database
# In testing, this script took over 10 hours to run, and generated about 5GB
-import sys, os, re
+import argparse
+import sys
+import os
+import re
import bz2
-import html, mwxml, mwparserfromhell
import sqlite3
+import html
+
+import mwxml
+import mwparserfromhell
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
DB_FILE = 'desc_data.db'
@@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
def convertTemplateReplace(match):
""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
if match.group(2) is None:
return f'{match.group(1)} {match.group(4)}'
else:
return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
-PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
-LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+# ========== For data generation ==========
def genData(dumpFile: str, dbFile: str) -> None:
print('Creating database')
@@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None:
dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
- #
+
print('Iterating through dump file')
with bz2.open(dumpFile, mode='rt') as file:
for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
if pageNum % 1e4 == 0:
print(f'At page {pageNum}')
- # Parse page
+
if page.namespace == 0:
try:
dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
@@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None:
desc = parseDesc(revision.text)
if desc is not None:
dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def parseDesc(text: str) -> str | None:
- # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
- # and then accumulate lines until a blank one.
- # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
- # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
+ Looks for a description in wikitext content.
+
+ Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ and then accumulates lines until a blank one.
+
+ Some cases not accounted for include:
+ disambiguation pages, abstracts with sentences split-across-lines,
+ nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
lines: list[str] = []
openBraceCount = 0
openBracketCount = 0
@@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None:
if lines:
return removeMarkup(' '.join(lines))
return None
+
def removeMarkup(content: str) -> str:
content = EMBEDDED_HTML_REGEX.sub('', content)
content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
@@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str:
content = PARENS_GROUP_REGEX.sub('', content)
content = LEFTOVER_BRACE_REGEX.sub('', content)
return content
+
def convertTitle(title: str) -> str:
return html.unescape(title).replace('_', ' ')
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(DUMP_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py
index 5778680..12a8a10 100755
--- a/backend/tol_data/enwiki/gen_dump_index_db.py
+++ b/backend/tol_data/enwiki/gen_dump_index_db.py
@@ -1,9 +1,13 @@
#!/usr/bin/python3
"""
-Adds data from the wiki dump index-file into a database
+Converts data from the wiki-dump index-file into a database
"""
-import sys, os, re
+
+import argparse
+import sys
+import os
+import re
import bz2
import sqlite3
@@ -14,10 +18,12 @@ def genData(indexFile: str, dbFile: str) -> None:
""" Reads the index file and creates the db """
if os.path.exists(dbFile):
raise Exception(f'ERROR: Existing {dbFile}')
+
print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+
print('Iterating through index file')
lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
lastOffset = 0
@@ -28,7 +34,7 @@ def genData(indexFile: str, dbFile: str) -> None:
lineNum += 1
if lineNum % 1e5 == 0:
print(f'At line {lineNum}')
- #
+
match = lineRegex.fullmatch(line.rstrip())
assert match is not None
offsetStr, pageId, title = match.group(1,2,3)
@@ -48,13 +54,13 @@ def genData(indexFile: str, dbFile: str) -> None:
dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
except sqlite3.IntegrityError as e:
print(f'Failed on title "{t}": {e}', file=sys.stderr)
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(INDEX_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
index 040f223..2c243f3 100755
--- a/backend/tol_data/enwiki/gen_img_data.py
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -8,31 +8,39 @@ The program can be re-run with an updated set of page IDs, and
will skip already-processed page IDs.
"""
+import argparse
import re
-import os, bz2, html, urllib.parse
+import os
+import bz2
+import html
+import urllib.parse
import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
INDEX_DB = 'dump_index.db'
IMG_DB = 'img_data.db' # The database to create
DB_FILE = os.path.join('..', 'data.db')
-#
+
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+# ========== For data generation ==========
+
def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
print('Opening databases')
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
+
print('Checking tables')
if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
# Create tables if not present
- imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+ imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)')
+ # 'img_name' values are set to NULL to indicate page IDs where no image was found
imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
else:
# Check for already-processed page IDs
@@ -44,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
else:
print(f'Found already-processed page ID {pid} which was not in input set')
print(f'Will skip {numSkipped} already-processed page IDs')
- #
+
print('Getting dump-file offsets')
offsetToPageids: dict[int, list[int]] = {}
offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
@@ -53,7 +61,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}')
- #
+
query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
if row is None:
@@ -65,7 +73,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
offsetToPageids[chunkOffset] = []
offsetToPageids[chunkOffset].append(pageId)
print(f'Found {len(offsetToEnd)} chunks to check')
- #
+
print('Iterating through chunks in dump file')
with open(dumpFile, mode='rb') as file:
iterNum = 0
@@ -73,7 +81,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 100 == 0:
print(f'At iteration {iterNum}')
- #
+
chunkPageIds = offsetToPageids[pageOffset]
# Jump to chunk
file.seek(pageOffset)
@@ -126,14 +134,15 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
break
if not foundText:
print(f'WARNING: Did not find <text> for page id {pageId}')
- #
+
print('Closing databases')
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()
+
def getImageName(content: list[str]) -> str | None:
""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
- # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+ # Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = IMG_LINE_REGEX.match(line)
if match is not None:
@@ -174,6 +183,8 @@ def getImageName(content: list[str]) -> str | None:
return None
return None
+# ========== For getting input page IDs ==========
+
def getInputPageIdsFromDb(dbFile: str) -> set[int]:
print('Getting input page-ids')
pageIds: set[int] = set()
@@ -182,12 +193,15 @@ def getInputPageIdsFromDb(dbFile: str) -> set[int]:
for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
pageIds.add(pageId)
dbCon.close()
+
print(f'Found {len(pageIds)}')
return pageIds
+
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
pageIds = getInputPageIdsFromDb(DB_FILE)
genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py
index 8aee1cc..95b4a60 100755
--- a/backend/tol_data/enwiki/gen_pageview_data.py
+++ b/backend/tol_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
"""
Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+ wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+ platform (eg: mobile-web), monthly view count,
+ hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
"""
# Took about 15min per file (each had about 180e6 lines)
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
DUMP_INDEX_DB = 'dump_index.db'
DB_FILE = 'pageview_data.db'
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
- # Each pageview file has lines that seem to hold these space-separated fields:
- # wiki code (eg: en.wikipedia), article title, page ID (may be: null),
- # platform (eg: mobile-web), monthly view count,
- # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
- #
+
namespaceRegex = re.compile(r'[a-zA-Z]+:')
titleToViews: dict[str, int] = defaultdict(int)
linePrefix = b'en.wikipedia '
@@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
print(f'At line {lineNum}')
if not line.startswith(linePrefix):
continue
+
# Get second and second-last fields
line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
title = line[:line.find(b' ')].decode('utf-8')
viewCount = int(line[line.rfind(b' ')+1:])
if namespaceRegex.match(title) is not None:
continue
+
# Update map
title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
idbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py
index f744818..c4d0932 100755
--- a/backend/tol_data/enwiki/lookup_page.py
+++ b/backend/tol_data/enwiki/lookup_page.py
@@ -5,6 +5,7 @@ Looks up a page with title title1 in the wiki dump, using the dump-index
db, and prints the corresponding <page>.
"""
+import argparse
import sys
import bz2
import sqlite3
@@ -24,7 +25,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
_, pageOffset, endOffset = row
dbCon.close()
print(f'Found chunk at offset {pageOffset}')
- #
+
print('Reading from wiki dump')
content: list[str] = []
with open(dumpFile, mode='rb') as file:
@@ -32,6 +33,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
file.seek(pageOffset)
compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+
# Look in chunk for page
lines = data.splitlines()
lineIdx = 0
@@ -58,14 +60,13 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
if line.lstrip() == '</page>':
break
lineIdx += 1
- #
+
print('Content: ')
print('\n'.join(content))
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('title', help='The title to look up')
args = parser.parse_args()
- #
+
lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' '))
diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py
index 8454a35..5757032 100755
--- a/backend/tol_data/eol/download_imgs.py
+++ b/backend/tol_data/eol/download_imgs.py
@@ -13,9 +13,16 @@ already-downloaded files, and continues after the one with
highest EOL ID.
"""
-import sys, re, os, random
+import argparse
+import sys
+import re
+import os
+import random
import sqlite3
-import urllib.parse, requests
+
+import requests
+import urllib.parse
+
import time
from threading import Thread
import signal
@@ -23,7 +30,7 @@ import signal
IMAGES_LIST_DB = 'images_list.db'
OUT_DIR = 'imgs_for_review'
DB_FILE = os.path.join('..', 'data.db')
-#
+
MAX_IMGS_PER_ID = 3
MAX_THREADS = 5
POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
@@ -43,7 +50,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
eolIdList = sorted(eolIds)
nextIdx = 0
print(f'Result: {len(eolIdList)} EOL IDs')
- #
+
print('Checking output directory')
if not os.path.exists(outDir):
os.mkdir(outDir)
@@ -57,7 +64,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
if nextIdx == len(eolIdList):
print('No IDs left. Exiting...')
return
- #
+
print('Starting download threads')
numThreads = 0
threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
@@ -81,6 +88,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
threadException = e
numThreads -= 1
+
# Manage downloading
for idx in range(nextIdx, len(eolIdList)):
eolId = eolIdList[idx]
@@ -96,9 +104,11 @@ def downloadImgs(eolIds, imagesListDb, outDir):
if len(extension) <= 1:
print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
continue
+
# Check image-quantity limit
if len(ownerSet) == MAX_IMGS_PER_ID:
break
+
# Check for skip conditions
if re.fullmatch(LICENSE_REGEX, license) is None:
continue
@@ -107,11 +117,13 @@ def downloadImgs(eolIds, imagesListDb, outDir):
if copyrightOwner in ownerSet:
continue
ownerSet.add(copyrightOwner)
+
# Determine output filename
outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}')
if os.path.exists(outPath):
print(f'WARNING: {outPath} already exists. Skipping download.')
continue
+
# Check thread limit
while numThreads == MAX_THREADS:
time.sleep(1)
@@ -122,6 +134,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
time.sleep(1)
exitLoop = True
break
+
# Perform download
print(f'Downloading image to {outPath}')
numThreads += 1
@@ -129,6 +142,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
thread.start()
if exitLoop:
break
+
# Close images-list db
while numThreads > 0:
time.sleep(1)
@@ -143,10 +157,10 @@ def getEolIdsFromDb(dbFile) -> set[int]:
eolIds.add(id)
dbCon.close()
return eolIds
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
eolIds = getEolIdsFromDb(DB_FILE)
downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR)
diff --git a/backend/tol_data/eol/gen_images_list_db.py b/backend/tol_data/eol/gen_images_list_db.py
index ee57ac6..3e5bea1 100755
--- a/backend/tol_data/eol/gen_images_list_db.py
+++ b/backend/tol_data/eol/gen_images_list_db.py
@@ -4,8 +4,12 @@
Generates a sqlite db from a directory of CSV files holding EOL image data
"""
-import os, glob
-import csv, re, sqlite3
+import argparse
+import os
+import glob
+import csv
+import re
+import sqlite3
IMAGE_LISTS_GLOB = os.path.join('imagesList', '*.csv')
DB_FILE = 'images_list.db'
@@ -18,6 +22,7 @@ def genData(imageListsGlob: str, dbFile: str) -> None:
' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \
' copy_url TEXT, license TEXT, copyright_owner TEXT)')
dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)')
+
print('Reading CSV files')
for filename in glob.glob(imageListsGlob):
print(f'Processing {filename}')
@@ -27,13 +32,13 @@ def genData(imageListsGlob: str, dbFile: str) -> None:
continue
dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
(int(contentId), int(pageId), sourceUrl, copyUrl, license, owner))
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(IMAGE_LISTS_GLOB, DB_FILE)
diff --git a/backend/tol_data/eol/review_imgs.py b/backend/tol_data/eol/review_imgs.py
index 9fb462c..145f338 100755
--- a/backend/tol_data/eol/review_imgs.py
+++ b/backend/tol_data/eol/review_imgs.py
@@ -7,8 +7,13 @@ choose an image to keep, or reject all. Also provides image rotation.
Chosen images are placed in another directory, and rejected ones are deleted.
"""
-import sys, re, os, time
+import argparse
+import sys
+import re
+import os
+import time
import sqlite3
+
import tkinter as tki
from tkinter import ttk
import PIL
@@ -17,7 +22,7 @@ from PIL import ImageTk, Image, ImageOps
IMG_DIR = 'imgs_for_review'
OUT_DIR = 'imgs'
EXTRA_INFO_DB = os.path.join('..', 'data.db')
-#
+
IMG_DISPLAY_SZ = 400
MAX_IMGS_PER_ID = 3
IMG_BG_COLOR = (88, 28, 135)
@@ -28,11 +33,13 @@ class EolImgReviewer:
def __init__(self, root, imgDir, imgList, extraInfoDb, outDir):
self.root = root
root.title('EOL Image Reviewer')
+
# Setup main frame
mainFrame = ttk.Frame(root, padding='5 5 5 5')
mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
+
# Set up images-to-be-reviewed frames
self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation
self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter
@@ -44,9 +51,11 @@ class EolImgReviewer:
label = ttk.Label(frame, image=self.photoImgs[i])
label.grid(column=0, row=0)
self.labels.append(label)
+
# Add padding
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
+
# Add keyboard bindings
root.bind('<q>', self.quit)
root.bind('<Key-j>', lambda evt: self.accept(0))
@@ -59,6 +68,7 @@ class EolImgReviewer:
root.bind('<Key-A>', lambda evt: self.rotate(0, True))
root.bind('<Key-S>', lambda evt: self.rotate(1, True))
root.bind('<Key-D>', lambda evt: self.rotate(2, True))
+
# Initialise fields
self.imgDir = imgDir
self.imgList = imgList
@@ -67,13 +77,15 @@ class EolImgReviewer:
self.nextEolId = 0
self.nextImgNames: list[str] = []
self.rotations: list[int] = []
+
# For displaying extra info
self.extraInfoDbCon = sqlite3.connect(extraInfoDb)
self.extraInfoDbCur = self.extraInfoDbCon.cursor()
self.numReviewed = 0
self.startTime = time.time()
- #
+
self.getNextImgs()
+
def getNextImgs(self):
""" Updates display with new images to review, or ends program """
# Gather names of next images to review
@@ -95,6 +107,7 @@ class EolImgReviewer:
self.nextImgNames.append(imgName)
self.rotations.append(0)
self.imgListIdx += 1
+
# Update displayed images
idx = 0
while idx < MAX_IMGS_PER_ID:
@@ -113,16 +126,19 @@ class EolImgReviewer:
self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
self.labels[idx].config(image=self.photoImgs[idx])
idx += 1
+
# Restart if all image files non-recognisable
if not self.nextImgNames:
self.getNextImgs()
return
+
# Update title
firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
lastImgIdx = self.imgListIdx
title = self.getExtraInfo(self.nextEolId)
title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})'
self.root.title(title)
+
def accept(self, imgIdx):
""" React to a user selecting an image """
if imgIdx >= len(self.nextImgNames):
@@ -142,12 +158,14 @@ class EolImgReviewer:
os.remove(inFile)
self.numReviewed += 1
self.getNextImgs()
+
def reject(self):
""" React to a user rejecting all images of a set """
for i in range(len(self.nextImgNames)):
os.remove(os.path.join(self.imgDir, self.nextImgNames[i]))
self.numReviewed += 1
self.getNextImgs()
+
def rotate(self, imgIdx, anticlockwise = False):
""" Respond to a user rotating an image """
deg = -90 if not anticlockwise else 90
@@ -155,6 +173,7 @@ class EolImgReviewer:
self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
+
def quit(self, e = None):
print(f'Number reviewed: {self.numReviewed}')
timeElapsed = time.time() - self.startTime
@@ -163,7 +182,7 @@ class EolImgReviewer:
print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
self.extraInfoDbCon.close()
self.root.destroy()
- #
+
def resizeImgForDisplay(self, img):
""" Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """
if max(img.width, img.height) > IMG_DISPLAY_SZ:
@@ -178,6 +197,7 @@ class EolImgReviewer:
int((IMG_DISPLAY_SZ - img.width) / 2),
int((IMG_DISPLAY_SZ - img.height) / 2)))
return bgImg
+
def getExtraInfo(self, eolId: int) -> str:
""" Used to display extra EOL ID info """
query = 'SELECT names.alt_name FROM' \
@@ -193,12 +213,14 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str):
print('Checking output directory')
if not os.path.exists(outDir):
os.mkdir(outDir)
+
print('Getting input image list')
imgList = os.listdir(imgDir)
imgList.sort(key=lambda s: int(s.split(' ')[0]))
if not imgList:
print('No input images found')
sys.exit(0)
+
# Create GUI and defer control
print('Starting GUI')
root = tki.Tk()
@@ -206,8 +228,7 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str):
root.mainloop()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
reviewImgs(IMG_DIR, OUT_DIR, EXTRA_INFO_DB)
diff --git a/backend/tol_data/gen_desc_data.py b/backend/tol_data/gen_desc_data.py
index fa08a8c..69efe79 100755
--- a/backend/tol_data/gen_desc_data.py
+++ b/backend/tol_data/gen_desc_data.py
@@ -5,7 +5,9 @@ Maps nodes to short descriptions, using data from DBpedia and
Wikipedia, and stores results in the database.
"""
-import os, sqlite3
+import argparse
+import os
+import sqlite3
DBPEDIA_DB = os.path.join('dbpedia', 'desc_data.db')
ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
@@ -16,12 +18,12 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)')
- #
+
print('Getting node mappings')
nodeToWikiId: dict[str, int] = {}
for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
nodeToWikiId[name] = wikiId
- #
+
print('Reading data from DBpedia')
dbpCon = sqlite3.connect(dbpediaDb)
dbpCur = dbpCon.cursor()
@@ -32,20 +34,22 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
iterNum += 1
if iterNum % 1e5 == 0:
print(f'At iteration {iterNum}')
- #
+
row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone()
if row is not None:
nodeToIri[name] = row[0]
+
print('Resolving redirects')
iterNum = 0
for name, iri in nodeToIri.items():
iterNum += 1
if iterNum % 1e5 == 0:
print(f'At iteration {iterNum}')
- #
+
row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone()
if row is not None:
nodeToIri[name] = row[0]
+
print('Adding descriptions')
iterNum = 0
for name, iri in nodeToIri.items():
@@ -57,11 +61,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
if row is not None:
dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1))
del nodeToWikiId[name]
+
dbpCon.close()
- #
+
print('Reading data from Wikipedia')
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
+
print('Adding descriptions')
iterNum = 0
for name, wikiId in nodeToWikiId.items():
@@ -79,14 +85,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone()
if row is not None:
dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0))
- #
+
print('Closing databases')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(DBPEDIA_DB, ENWIKI_DB, DB_FILE)
diff --git a/backend/tol_data/gen_imgs.py b/backend/tol_data/gen_imgs.py
index 0ba75ec..2479742 100755
--- a/backend/tol_data/gen_imgs.py
+++ b/backend/tol_data/gen_imgs.py
@@ -11,8 +11,11 @@ processing. It uses already-existing database entries to decide what
to skip.
"""
-import os, subprocess
-import sqlite3, urllib.parse
+import argparse
+import os
+import subprocess
+import sqlite3
+import urllib.parse
import signal
IMG_LIST_FILE = 'img_list.txt'
@@ -23,10 +26,11 @@ ENWIKI_IMG_DB = os.path.join('enwiki', 'img_data.db')
PICKED_IMGS_DIR = 'picked_imgs'
PICKED_IMGS_FILE = 'img_data.txt'
DB_FILE = 'data.db'
-#
+
IMG_OUT_SZ = 200
ImgId = tuple[int, str] # Holds an int ID and a source string (eg: 'eol')
+
class PickedImg:
""" Represents a picked-image from pickedImgsDir """
def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str):
@@ -44,9 +48,9 @@ def genImgs(
""" Reads the image-list file, generates images, and updates db """
if not os.path.exists(outDir):
os.mkdir(outDir)
- #
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+
print('Checking for image tables')
nodesDone: set[str] = set()
imgsDone: set[ImgId] = set()
@@ -63,15 +67,16 @@ def genImgs(
for imgId, imgSrc in dbCur.execute('SELECT id, src from images'):
imgsDone.add((imgId, imgSrc))
print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip')
- #
+
print('Processing picked-images')
success = processPickedImgs(pickedImgsDir, pickedImgsFile, nodesDone, imgsDone, outDir, dbCur)
if success:
print('Processing images from eol and enwiki')
processImgs(imgListFile, eolImgDir, eolImgDb, enwikiImgDb, nodesDone, imgsDone, outDir, dbCur)
- # Close db
+
dbCon.commit()
dbCon.close()
+
def processPickedImgs(
pickedImgsDir: str, pickedImgsFile: str, nodesDone: set[str], imgsDone: set[ImgId],
outDir: str, dbCur: sqlite3.Cursor) -> bool:
@@ -85,25 +90,30 @@ def processPickedImgs(
nodeName = os.path.splitext(filename)[0] # Remove extension
(otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone()
nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit)
+
# Set SIGINT handler
interrupted = False
def onSigint(sig, frame):
nonlocal interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
+
# Convert images
for otolId, imgData in nodeToPickedImg.items():
# Check for SIGINT event
if interrupted:
print('Exiting')
return False
+
# Skip if already processed
if otolId in nodesDone:
continue
+
# Convert image
success = convertImage(os.path.join(pickedImgsDir, imgData.filename), os.path.join(outDir, otolId + '.jpg'))
if not success:
return False
+
# Add entry to db
if (imgData.id, 'picked') not in imgsDone:
dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
@@ -112,6 +122,7 @@ def processPickedImgs(
dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked'))
nodesDone.add(otolId)
return True
+
def processImgs(
imgListFile: str, eolImgDir: str, eolImgDb: str, enwikiImgDb: str,
nodesDone: set[str], imgsDone: set[ImgId], outDir: str, dbCur: sqlite3.Cursor) -> bool:
@@ -120,12 +131,14 @@ def processImgs(
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
+
# Set SIGINT handler
interrupted = False
def onSigint(sig, frame):
nonlocal interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
+
# Convert images
flag = False # Set to True upon interruption or failure
with open(imgListFile) as file:
@@ -135,19 +148,24 @@ def processImgs(
print('Exiting')
flag = True
break
+
# Skip lines without an image path
if line.find(' ') == -1:
continue
+
# Get filenames
otolId, _, imgPath = line.rstrip().partition(' ')
+
# Skip if already processed
if otolId in nodesDone:
continue
+
# Convert image
success = convertImage(imgPath, os.path.join(outDir, otolId + '.jpg'))
if not success:
flag = True
break
+
# Add entry to db
(nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone()
fromEol = imgPath.startswith(eolImgDir)
@@ -185,14 +203,17 @@ def processImgs(
(enwikiId, 'enwiki', url, license, artist, credit))
imgsDone.add((enwikiId, 'enwiki'))
dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki'))
+
eolCon.close()
enwikiCon.close()
return not flag
+
def convertImage(imgPath: str, outPath: str):
print(f'Converting {imgPath} to {outPath}')
if os.path.exists(outPath):
print('ERROR: Output image already exists')
return False
+
try:
completedProcess = subprocess.run(
['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
@@ -207,8 +228,7 @@ def convertImage(imgPath: str, outPath: str):
return True
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genImgs(IMG_LIST_FILE, EOL_IMG_DIR, OUT_DIR, EOL_IMG_DB, ENWIKI_IMG_DB, PICKED_IMGS_DIR, PICKED_IMGS_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_linked_imgs.py b/backend/tol_data/gen_linked_imgs.py
index 7002e92..c9d7aac 100755
--- a/backend/tol_data/gen_linked_imgs.py
+++ b/backend/tol_data/gen_linked_imgs.py
@@ -5,11 +5,12 @@ Look for nodes without images in the database, and tries to
associate them with images from their children
"""
+import argparse
import re
import sqlite3
DB_FILE = 'data.db'
-#
+
COMPOUND_NAME_REGEX = re.compile(r'\[(.+) \+ (.+)]')
UP_PROPAGATE_COMPOUND_IMGS = False
@@ -18,14 +19,14 @@ def genData(dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)')
- #
+
print('Getting nodes with images')
nodeToUsedId: dict[str, str] = {} # Maps name of node to otol ID of node to use image for
query = 'SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name'
for name, otolId in dbCur.execute(query):
nodeToUsedId[name] = otolId
print(f'Found {len(nodeToUsedId)}')
- #
+
print('Getting node depths')
nodeToDepth: dict[str, int] = {}
maxDepth = 0
@@ -33,6 +34,7 @@ def genData(dbFile: str) -> None:
for nodeName in nodeToUsedId.keys():
nodeChain = [nodeName]
lastDepth = 0
+
# Add ancestors
while True:
row = dbCur.execute('SELECT parent FROM edges WHERE child = ?', (nodeName,)).fetchone()
@@ -45,11 +47,12 @@ def genData(dbFile: str) -> None:
if nodeName in nodeToDepth:
lastDepth = nodeToDepth[nodeName]
break
+
# Add depths
for i in range(len(nodeChain)):
nodeToDepth[nodeChain[-i-1]] = i + lastDepth
maxDepth = max(maxDepth, lastDepth + len(nodeChain) - 1)
- #
+
print('Finding ancestors to give linked images')
depthToNodes: dict[int, list[str]] = {depth: [] for depth in range(maxDepth + 1)}
for nodeName, depth in nodeToDepth.items():
@@ -70,12 +73,12 @@ def genData(dbFile: str) -> None:
(tips,) = dbCur.execute('SELECT tips FROM nodes WHERE name == ?', (node,)).fetchone()
if parent not in parentToCandidate or parentToCandidate[parent][1] < tips:
parentToCandidate[parent] = (node, tips)
- #
+
print('Replacing linked-images for compound nodes')
for iterNum, node in enumerate(parentToCandidate.keys(), 1):
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}')
- #
+
match = COMPOUND_NAME_REGEX.fullmatch(node)
if match is not None:
# Replace associated image with subname images
@@ -85,12 +88,15 @@ def genData(dbFile: str) -> None:
otolIdPair[0] = nodeToUsedId[subName1]
if subName2 in nodeToUsedId:
otolIdPair[1] = nodeToUsedId[subName2]
+
# Use no image if both subimages not found
if otolIdPair[0] == '' and otolIdPair[1] == '':
dbCur.execute('DELETE FROM linked_imgs WHERE name = ?', (node,))
continue
+
# Add to db
dbCur.execute('UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', (','.join(otolIdPair), node))
+
# Possibly repeat operation upon parent/ancestors
if UP_PROPAGATE_COMPOUND_IMGS:
while True:
@@ -104,14 +110,13 @@ def genData(dbFile: str) -> None:
node = parent
continue
break
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(DB_FILE)
diff --git a/backend/tol_data/gen_mapping_data.py b/backend/tol_data/gen_mapping_data.py
index 4373d1d..1ab577b 100755
--- a/backend/tol_data/gen_mapping_data.py
+++ b/backend/tol_data/gen_mapping_data.py
@@ -12,9 +12,12 @@ Based on code from https://github.com/OneZoom/OZtree, located in
OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
"""
+import argparse
import os
from collections import defaultdict
-import gzip, csv, sqlite3
+import gzip
+import csv
+import sqlite3
TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv')
EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz')
@@ -43,27 +46,31 @@ def genData(
nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title
titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string
titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID
+
# Get mappings from data input
readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds)
readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId)
readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId)
readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle)
getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId)
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+
# Get otol id-to-name map
otolIdToName: dict[int, str] = {}
for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
if nodeId.startswith('ott'):
otolIdToName[int(nodeId[3:])] = nodeName
+
# Add eol mappings
dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
for otolId, eolId in nodeToEolId.items():
if otolId in otolIdToName:
dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
+
# Add enwiki mappings
dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
@@ -73,8 +80,10 @@ def genData(
dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
if title in titleToIucnStatus:
dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
+
dbCon.commit()
dbCon.close()
+
def readTaxonomyFile(
taxonomyFile: str,
nodeToSrcIds: dict[int, dict[str, int]],
@@ -88,9 +97,11 @@ def readTaxonomyFile(
for lineNum, line in enumerate(file, 1):
if lineNum % 1e5 == 0:
print(f'At line {lineNum}')
+
# Skip header line
if lineNum == 1:
continue
+
# Parse line
fields = line.split('\t|\t')
try:
@@ -99,6 +110,7 @@ def readTaxonomyFile(
print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
continue
srcsField = fields[4]
+
# Add source IDs
for srcPair in srcsField.split(','):
src, srcIdStr = srcPair.split(':', 1)
@@ -111,6 +123,7 @@ def readTaxonomyFile(
nodeToSrcIds[otolId][src] = srcId
usedSrcIds.add((src, srcId))
print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
+
def readEolIdsFile(
eolIdsFile: str,
nodeToSrcIds: dict[int, dict[str, int]],
@@ -126,9 +139,11 @@ def readEolIdsFile(
for lineNum, row in enumerate(csv.reader(file), 1):
if lineNum % 1e6 == 0:
print(f'At line {lineNum}')
+
# Skip header line
if lineNum == 1:
continue
+
# Parse line
eolId = int(row[3])
srcInt = int(row[2])
@@ -144,7 +159,7 @@ def readEolIdsFile(
srcToEolId[src][srcId] = eolId
print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
# Was about 3.5e6 (4.2e6 without usedSrcIds)
- #
+
print('Resolving candidate EOL IDs')
# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
for otolId, srcInfo in nodeToSrcIds.items():
@@ -161,6 +176,7 @@ def readEolIdsFile(
eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
nodeToEolId[otolId] = min(eolIds)
print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
+
def readWikidataDb(
wikidataDb: str,
nodeToSrcIds: dict[int, dict[str, int]],
@@ -185,7 +201,7 @@ def readWikidataDb(
# Was about 1.1e6 (1.2e6 without usedSrcIds)
print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
dbCon.close()
- #
+
print('Resolving candidate Wikidata items')
# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
for otolId, srcInfo in nodeToSrcIds.items():
@@ -211,7 +227,7 @@ def readWikidataDb(
nodeToWikiTitle[otolId] = srcToTitle[src]
break
print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
- #
+
print('Adding extra EOL mappings from Wikidata')
wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
addedEntries: dict[int, int] = {}
@@ -222,6 +238,7 @@ def readWikidataDb(
nodeToEolId[otolId] = eolId
addedEntries[otolId] = eolId
print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
+
def readPickedMappings(
pickedMappings: dict[str, list[str]],
nodeToEolId: dict[int, int],
@@ -248,6 +265,7 @@ def readPickedMappings(
else:
if otolId in nodeToWikiTitle:
del nodeToWikiTitle[otolId]
+
def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None:
""" Read a db for mappings from enwiki titles to page IDs """
print('Getting enwiki page IDs')
@@ -264,8 +282,7 @@ def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], ti
print(f'Unable to find IDs for {numNotFound} titles') # Was 2913
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/gen_name_data.py b/backend/tol_data/gen_name_data.py
index 2e92c20..5b6e963 100755
--- a/backend/tol_data/gen_name_data.py
+++ b/backend/tol_data/gen_name_data.py
@@ -5,8 +5,12 @@ Maps nodes to vernacular names, using data from EOL, enwiki, and a
picked-names file, and stores results in the database.
"""
-import re, os
-import html, csv, sqlite3
+import argparse
+import re
+import os
+import html
+import csv
+import sqlite3
EOL_NAMES_FILE = os.path.join('eol', 'vernacularNames.csv')
ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
@@ -17,25 +21,26 @@ def genData(eolNamesFile: str, enwikiDb: str, pickedNamesFile: str, dbFile: str)
""" Reads the files and adds to db """
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Creating table')
dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))')
dbCur.execute('CREATE INDEX names_idx ON names(name)')
dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)')
dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)')
- #
+
print('Getting node mappings')
nodeToTips: dict[str, int] = {}
for name, tips in dbCur.execute('SELECT name, tips from nodes'):
nodeToTips[name] = tips
- #
+
addEolNames(eolNamesFile, nodeToTips, dbCur)
addEnwikiNames(enwikiDb, nodeToTips, dbCur)
addPickedNames(pickedNamesFile, nodeToTips, dbCur)
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
""" Reads EOL names, associates them with otol nodes, and writes to db """
# The CSV file has a header line, then lines with these fields:
@@ -47,26 +52,31 @@ def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cu
for name, eolId in dbCur.execute('SELECT name, id from eol_ids'):
if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]:
eolIdToNode[eolId] = name
+
print('Adding names from EOL')
namesToSkip = {'unknown', 'unknown species', 'unidentified species'}
with open(eolNamesFile, newline='') as file:
for lineNum, fields in enumerate(csv.reader(file), 1):
if lineNum % 1e5 == 0:
print(f'At line {lineNum}') # Reached about 2.8e6
+
# Skip header line
if lineNum == 1:
continue
+
# Parse line
eolId = int(fields[0])
name = html.unescape(fields[2]).lower()
lang = fields[3]
isPreferred = 1 if fields[6] == 'preferred' else 0
+
# Add to db
if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \
and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words
cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')'
# The 'OR IGNORE' accounts for duplicate lines
dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred))
+
def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
""" Reads enwiki names, associates them with otol nodes, and writes to db """
print('Getting enwiki mappings')
@@ -74,6 +84,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]:
wikiIdToNode[wikiId] = name
+
print('Adding names from enwiki')
altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)',
enwikiCon = sqlite3.connect(enwikiDb)
@@ -83,7 +94,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}') # Reached about 3.6e5
- #
+
query = 'SELECT p1.title FROM pages p1' \
' INNER JOIN redirects r1 ON p1.id = r1.id' \
' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?'
@@ -91,6 +102,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
name = name.lower()
if altNameRegex.fullmatch(name) is not None and name != nodeName and name not in nodeToTips:
dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0))
+
def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
# File format:
# nodename1|altName1|isPreferred1 -> Add an alt-name
@@ -121,8 +133,7 @@ def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqli
dbCur.execute(cmd, (nodeName,))
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(EOL_NAMES_FILE, ENWIKI_DB, PICKED_NAMES_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py
index eba8779..a67ea4b 100755
--- a/backend/tol_data/gen_otol_data.py
+++ b/backend/tol_data/gen_otol_data.py
@@ -21,14 +21,19 @@ Reads from a picked-names file, if present, which specifies name and node ID pai
These help resolve cases where multiple nodes share the same name.
"""
-import re, os
-import json, sqlite3
+import argparse
+import re
+import os
+import json
+import sqlite3
TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes
ANN_FILE = os.path.join('otol', 'annotations.json')
DB_FILE = 'data.db'
PICKED_NAMES_FILE = 'picked_otol_names.txt'
+# ========== Classes ==========
+
class Node:
""" Represents a tree-of-life node """
def __init__(self, name, childIds, parentId, tips, pSupport):
@@ -37,13 +42,16 @@ class Node:
self.parentId = parentId
self.tips = tips
self.pSupport = pSupport
+
class BasicStream:
""" Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """
def __init__(self, data, idx=0):
self.data = data
self.idx = idx
+
def hasNext(self) -> bool:
return self.idx < len(self.data)
+
def next(self) -> str:
if self.hasNext():
char = self.data[self.idx]
@@ -51,30 +59,37 @@ class BasicStream:
return char;
else:
return '';
+
def peek(self) -> str:
if self.hasNext():
return self.data[self.idx]
else:
return '';
+
def skipWhitespace(self) -> None:
while self.hasNext() and self.data[self.idx].isspace():
self.idx += 1
+
def progress(self) -> float:
return (self.idx / len(self.data))
+# ========== For data generation ==========
+
def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None:
""" Reads the files and stores the tree info """
nodeMap: dict[str, Node] = {} # Maps node IDs to node objects
nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs)
dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs
- #
+
print('Parsing tree file')
treeStream: BasicStream
with open(treeFile) as file:
treeStream = BasicStream(file.read())
+
# Parse content
parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds)
print('Resolving duplicate names')
+
# Read picked-names file
nameToPickedId: dict[str, str] = {}
if os.path.exists(pickedNamesFile):
@@ -82,6 +97,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
for line in file:
name, _, otolId = line.strip().partition('|')
nameToPickedId[name] = otolId
+
# Resolve duplicates
for dupName, ids in dupNameToIds.items():
# Check for picked id
@@ -98,10 +114,12 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
if id != idToUse:
nodeMap[id].name += f' [{counter}]'
counter += 1
+
print('Changing mrca* names')
for id, node in nodeMap.items():
if node.name.startswith('mrca'):
convertMrcaName(id, nodeMap)
+
print('Parsing annotations file')
# Read file
with open(annFile) as file:
@@ -116,6 +134,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0
conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0
node.pSupport = supportQty > 0 and conflictQty == 0
+
print('Creating nodes and edges tables')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -129,9 +148,11 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
childNode = nodeMap[childId]
dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)',
(node.name, childNode.name, 1 if childNode.pSupport else 0))
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def parseNewick(
stream: BasicStream,
nodeMap: dict[str, Node],
@@ -140,6 +161,7 @@ def parseNewick(
""" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """
if stream.idx % 1e5 == 0:
print(f'Progress: {stream.progress() * 100:.2f}%')
+
# Find node
stream.skipWhitespace()
if stream.peek() == '':
@@ -151,6 +173,7 @@ def parseNewick(
# Read child
childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds)
childIds.append(childId)
+
# Check for next child or end of node
stream.skipWhitespace()
if stream.peek() == '':
@@ -164,12 +187,15 @@ def parseNewick(
stream.skipWhitespace()
name, id = parseNewickName(stream)
updateNameMaps(name, id, nameToFirstId, dupNameToIds)
+
# Get child num-tips total
tips = 0
for childId in childIds:
tips += nodeMap[childId].tips
+
# Add node to nodeMap
nodeMap[id] = Node(name, childIds, None, tips, False)
+
# Update childrens' parent reference
for childId in childIds:
nodeMap[childId].parentId = id
@@ -179,6 +205,7 @@ def parseNewick(
updateNameMaps(name, id, nameToFirstId, dupNameToIds)
nodeMap[id] = Node(name, [], None, 1, False)
return id
+
def parseNewickName(stream: BasicStream) -> tuple[str, str]:
""" Parses a node name from 'stream', and returns a (name, id) pair """
name: str
@@ -202,6 +229,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
nameChars.append(stream.next())
if stream.peek() == ';': # Ignore trailing input semicolon
stream.next()
+
# Convert to (name, id)
name = ''.join(nameChars).rstrip().lower()
if name.startswith('mrca'):
@@ -217,6 +245,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
if match is None:
raise Exception(f'ERROR: invalid name \'{name}\'')
return (match.group(1).replace('_', ' '), match.group(2))
+
def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None:
""" Update maps upon a newly parsed name """
if name not in nameToFirstId:
@@ -226,6 +255,7 @@ def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToI
dupNameToIds[name] = [nameToFirstId[name], id]
else:
dupNameToIds[name].append(id)
+
def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
""" Update a node in a tree to be named after 2 descendants.
Returns the name of one such descendant, for use during recursion. """
@@ -234,6 +264,7 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
childIds = node.childIds
if len(childIds) < 2:
raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children')
+
# Get 2 children with most tips
childTips = [nodeMap[id].tips for id in childIds]
maxIdx1 = childTips.index(max(childTips))
@@ -243,11 +274,13 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
childId2 = childIds[maxIdx2]
childName1 = nodeMap[childId1].name
childName2 = nodeMap[childId2].name
+
# Check for mrca* child names
if childName1.startswith('mrca'):
childName1 = convertMrcaName(childId1, nodeMap)
if childName2.startswith('mrca'):
childName2 = convertMrcaName(childId2, nodeMap)
+
# Check for composite names
match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1)
if match is not None:
@@ -255,13 +288,15 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2)
if match is not None:
childName2 = match.group(1)
+
# Create composite name
node.name = f'[{childName1} + {childName2}]'
return childName1
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_pop_data.py b/backend/tol_data/gen_pop_data.py
index e6a646e..4280a12 100755
--- a/backend/tol_data/gen_pop_data.py
+++ b/backend/tol_data/gen_pop_data.py
@@ -5,7 +5,9 @@ Reads enwiki page view info from a database, and stores it
as node popularity values in the database.
"""
-import os, sqlite3
+import argparse
+import os
+import sqlite3
PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
DB_FILE = 'data.db'
@@ -13,7 +15,7 @@ DB_FILE = 'data.db'
def genData(pageviewsDb: str, dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Getting view counts')
pdbCon = sqlite3.connect(pageviewsDb)
pdbCur = pdbCon.cursor()
@@ -23,23 +25,22 @@ def genData(pageviewsDb: str, dbFile: str) -> None:
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}') # Reached 1.6e6
- #
+
row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone()
if row is not None:
nodeToViews[row[0]] = views
pdbCon.close()
- #
+
print(f'Writing {len(nodeToViews)} entries to db')
dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)')
for nodeName, views in nodeToViews.items():
dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views))
- #
+
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PAGEVIEWS_DB, DB_FILE)
diff --git a/backend/tol_data/gen_reduced_trees.py b/backend/tol_data/gen_reduced_trees.py
index 3742544..ce628f7 100755
--- a/backend/tol_data/gen_reduced_trees.py
+++ b/backend/tol_data/gen_reduced_trees.py
@@ -14,12 +14,14 @@ Creates reduced versions of the tree in the database:
removing some more, despite any node descriptions.
"""
-import sys, re
+import argparse
+import sys
+import re
import sqlite3
DB_FILE = 'data.db'
PICKED_NODES_FILE = 'picked_nodes.txt'
-#
+
COMP_NAME_REGEX = re.compile(r'\[.+ \+ .+]') # Used to recognise composite nodes
class Node:
@@ -30,16 +32,18 @@ class Node:
self.tips = tips
self.pSupport = pSupport
+# ========== For data generation ==========
+
def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
print('Opening database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Finding root node')
query = 'SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1'
(rootName,) = dbCur.execute(query).fetchone()
print(f'Found \'{rootName}\'')
- #
+
print('=== Getting picked-nodes ===')
pickedNames: set[str] = set()
pickedTreeExists = False
@@ -63,7 +67,7 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
for (name,) in dbCur.execute('SELECT name FROM nodes_p'):
pickedNames.add(name)
print(f'Found {len(pickedNames)} names')
- #
+
if (tree == 'picked' or tree is None) and not pickedTreeExists:
print('=== Generating picked-nodes tree ===')
genPickedNodeTree(dbCur, pickedNames, rootName)
@@ -88,22 +92,27 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
if tree == 'trimmed' or tree is None:
print('=== Generating weakly-trimmed tree ===')
genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName)
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: str) -> None:
PREF_NUM_CHILDREN = 3 # Include extra children up to this limit
+
print('Getting ancestors')
nodeMap = genNodeMap(dbCur, pickedNames, 100)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing composite nodes')
removedNames = removeCompositeNodes(nodeMap)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing \'collapsible\' nodes')
temp = removeCollapsibleNodes(nodeMap, pickedNames)
removedNames.update(temp)
print(f'Result has {len(nodeMap)} nodes')
+
print('Adding some additional nearby children')
namesToAdd: list[str] = []
iterNum = 0
@@ -111,7 +120,7 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st
iterNum += 1
if iterNum % 100 == 0:
print(f'At iteration {iterNum}')
- #
+
numChildren = len(node.children)
if numChildren < PREF_NUM_CHILDREN:
children = [row[0] for row in dbCur.execute('SELECT child FROM edges where parent = ?', (name,))]
@@ -134,33 +143,44 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st
parent = None if parent == '' else parent
nodeMap[name] = Node(id, [], parent, 0, pSupport == 1)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values')
updateTips(rootName, nodeMap)
+
print('Creating table')
addTreeTables(nodeMap, dbCur, 'p')
+
def genImagesOnlyTree(
dbCur: sqlite3.Cursor,
nodesWithImgOrPicked: set[str],
pickedNames: set[str],
rootName: str) -> None:
+
print('Getting ancestors')
nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing composite nodes')
removeCompositeNodes(nodeMap)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing \'collapsible\' nodes')
removeCollapsibleNodes(nodeMap, pickedNames)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values') # Needed for next trimming step
updateTips(rootName, nodeMap)
+
print('Trimming from nodes with \'many\' children')
trimIfManyChildren(nodeMap, rootName, 300, pickedNames)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values')
updateTips(rootName, nodeMap)
+
print('Creating table')
addTreeTables(nodeMap, dbCur, 'i')
+
def genWeaklyTrimmedTree(
dbCur: sqlite3.Cursor,
nodesWithImgDescOrPicked: set[str],
@@ -169,6 +189,7 @@ def genWeaklyTrimmedTree(
print('Getting ancestors')
nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5)
print(f'Result has {len(nodeMap)} nodes')
+
print('Getting nodes to \'strongly keep\'')
iterNum = 0
nodesFromImgOrPicked: set[str] = set()
@@ -184,19 +205,26 @@ def genWeaklyTrimmedTree(
else:
break
print(f'Node set has {len(nodesFromImgOrPicked)} nodes')
+
print('Removing \'collapsible\' nodes')
removeCollapsibleNodes(nodeMap, nodesWithImgDescOrPicked)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values') # Needed for next trimming step
updateTips(rootName, nodeMap)
+
print('Trimming from nodes with \'many\' children')
trimIfManyChildren(nodeMap, rootName, 600, nodesFromImgOrPicked)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values')
updateTips(rootName, nodeMap)
+
print('Creating table')
addTreeTables(nodeMap, dbCur, 't')
-# Helper functions
+
+# ========== Helper functions ==========
+
def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -> dict[str, Node]:
""" Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map """
nodeMap: dict[str, Node] = {}
@@ -206,7 +234,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -
iterNum += 1
if iterNum % itersBeforePrint == 0:
print(f'At iteration {iterNum}')
- #
+
prevName: str | None = None
while name is not None:
if name not in nodeMap:
@@ -227,6 +255,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -
nodeMap[name].children.append(prevName)
break
return nodeMap
+
def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]:
""" Given a tree, removes composite-name nodes, and returns the removed nodes' names """
namesToRemove: set[str] = set()
@@ -244,10 +273,12 @@ def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]:
for name in namesToRemove:
del nodeMap[name]
return namesToRemove
+
def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set()) -> set[str]:
""" Given a tree, removes single-child parents, then only-childs,
with given exceptions, and returns the set of removed nodes' names """
namesToRemove: set[str] = set()
+
# Remove single-child parents
for name, node in nodeMap.items():
if len(node.children) == 1 and node.parent is not None and name not in nodesToKeep:
@@ -262,6 +293,7 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set
namesToRemove.add(name)
for name in namesToRemove:
del nodeMap[name]
+
# Remove only-childs (not redundant because 'nodesToKeep' can cause single-child parents to be kept)
namesToRemove.clear()
for name, node in nodeMap.items():
@@ -277,8 +309,9 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set
namesToRemove.add(name)
for name in namesToRemove:
del nodeMap[name]
- #
+
return namesToRemove
+
def trimIfManyChildren(
nodeMap: dict[str, Node], rootName: str, childThreshold: int, nodesToKeep: set[str] = set()) -> None:
namesToRemove: set[str] = set()
@@ -299,14 +332,17 @@ def trimIfManyChildren(
# Recurse on children
for n in node.children:
findTrimmables(n)
+
def markForRemoval(nodeName: str) -> None:
nonlocal nodeMap, namesToRemove
namesToRemove.add(nodeName)
for child in nodeMap[nodeName].children:
markForRemoval(child)
+
findTrimmables(rootName)
for nodeName in namesToRemove:
del nodeMap[nodeName]
+
def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int:
""" Updates the 'tips' values for a node and it's descendants, returning the node's new 'tips' value """
node = nodeMap[nodeName]
@@ -314,6 +350,7 @@ def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int:
tips = max(1, tips)
node.tips = tips
return tips
+
def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str):
""" Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix """
nodesTbl = f'nodes_{suffix}'
@@ -328,10 +365,11 @@ def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str):
pSupport = 1 if nodeMap[childName].pSupport else 0
dbCur.execute(f'INSERT INTO {edgesTbl} VALUES (?, ?, ?)', (name, childName, pSupport))
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--tree', choices=['picked', 'images', 'trimmed'], help='Only generate the specified tree')
args = parser.parse_args()
- #
+
genData(args.tree, DB_FILE, PICKED_NODES_FILE)
diff --git a/backend/tol_data/review_imgs_to_gen.py b/backend/tol_data/review_imgs_to_gen.py
index 2283ed7..f384ddf 100755
--- a/backend/tol_data/review_imgs_to_gen.py
+++ b/backend/tol_data/review_imgs_to_gen.py
@@ -11,8 +11,11 @@ The program looks for an existing output file to determine what choices
have already been made.
"""
-import os, time
+import argparse
+import os
+import time
import sqlite3
+
import tkinter as tki
from tkinter import ttk
import PIL
@@ -22,7 +25,7 @@ EOL_IMG_DIR = os.path.join('eol', 'imgs')
ENWIKI_IMG_DIR = os.path.join('enwiki', 'imgs')
DB_FILE = 'data.db'
OUT_FILE = 'img_list.txt'
-#
+
IMG_DISPLAY_SZ = 400
PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
REVIEW = 'only pairs' # Can be: 'all', 'only pairs', 'none'
@@ -32,11 +35,13 @@ class ImgReviewer:
def __init__(self, root, nodeToImgs, eolImgDir, enwikiImgDir, outFile, dbCon, review):
self.root = root
root.title('Image Reviewer')
+
# Setup main frame
mainFrame = ttk.Frame(root, padding='5 5 5 5')
mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
+
# Set up images-to-be-reviewed frames
self.eolImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
self.enwikiImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
@@ -47,14 +52,17 @@ class ImgReviewer:
label = ttk.Label(frame, image=self.eolImg if i == 0 else self.enwikiImg)
label.grid(column=0, row=0)
self.labels.append(label)
+
# Add padding
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
+
# Add keyboard bindings
root.bind('<q>', self.quit)
root.bind('<Key-j>', lambda evt: self.accept(0))
root.bind('<Key-k>', lambda evt: self.accept(1))
root.bind('<Key-l>', lambda evt: self.reject())
+
# Set fields
self.nodeImgsList = list(nodeToImgs.items())
self.listIdx = -1
@@ -69,8 +77,10 @@ class ImgReviewer:
self.enwikiImgPath = None
self.numReviewed = 0
self.startTime = time.time()
+
# Initialise images to review
self.getNextImgs()
+
def getNextImgs(self):
""" Updates display with new images to review, or ends program """
# Get next image paths
@@ -81,6 +91,7 @@ class ImgReviewer:
self.quit()
return
self.otolId, imgPaths = self.nodeImgsList[self.listIdx]
+
# Potentially skip user choice
if len(imgPaths) == 1 and (self.review == 'only pairs' or self.review == 'none'):
with open(self.outFile, 'a') as file:
@@ -91,6 +102,7 @@ class ImgReviewer:
file.write(f'{self.otolId} {imgPaths[-1]}\n') # Prefer enwiki image
continue
break
+
# Update displayed images
self.eolImgPath = self.enwikiImgPath = None
imageOpenError = False
@@ -113,20 +125,24 @@ class ImgReviewer:
print(f'Unexpected image path {imgPath}')
self.quit()
return
+
# Re-iterate if all image paths invalid
if self.eolImgPath is None and self.enwikiImgPath is None:
if imageOpenError:
self.reject()
self.getNextImgs()
return
+
# Add placeholder images
if self.eolImgPath is None:
self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
elif self.enwikiImgPath is None:
self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
+
# Update image-frames
self.labels[0].config(image=self.eolImg)
self.labels[1].config(image=self.enwikiImg)
+
# Update title
title = f'Images for otol ID {self.otolId}'
query = 'SELECT names.alt_name FROM' \
@@ -137,6 +153,7 @@ class ImgReviewer:
title += f', aka {row[0]}'
title += f' ({self.listIdx + 1} out of {len(self.nodeImgsList)})'
self.root.title(title)
+
def accept(self, imgIdx):
""" React to a user selecting an image """
imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath
@@ -147,12 +164,14 @@ class ImgReviewer:
file.write(f'{self.otolId} {imgPath}\n')
self.numReviewed += 1
self.getNextImgs()
+
def reject(self):
""""" React to a user rejecting all images of a set """
with open(self.outFile, 'a') as file:
file.write(f'{self.otolId}\n')
self.numReviewed += 1
self.getNextImgs()
+
def quit(self, e = None):
print(f'Number reviewed: {self.numReviewed}')
timeElapsed = time.time() - self.startTime
@@ -161,6 +180,7 @@ class ImgReviewer:
print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
self.dbCon.close()
self.root.destroy()
+
def resizeImgForDisplay(self, img):
""" Returns a copy of an image, shrunk to fit it's frame (keeps aspect ratio), and with a background """
if max(img.width, img.height) > IMG_DISPLAY_SZ:
@@ -180,7 +200,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
print('Opening database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
nodeToImgs: dict[str, list[str]] = {} # Maps otol-ids to arrays of image paths
print('Iterating through images from EOL')
if os.path.exists(eolImgDir):
@@ -198,6 +218,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
if not found:
print(f'WARNING: No node found for {os.path.join(eolImgDir, filename)}')
print(f'Result: {len(nodeToImgs)} nodes with images')
+
print('Iterating through images from Wikipedia')
if os.path.exists(enwikiImgDir):
for filename in os.listdir(enwikiImgDir):
@@ -214,7 +235,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
if not found:
print(f'WARNING: No node found for {os.path.join(enwikiImgDir, filename)}')
print(f'Result: {len(nodeToImgs)} nodes with images')
- #
+
print('Filtering out already-made image choices')
oldSz = len(nodeToImgs)
if os.path.exists(outFile):
@@ -225,7 +246,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
line = line[:line.find(' ')]
del nodeToImgs[line]
print(f'Filtered out {oldSz - len(nodeToImgs)} entries')
- #
+
# Create GUI and defer control
print('Starting GUI')
root = tki.Tk()
@@ -234,8 +255,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
reviewImgs(EOL_IMG_DIR, ENWIKI_IMG_DIR, DB_FILE, OUT_FILE, REVIEW)
diff --git a/backend/tol_data/wikidata/gen_taxon_src_data.py b/backend/tol_data/wikidata/gen_taxon_src_data.py
index 1bddb6e..d2a3811 100755
--- a/backend/tol_data/wikidata/gen_taxon_src_data.py
+++ b/backend/tol_data/wikidata/gen_taxon_src_data.py
@@ -30,10 +30,21 @@ OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
# Possibly related: https://github.com/python/cpython/issues/72882
-import sys, os, re, math, io
+import argparse
+import sys
+import os
+import re
+import math
+import io
from collections import defaultdict
-import bz2, json, sqlite3
-import multiprocessing, indexed_bzip2, pickle, tempfile
+import bz2
+import json
+import sqlite3
+
+import multiprocessing
+import indexed_bzip2
+import pickle
+import tempfile
WIKIDATA_FILE = 'latest-all.json.bz2'
OFFSETS_FILE = 'offsets.dat'
@@ -49,9 +60,12 @@ IUCN_STATUS_IDS = {
'Q11394': 'endangered', 'Q219127': 'critically endangered', 'Q239509': 'extinct in the wild',
'Q237350': 'extinct species', 'Q3245245': 'data deficient'
}
+
# For filtering lines before parsing JSON
LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")').encode())
+# ========== For data generation ==========
+
def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
""" Reads the dump and writes source/iucn info to db """
# Maps to populate
@@ -59,10 +73,12 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
idToTitle: dict[int, str] = {} # Maps wikidata ID to enwiki title
idToAltId: dict[int, int] = {} # Maps taxon-item wikidata ID to taxon-alt ID (eg: 'canis lupus familiaris' -> 'dog')
idToIucnStatus: dict[int, str] = {} # Maps wikidata ID to iucn-status string ('least concern', etc)
+
# Check db
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
+
# Read dump
if nProcs == 1:
with bz2.open(wikidataFile, mode='rb') as file:
@@ -76,6 +92,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
with indexed_bzip2.open(wikidataFile) as file:
with open(offsetsFile, 'wb') as file2:
pickle.dump(file.block_offsets(), file2)
+
print('Allocating file into chunks')
fileSz: int # About 1.4 TB
with indexed_bzip2.open(wikidataFile) as file:
@@ -86,6 +103,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
# Each adjacent pair specifies a start+end byte index for readDumpChunk()
print(f'- Chunk size: {chunkSz:,}')
+
print('Starting processes to read dump')
with tempfile.TemporaryDirectory() as tempDirName:
# Using maxtasksperchild=1 to free resources on task completion
@@ -103,7 +121,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
idToTitle.update(maps[1])
idToAltId.update(maps[2])
idToIucnStatus.update(maps[3])
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -127,6 +145,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
# The 'OR IGNORE' allows for multiple taxons using the same alt
dbCon.commit()
dbCon.close()
+
def readDumpLine(
lineBytes: bytes,
srcIdToId: dict[str, dict[int, int]],
@@ -160,6 +179,7 @@ def readDumpLine(
return
if not isTaxon and not altTaxa:
return
+
# Get wikidata ID and enwiki title
itemId: int | None = None
itemTitle: str | None = None
@@ -172,11 +192,13 @@ def readDumpLine(
itemTitle = None
else:
return
+
# Update maps
if itemTitle is not None:
idToTitle[itemId] = itemTitle
for altId in altTaxa:
idToAltId[altId] = itemId
+
# Check for source IDs
for srcPropId, src in SRC_PROP_IDS.items():
if srcPropId in claims:
@@ -185,6 +207,7 @@ def readDumpLine(
srcIdToId[src][srcId] = itemId
except (KeyError, ValueError):
continue
+
# Check for IUCN status
if 'P141' in claims: # Check for 'iucn conservation status' statement
try:
@@ -192,9 +215,11 @@ def readDumpLine(
idToIucnStatus[itemId] = IUCN_STATUS_IDS[iucnStatusId]
except KeyError:
pass
+
def readDumpChunkOneParam(params: tuple[int, str, str, int, int, str]) -> str:
""" Forwards to readDumpChunk(), for use with pool.map() """
return readDumpChunk(*params)
+
def readDumpChunk(
procId: int, wikidataFile: str, offsetsFile: str, startByte: int, endByte: int, outFilename: str) -> str:
""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
@@ -205,18 +230,21 @@ def readDumpChunk(
dict[int, str],
dict[int, int],
dict[int, str]] = (defaultdict(dict), {}, {}, {})
+
# Read dump
with indexed_bzip2.open(wikidataFile) as file:
# Load offsets file
with open(offsetsFile, 'rb') as file2:
offsets = pickle.load(file2)
file.set_block_offsets(offsets)
+
# Seek to chunk
if startByte != -1:
file.seek(startByte)
file.readline()
else:
startByte = 0 # Used for progress calculation
+
# Read lines
count = 0
while file.tell() <= endByte:
@@ -225,15 +253,17 @@ def readDumpChunk(
perc = (file.tell() - startByte) / (endByte - startByte) * 100
print(f'Thread {procId}: {perc:.2f}%')
readDumpLine(file.readline(), *maps)
+
# Output results into file
with open(outFilename, 'wb') as file:
pickle.dump(maps, file)
return outFilename
+# ========== Main block ==========
+
if __name__ == '__main__': # Guard needed for multiprocessing
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
multiprocessing.set_start_method('spawn')
genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)