aboutsummaryrefslogtreecommitdiff
path: root/backend/tol_data
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2023-01-29 11:30:47 +1100
committerTerry Truong <terry06890@gmail.com>2023-01-29 11:30:47 +1100
commit8781fdb2b8c530a6c1531ae9e82221eb062e34fb (patch)
treeffd824aa9b945d69b47f012617ee13d98764d078 /backend/tol_data
parentf5e87ae628bab0eef97b3e3e62f6d71cca9c99c0 (diff)
Adjust backend coding style
Add line spacing, section comments, and import consistency
Diffstat (limited to 'backend/tol_data')
-rwxr-xr-xbackend/tol_data/dbpedia/gen_desc_data.py21
-rwxr-xr-xbackend/tol_data/enwiki/download_img_license_info.py30
-rwxr-xr-xbackend/tol_data/enwiki/download_imgs.py24
-rwxr-xr-xbackend/tol_data/enwiki/gen_desc_data.py45
-rwxr-xr-xbackend/tol_data/enwiki/gen_dump_index_db.py16
-rwxr-xr-xbackend/tol_data/enwiki/gen_img_data.py36
-rwxr-xr-xbackend/tol_data/enwiki/gen_pageview_data.py28
-rwxr-xr-xbackend/tol_data/enwiki/lookup_page.py9
-rwxr-xr-xbackend/tol_data/eol/download_imgs.py28
-rwxr-xr-xbackend/tol_data/eol/gen_images_list_db.py13
-rwxr-xr-xbackend/tol_data/eol/review_imgs.py33
-rwxr-xr-xbackend/tol_data/gen_desc_data.py23
-rwxr-xr-xbackend/tol_data/gen_imgs.py36
-rwxr-xr-xbackend/tol_data/gen_linked_imgs.py23
-rwxr-xr-xbackend/tol_data/gen_mapping_data.py31
-rwxr-xr-xbackend/tol_data/gen_name_data.py29
-rwxr-xr-xbackend/tol_data/gen_otol_data.py45
-rwxr-xr-xbackend/tol_data/gen_pop_data.py15
-rwxr-xr-xbackend/tol_data/gen_reduced_trees.py62
-rwxr-xr-xbackend/tol_data/review_imgs_to_gen.py34
-rwxr-xr-xbackend/tol_data/wikidata/gen_taxon_src_data.py42
21 files changed, 456 insertions, 167 deletions
diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py
index 50418e0..f8a665a 100755
--- a/backend/tol_data/dbpedia/gen_desc_data.py
+++ b/backend/tol_data/dbpedia/gen_desc_data.py
@@ -6,8 +6,10 @@ Adds DBpedia labels/types/abstracts/etc data into a database
# In testing, this script took a few hours to run, and generated about 10GB
+import argparse
import re
-import bz2, sqlite3
+import bz2
+import sqlite3
LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries
IDS_FILE = 'page_lang=en_ids.ttl.bz2'
@@ -24,7 +26,7 @@ def genData(
print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Reading/storing label data')
dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)')
dbCur.execute('CREATE INDEX labels_idx ON labels(label)')
@@ -38,7 +40,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing wiki page ids')
dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX ids_idx ON ids(id)')
@@ -55,7 +57,7 @@ def genData(
except sqlite3.IntegrityError as e:
# Accounts for certain lines that have the same IRI
print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}')
- #
+
print('Reading/storing redirection data')
dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)')
redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
@@ -67,7 +69,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing diambiguation-page data')
dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)')
disambigLineRegex = redirLineRegex
@@ -79,7 +81,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),))
- #
+
print('Reading/storing instance-type data')
dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)')
dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)')
@@ -92,7 +94,7 @@ def genData(
if match is None:
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2)))
- #
+
print('Reading/storing abstracts')
dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)')
descLineRegex = labelLineRegex
@@ -107,14 +109,13 @@ def genData(
raise Exception(f'ERROR: Line {lineNum} has unexpected format')
dbCur.execute('INSERT INTO abstracts VALUES (?, ?)',
(match.group(1), match.group(2).replace(r'\"', '"')))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py
index 17e15b4..6efc7a4 100755
--- a/backend/tol_data/enwiki/download_img_license_info.py
+++ b/backend/tol_data/enwiki/download_img_license_info.py
@@ -9,13 +9,19 @@ The program can be re-run to continue downloading, and looks
at already-processed names to decide what to skip.
"""
+import argparse
import re
-import sqlite3, urllib.parse, html
+import sqlite3
+
import requests
-import time, signal
+import urllib.parse
+import html
+
+import time
+import signal
IMG_DB = 'img_data.db'
-#
+
API_URL = 'https://en.wikipedia.org/w/api.php'
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
BATCH_SZ = 50 # Max 50
@@ -30,19 +36,19 @@ def downloadInfo(imgDb: str) -> None:
if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None:
dbCur.execute('CREATE TABLE imgs (' \
'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)')
- #
+
print('Reading image names')
imgNames: set[str] = set()
for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'):
imgNames.add(imgName)
print(f'Found {len(imgNames)}')
- #
+
print('Checking for already-processed images')
oldSz = len(imgNames)
for (imgName,) in dbCur.execute('SELECT name FROM imgs'):
imgNames.discard(imgName)
print(f'Found {oldSz - len(imgNames)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -51,7 +57,7 @@ def downloadInfo(imgDb: str) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Iterating through image names')
imgNameList = list(imgNames)
iterNum = 0
@@ -62,9 +68,11 @@ def downloadInfo(imgDb: str) -> None:
if interrupted:
print(f'Exiting loop at iteration {iterNum}')
break
+
# Get batch
imgBatch = imgNameList[i:i+BATCH_SZ]
imgBatch = ['File:' + x for x in imgBatch]
+
# Make request
headers = {
'user-agent': USER_AGENT,
@@ -87,6 +95,7 @@ def downloadInfo(imgDb: str) -> None:
print(f'ERROR: Exception while downloading info: {e}')
print('\tImage batch: ' + '|'.join(imgBatch))
continue
+
# Parse response-object
if 'query' not in responseObj or 'pages' not in responseObj['query']:
print('WARNING: Response object doesn\'t have page data')
@@ -126,6 +135,7 @@ def downloadInfo(imgDb: str) -> None:
artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None
credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
+
# Remove markup
if artist is not None:
artist = TAG_REGEX.sub(' ', artist).strip()
@@ -137,17 +147,17 @@ def downloadInfo(imgDb: str) -> None:
credit = WHITESPACE_REGEX.sub(' ', credit)
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
+
# Add to db
dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)',
(title, license, artist, credit, restrictions, url))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadInfo(IMG_DB)
diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py
index c6a1c21..164289d 100755
--- a/backend/tol_data/enwiki/download_imgs.py
+++ b/backend/tol_data/enwiki/download_imgs.py
@@ -11,14 +11,20 @@ in the output directory do decide what to skip.
# In testing, this downloaded about 100k images, over several days
-import re, os
+import argparse
+import re
+import os
import sqlite3
-import urllib.parse, requests
-import time, signal
+
+import requests
+import urllib.parse
+
+import time
+import signal
IMG_DB = 'img_data.db' # About 130k image names
OUT_DIR = 'imgs'
-#
+
LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE)
USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
TIMEOUT = 1
@@ -34,7 +40,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
for filename in fileList:
pageIdsDone.add(int(os.path.splitext(filename)[0]))
print(f'Found {len(pageIdsDone)}')
- #
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -43,7 +49,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
- #
+
print('Opening database')
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
@@ -57,6 +63,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if interrupted:
print('Exiting loop')
break
+
# Check for problematic attributes
if license is None or LICENSE_REGEX.fullmatch(license) is None:
continue
@@ -66,6 +73,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
continue
if restrictions is not None and restrictions != '':
continue
+
# Download image
iterNum += 1
print(f'Iteration {iterNum}: Downloading for page-id {pageId}')
@@ -87,12 +95,12 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
except Exception as e:
print(f'Error while downloading to {outFile}: {e}')
return
+
print('Closing database')
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
downloadImgs(IMG_DB, OUT_DIR, TIMEOUT)
diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py
index b3fde52..44e4d6f 100755
--- a/backend/tol_data/enwiki/gen_desc_data.py
+++ b/backend/tol_data/enwiki/gen_desc_data.py
@@ -7,10 +7,16 @@ and adds them to a database
# In testing, this script took over 10 hours to run, and generated about 5GB
-import sys, os, re
+import argparse
+import sys
+import os
+import re
import bz2
-import html, mwxml, mwparserfromhell
import sqlite3
+import html
+
+import mwxml
+import mwparserfromhell
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
DB_FILE = 'desc_data.db'
@@ -19,14 +25,17 @@ DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}')
+PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
+LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
def convertTemplateReplace(match):
""" Used in regex-substitution with CONVERT_TEMPLATE_REGEX """
if match.group(2) is None:
return f'{match.group(1)} {match.group(4)}'
else:
return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}'
-PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)')
-LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*')
+
+# ========== For data generation ==========
def genData(dumpFile: str, dbFile: str) -> None:
print('Creating database')
@@ -39,13 +48,13 @@ def genData(dumpFile: str, dbFile: str) -> None:
dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)')
dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)')
dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)')
- #
+
print('Iterating through dump file')
with bz2.open(dumpFile, mode='rt') as file:
for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1):
if pageNum % 1e4 == 0:
print(f'At page {pageNum}')
- # Parse page
+
if page.namespace == 0:
try:
dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title)))
@@ -60,15 +69,22 @@ def genData(dumpFile: str, dbFile: str) -> None:
desc = parseDesc(revision.text)
if desc is not None:
dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc))
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def parseDesc(text: str) -> str | None:
- # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
- # and then accumulate lines until a blank one.
- # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
- # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
+ Looks for a description in wikitext content.
+
+ Finds first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ and then accumulates lines until a blank one.
+
+ Some cases not accounted for include:
+ disambiguation pages, abstracts with sentences split-across-lines,
+ nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
+ """
lines: list[str] = []
openBraceCount = 0
openBracketCount = 0
@@ -108,6 +124,7 @@ def parseDesc(text: str) -> str | None:
if lines:
return removeMarkup(' '.join(lines))
return None
+
def removeMarkup(content: str) -> str:
content = EMBEDDED_HTML_REGEX.sub('', content)
content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content)
@@ -115,12 +132,14 @@ def removeMarkup(content: str) -> str:
content = PARENS_GROUP_REGEX.sub('', content)
content = LEFTOVER_BRACE_REGEX.sub('', content)
return content
+
def convertTitle(title: str) -> str:
return html.unescape(title).replace('_', ' ')
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(DUMP_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py
index 5778680..12a8a10 100755
--- a/backend/tol_data/enwiki/gen_dump_index_db.py
+++ b/backend/tol_data/enwiki/gen_dump_index_db.py
@@ -1,9 +1,13 @@
#!/usr/bin/python3
"""
-Adds data from the wiki dump index-file into a database
+Converts data from the wiki-dump index-file into a database
"""
-import sys, os, re
+
+import argparse
+import sys
+import os
+import re
import bz2
import sqlite3
@@ -14,10 +18,12 @@ def genData(indexFile: str, dbFile: str) -> None:
""" Reads the index file and creates the db """
if os.path.exists(dbFile):
raise Exception(f'ERROR: Existing {dbFile}')
+
print('Creating database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)')
+
print('Iterating through index file')
lineRegex = re.compile(r'([^:]+):([^:]+):(.*)')
lastOffset = 0
@@ -28,7 +34,7 @@ def genData(indexFile: str, dbFile: str) -> None:
lineNum += 1
if lineNum % 1e5 == 0:
print(f'At line {lineNum}')
- #
+
match = lineRegex.fullmatch(line.rstrip())
assert match is not None
offsetStr, pageId, title = match.group(1,2,3)
@@ -48,13 +54,13 @@ def genData(indexFile: str, dbFile: str) -> None:
dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1))
except sqlite3.IntegrityError as e:
print(f'Failed on title "{t}": {e}', file=sys.stderr)
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(INDEX_FILE, DB_FILE)
diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py
index 040f223..2c243f3 100755
--- a/backend/tol_data/enwiki/gen_img_data.py
+++ b/backend/tol_data/enwiki/gen_img_data.py
@@ -8,31 +8,39 @@ The program can be re-run with an updated set of page IDs, and
will skip already-processed page IDs.
"""
+import argparse
import re
-import os, bz2, html, urllib.parse
+import os
+import bz2
+import html
+import urllib.parse
import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
INDEX_DB = 'dump_index.db'
IMG_DB = 'img_data.db' # The database to create
DB_FILE = os.path.join('..', 'data.db')
-#
+
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE)
CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE)
+# ========== For data generation ==========
+
def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
print('Opening databases')
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
+
print('Checking tables')
if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
# Create tables if not present
- imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL
+ imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)')
+ # 'img_name' values are set to NULL to indicate page IDs where no image was found
imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
else:
# Check for already-processed page IDs
@@ -44,7 +52,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
else:
print(f'Found already-processed page ID {pid} which was not in input set')
print(f'Will skip {numSkipped} already-processed page IDs')
- #
+
print('Getting dump-file offsets')
offsetToPageids: dict[int, list[int]] = {}
offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets
@@ -53,7 +61,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}')
- #
+
query = 'SELECT offset, next_offset FROM offsets WHERE id = ?'
row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone()
if row is None:
@@ -65,7 +73,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
offsetToPageids[chunkOffset] = []
offsetToPageids[chunkOffset].append(pageId)
print(f'Found {len(offsetToEnd)} chunks to check')
- #
+
print('Iterating through chunks in dump file')
with open(dumpFile, mode='rb') as file:
iterNum = 0
@@ -73,7 +81,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
iterNum += 1
if iterNum % 100 == 0:
print(f'At iteration {iterNum}')
- #
+
chunkPageIds = offsetToPageids[pageOffset]
# Jump to chunk
file.seek(pageOffset)
@@ -126,14 +134,15 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
break
if not foundText:
print(f'WARNING: Did not find <text> for page id {pageId}')
- #
+
print('Closing databases')
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()
+
def getImageName(content: list[str]) -> str | None:
""" Given an array of text-content lines, tries to return an infoxbox image name, or None """
- # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+ # Note: Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = IMG_LINE_REGEX.match(line)
if match is not None:
@@ -174,6 +183,8 @@ def getImageName(content: list[str]) -> str | None:
return None
return None
+# ========== For getting input page IDs ==========
+
def getInputPageIdsFromDb(dbFile: str) -> set[int]:
print('Getting input page-ids')
pageIds: set[int] = set()
@@ -182,12 +193,15 @@ def getInputPageIdsFromDb(dbFile: str) -> set[int]:
for (pageId,) in dbCur.execute('SELECT id from wiki_ids'):
pageIds.add(pageId)
dbCon.close()
+
print(f'Found {len(pageIds)}')
return pageIds
+
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
pageIds = getInputPageIdsFromDb(DB_FILE)
genData(pageIds, DUMP_FILE, INDEX_DB, IMG_DB)
diff --git a/backend/tol_data/enwiki/gen_pageview_data.py b/backend/tol_data/enwiki/gen_pageview_data.py
index 8aee1cc..95b4a60 100755
--- a/backend/tol_data/enwiki/gen_pageview_data.py
+++ b/backend/tol_data/enwiki/gen_pageview_data.py
@@ -3,27 +3,34 @@
"""
Reads through wikimedia files containing pageview counts,
computes average counts, and adds them to a database
+
+Each pageview file has lines that seem to hold these space-separated fields:
+ wiki code (eg: en.wikipedia), article title, page ID (may be: null),
+ platform (eg: mobile-web), monthly view count,
+ hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
"""
# Took about 15min per file (each had about 180e6 lines)
-import sys, os, glob, math, re
+import argparse
+import sys
+import os
+import glob
+import math
+import re
from collections import defaultdict
-import bz2, sqlite3
+import bz2
+import sqlite3
PAGEVIEW_FILES = glob.glob('./pageviews/pageviews-*-user.bz2')
DUMP_INDEX_DB = 'dump_index.db'
DB_FILE = 'pageview_data.db'
def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
- # Each pageview file has lines that seem to hold these space-separated fields:
- # wiki code (eg: en.wikipedia), article title, page ID (may be: null),
- # platform (eg: mobile-web), monthly view count,
- # hourly count string (eg: A1B2 means 1 view on day 1 and 2 views on day 2)
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
- #
+
namespaceRegex = re.compile(r'[a-zA-Z]+:')
titleToViews: dict[str, int] = defaultdict(int)
linePrefix = b'en.wikipedia '
@@ -35,17 +42,19 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
print(f'At line {lineNum}')
if not line.startswith(linePrefix):
continue
+
# Get second and second-last fields
line = line[len(linePrefix):line.rfind(b' ')] # Remove first and last fields
title = line[:line.find(b' ')].decode('utf-8')
viewCount = int(line[line.rfind(b' ')+1:])
if namespaceRegex.match(title) is not None:
continue
+
# Update map
title = title.replace('_', ' ')
titleToViews[title] += viewCount
print(f'Found {len(titleToViews)} titles')
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -62,8 +71,7 @@ def genData(pageviewFiles: list[str], dumpIndexDb: str, dbFile: str) -> None:
idbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PAGEVIEW_FILES, DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/enwiki/lookup_page.py b/backend/tol_data/enwiki/lookup_page.py
index f744818..c4d0932 100755
--- a/backend/tol_data/enwiki/lookup_page.py
+++ b/backend/tol_data/enwiki/lookup_page.py
@@ -5,6 +5,7 @@ Looks up a page with title title1 in the wiki dump, using the dump-index
db, and prints the corresponding <page>.
"""
+import argparse
import sys
import bz2
import sqlite3
@@ -24,7 +25,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
_, pageOffset, endOffset = row
dbCon.close()
print(f'Found chunk at offset {pageOffset}')
- #
+
print('Reading from wiki dump')
content: list[str] = []
with open(dumpFile, mode='rb') as file:
@@ -32,6 +33,7 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
file.seek(pageOffset)
compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+
# Look in chunk for page
lines = data.splitlines()
lineIdx = 0
@@ -58,14 +60,13 @@ def lookupPage(dumpFile: str, indexDb: str, pageTitle: str) -> None:
if line.lstrip() == '</page>':
break
lineIdx += 1
- #
+
print('Content: ')
print('\n'.join(content))
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('title', help='The title to look up')
args = parser.parse_args()
- #
+
lookupPage(DUMP_FILE, INDEX_DB, args.title.replace('_', ' '))
diff --git a/backend/tol_data/eol/download_imgs.py b/backend/tol_data/eol/download_imgs.py
index 8454a35..5757032 100755
--- a/backend/tol_data/eol/download_imgs.py
+++ b/backend/tol_data/eol/download_imgs.py
@@ -13,9 +13,16 @@ already-downloaded files, and continues after the one with
highest EOL ID.
"""
-import sys, re, os, random
+import argparse
+import sys
+import re
+import os
+import random
import sqlite3
-import urllib.parse, requests
+
+import requests
+import urllib.parse
+
import time
from threading import Thread
import signal
@@ -23,7 +30,7 @@ import signal
IMAGES_LIST_DB = 'images_list.db'
OUT_DIR = 'imgs_for_review'
DB_FILE = os.path.join('..', 'data.db')
-#
+
MAX_IMGS_PER_ID = 3
MAX_THREADS = 5
POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
@@ -43,7 +50,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
eolIdList = sorted(eolIds)
nextIdx = 0
print(f'Result: {len(eolIdList)} EOL IDs')
- #
+
print('Checking output directory')
if not os.path.exists(outDir):
os.mkdir(outDir)
@@ -57,7 +64,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
if nextIdx == len(eolIdList):
print('No IDs left. Exiting...')
return
- #
+
print('Starting download threads')
numThreads = 0
threadException: Exception | None = None # Used for ending main thread after a non-main thread exception
@@ -81,6 +88,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
print(f'Error while downloading to {outFile}: {str(e)}', file=sys.stderr)
threadException = e
numThreads -= 1
+
# Manage downloading
for idx in range(nextIdx, len(eolIdList)):
eolId = eolIdList[idx]
@@ -96,9 +104,11 @@ def downloadImgs(eolIds, imagesListDb, outDir):
if len(extension) <= 1:
print(f'WARNING: No filename extension found in URL {url}', file=sys.stderr)
continue
+
# Check image-quantity limit
if len(ownerSet) == MAX_IMGS_PER_ID:
break
+
# Check for skip conditions
if re.fullmatch(LICENSE_REGEX, license) is None:
continue
@@ -107,11 +117,13 @@ def downloadImgs(eolIds, imagesListDb, outDir):
if copyrightOwner in ownerSet:
continue
ownerSet.add(copyrightOwner)
+
# Determine output filename
outPath = os.path.join(outDir, f'{eolId} {contentId}{extension}')
if os.path.exists(outPath):
print(f'WARNING: {outPath} already exists. Skipping download.')
continue
+
# Check thread limit
while numThreads == MAX_THREADS:
time.sleep(1)
@@ -122,6 +134,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
time.sleep(1)
exitLoop = True
break
+
# Perform download
print(f'Downloading image to {outPath}')
numThreads += 1
@@ -129,6 +142,7 @@ def downloadImgs(eolIds, imagesListDb, outDir):
thread.start()
if exitLoop:
break
+
# Close images-list db
while numThreads > 0:
time.sleep(1)
@@ -143,10 +157,10 @@ def getEolIdsFromDb(dbFile) -> set[int]:
eolIds.add(id)
dbCon.close()
return eolIds
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
eolIds = getEolIdsFromDb(DB_FILE)
downloadImgs(eolIds, IMAGES_LIST_DB, OUT_DIR)
diff --git a/backend/tol_data/eol/gen_images_list_db.py b/backend/tol_data/eol/gen_images_list_db.py
index ee57ac6..3e5bea1 100755
--- a/backend/tol_data/eol/gen_images_list_db.py
+++ b/backend/tol_data/eol/gen_images_list_db.py
@@ -4,8 +4,12 @@
Generates a sqlite db from a directory of CSV files holding EOL image data
"""
-import os, glob
-import csv, re, sqlite3
+import argparse
+import os
+import glob
+import csv
+import re
+import sqlite3
IMAGE_LISTS_GLOB = os.path.join('imagesList', '*.csv')
DB_FILE = 'images_list.db'
@@ -18,6 +22,7 @@ def genData(imageListsGlob: str, dbFile: str) -> None:
' (content_id INT PRIMARY KEY, page_id INT, source_url TEXT,' \
' copy_url TEXT, license TEXT, copyright_owner TEXT)')
dbCur.execute('CREATE INDEX images_pid_idx ON images(page_id)')
+
print('Reading CSV files')
for filename in glob.glob(imageListsGlob):
print(f'Processing {filename}')
@@ -27,13 +32,13 @@ def genData(imageListsGlob: str, dbFile: str) -> None:
continue
dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
(int(contentId), int(pageId), sourceUrl, copyUrl, license, owner))
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(IMAGE_LISTS_GLOB, DB_FILE)
diff --git a/backend/tol_data/eol/review_imgs.py b/backend/tol_data/eol/review_imgs.py
index 9fb462c..145f338 100755
--- a/backend/tol_data/eol/review_imgs.py
+++ b/backend/tol_data/eol/review_imgs.py
@@ -7,8 +7,13 @@ choose an image to keep, or reject all. Also provides image rotation.
Chosen images are placed in another directory, and rejected ones are deleted.
"""
-import sys, re, os, time
+import argparse
+import sys
+import re
+import os
+import time
import sqlite3
+
import tkinter as tki
from tkinter import ttk
import PIL
@@ -17,7 +22,7 @@ from PIL import ImageTk, Image, ImageOps
IMG_DIR = 'imgs_for_review'
OUT_DIR = 'imgs'
EXTRA_INFO_DB = os.path.join('..', 'data.db')
-#
+
IMG_DISPLAY_SZ = 400
MAX_IMGS_PER_ID = 3
IMG_BG_COLOR = (88, 28, 135)
@@ -28,11 +33,13 @@ class EolImgReviewer:
def __init__(self, root, imgDir, imgList, extraInfoDb, outDir):
self.root = root
root.title('EOL Image Reviewer')
+
# Setup main frame
mainFrame = ttk.Frame(root, padding='5 5 5 5')
mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
+
# Set up images-to-be-reviewed frames
self.imgs = [PLACEHOLDER_IMG] * MAX_IMGS_PER_ID # Stored as fields for use in rotation
self.photoImgs = list(map(lambda img: ImageTk.PhotoImage(img), self.imgs)) # Image objects usable by tkinter
@@ -44,9 +51,11 @@ class EolImgReviewer:
label = ttk.Label(frame, image=self.photoImgs[i])
label.grid(column=0, row=0)
self.labels.append(label)
+
# Add padding
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
+
# Add keyboard bindings
root.bind('<q>', self.quit)
root.bind('<Key-j>', lambda evt: self.accept(0))
@@ -59,6 +68,7 @@ class EolImgReviewer:
root.bind('<Key-A>', lambda evt: self.rotate(0, True))
root.bind('<Key-S>', lambda evt: self.rotate(1, True))
root.bind('<Key-D>', lambda evt: self.rotate(2, True))
+
# Initialise fields
self.imgDir = imgDir
self.imgList = imgList
@@ -67,13 +77,15 @@ class EolImgReviewer:
self.nextEolId = 0
self.nextImgNames: list[str] = []
self.rotations: list[int] = []
+
# For displaying extra info
self.extraInfoDbCon = sqlite3.connect(extraInfoDb)
self.extraInfoDbCur = self.extraInfoDbCon.cursor()
self.numReviewed = 0
self.startTime = time.time()
- #
+
self.getNextImgs()
+
def getNextImgs(self):
""" Updates display with new images to review, or ends program """
# Gather names of next images to review
@@ -95,6 +107,7 @@ class EolImgReviewer:
self.nextImgNames.append(imgName)
self.rotations.append(0)
self.imgListIdx += 1
+
# Update displayed images
idx = 0
while idx < MAX_IMGS_PER_ID:
@@ -113,16 +126,19 @@ class EolImgReviewer:
self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
self.labels[idx].config(image=self.photoImgs[idx])
idx += 1
+
# Restart if all image files non-recognisable
if not self.nextImgNames:
self.getNextImgs()
return
+
# Update title
firstImgIdx = self.imgListIdx - len(self.nextImgNames) + 1
lastImgIdx = self.imgListIdx
title = self.getExtraInfo(self.nextEolId)
title += f' (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})'
self.root.title(title)
+
def accept(self, imgIdx):
""" React to a user selecting an image """
if imgIdx >= len(self.nextImgNames):
@@ -142,12 +158,14 @@ class EolImgReviewer:
os.remove(inFile)
self.numReviewed += 1
self.getNextImgs()
+
def reject(self):
""" React to a user rejecting all images of a set """
for i in range(len(self.nextImgNames)):
os.remove(os.path.join(self.imgDir, self.nextImgNames[i]))
self.numReviewed += 1
self.getNextImgs()
+
def rotate(self, imgIdx, anticlockwise = False):
""" Respond to a user rotating an image """
deg = -90 if not anticlockwise else 90
@@ -155,6 +173,7 @@ class EolImgReviewer:
self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
+
def quit(self, e = None):
print(f'Number reviewed: {self.numReviewed}')
timeElapsed = time.time() - self.startTime
@@ -163,7 +182,7 @@ class EolImgReviewer:
print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
self.extraInfoDbCon.close()
self.root.destroy()
- #
+
def resizeImgForDisplay(self, img):
""" Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background """
if max(img.width, img.height) > IMG_DISPLAY_SZ:
@@ -178,6 +197,7 @@ class EolImgReviewer:
int((IMG_DISPLAY_SZ - img.width) / 2),
int((IMG_DISPLAY_SZ - img.height) / 2)))
return bgImg
+
def getExtraInfo(self, eolId: int) -> str:
""" Used to display extra EOL ID info """
query = 'SELECT names.alt_name FROM' \
@@ -193,12 +213,14 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str):
print('Checking output directory')
if not os.path.exists(outDir):
os.mkdir(outDir)
+
print('Getting input image list')
imgList = os.listdir(imgDir)
imgList.sort(key=lambda s: int(s.split(' ')[0]))
if not imgList:
print('No input images found')
sys.exit(0)
+
# Create GUI and defer control
print('Starting GUI')
root = tki.Tk()
@@ -206,8 +228,7 @@ def reviewImgs(imgDir: str, outDir: str, extraInfoDb: str):
root.mainloop()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
reviewImgs(IMG_DIR, OUT_DIR, EXTRA_INFO_DB)
diff --git a/backend/tol_data/gen_desc_data.py b/backend/tol_data/gen_desc_data.py
index fa08a8c..69efe79 100755
--- a/backend/tol_data/gen_desc_data.py
+++ b/backend/tol_data/gen_desc_data.py
@@ -5,7 +5,9 @@ Maps nodes to short descriptions, using data from DBpedia and
Wikipedia, and stores results in the database.
"""
-import os, sqlite3
+import argparse
+import os
+import sqlite3
DBPEDIA_DB = os.path.join('dbpedia', 'desc_data.db')
ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
@@ -16,12 +18,12 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)')
- #
+
print('Getting node mappings')
nodeToWikiId: dict[str, int] = {}
for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
nodeToWikiId[name] = wikiId
- #
+
print('Reading data from DBpedia')
dbpCon = sqlite3.connect(dbpediaDb)
dbpCur = dbpCon.cursor()
@@ -32,20 +34,22 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
iterNum += 1
if iterNum % 1e5 == 0:
print(f'At iteration {iterNum}')
- #
+
row = dbpCur.execute('SELECT iri FROM ids where id = ?', (wikiId,)).fetchone()
if row is not None:
nodeToIri[name] = row[0]
+
print('Resolving redirects')
iterNum = 0
for name, iri in nodeToIri.items():
iterNum += 1
if iterNum % 1e5 == 0:
print(f'At iteration {iterNum}')
- #
+
row = dbpCur.execute('SELECT target FROM redirects where iri = ?', (iri,)).fetchone()
if row is not None:
nodeToIri[name] = row[0]
+
print('Adding descriptions')
iterNum = 0
for name, iri in nodeToIri.items():
@@ -57,11 +61,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
if row is not None:
dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (nodeToWikiId[name], row[0], 1))
del nodeToWikiId[name]
+
dbpCon.close()
- #
+
print('Reading data from Wikipedia')
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
+
print('Adding descriptions')
iterNum = 0
for name, wikiId in nodeToWikiId.items():
@@ -79,14 +85,13 @@ def genData(dbpediaDb: str, enwikiDb: str, dbFile: str) -> None:
row = enwikiCur.execute('SELECT desc FROM descs where id = ?', (wikiIdToGet,)).fetchone()
if row is not None:
dbCur.execute('INSERT OR IGNORE INTO descs VALUES (?, ?, ?)', (wikiId, row[0], 0))
- #
+
print('Closing databases')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(DBPEDIA_DB, ENWIKI_DB, DB_FILE)
diff --git a/backend/tol_data/gen_imgs.py b/backend/tol_data/gen_imgs.py
index 0ba75ec..2479742 100755
--- a/backend/tol_data/gen_imgs.py
+++ b/backend/tol_data/gen_imgs.py
@@ -11,8 +11,11 @@ processing. It uses already-existing database entries to decide what
to skip.
"""
-import os, subprocess
-import sqlite3, urllib.parse
+import argparse
+import os
+import subprocess
+import sqlite3
+import urllib.parse
import signal
IMG_LIST_FILE = 'img_list.txt'
@@ -23,10 +26,11 @@ ENWIKI_IMG_DB = os.path.join('enwiki', 'img_data.db')
PICKED_IMGS_DIR = 'picked_imgs'
PICKED_IMGS_FILE = 'img_data.txt'
DB_FILE = 'data.db'
-#
+
IMG_OUT_SZ = 200
ImgId = tuple[int, str] # Holds an int ID and a source string (eg: 'eol')
+
class PickedImg:
""" Represents a picked-image from pickedImgsDir """
def __init__(self, nodeName: str, id: int, filename: str, url: str, license: str, artist: str, credit: str):
@@ -44,9 +48,9 @@ def genImgs(
""" Reads the image-list file, generates images, and updates db """
if not os.path.exists(outDir):
os.mkdir(outDir)
- #
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+
print('Checking for image tables')
nodesDone: set[str] = set()
imgsDone: set[ImgId] = set()
@@ -63,15 +67,16 @@ def genImgs(
for imgId, imgSrc in dbCur.execute('SELECT id, src from images'):
imgsDone.add((imgId, imgSrc))
print(f'Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip')
- #
+
print('Processing picked-images')
success = processPickedImgs(pickedImgsDir, pickedImgsFile, nodesDone, imgsDone, outDir, dbCur)
if success:
print('Processing images from eol and enwiki')
processImgs(imgListFile, eolImgDir, eolImgDb, enwikiImgDb, nodesDone, imgsDone, outDir, dbCur)
- # Close db
+
dbCon.commit()
dbCon.close()
+
def processPickedImgs(
pickedImgsDir: str, pickedImgsFile: str, nodesDone: set[str], imgsDone: set[ImgId],
outDir: str, dbCur: sqlite3.Cursor) -> bool:
@@ -85,25 +90,30 @@ def processPickedImgs(
nodeName = os.path.splitext(filename)[0] # Remove extension
(otolId,) = dbCur.execute('SELECT id FROM nodes WHERE name = ?', (nodeName,)).fetchone()
nodeToPickedImg[otolId] = PickedImg(nodeName, lineNum, filename, url, license, artist, credit)
+
# Set SIGINT handler
interrupted = False
def onSigint(sig, frame):
nonlocal interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
+
# Convert images
for otolId, imgData in nodeToPickedImg.items():
# Check for SIGINT event
if interrupted:
print('Exiting')
return False
+
# Skip if already processed
if otolId in nodesDone:
continue
+
# Convert image
success = convertImage(os.path.join(pickedImgsDir, imgData.filename), os.path.join(outDir, otolId + '.jpg'))
if not success:
return False
+
# Add entry to db
if (imgData.id, 'picked') not in imgsDone:
dbCur.execute('INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)',
@@ -112,6 +122,7 @@ def processPickedImgs(
dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (imgData.nodeName, imgData.id, 'picked'))
nodesDone.add(otolId)
return True
+
def processImgs(
imgListFile: str, eolImgDir: str, eolImgDb: str, enwikiImgDb: str,
nodesDone: set[str], imgsDone: set[ImgId], outDir: str, dbCur: sqlite3.Cursor) -> bool:
@@ -120,12 +131,14 @@ def processImgs(
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
+
# Set SIGINT handler
interrupted = False
def onSigint(sig, frame):
nonlocal interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
+
# Convert images
flag = False # Set to True upon interruption or failure
with open(imgListFile) as file:
@@ -135,19 +148,24 @@ def processImgs(
print('Exiting')
flag = True
break
+
# Skip lines without an image path
if line.find(' ') == -1:
continue
+
# Get filenames
otolId, _, imgPath = line.rstrip().partition(' ')
+
# Skip if already processed
if otolId in nodesDone:
continue
+
# Convert image
success = convertImage(imgPath, os.path.join(outDir, otolId + '.jpg'))
if not success:
flag = True
break
+
# Add entry to db
(nodeName,) = dbCur.execute('SELECT name FROM nodes WHERE id = ?', (otolId,)).fetchone()
fromEol = imgPath.startswith(eolImgDir)
@@ -185,14 +203,17 @@ def processImgs(
(enwikiId, 'enwiki', url, license, artist, credit))
imgsDone.add((enwikiId, 'enwiki'))
dbCur.execute('INSERT INTO node_imgs VALUES (?, ?, ?)', (nodeName, enwikiId, 'enwiki'))
+
eolCon.close()
enwikiCon.close()
return not flag
+
def convertImage(imgPath: str, outPath: str):
print(f'Converting {imgPath} to {outPath}')
if os.path.exists(outPath):
print('ERROR: Output image already exists')
return False
+
try:
completedProcess = subprocess.run(
['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
@@ -207,8 +228,7 @@ def convertImage(imgPath: str, outPath: str):
return True
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genImgs(IMG_LIST_FILE, EOL_IMG_DIR, OUT_DIR, EOL_IMG_DB, ENWIKI_IMG_DB, PICKED_IMGS_DIR, PICKED_IMGS_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_linked_imgs.py b/backend/tol_data/gen_linked_imgs.py
index 7002e92..c9d7aac 100755
--- a/backend/tol_data/gen_linked_imgs.py
+++ b/backend/tol_data/gen_linked_imgs.py
@@ -5,11 +5,12 @@ Look for nodes without images in the database, and tries to
associate them with images from their children
"""
+import argparse
import re
import sqlite3
DB_FILE = 'data.db'
-#
+
COMPOUND_NAME_REGEX = re.compile(r'\[(.+) \+ (.+)]')
UP_PROPAGATE_COMPOUND_IMGS = False
@@ -18,14 +19,14 @@ def genData(dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute('CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)')
- #
+
print('Getting nodes with images')
nodeToUsedId: dict[str, str] = {} # Maps name of node to otol ID of node to use image for
query = 'SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name'
for name, otolId in dbCur.execute(query):
nodeToUsedId[name] = otolId
print(f'Found {len(nodeToUsedId)}')
- #
+
print('Getting node depths')
nodeToDepth: dict[str, int] = {}
maxDepth = 0
@@ -33,6 +34,7 @@ def genData(dbFile: str) -> None:
for nodeName in nodeToUsedId.keys():
nodeChain = [nodeName]
lastDepth = 0
+
# Add ancestors
while True:
row = dbCur.execute('SELECT parent FROM edges WHERE child = ?', (nodeName,)).fetchone()
@@ -45,11 +47,12 @@ def genData(dbFile: str) -> None:
if nodeName in nodeToDepth:
lastDepth = nodeToDepth[nodeName]
break
+
# Add depths
for i in range(len(nodeChain)):
nodeToDepth[nodeChain[-i-1]] = i + lastDepth
maxDepth = max(maxDepth, lastDepth + len(nodeChain) - 1)
- #
+
print('Finding ancestors to give linked images')
depthToNodes: dict[int, list[str]] = {depth: [] for depth in range(maxDepth + 1)}
for nodeName, depth in nodeToDepth.items():
@@ -70,12 +73,12 @@ def genData(dbFile: str) -> None:
(tips,) = dbCur.execute('SELECT tips FROM nodes WHERE name == ?', (node,)).fetchone()
if parent not in parentToCandidate or parentToCandidate[parent][1] < tips:
parentToCandidate[parent] = (node, tips)
- #
+
print('Replacing linked-images for compound nodes')
for iterNum, node in enumerate(parentToCandidate.keys(), 1):
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}')
- #
+
match = COMPOUND_NAME_REGEX.fullmatch(node)
if match is not None:
# Replace associated image with subname images
@@ -85,12 +88,15 @@ def genData(dbFile: str) -> None:
otolIdPair[0] = nodeToUsedId[subName1]
if subName2 in nodeToUsedId:
otolIdPair[1] = nodeToUsedId[subName2]
+
# Use no image if both subimages not found
if otolIdPair[0] == '' and otolIdPair[1] == '':
dbCur.execute('DELETE FROM linked_imgs WHERE name = ?', (node,))
continue
+
# Add to db
dbCur.execute('UPDATE linked_imgs SET otol_ids = ? WHERE name = ?', (','.join(otolIdPair), node))
+
# Possibly repeat operation upon parent/ancestors
if UP_PROPAGATE_COMPOUND_IMGS:
while True:
@@ -104,14 +110,13 @@ def genData(dbFile: str) -> None:
node = parent
continue
break
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(DB_FILE)
diff --git a/backend/tol_data/gen_mapping_data.py b/backend/tol_data/gen_mapping_data.py
index 4373d1d..1ab577b 100755
--- a/backend/tol_data/gen_mapping_data.py
+++ b/backend/tol_data/gen_mapping_data.py
@@ -12,9 +12,12 @@ Based on code from https://github.com/OneZoom/OZtree, located in
OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
"""
+import argparse
import os
from collections import defaultdict
-import gzip, csv, sqlite3
+import gzip
+import csv
+import sqlite3
TAXONOMY_FILE = os.path.join('otol', 'taxonomy.tsv')
EOL_IDS_FILE = os.path.join('eol', 'provider_ids.csv.gz')
@@ -43,27 +46,31 @@ def genData(
nodeToWikiTitle: dict[int, str] = {} # Maps otol ID to wikipedia title
titleToIucnStatus: dict[str, str] = {} # Maps wikipedia title to IUCN string
titleToPageId: dict[str, int] = {} # Maps wikipedia title to page ID
+
# Get mappings from data input
readTaxonomyFile(taxonomyFile, nodeToSrcIds, usedSrcIds)
readEolIdsFile(eolIdsFile, nodeToSrcIds, usedSrcIds, nodeToEolId)
readWikidataDb(wikidataDb, nodeToSrcIds, usedSrcIds, nodeToWikiTitle, titleToIucnStatus, nodeToEolId)
readPickedMappings(pickedMappings, nodeToEolId, nodeToWikiTitle)
getEnwikiPageIds(enwikiDumpIndexDb, nodeToWikiTitle, titleToPageId)
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+
# Get otol id-to-name map
otolIdToName: dict[int, str] = {}
for nodeName, nodeId in dbCur.execute('SELECT name, id from nodes'):
if nodeId.startswith('ott'):
otolIdToName[int(nodeId[3:])] = nodeName
+
# Add eol mappings
dbCur.execute('CREATE TABLE eol_ids (name TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX eol_id_idx ON eol_ids(id)')
for otolId, eolId in nodeToEolId.items():
if otolId in otolIdToName:
dbCur.execute('INSERT INTO eol_ids VALUES (?, ?)', (otolIdToName[otolId], eolId))
+
# Add enwiki mappings
dbCur.execute('CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT)')
dbCur.execute('CREATE INDEX wiki_id_idx ON wiki_ids(id)')
@@ -73,8 +80,10 @@ def genData(
dbCur.execute('INSERT INTO wiki_ids VALUES (?, ?)', (otolIdToName[otolId], titleToPageId[title]))
if title in titleToIucnStatus:
dbCur.execute('INSERT INTO node_iucn VALUES (?, ?)', (otolIdToName[otolId], titleToIucnStatus[title]))
+
dbCon.commit()
dbCon.close()
+
def readTaxonomyFile(
taxonomyFile: str,
nodeToSrcIds: dict[int, dict[str, int]],
@@ -88,9 +97,11 @@ def readTaxonomyFile(
for lineNum, line in enumerate(file, 1):
if lineNum % 1e5 == 0:
print(f'At line {lineNum}')
+
# Skip header line
if lineNum == 1:
continue
+
# Parse line
fields = line.split('\t|\t')
try:
@@ -99,6 +110,7 @@ def readTaxonomyFile(
print(f'Skipping non-integral ID {fields[0]} on line {lineNum}')
continue
srcsField = fields[4]
+
# Add source IDs
for srcPair in srcsField.split(','):
src, srcIdStr = srcPair.split(':', 1)
@@ -111,6 +123,7 @@ def readTaxonomyFile(
nodeToSrcIds[otolId][src] = srcId
usedSrcIds.add((src, srcId))
print(f'- Result has {sum([len(v) for v in nodeToSrcIds.values()]):,} entries') # Was about 6.7e6
+
def readEolIdsFile(
eolIdsFile: str,
nodeToSrcIds: dict[int, dict[str, int]],
@@ -126,9 +139,11 @@ def readEolIdsFile(
for lineNum, row in enumerate(csv.reader(file), 1):
if lineNum % 1e6 == 0:
print(f'At line {lineNum}')
+
# Skip header line
if lineNum == 1:
continue
+
# Parse line
eolId = int(row[3])
srcInt = int(row[2])
@@ -144,7 +159,7 @@ def readEolIdsFile(
srcToEolId[src][srcId] = eolId
print(f'- Result has {sum([len(v) for v in srcToEolId.values()]):,} entries')
# Was about 3.5e6 (4.2e6 without usedSrcIds)
- #
+
print('Resolving candidate EOL IDs')
# For each otol ID, find eol IDs with matching sources, and choose the 'best' one
for otolId, srcInfo in nodeToSrcIds.items():
@@ -161,6 +176,7 @@ def readEolIdsFile(
eolIds = [eolId for eolId, count in eolIdToCount.items() if count == maxCount]
nodeToEolId[otolId] = min(eolIds)
print(f'- Result has {len(nodeToEolId):,} entries') # Was about 2.7e6
+
def readWikidataDb(
wikidataDb: str,
nodeToSrcIds: dict[int, dict[str, int]],
@@ -185,7 +201,7 @@ def readWikidataDb(
# Was about 1.1e6 (1.2e6 without usedSrcIds)
print(f'- IUCN map has {len(titleToIucnStatus):,} entries') # Was about 7e4 (7.2e4 without usedSrcIds)
dbCon.close()
- #
+
print('Resolving candidate Wikidata items')
# For each otol ID, find wikidata titles with matching sources, and choose the 'best' one
for otolId, srcInfo in nodeToSrcIds.items():
@@ -211,7 +227,7 @@ def readWikidataDb(
nodeToWikiTitle[otolId] = srcToTitle[src]
break
print(f'- Result has {len(nodeToWikiTitle):,} entries') # Was about 4e5
- #
+
print('Adding extra EOL mappings from Wikidata')
wikiTitleToNode = {title: node for node, title in nodeToWikiTitle.items()}
addedEntries: dict[int, int] = {}
@@ -222,6 +238,7 @@ def readWikidataDb(
nodeToEolId[otolId] = eolId
addedEntries[otolId] = eolId
print(f'- Added {len(addedEntries):,} entries') # Was about 3e3
+
def readPickedMappings(
pickedMappings: dict[str, list[str]],
nodeToEolId: dict[int, int],
@@ -248,6 +265,7 @@ def readPickedMappings(
else:
if otolId in nodeToWikiTitle:
del nodeToWikiTitle[otolId]
+
def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], titleToPageId: dict[str, int]) -> None:
""" Read a db for mappings from enwiki titles to page IDs """
print('Getting enwiki page IDs')
@@ -264,8 +282,7 @@ def getEnwikiPageIds(enwikiDumpIndexDb: str, nodeToWikiTitle: dict[int, str], ti
print(f'Unable to find IDs for {numNotFound} titles') # Was 2913
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(TAXONOMY_FILE, EOL_IDS_FILE, WIKIDATA_DB, PICKED_MAPPINGS, ENWIKI_DUMP_INDEX_DB, DB_FILE)
diff --git a/backend/tol_data/gen_name_data.py b/backend/tol_data/gen_name_data.py
index 2e92c20..5b6e963 100755
--- a/backend/tol_data/gen_name_data.py
+++ b/backend/tol_data/gen_name_data.py
@@ -5,8 +5,12 @@ Maps nodes to vernacular names, using data from EOL, enwiki, and a
picked-names file, and stores results in the database.
"""
-import re, os
-import html, csv, sqlite3
+import argparse
+import re
+import os
+import html
+import csv
+import sqlite3
EOL_NAMES_FILE = os.path.join('eol', 'vernacularNames.csv')
ENWIKI_DB = os.path.join('enwiki', 'desc_data.db')
@@ -17,25 +21,26 @@ def genData(eolNamesFile: str, enwikiDb: str, pickedNamesFile: str, dbFile: str)
""" Reads the files and adds to db """
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Creating table')
dbCur.execute('CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))')
dbCur.execute('CREATE INDEX names_idx ON names(name)')
dbCur.execute('CREATE INDEX names_alt_idx ON names(alt_name)')
dbCur.execute('CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)')
- #
+
print('Getting node mappings')
nodeToTips: dict[str, int] = {}
for name, tips in dbCur.execute('SELECT name, tips from nodes'):
nodeToTips[name] = tips
- #
+
addEolNames(eolNamesFile, nodeToTips, dbCur)
addEnwikiNames(enwikiDb, nodeToTips, dbCur)
addPickedNames(pickedNamesFile, nodeToTips, dbCur)
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
""" Reads EOL names, associates them with otol nodes, and writes to db """
# The CSV file has a header line, then lines with these fields:
@@ -47,26 +52,31 @@ def addEolNames(eolNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cu
for name, eolId in dbCur.execute('SELECT name, id from eol_ids'):
if eolId not in eolIdToNode or nodeToTips[eolIdToNode[eolId]] < nodeToTips[name]:
eolIdToNode[eolId] = name
+
print('Adding names from EOL')
namesToSkip = {'unknown', 'unknown species', 'unidentified species'}
with open(eolNamesFile, newline='') as file:
for lineNum, fields in enumerate(csv.reader(file), 1):
if lineNum % 1e5 == 0:
print(f'At line {lineNum}') # Reached about 2.8e6
+
# Skip header line
if lineNum == 1:
continue
+
# Parse line
eolId = int(fields[0])
name = html.unescape(fields[2]).lower()
lang = fields[3]
isPreferred = 1 if fields[6] == 'preferred' else 0
+
# Add to db
if eolId in eolIdToNode and name not in namesToSkip and name not in nodeToTips \
and lang == 'eng' and len(name.split(' ')) <= 3: # Ignore names with >3 words
cmd = 'INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'eol\')'
# The 'OR IGNORE' accounts for duplicate lines
dbCur.execute(cmd, (eolIdToNode[eolId], name, isPreferred))
+
def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
""" Reads enwiki names, associates them with otol nodes, and writes to db """
print('Getting enwiki mappings')
@@ -74,6 +84,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
for name, wikiId in dbCur.execute('SELECT name, id from wiki_ids'):
if wikiId not in wikiIdToNode or nodeToTips[wikiIdToNode[wikiId]] < nodeToTips[name]:
wikiIdToNode[wikiId] = name
+
print('Adding names from enwiki')
altNameRegex = re.compile(r'[a-z]+') # Avoids names like 'evolution of elephants', 'banana fiber', 'fish (zoology)',
enwikiCon = sqlite3.connect(enwikiDb)
@@ -83,7 +94,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}') # Reached about 3.6e5
- #
+
query = 'SELECT p1.title FROM pages p1' \
' INNER JOIN redirects r1 ON p1.id = r1.id' \
' INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?'
@@ -91,6 +102,7 @@ def addEnwikiNames(enwikiDb: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cur
name = name.lower()
if altNameRegex.fullmatch(name) is not None and name != nodeName and name not in nodeToTips:
dbCur.execute('INSERT OR IGNORE INTO names VALUES (?, ?, ?, \'enwiki\')', (nodeName, name, 0))
+
def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqlite3.Cursor) -> None:
# File format:
# nodename1|altName1|isPreferred1 -> Add an alt-name
@@ -121,8 +133,7 @@ def addPickedNames(pickedNamesFile: str, nodeToTips: dict[str, int], dbCur: sqli
dbCur.execute(cmd, (nodeName,))
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(EOL_NAMES_FILE, ENWIKI_DB, PICKED_NAMES_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_otol_data.py b/backend/tol_data/gen_otol_data.py
index eba8779..a67ea4b 100755
--- a/backend/tol_data/gen_otol_data.py
+++ b/backend/tol_data/gen_otol_data.py
@@ -21,14 +21,19 @@ Reads from a picked-names file, if present, which specifies name and node ID pai
These help resolve cases where multiple nodes share the same name.
"""
-import re, os
-import json, sqlite3
+import argparse
+import re
+import os
+import json
+import sqlite3
TREE_FILE = os.path.join('otol', 'labelled_supertree_ottnames.tre') # Had about 2.5e9 nodes
ANN_FILE = os.path.join('otol', 'annotations.json')
DB_FILE = 'data.db'
PICKED_NAMES_FILE = 'picked_otol_names.txt'
+# ========== Classes ==========
+
class Node:
""" Represents a tree-of-life node """
def __init__(self, name, childIds, parentId, tips, pSupport):
@@ -37,13 +42,16 @@ class Node:
self.parentId = parentId
self.tips = tips
self.pSupport = pSupport
+
class BasicStream:
""" Represents a basic data stream, using a string and index. Used for parsing text with lookahead. """
def __init__(self, data, idx=0):
self.data = data
self.idx = idx
+
def hasNext(self) -> bool:
return self.idx < len(self.data)
+
def next(self) -> str:
if self.hasNext():
char = self.data[self.idx]
@@ -51,30 +59,37 @@ class BasicStream:
return char;
else:
return '';
+
def peek(self) -> str:
if self.hasNext():
return self.data[self.idx]
else:
return '';
+
def skipWhitespace(self) -> None:
while self.hasNext() and self.data[self.idx].isspace():
self.idx += 1
+
def progress(self) -> float:
return (self.idx / len(self.data))
+# ========== For data generation ==========
+
def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> None:
""" Reads the files and stores the tree info """
nodeMap: dict[str, Node] = {} # Maps node IDs to node objects
nameToFirstId: dict[str, str] = {} # Maps node names to first found ID (names might have multiple IDs)
dupNameToIds: dict[str, list[str]] = {} # Maps names of nodes with multiple IDs to those IDs
- #
+
print('Parsing tree file')
treeStream: BasicStream
with open(treeFile) as file:
treeStream = BasicStream(file.read())
+
# Parse content
parseNewick(treeStream, nodeMap, nameToFirstId, dupNameToIds)
print('Resolving duplicate names')
+
# Read picked-names file
nameToPickedId: dict[str, str] = {}
if os.path.exists(pickedNamesFile):
@@ -82,6 +97,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
for line in file:
name, _, otolId = line.strip().partition('|')
nameToPickedId[name] = otolId
+
# Resolve duplicates
for dupName, ids in dupNameToIds.items():
# Check for picked id
@@ -98,10 +114,12 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
if id != idToUse:
nodeMap[id].name += f' [{counter}]'
counter += 1
+
print('Changing mrca* names')
for id, node in nodeMap.items():
if node.name.startswith('mrca'):
convertMrcaName(id, nodeMap)
+
print('Parsing annotations file')
# Read file
with open(annFile) as file:
@@ -116,6 +134,7 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
supportQty = len(nodeAnns['supported_by']) if 'supported_by' in nodeAnns else 0
conflictQty = len(nodeAnns['conflicts_with']) if 'conflicts_with' in nodeAnns else 0
node.pSupport = supportQty > 0 and conflictQty == 0
+
print('Creating nodes and edges tables')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -129,9 +148,11 @@ def genData(treeFile: str, annFile: str, pickedNamesFile: str, dbFile: str) -> N
childNode = nodeMap[childId]
dbCur.execute('INSERT INTO edges VALUES (?, ?, ?)',
(node.name, childNode.name, 1 if childNode.pSupport else 0))
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def parseNewick(
stream: BasicStream,
nodeMap: dict[str, Node],
@@ -140,6 +161,7 @@ def parseNewick(
""" Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID """
if stream.idx % 1e5 == 0:
print(f'Progress: {stream.progress() * 100:.2f}%')
+
# Find node
stream.skipWhitespace()
if stream.peek() == '':
@@ -151,6 +173,7 @@ def parseNewick(
# Read child
childId = parseNewick(stream, nodeMap, nameToFirstId, dupNameToIds)
childIds.append(childId)
+
# Check for next child or end of node
stream.skipWhitespace()
if stream.peek() == '':
@@ -164,12 +187,15 @@ def parseNewick(
stream.skipWhitespace()
name, id = parseNewickName(stream)
updateNameMaps(name, id, nameToFirstId, dupNameToIds)
+
# Get child num-tips total
tips = 0
for childId in childIds:
tips += nodeMap[childId].tips
+
# Add node to nodeMap
nodeMap[id] = Node(name, childIds, None, tips, False)
+
# Update childrens' parent reference
for childId in childIds:
nodeMap[childId].parentId = id
@@ -179,6 +205,7 @@ def parseNewick(
updateNameMaps(name, id, nameToFirstId, dupNameToIds)
nodeMap[id] = Node(name, [], None, 1, False)
return id
+
def parseNewickName(stream: BasicStream) -> tuple[str, str]:
""" Parses a node name from 'stream', and returns a (name, id) pair """
name: str
@@ -202,6 +229,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
nameChars.append(stream.next())
if stream.peek() == ';': # Ignore trailing input semicolon
stream.next()
+
# Convert to (name, id)
name = ''.join(nameChars).rstrip().lower()
if name.startswith('mrca'):
@@ -217,6 +245,7 @@ def parseNewickName(stream: BasicStream) -> tuple[str, str]:
if match is None:
raise Exception(f'ERROR: invalid name \'{name}\'')
return (match.group(1).replace('_', ' '), match.group(2))
+
def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToIds: dict[str, list[str]]) -> None:
""" Update maps upon a newly parsed name """
if name not in nameToFirstId:
@@ -226,6 +255,7 @@ def updateNameMaps(name: str, id: str, nameToFirstId: dict[str, str], dupNameToI
dupNameToIds[name] = [nameToFirstId[name], id]
else:
dupNameToIds[name].append(id)
+
def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
""" Update a node in a tree to be named after 2 descendants.
Returns the name of one such descendant, for use during recursion. """
@@ -234,6 +264,7 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
childIds = node.childIds
if len(childIds) < 2:
raise Exception(f'ERROR: MRCA node \'{name}\' has less than 2 children')
+
# Get 2 children with most tips
childTips = [nodeMap[id].tips for id in childIds]
maxIdx1 = childTips.index(max(childTips))
@@ -243,11 +274,13 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
childId2 = childIds[maxIdx2]
childName1 = nodeMap[childId1].name
childName2 = nodeMap[childId2].name
+
# Check for mrca* child names
if childName1.startswith('mrca'):
childName1 = convertMrcaName(childId1, nodeMap)
if childName2.startswith('mrca'):
childName2 = convertMrcaName(childId2, nodeMap)
+
# Check for composite names
match = re.fullmatch(r'\[(.+) \+ (.+)]', childName1)
if match is not None:
@@ -255,13 +288,15 @@ def convertMrcaName(id: str, nodeMap: dict[str, Node]) -> str:
match = re.fullmatch(r'\[(.+) \+ (.+)]', childName2)
if match is not None:
childName2 = match.group(1)
+
# Create composite name
node.name = f'[{childName1} + {childName2}]'
return childName1
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
genData(TREE_FILE, ANN_FILE, PICKED_NAMES_FILE, DB_FILE)
diff --git a/backend/tol_data/gen_pop_data.py b/backend/tol_data/gen_pop_data.py
index e6a646e..4280a12 100755
--- a/backend/tol_data/gen_pop_data.py
+++ b/backend/tol_data/gen_pop_data.py
@@ -5,7 +5,9 @@ Reads enwiki page view info from a database, and stores it
as node popularity values in the database.
"""
-import os, sqlite3
+import argparse
+import os
+import sqlite3
PAGEVIEWS_DB = os.path.join('enwiki', 'pageview_data.db')
DB_FILE = 'data.db'
@@ -13,7 +15,7 @@ DB_FILE = 'data.db'
def genData(pageviewsDb: str, dbFile: str) -> None:
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Getting view counts')
pdbCon = sqlite3.connect(pageviewsDb)
pdbCur = pdbCon.cursor()
@@ -23,23 +25,22 @@ def genData(pageviewsDb: str, dbFile: str) -> None:
iterNum += 1
if iterNum % 1e4 == 0:
print(f'At iteration {iterNum}') # Reached 1.6e6
- #
+
row = dbCur.execute('SELECT name FROM wiki_ids WHERE id = ?', (wikiId,)).fetchone()
if row is not None:
nodeToViews[row[0]] = views
pdbCon.close()
- #
+
print(f'Writing {len(nodeToViews)} entries to db')
dbCur.execute('CREATE TABLE node_pop (name TEXT PRIMARY KEY, pop INT)')
for nodeName, views in nodeToViews.items():
dbCur.execute('INSERT INTO node_pop VALUES (?, ?)', (nodeName, views))
- #
+
dbCon.commit()
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
genData(PAGEVIEWS_DB, DB_FILE)
diff --git a/backend/tol_data/gen_reduced_trees.py b/backend/tol_data/gen_reduced_trees.py
index 3742544..ce628f7 100755
--- a/backend/tol_data/gen_reduced_trees.py
+++ b/backend/tol_data/gen_reduced_trees.py
@@ -14,12 +14,14 @@ Creates reduced versions of the tree in the database:
removing some more, despite any node descriptions.
"""
-import sys, re
+import argparse
+import sys
+import re
import sqlite3
DB_FILE = 'data.db'
PICKED_NODES_FILE = 'picked_nodes.txt'
-#
+
COMP_NAME_REGEX = re.compile(r'\[.+ \+ .+]') # Used to recognise composite nodes
class Node:
@@ -30,16 +32,18 @@ class Node:
self.tips = tips
self.pSupport = pSupport
+# ========== For data generation ==========
+
def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
print('Opening database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
print('Finding root node')
query = 'SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1'
(rootName,) = dbCur.execute(query).fetchone()
print(f'Found \'{rootName}\'')
- #
+
print('=== Getting picked-nodes ===')
pickedNames: set[str] = set()
pickedTreeExists = False
@@ -63,7 +67,7 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
for (name,) in dbCur.execute('SELECT name FROM nodes_p'):
pickedNames.add(name)
print(f'Found {len(pickedNames)} names')
- #
+
if (tree == 'picked' or tree is None) and not pickedTreeExists:
print('=== Generating picked-nodes tree ===')
genPickedNodeTree(dbCur, pickedNames, rootName)
@@ -88,22 +92,27 @@ def genData(tree: str, dbFile: str, pickedNodesFile: str) -> None:
if tree == 'trimmed' or tree is None:
print('=== Generating weakly-trimmed tree ===')
genWeaklyTrimmedTree(dbCur, nodesWithImgDescOrPicked, nodesWithImgOrPicked, rootName)
- #
+
print('Closing database')
dbCon.commit()
dbCon.close()
+
def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: str) -> None:
PREF_NUM_CHILDREN = 3 # Include extra children up to this limit
+
print('Getting ancestors')
nodeMap = genNodeMap(dbCur, pickedNames, 100)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing composite nodes')
removedNames = removeCompositeNodes(nodeMap)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing \'collapsible\' nodes')
temp = removeCollapsibleNodes(nodeMap, pickedNames)
removedNames.update(temp)
print(f'Result has {len(nodeMap)} nodes')
+
print('Adding some additional nearby children')
namesToAdd: list[str] = []
iterNum = 0
@@ -111,7 +120,7 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st
iterNum += 1
if iterNum % 100 == 0:
print(f'At iteration {iterNum}')
- #
+
numChildren = len(node.children)
if numChildren < PREF_NUM_CHILDREN:
children = [row[0] for row in dbCur.execute('SELECT child FROM edges where parent = ?', (name,))]
@@ -134,33 +143,44 @@ def genPickedNodeTree(dbCur: sqlite3.Cursor, pickedNames: set[str], rootName: st
parent = None if parent == '' else parent
nodeMap[name] = Node(id, [], parent, 0, pSupport == 1)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values')
updateTips(rootName, nodeMap)
+
print('Creating table')
addTreeTables(nodeMap, dbCur, 'p')
+
def genImagesOnlyTree(
dbCur: sqlite3.Cursor,
nodesWithImgOrPicked: set[str],
pickedNames: set[str],
rootName: str) -> None:
+
print('Getting ancestors')
nodeMap = genNodeMap(dbCur, nodesWithImgOrPicked, 1e4)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing composite nodes')
removeCompositeNodes(nodeMap)
print(f'Result has {len(nodeMap)} nodes')
+
print('Removing \'collapsible\' nodes')
removeCollapsibleNodes(nodeMap, pickedNames)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values') # Needed for next trimming step
updateTips(rootName, nodeMap)
+
print('Trimming from nodes with \'many\' children')
trimIfManyChildren(nodeMap, rootName, 300, pickedNames)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values')
updateTips(rootName, nodeMap)
+
print('Creating table')
addTreeTables(nodeMap, dbCur, 'i')
+
def genWeaklyTrimmedTree(
dbCur: sqlite3.Cursor,
nodesWithImgDescOrPicked: set[str],
@@ -169,6 +189,7 @@ def genWeaklyTrimmedTree(
print('Getting ancestors')
nodeMap = genNodeMap(dbCur, nodesWithImgDescOrPicked, 1e5)
print(f'Result has {len(nodeMap)} nodes')
+
print('Getting nodes to \'strongly keep\'')
iterNum = 0
nodesFromImgOrPicked: set[str] = set()
@@ -184,19 +205,26 @@ def genWeaklyTrimmedTree(
else:
break
print(f'Node set has {len(nodesFromImgOrPicked)} nodes')
+
print('Removing \'collapsible\' nodes')
removeCollapsibleNodes(nodeMap, nodesWithImgDescOrPicked)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values') # Needed for next trimming step
updateTips(rootName, nodeMap)
+
print('Trimming from nodes with \'many\' children')
trimIfManyChildren(nodeMap, rootName, 600, nodesFromImgOrPicked)
print(f'Result has {len(nodeMap)} nodes')
+
print('Updating \'tips\' values')
updateTips(rootName, nodeMap)
+
print('Creating table')
addTreeTables(nodeMap, dbCur, 't')
-# Helper functions
+
+# ========== Helper functions ==========
+
def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -> dict[str, Node]:
""" Returns a subtree that includes nodes in 'nameSet', as a name-to-Node map """
nodeMap: dict[str, Node] = {}
@@ -206,7 +234,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -
iterNum += 1
if iterNum % itersBeforePrint == 0:
print(f'At iteration {iterNum}')
- #
+
prevName: str | None = None
while name is not None:
if name not in nodeMap:
@@ -227,6 +255,7 @@ def genNodeMap(dbCur: sqlite3.Cursor, nameSet: set[str], itersBeforePrint = 1) -
nodeMap[name].children.append(prevName)
break
return nodeMap
+
def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]:
""" Given a tree, removes composite-name nodes, and returns the removed nodes' names """
namesToRemove: set[str] = set()
@@ -244,10 +273,12 @@ def removeCompositeNodes(nodeMap: dict[str, Node]) -> set[str]:
for name in namesToRemove:
del nodeMap[name]
return namesToRemove
+
def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set()) -> set[str]:
""" Given a tree, removes single-child parents, then only-childs,
with given exceptions, and returns the set of removed nodes' names """
namesToRemove: set[str] = set()
+
# Remove single-child parents
for name, node in nodeMap.items():
if len(node.children) == 1 and node.parent is not None and name not in nodesToKeep:
@@ -262,6 +293,7 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set
namesToRemove.add(name)
for name in namesToRemove:
del nodeMap[name]
+
# Remove only-childs (not redundant because 'nodesToKeep' can cause single-child parents to be kept)
namesToRemove.clear()
for name, node in nodeMap.items():
@@ -277,8 +309,9 @@ def removeCollapsibleNodes(nodeMap: dict[str, Node], nodesToKeep: set[str] = set
namesToRemove.add(name)
for name in namesToRemove:
del nodeMap[name]
- #
+
return namesToRemove
+
def trimIfManyChildren(
nodeMap: dict[str, Node], rootName: str, childThreshold: int, nodesToKeep: set[str] = set()) -> None:
namesToRemove: set[str] = set()
@@ -299,14 +332,17 @@ def trimIfManyChildren(
# Recurse on children
for n in node.children:
findTrimmables(n)
+
def markForRemoval(nodeName: str) -> None:
nonlocal nodeMap, namesToRemove
namesToRemove.add(nodeName)
for child in nodeMap[nodeName].children:
markForRemoval(child)
+
findTrimmables(rootName)
for nodeName in namesToRemove:
del nodeMap[nodeName]
+
def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int:
""" Updates the 'tips' values for a node and it's descendants, returning the node's new 'tips' value """
node = nodeMap[nodeName]
@@ -314,6 +350,7 @@ def updateTips(nodeName: str, nodeMap: dict[str, Node]) -> int:
tips = max(1, tips)
node.tips = tips
return tips
+
def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str):
""" Adds a tree to the database, as tables nodes_X and edges_X, where X is the given suffix """
nodesTbl = f'nodes_{suffix}'
@@ -328,10 +365,11 @@ def addTreeTables(nodeMap: dict[str, Node], dbCur: sqlite3.Cursor, suffix: str):
pSupport = 1 if nodeMap[childName].pSupport else 0
dbCur.execute(f'INSERT INTO {edgesTbl} VALUES (?, ?, ?)', (name, childName, pSupport))
+# ========== Main block ==========
+
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--tree', choices=['picked', 'images', 'trimmed'], help='Only generate the specified tree')
args = parser.parse_args()
- #
+
genData(args.tree, DB_FILE, PICKED_NODES_FILE)
diff --git a/backend/tol_data/review_imgs_to_gen.py b/backend/tol_data/review_imgs_to_gen.py
index 2283ed7..f384ddf 100755
--- a/backend/tol_data/review_imgs_to_gen.py
+++ b/backend/tol_data/review_imgs_to_gen.py
@@ -11,8 +11,11 @@ The program looks for an existing output file to determine what choices
have already been made.
"""
-import os, time
+import argparse
+import os
+import time
import sqlite3
+
import tkinter as tki
from tkinter import ttk
import PIL
@@ -22,7 +25,7 @@ EOL_IMG_DIR = os.path.join('eol', 'imgs')
ENWIKI_IMG_DIR = os.path.join('enwiki', 'imgs')
DB_FILE = 'data.db'
OUT_FILE = 'img_list.txt'
-#
+
IMG_DISPLAY_SZ = 400
PLACEHOLDER_IMG = Image.new('RGB', (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
REVIEW = 'only pairs' # Can be: 'all', 'only pairs', 'none'
@@ -32,11 +35,13 @@ class ImgReviewer:
def __init__(self, root, nodeToImgs, eolImgDir, enwikiImgDir, outFile, dbCon, review):
self.root = root
root.title('Image Reviewer')
+
# Setup main frame
mainFrame = ttk.Frame(root, padding='5 5 5 5')
mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)
+
# Set up images-to-be-reviewed frames
self.eolImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
self.enwikiImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
@@ -47,14 +52,17 @@ class ImgReviewer:
label = ttk.Label(frame, image=self.eolImg if i == 0 else self.enwikiImg)
label.grid(column=0, row=0)
self.labels.append(label)
+
# Add padding
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
+
# Add keyboard bindings
root.bind('<q>', self.quit)
root.bind('<Key-j>', lambda evt: self.accept(0))
root.bind('<Key-k>', lambda evt: self.accept(1))
root.bind('<Key-l>', lambda evt: self.reject())
+
# Set fields
self.nodeImgsList = list(nodeToImgs.items())
self.listIdx = -1
@@ -69,8 +77,10 @@ class ImgReviewer:
self.enwikiImgPath = None
self.numReviewed = 0
self.startTime = time.time()
+
# Initialise images to review
self.getNextImgs()
+
def getNextImgs(self):
""" Updates display with new images to review, or ends program """
# Get next image paths
@@ -81,6 +91,7 @@ class ImgReviewer:
self.quit()
return
self.otolId, imgPaths = self.nodeImgsList[self.listIdx]
+
# Potentially skip user choice
if len(imgPaths) == 1 and (self.review == 'only pairs' or self.review == 'none'):
with open(self.outFile, 'a') as file:
@@ -91,6 +102,7 @@ class ImgReviewer:
file.write(f'{self.otolId} {imgPaths[-1]}\n') # Prefer enwiki image
continue
break
+
# Update displayed images
self.eolImgPath = self.enwikiImgPath = None
imageOpenError = False
@@ -113,20 +125,24 @@ class ImgReviewer:
print(f'Unexpected image path {imgPath}')
self.quit()
return
+
# Re-iterate if all image paths invalid
if self.eolImgPath is None and self.enwikiImgPath is None:
if imageOpenError:
self.reject()
self.getNextImgs()
return
+
# Add placeholder images
if self.eolImgPath is None:
self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
elif self.enwikiImgPath is None:
self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
+
# Update image-frames
self.labels[0].config(image=self.eolImg)
self.labels[1].config(image=self.enwikiImg)
+
# Update title
title = f'Images for otol ID {self.otolId}'
query = 'SELECT names.alt_name FROM' \
@@ -137,6 +153,7 @@ class ImgReviewer:
title += f', aka {row[0]}'
title += f' ({self.listIdx + 1} out of {len(self.nodeImgsList)})'
self.root.title(title)
+
def accept(self, imgIdx):
""" React to a user selecting an image """
imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath
@@ -147,12 +164,14 @@ class ImgReviewer:
file.write(f'{self.otolId} {imgPath}\n')
self.numReviewed += 1
self.getNextImgs()
+
def reject(self):
""""" React to a user rejecting all images of a set """
with open(self.outFile, 'a') as file:
file.write(f'{self.otolId}\n')
self.numReviewed += 1
self.getNextImgs()
+
def quit(self, e = None):
print(f'Number reviewed: {self.numReviewed}')
timeElapsed = time.time() - self.startTime
@@ -161,6 +180,7 @@ class ImgReviewer:
print(f'Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds')
self.dbCon.close()
self.root.destroy()
+
def resizeImgForDisplay(self, img):
""" Returns a copy of an image, shrunk to fit it's frame (keeps aspect ratio), and with a background """
if max(img.width, img.height) > IMG_DISPLAY_SZ:
@@ -180,7 +200,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
print('Opening database')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
- #
+
nodeToImgs: dict[str, list[str]] = {} # Maps otol-ids to arrays of image paths
print('Iterating through images from EOL')
if os.path.exists(eolImgDir):
@@ -198,6 +218,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
if not found:
print(f'WARNING: No node found for {os.path.join(eolImgDir, filename)}')
print(f'Result: {len(nodeToImgs)} nodes with images')
+
print('Iterating through images from Wikipedia')
if os.path.exists(enwikiImgDir):
for filename in os.listdir(enwikiImgDir):
@@ -214,7 +235,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
if not found:
print(f'WARNING: No node found for {os.path.join(enwikiImgDir, filename)}')
print(f'Result: {len(nodeToImgs)} nodes with images')
- #
+
print('Filtering out already-made image choices')
oldSz = len(nodeToImgs)
if os.path.exists(outFile):
@@ -225,7 +246,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
line = line[:line.find(' ')]
del nodeToImgs[line]
print(f'Filtered out {oldSz - len(nodeToImgs)} entries')
- #
+
# Create GUI and defer control
print('Starting GUI')
root = tki.Tk()
@@ -234,8 +255,7 @@ def reviewImgs(eolImgDir: str, enwikiImgDir: str, dbFile: str, outFile: str, rev
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
- #
+
reviewImgs(EOL_IMG_DIR, ENWIKI_IMG_DIR, DB_FILE, OUT_FILE, REVIEW)
diff --git a/backend/tol_data/wikidata/gen_taxon_src_data.py b/backend/tol_data/wikidata/gen_taxon_src_data.py
index 1bddb6e..d2a3811 100755
--- a/backend/tol_data/wikidata/gen_taxon_src_data.py
+++ b/backend/tol_data/wikidata/gen_taxon_src_data.py
@@ -30,10 +30,21 @@ OZprivate/ServerScripts/TaxonMappingAndPopularity/ (22 Aug 2022).
# - Using pool.map() instead of pool.imap_unordered(), which seems to hang in some cases (was using python 3.8).
# Possibly related: https://github.com/python/cpython/issues/72882
-import sys, os, re, math, io
+import argparse
+import sys
+import os
+import re
+import math
+import io
from collections import defaultdict
-import bz2, json, sqlite3
-import multiprocessing, indexed_bzip2, pickle, tempfile
+import bz2
+import json
+import sqlite3
+
+import multiprocessing
+import indexed_bzip2
+import pickle
+import tempfile
WIKIDATA_FILE = 'latest-all.json.bz2'
OFFSETS_FILE = 'offsets.dat'
@@ -49,9 +60,12 @@ IUCN_STATUS_IDS = {
'Q11394': 'endangered', 'Q219127': 'critically endangered', 'Q239509': 'extinct in the wild',
'Q237350': 'extinct species', 'Q3245245': 'data deficient'
}
+
# For filtering lines before parsing JSON
LINE_REGEX = re.compile(('"id":(?:"' + '"|"'.join([s for s in TAXON_IDS + TAXON_ALT_IDS]) + '")').encode())
+# ========== For data generation ==========
+
def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> None:
""" Reads the dump and writes source/iucn info to db """
# Maps to populate
@@ -59,10 +73,12 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
idToTitle: dict[int, str] = {} # Maps wikidata ID to enwiki title
idToAltId: dict[int, int] = {} # Maps taxon-item wikidata ID to taxon-alt ID (eg: 'canis lupus familiaris' -> 'dog')
idToIucnStatus: dict[int, str] = {} # Maps wikidata ID to iucn-status string ('least concern', etc)
+
# Check db
if os.path.exists(dbFile):
print('ERROR: Database already exists')
sys.exit(1)
+
# Read dump
if nProcs == 1:
with bz2.open(wikidataFile, mode='rb') as file:
@@ -76,6 +92,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
with indexed_bzip2.open(wikidataFile) as file:
with open(offsetsFile, 'wb') as file2:
pickle.dump(file.block_offsets(), file2)
+
print('Allocating file into chunks')
fileSz: int # About 1.4 TB
with indexed_bzip2.open(wikidataFile) as file:
@@ -86,6 +103,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
chunkIdxs = [-1] + [chunkSz * i for i in range(1, nProcs)] + [fileSz-1]
# Each adjacent pair specifies a start+end byte index for readDumpChunk()
print(f'- Chunk size: {chunkSz:,}')
+
print('Starting processes to read dump')
with tempfile.TemporaryDirectory() as tempDirName:
# Using maxtasksperchild=1 to free resources on task completion
@@ -103,7 +121,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
idToTitle.update(maps[1])
idToAltId.update(maps[2])
idToIucnStatus.update(maps[3])
- #
+
print('Writing to db')
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
@@ -127,6 +145,7 @@ def genData(wikidataFile: str, offsetsFile: str, dbFile: str, nProcs: int) -> No
# The 'OR IGNORE' allows for multiple taxons using the same alt
dbCon.commit()
dbCon.close()
+
def readDumpLine(
lineBytes: bytes,
srcIdToId: dict[str, dict[int, int]],
@@ -160,6 +179,7 @@ def readDumpLine(
return
if not isTaxon and not altTaxa:
return
+
# Get wikidata ID and enwiki title
itemId: int | None = None
itemTitle: str | None = None
@@ -172,11 +192,13 @@ def readDumpLine(
itemTitle = None
else:
return
+
# Update maps
if itemTitle is not None:
idToTitle[itemId] = itemTitle
for altId in altTaxa:
idToAltId[altId] = itemId
+
# Check for source IDs
for srcPropId, src in SRC_PROP_IDS.items():
if srcPropId in claims:
@@ -185,6 +207,7 @@ def readDumpLine(
srcIdToId[src][srcId] = itemId
except (KeyError, ValueError):
continue
+
# Check for IUCN status
if 'P141' in claims: # Check for 'iucn conservation status' statement
try:
@@ -192,9 +215,11 @@ def readDumpLine(
idToIucnStatus[itemId] = IUCN_STATUS_IDS[iucnStatusId]
except KeyError:
pass
+
def readDumpChunkOneParam(params: tuple[int, str, str, int, int, str]) -> str:
""" Forwards to readDumpChunk(), for use with pool.map() """
return readDumpChunk(*params)
+
def readDumpChunk(
procId: int, wikidataFile: str, offsetsFile: str, startByte: int, endByte: int, outFilename: str) -> str:
""" Reads lines in the dump that begin after a start-byte, and not after an end byte.
@@ -205,18 +230,21 @@ def readDumpChunk(
dict[int, str],
dict[int, int],
dict[int, str]] = (defaultdict(dict), {}, {}, {})
+
# Read dump
with indexed_bzip2.open(wikidataFile) as file:
# Load offsets file
with open(offsetsFile, 'rb') as file2:
offsets = pickle.load(file2)
file.set_block_offsets(offsets)
+
# Seek to chunk
if startByte != -1:
file.seek(startByte)
file.readline()
else:
startByte = 0 # Used for progress calculation
+
# Read lines
count = 0
while file.tell() <= endByte:
@@ -225,15 +253,17 @@ def readDumpChunk(
perc = (file.tell() - startByte) / (endByte - startByte) * 100
print(f'Thread {procId}: {perc:.2f}%')
readDumpLine(file.readline(), *maps)
+
# Output results into file
with open(outFilename, 'wb') as file:
pickle.dump(maps, file)
return outFilename
+# ========== Main block ==========
+
if __name__ == '__main__': # Guard needed for multiprocessing
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
args = parser.parse_args()
- #
+
multiprocessing.set_start_method('spawn')
genData(WIKIDATA_FILE, OFFSETS_FILE, DB_FILE, N_PROCS)