diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-22 23:16:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-22 23:16:42 +1000 |
| commit | abb936f5d76f7fe5cec1e8948d287da86643d504 (patch) | |
| tree | f07b9eaadf5ae91363fdbac9d81b74e1fb0a436f | |
| parent | e78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff) | |
Refactor backend scriptsextended-db
25 files changed, 876 insertions, 721 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 7d1adad..f5b35f0 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -1,17 +1,50 @@ This directory holds files used to generate data.db, which contains tree-of-life data. # Tables: -- `nodes`: `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT` -- `edges`: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)` -- `eol_ids`: `id INT PRIMARY KEY, name TEXT` -- `names`: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)` -- `wiki_ids`: `name TEXT PRIMARY KEY, id INT, redirected INT` -- `descs`: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT` -- `node_imgs`: `name TEXT PRIMARY KEY, img_id INT, src TEXT` -- `images`: `id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)` -- `linked_imgs`: `name TEXT PRIMARY KEY, otol_ids TEXT` -- `r_nodes`: `name TEXT PRIMARY KEY, tips INT` -- `r_edges`: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)` +## Tree Structure data +- `nodes` <br> + Format : `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT` <br> + Represents a tree-of-life node. `tips` represents the number of no-child descendants. +- `edges` <br> + Format: `parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child)` <br> + `p_support` is 1 if the edge has 'phylogenetic support', and 0 otherwise +## Node name data +- `eol_ids` <br> + Format: `id INT PRIMARY KEY, name TEXT` <br> + Associates an EOL ID with a node's name. +- `names` <br> + Format: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)` <br> + Associates a node with alternative names. + `pref_alt` is 1 if the alt-name is the most 'preferred' one. + `src` indicates the dataset the alt-name was obtained from (can be 'eol', 'enwiki', or 'picked'). +## Node description data +- `wiki_ids` <br> + Format: `name TEXT PRIMARY KEY, id INT, redirected INT` <br> + Associates a node with a wikipedia page ID. + `redirected` is 1 if the node was associated with a different page that redirected to this one. +- `descs` <br> + Format: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT` <br> + Associates a wikipedia page ID with a short-description. + `from_dbp` is 1 if the description was obtained from DBpedia, and 0 otherwise. +## Node image data +- `node_imgs` <br> + Format: `name TEXT PRIMARY KEY, img_id INT, src TEXT` <br> + Associates a node with an image. +- `images` <br> + Format: `id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)` <br> + Represents an image, identified by a source ('eol', 'enwiki', or 'picked'), and a source-specific ID. +- `linked_imgs` <br> + Format: `name TEXT PRIMARY KEY, otol_ids TEXT` <br> + Associates a node with an image from another node. + `otol_ids` can be an otol ID, or two comma-separated otol IDs or empty strings. + The latter is used for compound nodes. +## Reduced-tree data +- `r_nodes` <br> + Format: `name TEXT PRIMARY KEY, tips INT` <br> + Like `nodes`, but for a reduced tree. +- `r_edges` <br> + Format: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)` <br> + Like `edges` but for a reduced tree. # Generating the Database @@ -68,7 +101,7 @@ Some of the python scripts require third-party packages: - pickedEnwikiNamesToSkip.txt: Same as with genDbpData.py. - pickedEnwikiLabels.txt: Similar to pickedDbpLabels.txt. -## Generate image data +## Generate node image data ### Get images from EOL 1. Obtain 'image metadata files' in eol/, as specified in it's README. 2. In eol/, run downloadImgs.py, which downloads images (possibly multiple per node), @@ -81,7 +114,7 @@ Some of the python scripts require third-party packages: using the `wiki_ids` table, and stores them in a database. 2. In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing information for those images, using wikipedia's online API. -3. In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed' +3. In enwiki/, run downloadImgs.py, which downloads 'permissively-licensed' images into enwiki/imgs/. ### Merge the image sets 1. Run reviewImgsToGen.py, which displays images from eol/imgs/ and enwiki/imgs/, @@ -107,15 +140,16 @@ Some of the python scripts require third-party packages: `nodes`, `edges`, and `node_imgs` tables. ## Do some post-processing -1. Run genReducedTreeData.py, which generates a second, reduced version of the tree, - adding the `r_nodes` and `r_edges` tables, using `nodes` and `names`. Reads from - pickedReducedNodes.txt, which lists names of nodes that must be included (1 per line). -2. Optionally run trimTree.py, which tries to remove some 'low-significance' nodes, - for the sake of performance and result-relevance. Otherwise, some nodes may have - over 10k children, which can take a while to render (over a minute in my testing). - You might want to backup the untrimmed tree first, as this operation is not easily - reversible. -3. Optionally run genEnwikiNameData.py, which adds more entries to the `names` table, +1. Run genEnwikiNameData.py, which adds more entries to the `names` table, using data in enwiki/, and the `names` and `wiki_ids` tables. -4. Optionally run addPickedNames.py, which allows adding manually-selected name data to +2. Optionally run addPickedNames.py, which allows adding manually-selected name data to the `names` table, as specified in pickedNames.txt. + - pickedNames.txt: Has lines of the form `nodeName1|altName1|prefAlt1`. + These correspond to entries in the `names` table. `prefAlt` should be 1 or 0. + A line like `name1|name1|1` causes a node to have no preferred alt-name. +3. Run genReducedTreeData.py, which generates a second, reduced version of the tree, + adding the `r_nodes` and `r_edges` tables, using `nodes` and `names`. Reads from + pickedReducedNodes.txt, which lists names of nodes that must be included (1 per line). +4. Optionally run trimTree.py, which tries to remove some 'low significance' nodes, + for the sake of performance and content-relevance. Otherwise, some nodes may have + over 10k children, which can take a while to render (took over a minute in testing). diff --git a/backend/data/addPickedNames.py b/backend/data/addPickedNames.py index 3ef099a..d56a0cb 100755 --- a/backend/data/addPickedNames.py +++ b/backend/data/addPickedNames.py @@ -3,12 +3,11 @@ import sys import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads alt-name data from a file, and adds it to the 'names' table.\n" -usageInfo += "The file is expected to have lines of the form: nodeName|altName|prefAlt\n" -usageInfo += " These correspond to entries in the 'names' table. 'prefAlt' should\n" -usageInfo += " be 1 or 0. A line may specify name1|name1|1, which causes the node\n" -usageInfo += " to have no preferred alt-name.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads alt-name data from a file, and adds it to the database's 'names' table. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -16,15 +15,21 @@ if len(sys.argv) > 1: dbFile = "data.db" pickedNamesFile = "pickedNames.txt" -# Open db +print("Opening database") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Iterate through picked-names file + +print("Iterating through picked-names file") with open(pickedNamesFile) as file: for line in file: # Get record data - (nodeName, altName, prefAlt) = line.lower().rstrip().split("|") + nodeName, altName, prefAlt = line.lower().rstrip().split("|") prefAlt = int(prefAlt) + # Check whether there exists a node with the name + row = dbCur.execute("SELECT name from nodes where name = ?", (nodeName,)).fetchone() + if row == None: + print(f"ERROR: No node with name \"{nodeName}\" exists") + break # Remove any existing preferred-alt status if prefAlt == 1: query = "SELECT name, alt_name FROM names WHERE name = ? AND pref_alt = 1" @@ -46,6 +51,7 @@ with open(pickedNamesFile) as file: print(f"Updating record for alt-name {altName} for {nodeName}") dbCur.execute("UPDATE names SET pref_alt = ?, src = 'picked' WHERE name = ? AND alt_name = ?", (prefAlt, nodeName, altName)) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/dbpedia/genDescData.py b/backend/data/dbpedia/genDescData.py index bba3ff5..d9e8a80 100755 --- a/backend/data/dbpedia/genDescData.py +++ b/backend/data/dbpedia/genDescData.py @@ -3,25 +3,28 @@ import sys, re import bz2, sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n" -usageInfo += "and creates a sqlite db containing that data.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Adds DBpedia labels/types/abstracts/etc data into a database. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines +labelsFile = "labels_lang=en.ttl.bz2" # Had about 16e6 entries idsFile = "page_lang=en_ids.ttl.bz2" redirectsFile = "redirects_lang=en_transitive.ttl.bz2" disambigFile = "disambiguations_lang=en.ttl.bz2" typesFile = "instance-types_lang=en_specific.ttl.bz2" abstractsFile = "short-abstracts_lang=en.ttl.bz2" dbFile = "descData.db" +# In testing, this script took a few hours to run, and generated about 10GB -# Open db +print("Creating database") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Read/store labels + print("Reading/storing label data") dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)") dbCur.execute("CREATE INDEX labels_idx ON labels(label)") @@ -32,16 +35,13 @@ with bz2.open(labelsFile, mode='rt') as file: for line in file: lineNum += 1 if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") + print(f"At line {lineNum}") # match = labelLineRegex.fullmatch(line) if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2))) -dbCon.commit() -# Read/store wiki page ids + raise Exception(f"ERROR: Line {lineNum} has unexpected format") + dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2))) + print("Reading/storing wiki page ids") dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)") idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') @@ -50,20 +50,17 @@ with bz2.open(idsFile, mode='rt') as file: for line in file: lineNum += 1 if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") + print(f"At line {lineNum}") # match = idLineRegex.fullmatch(line) if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - try: - dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2)))) - except sqlite3.IntegrityError as e: - # Accounts for certain lines that have the same IRI - print(f"Failed to add entry with IRI \"{match.group(1)}\": {e}") -dbCon.commit() -# Read/store redirects + raise Exception(f"ERROR: Line {lineNum} has unexpected format") + try: + dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2)))) + except sqlite3.IntegrityError as e: + # Accounts for certain lines that have the same IRI + print(f"WARNING: Failed to add entry with IRI \"{match.group(1)}\": {e}") + print("Reading/storing redirection data") dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)") redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') @@ -72,37 +69,28 @@ with bz2.open(redirectsFile, mode='rt') as file: for line in file: lineNum += 1 if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") + print(f"At line {lineNum}") # match = redirLineRegex.fullmatch(line) if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2))) -dbCon.commit() -# Read/store diambiguation-page data + raise Exception(f"ERROR: Line {lineNum} has unexpected format") + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2))) + print("Reading/storing diambiguation-page data") -disambigNames = set() +dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)") disambigLineRegex = redirLineRegex lineNum = 0 with bz2.open(disambigFile, mode='rt') as file: for line in file: lineNum += 1 if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") + print(f"At line {lineNum}") # match = disambigLineRegex.fullmatch(line) if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - disambigNames.add(match.group(1)) -dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)") -for name in disambigNames: - dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,)) -dbCon.commit() -# Read/store instance-type + raise Exception(f"ERROR: Line {lineNum} has unexpected format") + dbCur.execute("INSERT OR IGNORE INTO disambiguations VALUES (?)", (match.group(1),)) + print("Reading/storing instance-type data") dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)") dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)") @@ -112,16 +100,13 @@ with bz2.open(typesFile, mode='rt') as file: for line in file: lineNum += 1 if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") + print(f"At line {lineNum}") # match = typeLineRegex.fullmatch(line) if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2))) -dbCon.commit() -# Read/store abstracts + raise Exception(f"ERROR: Line {lineNum} has unexpected format") + dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2))) + print("Reading/storing abstracts") dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)") descLineRegex = labelLineRegex @@ -130,17 +115,16 @@ with bz2.open(abstractsFile, mode='rt') as file: for line in file: lineNum += 1 if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") + print(f"At line {lineNum}") # if line[0] == "#": continue match = descLineRegex.fullmatch(line) if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO abstracts VALUES (?, ?)", - (match.group(1), match.group(2).replace(r'\"', '"'))) -# Close db + raise Exception(f"ERROR: Line {lineNum} has unexpected format") + dbCur.execute("INSERT INTO abstracts VALUES (?, ?)", + (match.group(1), match.group(2).replace(r'\"', '"'))) + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index 1c16a2e..90d16c7 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -42,7 +42,7 @@ This directory holds files obtained from/using [English Wikipedia](https://en.wi `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids. - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br> Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. -- downloadEnwikiImgs.py <br> +- downloadImgs.py <br> Used to download image files into imgs/. # Other Files diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py index 097304b..399922e 100755 --- a/backend/data/enwiki/downloadImgLicenseInfo.py +++ b/backend/data/enwiki/downloadImgLicenseInfo.py @@ -5,41 +5,48 @@ import sqlite3, urllib.parse, html import requests import time, signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n" -usageInfo += "licensing information for them, adding the info to a sqlite db.\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" -usageInfo += "The program can be re-run to continue downloading, and looks\n" -usageInfo += "at names added to the db to decide what to skip.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads image names from a database, and uses enwiki's online API to obtain +licensing information for them, adding the info to the database. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +at already-processed names to decide what to skip. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDb = "imgData.db" # About 130k image names +imgDb = "imgData.db" apiUrl = "https://en.wikipedia.org/w/api.php" +userAgent = "terryt.dev (terry06890@gmail.com)" batchSz = 50 # Max 50 tagRegex = re.compile(r"<[^<]+>") whitespaceRegex = re.compile(r"\s+") -# Open db +print("Opening database") dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() dbCur2 = dbCon.cursor() -# Create table if it doesn't exist +print("Checking for table") if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None: dbCur.execute("CREATE TABLE imgs(" \ "name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)") -# Get image names + print("Reading image names") imgNames = set() for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"): imgNames.add(imgName) -print(f"Found {len(imgNames)} images") +print(f"Found {len(imgNames)}") + +print("Checking for already-processed images") oldSz = len(imgNames) for (imgName,) in dbCur.execute("SELECT name FROM imgs"): imgNames.discard(imgName) -print(f"Skipping {oldSz - len(imgNames)} already-done images") +print(f"Found {oldSz - len(imgNames)}") + # Set SIGINT handler interrupted = False oldHandler = None @@ -48,7 +55,8 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) -# Iterate through image names, making API requests + +print("Iterating through image names") imgNames = list(imgNames) iterNum = 0 for i in range(0, len(imgNames), batchSz): @@ -63,7 +71,7 @@ for i in range(0, len(imgNames), batchSz): imgBatch = ["File:" + x for x in imgBatch] # Make request headers = { - "user-agent": "terryt.dev (terry06890@gmail.com)", + "user-agent": userAgent, "accept-encoding": "gzip", } params = { @@ -80,16 +88,16 @@ for i in range(0, len(imgNames), batchSz): response = requests.get(apiUrl, params=params, headers=headers) responseObj = response.json() except Exception as e: - print(f"Error while downloading info: {e}", file=sys.stderr) - print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + print(f"ERROR: Exception while downloading info: {e}") + print(f"\tImage batch: " + "|".join(imgBatch)) continue # Parse response-object if "query" not in responseObj or "pages" not in responseObj["query"]: - print("WARNING: Response object for doesn't have page data", file=sys.stderr) - print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr) + print("WARNING: Response object for doesn't have page data") + print("\tImage batch: " + "|".join(imgBatch)) if "error" in responseObj: errorCode = responseObj["error"]["code"] - print(f"\tError code: {errorCode}", file=sys.stderr) + print(f"\tError code: {errorCode}") if errorCode == "maxlag": time.sleep(5) continue @@ -111,10 +119,10 @@ for i in range(0, len(imgNames), batchSz): title = normalisedToInput[title] title = title[5:] # Remove 'File:' if title not in imgNames: - print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr) + print(f"WARNING: Got title \"{title}\" not in image-name list") continue if "imageinfo" not in page: - print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr) + print(f"WARNING: No imageinfo section for page \"{title}\"") continue metadata = page["imageinfo"][0]["extmetadata"] url = page["imageinfo"][0]["url"] @@ -122,7 +130,7 @@ for i in range(0, len(imgNames), batchSz): artist = metadata['Artist']['value'] if 'Artist' in metadata else None credit = metadata['Credit']['value'] if 'Credit' in metadata else None restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None - # Remove newlines + # Remove markup if artist != None: artist = tagRegex.sub(" ", artist) artist = whitespaceRegex.sub(" ", artist) @@ -134,7 +142,9 @@ for i in range(0, len(imgNames), batchSz): credit = html.unescape(credit) credit = urllib.parse.unquote(credit) # Add to db - dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url)) -# Close db + dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", + (title, license, artist, credit, restrictions, url)) + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadImgs.py index 2929a0d..8fb605f 100755 --- a/backend/data/enwiki/downloadEnwikiImgs.py +++ b/backend/data/enwiki/downloadImgs.py @@ -5,13 +5,16 @@ import sqlite3 import urllib.parse, requests import time, signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Downloads images from URLs specified in an sqlite db,\n" -usageInfo += "into a specified directory.'\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n" -usageInfo += "The program can be re-run to continue downloading, and looks\n" -usageInfo += "in the output directory do decide what to skip.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Downloads images from URLs in an image database, into an output directory, +with names of the form 'pageId1.ext1'. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +in the output directory do decide what to skip. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -19,18 +22,18 @@ if len(sys.argv) > 1: imgDb = "imgData.db" # About 130k image names outDir = "imgs" licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) +# In testing, this downloaded about 100k images, over several days -# Create output directory if not present if not os.path.exists(outDir): os.mkdir(outDir) -# Get existing image names -print("Gettings already-downloaded images") +print("Checking for already-downloaded images") fileList = os.listdir(outDir) pageIdsDone = set() for filename in fileList: (basename, extension) = os.path.splitext(filename) pageIdsDone.add(int(basename)) -print(f"Found {len(pageIdsDone)} already-downloaded images") +print(f"Found {len(pageIdsDone)}") + # Set SIGINT handler interrupted = False oldHandler = None @@ -39,10 +42,10 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) -# Open db + +print("Opening database") dbCon = sqlite3.connect(imgDb) dbCur = dbCon.cursor() -# Start downloads print("Starting downloads") iterNum = 0 query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \ @@ -68,7 +71,7 @@ for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query) urlParts = urllib.parse.urlparse(url) extension = os.path.splitext(urlParts.path)[1] if len(extension) <= 1: - print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) + print(f"WARNING: No filename extension found in URL {url}") sys.exit(1) outFile = f"{outDir}/{pageId}{extension}" headers = { @@ -81,8 +84,8 @@ for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query) file.write(response.content) time.sleep(1) # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec" - # It's unclear how to properly check for cache misses, so just do about <=1 per sec + # It's unclear how to properly check for cache misses, so this just aims for 1 per sec except Exception as e: - print(f"Error while downloading to {outFile}: {e}", file=sys.stderr) -# Close db + print(f"Error while downloading to {outFile}: {e}") +print("Closing database") dbCon.close() diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py index 032dbed..b0ca272 100755 --- a/backend/data/enwiki/genDescData.py +++ b/backend/data/enwiki/genDescData.py @@ -5,31 +5,36 @@ import bz2 import html, mwxml, mwparserfromhell import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n" -usageInfo += "and short-description info to an sqlite db.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads through the wiki dump, and attempts to +parse short-descriptions, and add them to a database. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages enwikiDb = "descData.db" +# In testing, this script took over 10 hours to run, and generated about 5GB -# Some regexps and functions for parsing wikitext descLineRegex = re.compile("^ *[A-Z'\"]") embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$") # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") -parensGrpRegex = re.compile(r" \([^()]*\)") -leftoverBraceRegex = re.compile(r"(?:{\||{{).*") def convertTemplateReplace(match): if match.group(2) == None: return f"{match.group(1)} {match.group(4)}" else: return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}" +parensGroupRegex = re.compile(r" \([^()]*\)") +leftoverBraceRegex = re.compile(r"(?:{\||{{).*") + def parseDesc(text): - # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank - # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, + # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, + # and then accumulate lines until a blank one. + # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, lines = [] openBraceCount = 0 @@ -74,18 +79,15 @@ def removeMarkup(content): content = embeddedHtmlRegex.sub("", content) content = convertTemplateRegex.sub(convertTemplateReplace, content) content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup - content = parensGrpRegex.sub("", content) + content = parensGroupRegex.sub("", content) content = leftoverBraceRegex.sub("", content) return content -# Other helper functions def convertTitle(title): return html.unescape(title).replace("_", " ") -# Check for existing db +print("Creating database") if os.path.exists(enwikiDb): - print(f"ERROR: Existing {enwikiDb}", file=sys.stderr) - sys.exit(1) -# Create db + raise Exception(f"ERROR: Existing {enwikiDb}") dbCon = sqlite3.connect(enwikiDb) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") @@ -93,8 +95,8 @@ dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") -# Read through dump file -print("Reading dump file") + +print("Iterating through dump file") with bz2.open(dumpFile, mode='rt') as file: dump = mwxml.Dump.from_file(file) pageNum = 0 @@ -102,13 +104,15 @@ with bz2.open(dumpFile, mode='rt') as file: pageNum += 1 if pageNum % 1e4 == 0: print(f"At page {pageNum}") + if pageNum > 3e4: + break # Parse page if page.namespace == 0: try: dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) except sqlite3.IntegrityError as e: # Accounts for certain pages that have the same title - print(f"Failed to add page with title \"{page.title}\": {e}") + print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr) continue if page.redirect != None: dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) @@ -117,6 +121,7 @@ with bz2.open(dumpFile, mode='rt') as file: desc = parseDesc(revision.text) if desc != None: dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py index ee3e813..3955885 100755 --- a/backend/data/enwiki/genDumpIndexDb.py +++ b/backend/data/enwiki/genDumpIndexDb.py @@ -4,25 +4,26 @@ import sys, os, re import bz2 import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a Wikimedia enwiki dump index file,\n" -usageInfo += "and stores it's offset and title data to an sqlite db.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Adds data from the wiki dump index-file into a database. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines +indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines indexDb = "dumpIndex.db" -# Check for existing db if os.path.exists(indexDb): - print(f"ERROR: Existing {indexDb}", file=sys.stderr) - sys.exit(1) -# Create db + raise Exception(f"ERROR: Existing {indexDb}") +print("Creating database") dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)") -# Reading index file + +print("Iterating through index file") lineRegex = re.compile(r"([^:]+):([^:]+):(.*)") lastOffset = 0 lineNum = 0 @@ -42,7 +43,7 @@ with bz2.open(indexFile, mode='rt') as file: dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset)) except sqlite3.IntegrityError as e: # Accounts for certain entries in the file that have the same title - print(f"Failed on title \"{t}\": {e}") + print(f"Failed on title \"{t}\": {e}", file=sys.stderr) entriesToAdd = [] lastOffset = offset entriesToAdd.append([title, pageId]) @@ -50,7 +51,8 @@ for (title, pageId) in entriesToAdd: try: dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1)) except sqlite3.IntegrityError as e: - print(f"Failed on title \"{t}\": {e}") -# Close db + print(f"Failed on title \"{t}\": {e}", file=sys.stderr) + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py index 9bd28f4..dedfe14 100755 --- a/backend/data/enwiki/genImgData.py +++ b/backend/data/enwiki/genImgData.py @@ -4,9 +4,15 @@ import sys, re import bz2, html, urllib.parse import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n" -usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +For some set of page IDs, looks up their content in the wiki dump, +and tries to parse infobox image names, storing them into a database. + +The program can be re-run with an updated set of page IDs, and +will skip already-processed page IDs. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -21,58 +27,64 @@ def getInputPageIds(): return pageIds dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" indexDb = "dumpIndex.db" -imgDb = "imgData.db" # Output db +imgDb = "imgData.db" # The database to create idLineRegex = re.compile(r"<id>(.*)</id>") imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE) cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) +# In testing, got about 360k image names -# Open dbs +print("Getting input page-ids") +pageIds = getInputPageIds() +print(f"Found {len(pageIds)}") + +print("Opening databases") indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() -# Create image-db table -pidsDone = set() +print("Checking tables") if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None: + # Create tables if not present imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") else: + # Check for already-processed page IDs + numSkipped = 0 for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"): - pidsDone.add(pid) - print(f"Will skip {len(pidsDone)} already-processed page-ids") -# Get input pageIds -print("Getting input page-ids", file=sys.stderr) -pageIds = getInputPageIds() -for pid in pidsDone: - pageIds.remove(pid) -print(f"Found {len(pageIds)} page-ids to process") -# Get page-id dump-file offsets -print("Getting dump-file offsets", file=sys.stderr) + if pid in pageIds: + pageIds.remove(pid) + numSkipped += 1 + else: + print(f"WARNING: Found already-processed page ID {pid} which was not in input set") + print(f"Will skip {numSkipped} already-processed page IDs") + +print("Getting dump-file offsets") offsetToPageids = {} -offsetToEnd = {} +offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets iterNum = 0 for pageId in pageIds: iterNum += 1 if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}", file=sys.stderr) + print(f"At iteration {iterNum}") # query = "SELECT offset, next_offset FROM offsets WHERE id = ?" row = indexDbCur.execute(query, (pageId,)).fetchone() if row == None: - print(f"WARNING: Page id {pageId} not found", file=sys.stderr) + print(f"WARNING: Page ID {pageId} not found") continue (chunkOffset, endOffset) = row offsetToEnd[chunkOffset] = endOffset if chunkOffset not in offsetToPageids: offsetToPageids[chunkOffset] = [] offsetToPageids[chunkOffset].append(pageId) -print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr) -# Look through dump file, jumping to chunks containing relevant pages -print("Reading through dump file", file=sys.stderr) +print(f"Found {len(offsetToEnd)} chunks to check") + +print("Iterating through chunks in dump file") def getImageName(content): - """ Given an array of text-content lines, returns an image-filename, or None """ + " Given an array of text-content lines, tries to return an infoxbox image name, or None " + # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections for line in content: match = imageLineRegex.match(line) if match != None: @@ -109,16 +121,15 @@ def getImageName(content): imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases) imageName = imageName.replace("_", " ") return imageName - # Skip lines like: | image = <imagemap> + # Exclude lines like: | image = <imagemap> return None - # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections return None with open(dumpFile, mode='rb') as file: iterNum = 0 for (pageOffset, endOffset) in offsetToEnd.items(): iterNum += 1 if iterNum % 100 == 0: - print(f"At iteration {iterNum}", file=sys.stderr) + print(f"At iteration {iterNum}") # pageIds = offsetToPageids[pageOffset] # Jump to chunk @@ -168,11 +179,12 @@ with open(dumpFile, mode='rb') as file: imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName)) break if not foundTextEnd: - print(f"Did not find </text> for page id {pageId}", file=sys.stderr) + print(f"WARNING: Did not find </text> for page id {pageId}") break if not foundText: - print(f"Did not find <text> for page id {pageId}", file=sys.stderr) -# Close dbs + print(f"WARNING: Did not find <text> for page id {pageId}") + +print("Closing databases") indexDbCon.close() imgDbCon.commit() imgDbCon.close() diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py index 76f2f95..1a90851 100755 --- a/backend/data/enwiki/lookupPage.py +++ b/backend/data/enwiki/lookupPage.py @@ -4,9 +4,12 @@ import sys, re import bz2 import sqlite3 -usageInfo = f"usage: {sys.argv[0]} title1\n" -usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n" -usageInfo += "using a dump index db, and prints the corresponding <page>.\n" +usageInfo = f""" +Usage: {sys.argv[0]} title1 + +Looks up a page with title title1 in the wiki dump, using +the dump-index db, and prints the corresponding <page>. +""" if len(sys.argv) != 2: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -15,20 +18,19 @@ dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" indexDb = "dumpIndex.db" pageTitle = sys.argv[1].replace("_", " ") -# Searching index file -print("Lookup offset in index db") +print("Looking up offset in index db") dbCon = sqlite3.connect(indexDb) dbCur = dbCon.cursor() query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?" row = dbCur.execute(query, (pageTitle,)).fetchone() if row == None: print("Title not found") - sys.exit(1) -(_, pageOffset, endOffset) = row + sys.exit(0) +_, pageOffset, endOffset = row dbCon.close() print(f"Found chunk at offset {pageOffset}") -# Read dump file -print("Reading dump file") + +print("Reading from wiki dump") content = [] with open(dumpFile, mode='rb') as file: # Get uncompressed chunk @@ -61,6 +63,6 @@ with open(dumpFile, mode='rb') as file: if line.lstrip() == "</page>": break lineIdx += 1 -# Print content + print("Content: ") print("\n".join(content)) diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md index fbb008d..8c527a8 100644 --- a/backend/data/eol/README.md +++ b/backend/data/eol/README.md @@ -11,9 +11,10 @@ This directory holds files obtained from/using the [Encyclopedia of Life](https: Contains metadata for images from EOL. - imagesList/ <br> Extracted from imagesList.tgz. +- genImagesListDb.sh <br> + Creates a database, and imports imagesList/*.csv files into it. - imagesList.db <br> - Contains data from imagesList/. - Created by running genImagesListDb.sh, which simply imports csv files into a database. <br> + Created by running genImagesListDb.sh <br> Tables: <br> - `images`: `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT` diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py index ac72ea1..96bc085 100755 --- a/backend/data/eol/downloadImgs.py +++ b/backend/data/eol/downloadImgs.py @@ -7,18 +7,24 @@ import time from threading import Thread import signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Downloads images from URLs specified in an image-list database,\n" -usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n" -usageInfo += "the form 'eolId1 contentId1.ext1'.\n" -usageInfo += "\n" -usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n" -usageInfo += "The program can be re-run to continue downloading. It looks for\n" -usageInfo += "existing downloaded files, and continues after the one with\n" -usageInfo += "highest EOL ID.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +For some set of EOL IDs, downloads associated images from URLs in +an image-list database. Uses multiple downloading threads. + +May obtain multiple images per ID. The images will get names +with the form 'eolId1 contentId1.ext1'. + +SIGINT causes the program to finish ongoing downloads and exit. +The program can be re-run to continue downloading. It looks for +already-downloaded files, and continues after the one with +highest EOL ID. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) +# In testing, this downloaded about 70k images, over a few days imagesListDb = "imagesList.db" def getInputEolIds(): @@ -30,44 +36,29 @@ def getInputEolIds(): dbCon.close() return eolIds outDir = "imgsForReview/" -LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" +MAX_IMGS_PER_ID = 3 +MAX_THREADS = 5 POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread) POST_DL_DELAY_MAX = 3 +LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain" -# Get eol-ids from data db print("Getting input EOL IDs") eolIds = getInputEolIds() -# Get eol-ids from images db -print("Getting images-list-db EOL IDs") +print("Getting EOL IDs to download for") +# Get IDs from images-list db imgDbCon = sqlite3.connect(imagesListDb) imgCur = imgDbCon.cursor() imgListIds = set() -for row in imgCur.execute("SELECT DISTINCT page_id FROM images"): - imgListIds.add(row[0]) -# Get eol-id intersection, and sort into list +for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"): + imgListIds.add(pageId) +# Get set intersection, and sort into list eolIds = eolIds.intersection(imgListIds) eolIds = sorted(eolIds) -print(f"Resulted in {len(eolIds)} EOL IDs") +print(f"Result: {len(eolIds)} EOL IDs") -MAX_IMGS_PER_ID = 3 -MAX_THREADS = 5 -numThreads = 0 -threadException = None # Used for ending main thread after a non-main thread exception -def downloadImg(url, outFile): - global numThreads, threadException - try: - data = requests.get(url) - with open(outFile, 'wb') as file: - file.write(data.content) - time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) - except Exception as e: - print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) - threadException = e - numThreads -= 1 -# Create output directory if not present +print("Checking output directory") if not os.path.exists(outDir): os.mkdir(outDir) -# Find next eol ID to download for print("Finding next ID to download for") nextIdx = 0 fileList = os.listdir(outDir) @@ -78,7 +69,11 @@ if len(ids) > 0: if nextIdx == len(eolIds): print("No IDs left. Exiting...") sys.exit(0) -# Detect SIGINT signals + +print("Starting download threads") +numThreads = 0 +threadException = None # Used for ending main thread after a non-main thread exception +# Handle SIGINT signals interrupted = False oldHandler = None def onSigint(sig, frame): @@ -86,33 +81,27 @@ def onSigint(sig, frame): interrupted = True signal.signal(signal.SIGINT, oldHandler) oldHandler = signal.signal(signal.SIGINT, onSigint) +# Function for threads to execute +def downloadImg(url, outFile): + global numThreads, threadException + try: + data = requests.get(url) + with open(outFile, 'wb') as file: + file.write(data.content) + time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN) + except Exception as e: + print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr) + threadException = e + numThreads -= 1 # Manage downloading for idx in range(nextIdx, len(eolIds)): eolId = eolIds[idx] # Get image urls imgDataList = [] ownerSet = set() # Used to get images from different owners, for variety - for row in imgCur.execute( - "SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)): - license = row[3] - copyrightOwner = row[4] - if re.fullmatch(LICENSE_REGEX, license) == None: - continue - if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic - continue - if copyrightOwner not in ownerSet: - ownerSet.add(copyrightOwner) - imgDataList.append(row) - if len(ownerSet) == MAX_IMGS_PER_ID: - break - if len(imgDataList) == 0: - continue - # Determine output filenames - outFiles = [] - urls = [] - for row in imgDataList: - contentId = row[0] - url = row[2] + exitLoop = False + query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?" + for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)): if url.startswith("data/"): url = "https://content.eol.org/" + url urlParts = urllib.parse.urlparse(url) @@ -120,28 +109,37 @@ for idx in range(nextIdx, len(eolIds)): if len(extension) <= 1: print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr) continue - outFiles.append(str(eolId) + " " + str(contentId) + extension) - urls.append(url) - # Start downloads - exitLoop = False - for i in range(len(outFiles)): - outPath = outDir + outFiles[i] - if not os.path.exists(outPath): - # Enforce thread limit - while numThreads == MAX_THREADS: + # Check image-quantity limit + if len(ownerSet) == MAX_IMGS_PER_ID: + break + # Check for skip conditions + if re.fullmatch(LICENSE_REGEX, license) == None: + continue + if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic + continue + if copyrightOwner in ownerSet: + continue + ownerSet.add(copyrightOwner) + # Determine output filename + outPath = f"{outDir}{eolId} {contentId}{extension}" + if os.path.exists(outPath): + print(f"WARNING: {outPath} already exists. Skipping download.") + continue + # Check thread limit + while numThreads == MAX_THREADS: + time.sleep(1) + # Wait for threads after an interrupt or thread-exception + if interrupted or threadException != None: + print("Waiting for existing threads to end") + while numThreads > 0: time.sleep(1) - # Wait for threads after an interrupt or thread-exception - if interrupted or threadException != None: - print("Waiting for existing threads to end") - while numThreads > 0: - time.sleep(1) - exitLoop = True - break - print(f"Downloading image to {outPath}") - # Perform download - numThreads += 1 - thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True) - thread.start() + exitLoop = True + break + # Perform download + print(f"Downloading image to {outPath}") + numThreads += 1 + thread = Thread(target=downloadImg, args=(url, outPath), daemon=True) + thread.start() if exitLoop: break # Close images-list db diff --git a/backend/data/eol/genImagesListDb.sh b/backend/data/eol/genImagesListDb.sh index 3a8ced7..87dd840 100755 --- a/backend/data/eol/genImagesListDb.sh +++ b/backend/data/eol/genImagesListDb.sh @@ -1,7 +1,9 @@ #!/bin/bash set -e +# Combine CSV files into one, skipping header lines cat imagesList/media_*_{1..58}.csv | tail -n +2 > imagesList.csv +# Create database, and import the CSV file sqlite3 imagesList.db <<END CREATE TABLE images ( content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT); diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py index 5290f9e..ecdf7ab 100755 --- a/backend/data/eol/reviewImgs.py +++ b/backend/data/eol/reviewImgs.py @@ -7,11 +7,14 @@ from tkinter import ttk import PIL from PIL import ImageTk, Image, ImageOps -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Provides a GUI for reviewing images. Looks in a for-review directory for\n" -usageInfo += "images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to\n" -usageInfo += "choose an image to keep, or reject all. Also provides image rotation.\n" -usageInfo += "Chosen images are placed in another directory, and rejected ones are deleted.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Provides a GUI for reviewing images. Looks in a for-review directory for +images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to +choose an image to keep, or reject all. Also provides image rotation. +Chosen images are placed in another directory, and rejected ones are deleted. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -21,6 +24,7 @@ outDir = "imgs/" extraInfoDbCon = sqlite3.connect("../data.db") extraInfoDbCur = extraInfoDbCon.cursor() def getExtraInfo(eolId): + global extraInfoDbCur query = "SELECT names.alt_name FROM" \ " names INNER JOIN eol_ids ON eol_ids.name = names.name" \ " WHERE id = ? and pref_alt = 1" @@ -31,21 +35,21 @@ def getExtraInfo(eolId): return f"Reviewing EOL ID {eolId}" IMG_DISPLAY_SZ = 400 MAX_IMGS_PER_ID = 3 -PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135)) +IMG_BG_COLOR = (88, 28, 135) +PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR) -# Create output directory if not present +print("Checking output directory") if not os.path.exists(outDir): os.mkdir(outDir) -# Get images for review -print("Reading input image list") +print("Getting input image list") imgList = os.listdir(imgDir) imgList.sort(key=lambda s: int(s.split(" ")[0])) if len(imgList) == 0: - print("No input images found", file=sys.stderr) - sys.exit(1) + print("No input images found") + sys.exit(0) class EolImgReviewer: - """ Provides the GUI for reviewing images """ + " Provides the GUI for reviewing images " def __init__(self, root, imgList): self.root = root root.title("EOL Image Reviewer") @@ -68,7 +72,7 @@ class EolImgReviewer: # Add padding for child in mainFrame.winfo_children(): child.grid_configure(padx=5, pady=5) - # Add bindings + # Add keyboard bindings root.bind("<q>", self.quit) root.bind("<Key-j>", lambda evt: self.accept(0)) root.bind("<Key-k>", lambda evt: self.accept(1)) @@ -87,11 +91,11 @@ class EolImgReviewer: self.nextImgNames = [] self.rotations = [] self.getNextImgs() - # For more info + # For displaying extra info self.numReviewed = 0 self.startTime = time.time() def getNextImgs(self): - """ Updates display with new images to review, or ends program """ + " Updates display with new images to review, or ends program " # Gather names of next images to review for i in range(MAX_IMGS_PER_ID): if self.imgListIdx == len(self.imgList): @@ -123,7 +127,7 @@ class EolImgReviewer: del self.nextImgNames[idx] del self.rotations[idx] continue - self.imgs[idx] = self.resizeForDisplay(img) + self.imgs[idx] = self.resizeImgForDisplay(img) else: self.imgs[idx] = PLACEHOLDER_IMG self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx]) @@ -140,7 +144,7 @@ class EolImgReviewer: title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})" self.root.title(title) def accept(self, imgIdx): - """ React to a user selecting an image """ + " React to a user selecting an image " if imgIdx >= len(self.nextImgNames): print("Invalid selection") return @@ -159,19 +163,20 @@ class EolImgReviewer: self.numReviewed += 1 self.getNextImgs() def reject(self): - """ React to a user rejecting all images of a set """ + " React to a user rejecting all images of a set " for i in range(len(self.nextImgNames)): os.remove(imgDir + self.nextImgNames[i]) self.numReviewed += 1 self.getNextImgs() def rotate(self, imgIdx, anticlockwise = False): - """ Respond to a user rotating an image """ + " Respond to a user rotating an image " deg = -90 if not anticlockwise else 90 self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg) self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx]) self.labels[imgIdx].config(image=self.photoImgs[imgIdx]) self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360 def quit(self, e = None): + global extraInfoDbCon print(f"Number reviewed: {self.numReviewed}") timeElapsed = time.time() - self.startTime print(f"Time elapsed: {timeElapsed:.2f} seconds") @@ -179,8 +184,8 @@ class EolImgReviewer: print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds") extraInfoDbCon.close() self.root.destroy() - def resizeForDisplay(self, img): - """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """ + def resizeImgForDisplay(self, img): + " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background " if max(img.width, img.height) > IMG_DISPLAY_SZ: if (img.width > img.height): newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) @@ -194,6 +199,7 @@ class EolImgReviewer: int((IMG_DISPLAY_SZ - img.height) / 2))) return bgImg # Create GUI and defer control +print("Starting GUI") root = tki.Tk() EolImgReviewer(root, imgList) root.mainloop() diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py index afe1e17..df3a6be 100755 --- a/backend/data/genDbpData.py +++ b/backend/data/genDbpData.py @@ -3,11 +3,12 @@ import sys, os, re import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads DBpedia data from dbpedia/*, along with tree-of-life\n" -usageInfo += "node and name data from a sqlite database, associates nodes with\n" -usageInfo += "DBpedia IRIs, and adds alt-name and description information for\n" -usageInfo += "those nodes.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads a database containing data from DBpedia, and tries to associate +DBpedia IRIs with nodes in a database, adding short-descriptions for them. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -16,18 +17,21 @@ dbpediaDb = "dbpedia/descData.db" namesToSkipFile = "pickedEnwikiNamesToSkip.txt" pickedLabelsFile = "pickedDbpLabels.txt" dbFile = "data.db" +rootNodeName = "cellular organisms" +rootLabel = "organism" # Will be associated with root node +# Got about 400k descriptions when testing -# Open dbs +print("Opening databases") dbpCon = sqlite3.connect(dbpediaDb) dbpCur = dbpCon.cursor() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Get node names -print("Reading node names") + +print("Getting node names") nodeNames = set() for (name,) in dbCur.execute("SELECT name from nodes"): nodeNames.add(name) -# Skipping certain names + print("Checking for names to skip") oldSz = len(nodeNames) if os.path.exists(namesToSkipFile): @@ -35,22 +39,22 @@ if os.path.exists(namesToSkipFile): for line in file: nodeNames.remove(line.rstrip()) print(f"Skipping {oldSz - len(nodeNames)} nodes") -# Get disambiguation page labels + print("Reading disambiguation-page labels") disambigLabels = set() query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri" for (label,) in dbpCur.execute(query): disambigLabels.add(label) -# Try associating nodes with IRIs, accounting for disambiguation labels -print("Trying to associate nodes with labels") + +print("Trying to associate nodes with DBpedia labels") nodeToLabel = {} -nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") -nameToVariants = {} +nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)' +nameToVariants = {} # Maps node names to lists of matching labels iterNum = 0 for (label,) in dbpCur.execute("SELECT label from labels"): iterNum += 1 if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") + print(f"At iteration {iterNum}") # if label in disambigLabels: continue @@ -69,18 +73,20 @@ for (label,) in dbpCur.execute("SELECT label from labels"): nameToVariants[subName] = [label] elif name not in nameToVariants[subName]: nameToVariants[subName].append(label) +# Associate labels without conflicts for (name, variants) in nameToVariants.items(): if len(variants) == 1: nodeToLabel[name] = variants[0] for name in nodeToLabel: del nameToVariants[name] -nodeToLabel["cellular organisms"] = "organism" # Special case for root node -print(f"Number of conflicts: {len(nameToVariants)}") -# Try resolving conflicts +# Special case for root node +nodeToLabel[rootNodeName] = rootLabel +if rootNodeName in nameToVariants: + del nameToVariants["cellular organisms"] + +print("Trying to resolve {len(nameToVariants)} conflicts") def resolveWithPickedLabels(): - # Attempts conflict resolution using a file with lines of the form 'name1|label1', - # where label1 may be absent, indicating that no label should be associated with the name - print("Resolving conflicts using picked-labels") + " Attempts to resolve conflicts using a picked-names file " with open(pickedLabelsFile) as file: for line in file: (name, _, label) = line.rstrip().partition("|") @@ -94,11 +100,13 @@ def resolveWithPickedLabels(): print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr) nodeToLabel[name] = label del nameToVariants[name] - print(f"Remaining number of conflicts: {len(nameToVariants)}") def resolveWithCategoryList(): - # Attempts conflict resolution using category-text in labels of the form 'name1 (category1)' - # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape) - print("Resolving conflicts using category-list") + """ + Attempts to resolve conflicts by looking for labels like 'name1 (category1)', + and choosing those with a category1 that seems 'biological'. + Does two passes, using more generic categories first. This helps avoid stuff like + Pan being classified as a horse instead of an ape. + """ generalCategories = { "species", "genus", "plant", "fungus", "animal", @@ -107,7 +115,7 @@ def resolveWithCategoryList(): } specificCategories = { "protist", "alveolate", "dinoflagellates", - "orchid", "Poaceae", "fern", "moss", "alga", + "orchid", "poaceae", "fern", "moss", "alga", "bryozoan", "hydrozoan", "sponge", "cnidarian", "coral", "polychaete", "echinoderm", "bivalve", "gastropod", "chiton", @@ -139,10 +147,8 @@ def resolveWithCategoryList(): break for name in namesToRemove: del nameToVariants[name] - print(f"Remaining number of conflicts: {len(nameToVariants)}") def resolveWithTypeData(): - # Attempts conflict-resolution using dbpedia's instance-type data - print("Resolving conflicts using instance-type data") + " Attempts to resolve conflicts using DBpedia's type data " taxonTypes = { # Obtained from the DBpedia ontology "http://dbpedia.org/ontology/Species", "http://dbpedia.org/ontology/Archaea", @@ -179,7 +185,7 @@ def resolveWithTypeData(): for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"): iterNum += 1 if iterNum % 1e5 == 0: - print(f"Processing line {iterNum}") + print(f"At iteration {iterNum}") # if type in taxonTypes: name = label.lower() @@ -193,20 +199,17 @@ def resolveWithTypeData(): if name in nameToVariants: nodeToLabel[name] = label del nameToVariants[name] - print(f"Remaining number of conflicts: {len(nameToVariants)}") +#resolveWithTypeData() +#resolveWithCategoryList() resolveWithPickedLabels() -# Associate nodes with IRIs +print(f"Remaining number of conflicts: {len(nameToVariants)}") + print("Getting node IRIs") nodeToIri = {} -iterNum = 0 for (name, label) in nodeToLabel.items(): - row = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone() - if row == None: - print(f"ERROR: Couldn't find label {label}", file=sys.stderr) - sys.exit(1) - else: - nodeToIri[name] = row[0] -# Resolve redirects + (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone() + nodeToIri[name] = iri + print("Resolving redirects") redirectingIriSet = set() iterNum = 0 @@ -219,9 +222,10 @@ for (name, iri) in nodeToIri.items(): if row != None: nodeToIri[name] = row[0] redirectingIriSet.add(name) -# Find descriptions, and add to db -print("Adding node description data") + +print("Adding description tables") dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)") +dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)") dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)") iterNum = 0 for (name, iri) in nodeToIri.items(): @@ -232,10 +236,11 @@ for (name, iri) in nodeToIri.items(): query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?" row = dbpCur.execute(query, (iri,)).fetchone() if row != None: - (desc, wikiId) = row + desc, wikiId = row dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0)) dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1)) -# Close dbs + +print("Closing databases") dbCon.commit() dbCon.close() dbpCon.commit() diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py index dbc8d6b..d3f93ed 100755 --- a/backend/data/genEnwikiDescData.py +++ b/backend/data/genEnwikiDescData.py @@ -3,10 +3,13 @@ import sys, re, os import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data" -usageInfo += "from a sqlite database, and adds description data for names that\n" -usageInfo += "don't have them.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads a database containing data from Wikipedia, and tries to associate +wiki pages with nodes in the database, and add descriptions for nodes +that don't have them. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -15,36 +18,39 @@ enwikiDb = "enwiki/descData.db" dbFile = "data.db" namesToSkipFile = "pickedEnwikiNamesToSkip.txt" pickedLabelsFile = "pickedEnwikiLabels.txt" +# Got about 25k descriptions when testing -# Open dbs +print("Opening databases") enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Read name/title files + +print("Checking for names to skip") namesToSkip = set() -nameToPickedTitle = {} # Maps names to titles to be used for them if os.path.exists(namesToSkipFile): with open(namesToSkipFile) as file: for line in file: namesToSkip.add(line.rstrip()) - print(f"Read in {len(namesToSkip)} names to skip") + print(f"Found {len(namesToSkip)}") +print("Checking for picked-titles") +nameToPickedTitle = {} if os.path.exists(pickedLabelsFile): with open(pickedLabelsFile) as file: for line in file: (name, _, title) = line.rstrip().partition("|") nameToPickedTitle[name.lower()] = title -print(f"Read in {len(nameToPickedTitle)} titles to use for certain names") -# Get node names without descriptions -print("Getting node names") +print(f"Found {len(nameToPickedTitle)}") + +print("Getting names of nodes without descriptions") nodeNames = set() query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL" -for row in dbCur.execute(query): - nodeNames.add(row[0]) -print(f"Found {len(nodeNames)} names") +for (name,) in dbCur.execute(query): + nodeNames.add(name) +print(f"Found {len(nodeNames)}") nodeNames.difference_update(namesToSkip) -# Find page id for each node name -print("Getting node page-ids") + +print("Associating nodes with page IDs") nodeToPageId = {} iterNum = 0 for name in nodeNames: @@ -63,34 +69,34 @@ for name in nodeNames: nodeToPageId[name] = row[0] else: print("WARNING: Picked title {title} not found", file=sys.stderr) -# Resolve redirects + print("Resolving redirects") redirectingNames = set() iterNum = 0 for (name, pageId) in nodeToPageId.items(): iterNum += 1 - if iterNum % 1000 == 0: + if iterNum % 1e3 == 0: print(f"At iteration {iterNum}") # - row = enwikiCur.execute( - "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?", - (pageId,)).fetchone() + query = "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?" + row = enwikiCur.execute(query, (pageId,)).fetchone() if row != None: nodeToPageId[name] = row[0] redirectingNames.add(name) -# Add descriptions for each node + print("Adding description data") iterNum = 0 for (name, pageId) in nodeToPageId.items(): iterNum += 1 - if iterNum % 1000 == 0: + if iterNum % 1e3 == 0: print(f"At iteration {iterNum}") # row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() if row != None: dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0)) dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0)) -# Close dbs + +print("Closing databases") dbCon.commit() dbCon.close() enwikiCon.close() diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py index 8285a40..7ad61d1 100755 --- a/backend/data/genEnwikiNameData.py +++ b/backend/data/genEnwikiNameData.py @@ -3,9 +3,13 @@ import sys, re import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads Wikimedia enwiki redirect data from enwiki/, and node and wiki-id\n" -usageInfo += "data from a sqlite database, and adds supplmenentary alt-name data.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads from a database containing data from Wikipdia, along with +node and wiki-id information from the database, and use wikipedia +page-redirect information to add additional alt-name data. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -15,19 +19,19 @@ dbFile = "data.db" altNameRegex = re.compile(r"[a-zA-Z]+") # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)', -# Open dbs +print("Opening databases") enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Get nodes with wiki-ids + print("Getting nodes with wiki IDs") nodeToWikiId = {} -for row in dbCur.execute("SELECT name, id from wiki_ids"): - nodeToWikiId[row[0]] = row[1] -print(f"Found {len(nodeToWikiId)} nodes") -# Find wiki-ids that redirect to each node -print("Finding redirecter names") +for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"): + nodeToWikiId[nodeName] = wikiId +print(f"Found {len(nodeToWikiId)}") + +print("Iterating through nodes, finding names that redirect to them") nodeToAltNames = {} numAltNames = 0 iterNum = 0 @@ -45,8 +49,8 @@ for (nodeName, wikiId) in nodeToWikiId.items(): nodeToAltNames[nodeName].add(name.lower()) numAltNames += 1 print(f"Found {numAltNames} alt-names") -# Remove existing alt-names -print("Removing existing alt-names") + +print("Excluding existing alt-names from the set") query = "SELECT alt_name FROM names WHERE alt_name IN ({})" iterNum = 0 for (nodeName, altNames) in nodeToAltNames.items(): @@ -60,12 +64,13 @@ for (nodeName, altNames) in nodeToAltNames.items(): numAltNames -= len(existingNames) altNames.difference_update(existingNames) print(f"Left with {numAltNames} alt-names") -# Add alt-names -print("Adding alt-names") + +print("Adding alt-names to database") for (nodeName, altNames) in nodeToAltNames.items(): for altName in altNames: dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0)) -# Close dbs + +print("Closing databases") dbCon.commit() dbCon.close() enwikiCon.close() diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py index d852751..dd33ee0 100755 --- a/backend/data/genEolNameData.py +++ b/backend/data/genEolNameData.py @@ -3,34 +3,39 @@ import sys, re, os import html, csv, sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads vernacular-names CSV data (from the Encyclopedia of Life site),\n" -usageInfo += "makes associations with node data in a sqlite database, and writes\n" -usageInfo += "name data to that database.\n" -usageInfo += "\n" -usageInfo += "Expects a CSV header describing lines with format:\n" -usageInfo += " page_id, canonical_form, vernacular_string, language_code,\n" -usageInfo += " resource_name, is_preferred_by_resource, is_preferred_by_eol\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads files describing name data from the 'Encyclopedia of Life' site, +tries to associate names with nodes in the database, and adds tables +to represent associated names. + +Reads a vernacularNames.csv file: + Starts with a header line containing: + page_id, canonical_form, vernacular_string, language_code, + resource_name, is_preferred_by_resource, is_preferred_by_eol + The canonical_form and vernacular_string fields contain names + associated with the page ID. Names are not always unique to + particular page IDs. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -vnamesFile = "eol/vernacularNames.csv" +vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries dbFile = "data.db" -NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"} +namesToSkip = {"unknown", "unknown species", "unidentified species"} pickedIdsFile = "pickedEolIds.txt" -badAltsFile = "pickedEolAltsToSkip.txt" +altsToSkipFile = "pickedEolAltsToSkip.txt" -# Read in vernacular-names data - # Note: Canonical-names may have multiple pids - # Note: A canonical-name's associated pids might all have other associated names print("Reading in vernacular-names data") -nameToPids = {} +nameToPids = {} # 'pid' means 'Page ID' canonicalNameToPids = {} pidToNames = {} -pidToPreferred = {} +pidToPreferred = {} # Maps pids to 'preferred' names def updateMaps(name, pid, canonical, preferredAlt): - if name in NAMES_TO_SKIP: + global namesToSkip, nameToPids, canonicalNameToPids, pidToNames, pidToPreferred + if name in namesToSkip: return if name not in nameToPids: nameToPids[name] = {pid} @@ -52,6 +57,9 @@ with open(vnamesFile, newline="") as csvfile: lineNum = 0 for row in reader: lineNum += 1 + if lineNum % 1e5 == 0: + print(f"At line {lineNum}") + # Skip header line if lineNum == 1: continue # Parse line @@ -64,7 +72,7 @@ with open(vnamesFile, newline="") as csvfile: updateMaps(name1, pid, True, False) if lang == "eng" and name2 != "": updateMaps(name2, pid, False, preferred) -# Check for manually-picked pids + print("Checking for manually-picked pids") nameToPickedPid = {} if os.path.exists(pickedIdsFile): @@ -73,64 +81,77 @@ if os.path.exists(pickedIdsFile): (name, _, eolId) = line.rstrip().partition("|") nameToPickedPid[name] = None if eolId == "" else int(eolId) print(f"Found {len(nameToPickedPid)}") -# Read in node-alt_names to avoid -print("Checking for bad-alt-names") -nameToBadAlts = {} -if os.path.exists(badAltsFile): - with open(badAltsFile) as file: + +print("Checking for alt-names to skip") +nameToAltsToSkip = {} +numToSkip = 0 +if os.path.exists(altsToSkipFile): + with open(altsToSkipFile) as file: for line in file: (name, _, altName) = line.rstrip().partition("|") - if name not in nameToBadAlts: - nameToBadAlts[name] = [altName] + if name not in nameToAltsToSkip: + nameToAltsToSkip[name] = [altName] else: - nameToBadAlts[name].append(altName) -print(f"Found bad-alts for {len(nameToBadAlts)} nodes") -# Open db connection + nameToAltsToSkip[name].append(altName) + numToSkip += 1 +print(f"Found {numToSkip} alt-names to skip") + +print("Creating database tables") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Create tables dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))") dbCur.execute("CREATE INDEX names_idx ON names(name)") dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)") dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)") dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)") dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)") -# Iterate through 'nodes' table, resolving to canonical-names + +print("Associating nodes with names") usedPids = set() unresolvedNodeNames = set() dbCur2 = dbCon.cursor() def addToDb(nodeName, pidToUse): - altNames = set() - preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None + " Adds page-ID-associated name data to a node in the database " + global dbCur, pidToPreferred dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName)) + # Get alt-names + altNames = set() for n in pidToNames[pidToUse]: + # Avoid alt-names with >3 words if len(n.split(" ")) > 3: continue + # Avoid alt-names that already name a node in the database if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None: continue - if nodeName in nameToBadAlts and n in nameToBadAlts[nodeName]: - print(f"Excluding bad-alt {n} for node {nodeName}") + # Check for picked alt-name-to-skip + if nodeName in nameToAltsToSkip and n in nameToAltsToSkip[nodeName]: + print(f"Excluding alt-name {n} for node {nodeName}") continue + # altNames.add(n) + # Add alt-names to db + preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None for n in altNames: isPreferred = 1 if (n == preferredName) else 0 dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred)) -for name in nameToPickedPid: # Add manually-picked pids - pickedPid = nameToPickedPid[name] - usedPids.add(pickedPid) - if pickedPid != None: - addToDb(name, pickedPid) -iterationNum = 0 -for (name,) in dbCur2.execute("SELECT name FROM nodes"): - iterationNum += 1 - if iterationNum % 10000 == 0: - print(f"Loop 1 iteration {iterationNum}") - if name in nameToPickedPid: +print("Adding picked IDs") +for (name, pid) in nameToPickedPid.items(): + if pid != None: + addToDb(name, pid) + usedPids.add(pid) +print("Associating nodes with canonical names") +iterNum = 0 +for (nodeName,) in dbCur2.execute("SELECT name FROM nodes"): + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") + if nodeName in nameToPickedPid: continue - # If name matches a canonical-name, add alt-name entries to 'names' table - if name in canonicalNameToPids: + # Check for matching canonical name + if nodeName in canonicalNameToPids: pidToUse = None - for pid in canonicalNameToPids[name]: + # Pick an associated page ID + for pid in canonicalNameToPids[nodeName]: hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred if hasLowerPrio: @@ -138,24 +159,26 @@ for (name,) in dbCur2.execute("SELECT name FROM nodes"): if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio): pidToUse = pid if pidToUse != None: + addToDb(nodeName, pidToUse) usedPids.add(pidToUse) - addToDb(name, pidToUse) - elif name in nameToPids: - unresolvedNodeNames.add(name) -# Iterate through unresolved nodes, resolving to vernacular-names -iterationNum = 0 -for name in unresolvedNodeNames: - iterationNum += 1 - if iterationNum % 100 == 0: - print(f"Loop 2 iteration {iterationNum}") - # Add alt-name entries to 'names' table for first corresponding pid + elif nodeName in nameToPids: + unresolvedNodeNames.add(nodeName) +print("Associating leftover nodes with other names") +iterNum = 0 +for nodeName in unresolvedNodeNames: + iterNum += 1 + if iterNum % 100 == 0: + print(f"At iteration {iterNum}") + # Check for matching name pidToUse = None - for pid in nameToPids[name]: + for pid in nameToPids[nodeName]: + # Pick an associated page ID if pid not in usedPids and (pidToUse == None or pid < pidToUse): pidToUse = pid if pidToUse != None: + addToDb(nodeName, pidToUse) usedPids.add(pidToUse) - addToDb(name, pidToUse) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/genImgs.py b/backend/data/genImgs.py index 097959f..ecca8e0 100755 --- a/backend/data/genImgs.py +++ b/backend/data/genImgs.py @@ -4,13 +4,18 @@ import sys, os, subprocess import sqlite3, urllib.parse import signal -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n" -usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n" -usageInfo += "Also adds image metadata to an sqlite database.\n" -usageInfo += "\n" -usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n" -usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads node IDs and image paths from a file, and possibly from a directory, +and generates cropped/resized versions of those images into a directory, +with names of the form 'nodeId1.jpg'. Also adds image metadata to the +database. + +SIGINT can be used to stop, and the program can be re-run to continue +processing. It uses already-existing database entries to decide what +to skip. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -23,19 +28,19 @@ pickedImgsDir = "pickedImgs/" pickedImgsFilename = "imgData.txt" dbFile = "data.db" IMG_OUT_SZ = 200 -genImgFiles = True +genImgFiles = True # Usable for debugging -# Create output directory if not present if not os.path.exists(outDir): os.mkdir(outDir) -# Open dbs + +print("Opening databases") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() eolCon = sqlite3.connect(eolImgDb) eolCur = eolCon.cursor() enwikiCon = sqlite3.connect(enwikiImgDb) enwikiCur = enwikiCon.cursor() -# Get 'picked images' info +print("Checking for picked-images") nodeToPickedImg = {} if os.path.exists(pickedImgsDir + pickedImgsFilename): lineNum = 0 @@ -49,29 +54,34 @@ if os.path.exists(pickedImgsDir + pickedImgsFilename): "nodeName": nodeName, "id": lineNum, "filename": filename, "url": url, "license": license, "artist": artist, "credit": credit, } -# Create image tables if not present + +print("Checking for image tables") nodesDone = set() imgsDone = set() if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None: + # Add image tables if not present dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)") dbCur.execute("CREATE TABLE images" \ " (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))") else: - # Get existing node-associations + # Get existing image-associated nodes for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"): nodesDone.add(otolId) - # And images + # Get existing node-associated images for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"): imgsDone.add((imgId, imgSrc)) - print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing") -# Detect SIGINT signals + print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip") + +# Set SIGINT handler interrupted = False def onSigint(sig, frame): global interrupted interrupted = True signal.signal(signal.SIGINT, onSigint) -# Iterate though images to process + +print("Iterating through input images") def quit(): + print("Closing databases") dbCon.commit() dbCon.close() eolCon.close() @@ -94,7 +104,7 @@ def convertImage(imgPath, outPath): print(f"ERROR: smartcrop had exit status {completedProcess.returncode}") return False return True -print("Processing picked images") +print("Processing picked-images") for (otolId, imgData) in nodeToPickedImg.items(): # Check for SIGINT event if interrupted: @@ -105,7 +115,8 @@ for (otolId, imgData) in nodeToPickedImg.items(): continue # Convert image if genImgFiles: - if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"): + success = convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg") + if not success: quit() else: print(f"Processing {imgData['nodeName']}: {otolId}.jpg") @@ -135,7 +146,8 @@ with open(imgListFile) as file: continue # Convert image if genImgFiles: - if not convertImage(imgPath, outDir + otolId + ".jpg"): + success = convertImage(imgPath, outDir + otolId + ".jpg") + if not success: break else: if iterNum % 1e4 == 0: @@ -146,13 +158,13 @@ with open(imgListFile) as file: imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component imgName = os.path.splitext(imgName)[0] # Remove extension if fromEol: - (eolId, _, contentId) = imgName.partition(" ") - (eolId, contentId) = (int(eolId), int(contentId)) + eolId, _, contentId = imgName.partition(" ") + eolId, contentId = (int(eolId), int(contentId)) if (eolId, "eol") not in imgsDone: query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?" row = eolCur.execute(query, (contentId,)).fetchone() if row == None: - print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr) + print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}") break (url, license, owner) = row dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)", @@ -167,7 +179,7 @@ with open(imgListFile) as file: " WHERE page_imgs.page_id = ?" row = enwikiCur.execute(query, (enwikiId,)).fetchone() if row == None: - print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr) + print(f"ERROR: No image record for enwiki ID {enwikiId}") break (name, license, artist, credit) = row url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name) diff --git a/backend/data/genLinkedImgs.py b/backend/data/genLinkedImgs.py index 9fe07a2..a8e1322 100755 --- a/backend/data/genLinkedImgs.py +++ b/backend/data/genLinkedImgs.py @@ -3,9 +3,12 @@ import sys, re import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Adds a table to data.db, associating nodes without images to\n" -usageInfo += "usable child images.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Look for nodes without images in the database, and tries to +associate them with images from their children. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -14,24 +17,22 @@ dbFile = "data.db" compoundNameRegex = re.compile(r"\[(.+) \+ (.+)]") upPropagateCompoundImgs = False -# Open db +print("Opening databases") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)") - # Associates a node with one (or two) otol-ids with usable images, - # encoded as 'otolId1' or 'otolId1,otolId2' -# Get nodes with images + print("Getting nodes with images") resolvedNodes = {} # Will map node names to otol IDs with a usable image query = "SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name" for (name, otolId) in dbCur.execute(query): resolvedNodes[name] = otolId -print(f"Got {len(resolvedNodes)} nodes") -# Iterate through resolved nodes, resolving ancestors where able -print("Resolving ancestor nodes") -nodesToResolve = {} -processedNodes = {} -parentToChosenTips = {} +print(f"Found {len(resolvedNodes)}") + +print("Iterating through nodes, trying to resolve images for ancestors") +nodesToResolve = {} # Maps a node name to a list of objects that represent possible child images +processedNodes = {} # Map a node name to an OTOL ID, representing a child node whose image is to be used +parentToChosenTips = {} # used to prefer images from children with more tips iterNum = 0 while len(resolvedNodes) > 0: iterNum += 1 @@ -43,13 +44,13 @@ while len(resolvedNodes) > 0: # Traverse upwards, resolving ancestors if able while True: # Get parent - row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (nodeName,)).fetchone() + row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (nodeName,)).fetchone() if row == None or row[0] in processedNodes or row[0] in resolvedNodes: break parent = row[0] # Get parent data if parent not in nodesToResolve: - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (parent,))] + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (parent,))] query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(childNames))) childObjs = [{"name": row[0], "tips": row[1], "otolId": None} for row in dbCur.execute(query, childNames)] childObjs.sort(key=lambda x: x["tips"], reverse=True) @@ -66,7 +67,7 @@ while len(resolvedNodes) > 0: nodeName = parent continue else: - # Add potential otol-id + # Mark child as a potential choice childObj = next(c for c in childObjs if c["name"] == nodeName) childObj["otolId"] = otolId break @@ -78,8 +79,8 @@ while len(resolvedNodes) > 0: parentToChosenTips[name] = childObj["tips"] dbCur.execute("INSERT INTO linked_imgs VALUES (?, ?)", (name, childObj["otolId"])) nodesToResolve.clear() -# Iterate through processed nodes with compound names -print("Replacing images for compound-name nodes") + +print("Replacing linked-images for compound nodes") iterNum = 0 for nodeName in processedNodes.keys(): iterNum += 1 @@ -106,7 +107,7 @@ for nodeName in processedNodes.keys(): if upPropagateCompoundImgs: while True: # Get parent - row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (nodeName,)).fetchone() + row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (nodeName,)).fetchone() if row != None: parent = row[0] # Check num tips @@ -118,6 +119,7 @@ for nodeName in processedNodes.keys(): nodeName = parent continue break -# Close db + +print("Closing databases") dbCon.commit() dbCon.close() diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py index 87b35c3..36b6197 100755 --- a/backend/data/genOtolData.py +++ b/backend/data/genOtolData.py @@ -3,29 +3,33 @@ import sys, re, os import json, sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release),\n" -usageInfo += "and creates a sqlite database, which holds entries of the form (name text, data text).\n" -usageInfo += "Each row holds a tree-of-life node's name, JSON-encoded child name array, a parent name or '',\n" -usageInfo += "number of descendant 'tips', and a 1 or 0 indicating phylogenetic-support.\n" -usageInfo += "\n" -usageInfo += "Expected labelled_supertree_ottnames.tre format:\n" -usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n" -usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n" -usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n" -usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n" -usageInfo += "Expected annotations.json format:\n" -usageInfo += " JSON object holding information about the tree-of-life release.\n" -usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n" -usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n" -usageInfo += "\n" -usageInfo += "Some node trimming is done on the extracted tree, for performance and relevance reasons.\n" -usageInfo += "The app can get quite laggy when some nodes in the chain have over 10k children.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Reads files describing a tree-of-life from an 'Open Tree of Life' release, +and stores tree information in a database. + +Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: + The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 + The root node is named n6, and has children n1, n2, and n5. + Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', + 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. + The node with ID 'ott770315' will get the name 'homo sapiens'. + A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). + It is possible for multiple nodes to have the same name. + In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. +Reads an annotations.json file, which is assumed to have this format: + Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, + such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that + support/conflict with the node's placement. +Reads from a picked-names file, if present, which specifies name and node ID pairs. + These help resolve cases where multiple nodes share the same name. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -treeFile = "otol/labelled_supertree_ottnames.tre" +treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes annFile = "otol/annotations.json" dbFile = "data.db" nodeMap = {} # Maps node IDs to node objects @@ -33,19 +37,32 @@ nameToFirstId = {} # Maps node names to first found ID (names might have multipl dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs pickedNamesFile = "pickedOtolNames.txt" -# Parse treeFile +class Node: + " Represents a tree-of-life node " + def __init__(self, name, childIds, parentId, tips, pSupport): + self.name = name + self.childIds = childIds + self.parentId = parentId + self.tips = tips + self.pSupport = pSupport + print("Parsing tree file") +# Read file data = None with open(treeFile) as file: data = file.read() dataIdx = 0 +# Parse content +iterNum = 0 def parseNewick(): - """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None""" - global dataIdx + " Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID " + global data, dataIdx, iterNum + iterNum += 1 + if iterNum % 1e5 == 0: + print(f"At iteration {iterNum}") # Check for EOF if dataIdx == len(data): - print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr) - return None + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") # Check for node if data[dataIdx] == "(": # parse inner node dataIdx += 1 @@ -53,12 +70,9 @@ def parseNewick(): while True: # Read child childId = parseNewick() - if childId == None: - return None childIds.append(childId) if (dataIdx == len(data)): - print("ERROR: Unexpected EOF", file=sys.stderr) - return None + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") # Check for next child if (data[dataIdx] == ","): dataIdx += 1 @@ -66,33 +80,25 @@ def parseNewick(): else: # Get node name and id dataIdx += 1 # Consume an expected ')' - [name, id] = parseNewickName() + name, id = parseNewickName() updateNameMaps(name, id) # Get child num-tips total tips = 0 for childId in childIds: - tips += nodeMap[childId]["tips"] + tips += nodeMap[childId].tips # Add node to nodeMap - nodeMap[id] = {"name": name, "children": childIds, "parent": None, "tips": tips, "pSupport": False} + nodeMap[id] = Node(name, childIds, None, tips, False) # Update childrens' parent reference for childId in childIds: - nodeMap[childId]["parent"] = id + nodeMap[childId].parentId = id return id else: # Parse node name - [name, id] = parseNewickName() + name, id = parseNewickName() updateNameMaps(name, id) - nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False} + nodeMap[id] = Node(name, [], None, 1, False) return id -def updateNameMaps(name, id): - if name not in nameToFirstId: - nameToFirstId[name] = id - else: - if name not in dupNameToIds: - dupNameToIds[name] = [nameToFirstId[name], id] - else: - dupNameToIds[name].append(id) def parseNewickName(): - """Helper that parses an input node name, and returns a [name,id] pair""" + " Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair " global data, dataIdx name = None end = dataIdx @@ -102,7 +108,7 @@ def parseNewickName(): inQuote = True while end < len(data): if (data[end] == "'"): - if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote + if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote end += 2 continue else: @@ -111,75 +117,86 @@ def parseNewickName(): break end += 1 if inQuote: - raise Exception("ERROR: Unexpected EOF") + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") name = data[dataIdx:end] dataIdx = end else: while end < len(data) and not re.match(r"[(),]", data[end]): end += 1 if (end == dataIdx): - raise Exception("ERROR: Unexpected EOF") + raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}") name = data[dataIdx:end].rstrip() if end == len(data): # Ignore trailing input semicolon name = name[:-1] dataIdx = end - # Convert to [name, id] + # Convert to (name, id) name = name.lower() if name.startswith("mrca"): - return [name, name] + return (name, name) elif name[0] == "'": match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name) if match == None: raise Exception(f"ERROR: invalid name \"{name}\"") name = match.group(1).replace("''", "'") - return [name, match.group(2)] + return (name, match.group(2)) else: match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name) if match == None: raise Exception(f"ERROR: invalid name \"{name}\"") - return [match.group(1).replace("_", " "), match.group(2)] + return (match.group(1).replace("_", " "), match.group(2)) +def updateNameMaps(name, id): + global nameToFirstId, dupNameToIds + if name not in nameToFirstId: + nameToFirstId[name] = id + else: + if name not in dupNameToIds: + dupNameToIds[name] = [nameToFirstId[name], id] + else: + dupNameToIds[name].append(id) rootId = parseNewick() -# Resolve duplicate names -print("Resolving duplicates") + +print("Resolving duplicate names") +# Read picked-names file nameToPickedId = {} if os.path.exists(pickedNamesFile): with open(pickedNamesFile) as file: for line in file: (name, _, otolId) = line.rstrip().partition("|") nameToPickedId[name] = otolId -for [dupName, ids] in dupNameToIds.items(): +# Resolve duplicates +for (dupName, ids) in dupNameToIds.items(): # Check for picked id if dupName in nameToPickedId: idToUse = nameToPickedId[dupName] else: # Get conflicting node with most tips - tipNums = [nodeMap[id]["tips"] for id in ids] + tipNums = [nodeMap[id].tips for id in ids] maxIdx = tipNums.index(max(tipNums)) idToUse = ids[maxIdx] # Adjust name of other conflicting nodes counter = 2 for id in ids: if id != idToUse: - nodeMap[id]["name"] += " [" + str(counter)+ "]" + nodeMap[id].name += f" [{counter}]" counter += 1 -# Change mrca* names + print("Changing mrca* names") def convertMrcaName(id): node = nodeMap[id] - name = node["name"] - childIds = node["children"] + name = node.name + childIds = node.childIds if len(childIds) < 2: - print(f"WARNING: MRCA node \"{name}\" has less than 2 children", file=sys.stderr) + print(f"WARNING: MRCA node \"{name}\" has less than 2 children") return # Get 2 children with most tips - childTips = [nodeMap[id]["tips"] for id in childIds] - maxIdx = childTips.index(max(childTips)) - childTips[maxIdx] = 0 + childTips = [nodeMap[id].tips for id in childIds] + maxIdx1 = childTips.index(max(childTips)) + childTips[maxIdx1] = 0 maxIdx2 = childTips.index(max(childTips)) - childId1 = childIds[maxIdx] + childId1 = childIds[maxIdx1] childId2 = childIds[maxIdx2] - childName1 = nodeMap[childId1]["name"] - childName2 = nodeMap[childId2]["name"] + childName1 = nodeMap[childId1].name + childName2 = nodeMap[childId2].name # Check for mrca* child names if childName1.startswith("mrca"): childName1 = convertMrcaName(childId1) @@ -193,44 +210,44 @@ def convertMrcaName(id): if match != None: childName2 = match.group(1) # Create composite name - node["name"] = f"[{childName1} + {childName2}]" + node.name = f"[{childName1} + {childName2}]" return childName1 -for [id, node] in nodeMap.items(): - if node["name"].startswith("mrca"): +for (id, node) in nodeMap.items(): + if node.name.startswith("mrca"): convertMrcaName(id) -# Parse annFile + print("Parsing annotations file") +# Read file data = None with open(annFile) as file: data = file.read() obj = json.loads(data) -nodeAnnsMap = obj['nodes'] -# Add annotations data -print("Adding annotation data") -for [id, node] in nodeMap.items(): +nodeAnnsMap = obj["nodes"] +# Find relevant annotations +for (id, node) in nodeMap.items(): # Set has-support value using annotations if id in nodeAnnsMap: nodeAnns = nodeAnnsMap[id] supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0 conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0 - node["pSupport"] = supportQty > 0 and conflictQty == 0 + node.pSupport = supportQty > 0 and conflictQty == 0 # Root node gets support - if node["parent"] == None: - node["pSupport"] = True -# Create db + if node.parentId == None: + node.pSupport = True + print("Creating nodes and edges tables") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)") -dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))") +dbCur.execute("CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)") for (otolId, node) in nodeMap.items(): - dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node["name"], otolId, node["tips"])) - childIds = node["children"] - for childId in childIds: + dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node.name, otolId, node.tips)) + for childId in node.childIds: childNode = nodeMap[childId] dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)", - (node["name"], childNode["name"], 1 if childNode["pSupport"] else 0)) + (node.name, childNode.name, 1 if childNode.pSupport else 0)) +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py index b475794..2e56bba 100755 --- a/backend/data/genReducedTreeData.py +++ b/backend/data/genReducedTreeData.py @@ -3,123 +3,131 @@ import sys, os.path, re import json, sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads \n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Creates a reduced version of the tree in the database. +Reads a subset of the node names from a file, and creates a +minimal tree that contains them, possibly with a few extras. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) dbFile = "data.db" -nodeNamesFile = "reducedTreeNodes.txt" +nodeNamesFile = "pickedReducedNodes.txt" minimalNames = set() nodeMap = {} # Maps node names to node objects PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit -compNameRegex = re.compile(r"\[.+ \+ .+]") +compNameRegex = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes + +class Node: + " Represents a node from the database " + def __init__(self, id, children, parent, tips, pSupport): + self.id = id + self.children = children + self.parent = parent + self.tips = tips + self.pSupport = pSupport -# Connect to db +print("Opening database") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Read in minimal set of node names + print("Getting minimal name set") iterNum = 0 with open(nodeNamesFile) as file: for line in file: iterNum += 1 if iterNum % 100 == 0: - print(f"Iteration {iterNum}") + print(f"At iteration {iterNum}") # - row = dbCur.execute("SELECT name from nodes WHERE name = ?", (line.rstrip(),)).fetchone() + name = line.rstrip() + row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone() if row == None: - row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (line.rstrip(),)).fetchone() + row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone() if row != None: minimalNames.add(row[0]) if len(minimalNames) == 0: - print("ERROR: No names found", file=sys.stderr) - sys.exit(1) -print(f"Name set has {len(minimalNames)} names") -# Add nodes that connect up to root -print("Getting connected nodes set") -iterNum = 0 + print("No names found") + sys.exit(0) +print(f"Result has {len(minimalNames)} names") + +print("Getting ancestor nodes") rootName = None +iterNum = 0 for name in minimalNames: iterNum += 1 if iterNum % 100 == 0: - print(f"Iteration {iterNum}") + print(f"At iteration {iterNum}") # prevName = None while name != None: if name not in nodeMap: (id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone() - row = dbCur.execute("SELECT node, p_support from edges where child = ?", (name,)).fetchone() + row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone() parent = None if row == None or row[0] == "" else row[0] - pSupport = 1 if row == None or row[1] == 1 else 0 - nodeMap[name] = { - "id": id, - "children": [] if prevName == None else [prevName], - "parent": parent, - "tips": 0, - "pSupport": pSupport, - } + pSupport = row == None or row[1] == 1 + children = [] if prevName == None else [prevName] + nodeMap[name] = Node(id, children, parent, 0, pSupport) prevName = name name = parent else: if prevName != None: - nodeMap[name]["children"].append(prevName) + nodeMap[name].children.append(prevName) break if name == None: rootName = prevName -print(f"New node set has {len(nodeMap)} nodes") -# Merge-upward compsite-named nodes -print("Merging-upward composite-named nodes") +print(f"Result has {len(nodeMap)} nodes") + +print("Merging-upward composite nodes") namesToRemove = set() -for (name, nodeObj) in nodeMap.items(): - parent = nodeObj["parent"] +for (name, node) in nodeMap.items(): + parent = node.parent if parent != None and compNameRegex.fullmatch(name) != None: # Connect children to parent - nodeMap[parent]["children"].remove(name) - nodeMap[parent]["children"].extend(nodeObj["children"]) - for n in nodeObj["children"]: - nodeMap[n]["parent"] = parent - nodeMap[n]["pSupport"] &= nodeObj["pSupport"] + nodeMap[parent].children.remove(name) + nodeMap[parent].children.extend(node.children) + for n in node.children: + nodeMap[n].parent = parent + nodeMap[n].pSupport &= node.pSupport # Remember for removal namesToRemove.add(name) for name in namesToRemove: del nodeMap[name] -print(f"New node set has {len(nodeMap)} nodes") -# Remove certain 'chain collapsible' nodes +print(f"Result has {len(nodeMap)} nodes") + print("Removing 'chain collapsible' nodes") namesToRemove2 = set() -for (name, nodeObj) in nodeMap.items(): - hasOneChild = len(nodeObj["children"]) == 1 - isOnlyChild = nodeObj["parent"] != None and len(nodeMap[nodeObj["parent"]]["children"]) == 1 +for (name, node) in nodeMap.items(): + hasOneChild = len(node.children) == 1 + isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1 if name not in minimalNames and (hasOneChild or isOnlyChild): - parentName = nodeObj["parent"] - children = nodeObj["children"] + parent = node.parent # Connect parent and children - nodeMap[parentName]["children"].remove(name) - nodeMap[parentName]["children"].extend(children) - for n in children: - nodeMap[n]["parent"] = parentName - # Adjust child pSupport - nodeMap[n]["pSupport"] &= nodeObj["pSupport"] + nodeMap[parent].children.remove(name) + nodeMap[parent].children.extend(node.children) + for n in node.children: + nodeMap[n].parent = parent + nodeMap[n].pSupport &= node.pSupport # Remember for removal namesToRemove2.add(name) for name in namesToRemove2: del nodeMap[name] namesToRemove.add(name) -print(f"New node set has {len(nodeMap)} nodes") -# Add some connected children -print("Adding additional nearby children") +print(f"Result has {len(nodeMap)} nodes") + +print("Adding some additional nearby children") namesToAdd = [] iterNum = 0 -for (name, nodeObj) in nodeMap.items(): +for (name, node) in nodeMap.items(): iterNum += 1 if iterNum % 100 == 0: - print(f"Iteration {iterNum}") + print(f"At iteration {iterNum}") # - numChildren = len(nodeObj["children"]) + numChildren = len(node.children) if numChildren < PREF_NUM_CHILDREN: - children = [row[0] for row in dbCur.execute("SELECT child FROM edges where node = ?", (name,))] + children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))] newChildren = [] for n in children: if n in nodeMap or n in namesToRemove: @@ -132,43 +140,38 @@ for (name, nodeObj) in nodeMap.items(): continue newChildren.append(n) newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)] - nodeObj["children"].extend(newChildNames) + node.children.extend(newChildNames) namesToAdd.extend(newChildNames) for name in namesToAdd: - (parent, pSupport) = dbCur.execute("SELECT node, p_support from edges WHERE child = ?", (name,)).fetchone() + parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone() (id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone() parent = None if parent == "" else parent - nodeMap[name] = { - "id": id, - "children": [], - "parent": parent, - "tips": 0, - "pSupport": pSupport == 1, - } -print(f"New node set has {len(nodeMap)} nodes") -# set tips vals -print("Setting tips vals") + nodeMap[name] = Node(id, [], parent, 0, pSupport == 1) +print(f"Result has {len(nodeMap)} nodes") + +print("Setting 'tips' values") def setTips(nodeName): - nodeObj = nodeMap[nodeName] - if len(nodeObj["children"]) == 0: - nodeObj["tips"] = 1 + node = nodeMap[nodeName] + if len(node.children) == 0: + node.tips = 1 return 1 - tips = sum([setTips(childName) for childName in nodeObj["children"]]) - nodeObj["tips"] = tips + tips = sum([setTips(childName) for childName in node.children]) + node.tips = tips return tips setTips(rootName) -# Add new nodes to db -print("Adding to db") + +print("Adding reduced tree to database") dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)") dbCur.execute("CREATE INDEX r_nodes_idx_nc ON r_nodes(name COLLATE NOCASE)") -dbCur.execute("CREATE TABLE r_edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))") +dbCur.execute("CREATE TABLE r_edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))") dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)") -for (name, nodeObj) in nodeMap.items(): - parentName = "" if nodeObj["parent"] == None else nodeObj["parent"] - dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, nodeObj["id"], nodeObj["tips"])) - for childName in nodeObj["children"]: - pSupport = 1 if nodeMap[childName]["pSupport"] else 0 +for (name, node) in nodeMap.items(): + parentName = "" if node.parent == None else node.parent + dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, node.id, node.tips)) + for childName in node.children: + pSupport = 1 if nodeMap[childName].pSupport else 0 dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport)) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/data/reviewImgsToGen.py b/backend/data/reviewImgsToGen.py index 4d970ba..de592f5 100755 --- a/backend/data/reviewImgsToGen.py +++ b/backend/data/reviewImgsToGen.py @@ -7,15 +7,18 @@ from tkinter import ttk import PIL from PIL import ImageTk, Image, ImageOps -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Provides a GUI that displays, for each tol-node, an associated image from\n" -usageInfo += "eol/* and enwiki/*, and enables the user to choose which to use. Writes\n" -usageInfo += "choice data to a text file with lines of the form 'otolId1 imgPath1', or\n" -usageInfo += "'otolId1', where no path indicates a choice of no image.\n" -usageInfo += "\n" -usageInfo += "The program can be closed, and run again to continue from the last choice.\n" -usageInfo += "The program looks for an existing output file to determine what choices\n" -usageInfo += "have already been made.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Provides a GUI that displays, for each node in the database, associated +images from EOL and Wikipedia, and allows choosing which to use. Writes +choice data to a text file with lines of the form 'otolId1 imgPath1', or +'otolId1', where no path indicates a choice of no image. + +The program can be closed, and run again to continue from the last choice. +The program looks for an existing output file to determine what choices +have already been made. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) @@ -28,16 +31,18 @@ IMG_DISPLAY_SZ = 400 PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135)) onlyReviewPairs = True -# Open db +print("Opening database") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Associate nodes with images -nodeToImgs = {} # Maps otol-ids to img-path arrays -print("Looking through EOL images") + +nodeToImgs = {} # Maps otol-ids to arrays of image paths +print("Iterating through images from EOL") if os.path.exists(eolImgDir): for filename in os.listdir(eolImgDir): - (eolId, _, _) = filename.partition(" ") + # Get associated EOL ID + eolId, _, _ = filename.partition(" ") query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?" + # Get associated node IDs found = False for (otolId,) in dbCur.execute(query, (int(eolId),)): if otolId not in nodeToImgs: @@ -45,13 +50,15 @@ if os.path.exists(eolImgDir): nodeToImgs[otolId].append(eolImgDir + filename) found = True if not found: - print(f"No node found for {eolImgDir}{filename}", file=sys.stderr) -print(f"Result has {len(nodeToImgs)} node entries") -print("Looking through enwiki images") + print(f"WARNING: No node found for {eolImgDir}{filename}") +print(f"Result: {len(nodeToImgs)} nodes with images") +print("Iterating through images from Wikipedia") if os.path.exists(enwikiImgDir): for filename in os.listdir(enwikiImgDir): + # Get associated page ID (wikiId, _, _) = filename.partition(".") - query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids._id = ?" + # Get associated node IDs + query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id = ?" found = False for (otolId,) in dbCur.execute(query, (int(wikiId),)): if otolId not in nodeToImgs: @@ -59,10 +66,9 @@ if os.path.exists(enwikiImgDir): nodeToImgs[otolId].append(enwikiImgDir + filename) found = True if not found: - print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr) -print(f"Result has {len(nodeToImgs)} node entries") -# Check for already-made choices -print("Filtering out already-chosen IDs") + print(f"WARNING: No node found for {enwikiImgDir}{filename}") +print(f"Result: {len(nodeToImgs)} nodes with images") +print("Filtering out already-made image choices") oldSz = len(nodeToImgs) if os.path.exists(outFile): with open(outFile) as file: @@ -74,7 +80,7 @@ if os.path.exists(outFile): print(f"Filtered out {oldSz - len(nodeToImgs)} entries") class ImgReviewer: - """ Provides the GUI for reviewing images """ + " Provides the GUI for reviewing images " def __init__(self, root, nodeToImgs): self.root = root root.title("Image Reviewer") @@ -96,7 +102,7 @@ class ImgReviewer: # Add padding for child in mainFrame.winfo_children(): child.grid_configure(padx=5, pady=5) - # Add bindings + # Add keyboard bindings root.bind("<q>", self.quit) root.bind("<Key-j>", lambda evt: self.accept(0)) root.bind("<Key-k>", lambda evt: self.accept(1)) @@ -112,7 +118,7 @@ class ImgReviewer: # Initialise images to review self.getNextImgs() def getNextImgs(self): - """ Updates display with new images to review, or ends program """ + " Updates display with new images to review, or ends program " # Get next image paths while True: self.listIdx += 1 @@ -120,7 +126,7 @@ class ImgReviewer: print("No more images to review. Exiting program.") self.quit() return - (self.otolId, imgPaths) = self.nodeImgsList[self.listIdx] + self.otolId, imgPaths = self.nodeImgsList[self.listIdx] # Potentially skip user choice if onlyReviewPairs and len(imgPaths) == 1: with open(outFile, 'a') as file: @@ -141,12 +147,12 @@ class ImgReviewer: continue if imgPath.startswith("eol/"): self.eolImgPath = imgPath - self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(img)) + self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(img)) elif imgPath.startswith("enwiki/"): self.enwikiImgPath = imgPath - self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(img)) + self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(img)) else: - print(f"Unexpected image path {imgPath}", file=sys.stderr) + print(f"Unexpected image path {imgPath}") self.quit() return # Re-iterate if all image paths invalid @@ -157,14 +163,14 @@ class ImgReviewer: return # Add placeholder images if self.eolImgPath == None: - self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG)) + self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG)) elif self.enwikiImgPath == None: - self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG)) + self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG)) # Update image-frames self.labels[0].config(image=self.eolImg) self.labels[1].config(image=self.enwikiImg) # Update title - title = f"Imgs for otol ID {self.otolId}" + title = f"Images for otol ID {self.otolId}" query = "SELECT names.alt_name FROM" \ " nodes INNER JOIN names ON nodes.name = names.name" \ " WHERE nodes.id = ? and pref_alt = 1" @@ -174,7 +180,7 @@ class ImgReviewer: title += f" ({self.listIdx + 1} out of {len(self.nodeImgsList)})" self.root.title(title) def accept(self, imgIdx): - """ React to a user selecting an image """ + " React to a user selecting an image " imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath if imgPath == None: print("Invalid selection") @@ -184,12 +190,13 @@ class ImgReviewer: self.numReviewed += 1 self.getNextImgs() def reject(self): - """ React to a user rejecting all images of a set """ + " React to a user rejecting all images of a set " with open(outFile, 'a') as file: file.write(f"{self.otolId}\n") self.numReviewed += 1 self.getNextImgs() def quit(self, e = None): + global dbCon print(f"Number reviewed: {self.numReviewed}") timeElapsed = time.time() - self.startTime print(f"Time elapsed: {timeElapsed:.2f} seconds") @@ -197,8 +204,8 @@ class ImgReviewer: print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds") dbCon.close() self.root.destroy() - def resizeForDisplay(self, img): - """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """ + def resizeImgForDisplay(self, img): + " Returns a copy of an image, shrunk to fit it's frame (keeps aspect ratio), and with a background " if max(img.width, img.height) > IMG_DISPLAY_SZ: if (img.width > img.height): newHeight = int(img.height * IMG_DISPLAY_SZ/img.width) @@ -212,6 +219,7 @@ class ImgReviewer: int((IMG_DISPLAY_SZ - img.height) / 2))) return bgImg # Create GUI and defer control +print("Starting GUI") root = tki.Tk() ImgReviewer(root, nodeToImgs) root.mainloop() diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py index 302ea0d..fa269d8 100755 --- a/backend/data/trimTree.py +++ b/backend/data/trimTree.py @@ -3,21 +3,25 @@ import sys import sqlite3 -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Removes certain children from a tol-tree in an sqlite db.\n" -usageInfo += "Looks for nodes with an amount of children above a threshold,\n" -usageInfo += "and removes the excess, excluding those with 'significant'\n" -usageInfo += "associations, like those with descriptions and images.\n" +usageInfo = f""" +Usage: {sys.argv[0]} + +Tries to remove 'low significance' nodes from the database. Currently +removes nodes that don't have an image or description, or a presence in +the reduced tree. Also, for nodes with 'many' children, trims some more, +ignoring the presence of node descriptions. +""" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) dbFile = "data.db" -softChildLimit = 500 +softChildLimit = 500 # Used to determine when a node has 'many' children +print("Opening database") dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() -# Get nodes that shouldn't be deleted, along with their ancestors + print("Finding nodes to keep") nodesToKeep = set() nodesToStronglyKeep = set() @@ -41,25 +45,26 @@ for name in nodesToKeep: print(f"\tAt iteration {iterNum}") # while True: - row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (name,)).fetchone() + row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (name,)).fetchone() if row != None: parent = row[0] if parent not in nodesToKeep and parent not in ancestors: ancestors.add(parent) - if name in nodesToStronglyKeep: + if name not in nodesToStronglyKeep: nodesToStronglyKeep.add(parent) name = parent continue break nodesToKeep.update(ancestors) -print(f"Total of {len(nodesToKeep)} nodes to keep") +print(f"Result: {len(nodesToKeep)} nodes to keep") + # Find root node -query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.node IS NULL LIMIT 1" +query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1" (rootName,) = dbCur.execute(query).fetchone() -print(f"Found root node '{rootName}'") -# Traverse tree, looking for trimmable nodes +print(f"Found root node \"{rootName}\"") + print("Looking for trimmable nodes") -nodeToTipsChg = {} +nodeToTipsChg = {} # Used to update 'tips' values after trimming nodesToDelete = set() iterNum = 0 def findTrimmables(nodeName): @@ -68,15 +73,15 @@ def findTrimmables(nodeName): if iterNum % 1e4 == 0: print(f"At iteration {iterNum}") # - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))] + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] childrenToKeep, otherChildren = set(), set() for n in childNames: if n in nodesToKeep: childrenToKeep.add(n) else: otherChildren.add(n) - # Check soft limit tipsRemoved = 0 + # Check soft limit if len(childrenToKeep) > softChildLimit: numToTrim = len(childrenToKeep) - softChildLimit # Try removing weakly-kept nodes, preferring those with less tips @@ -88,7 +93,7 @@ def findTrimmables(nodeName): candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True) otherChildren.update(candidatesToTrim[-numToTrim:]) childrenToKeep.difference_update(candidatesToTrim[-numToTrim:]) - # 'Simulate' deletions + # Mark nodes for deletion for n in otherChildren: tipsRemoved += markForDeletion(n) # Recurse on children @@ -99,7 +104,7 @@ def findTrimmables(nodeName): return tipsRemoved def markForDeletion(nodeName): nodesToDelete.add(nodeName) - childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))] + childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))] if len(childNames) == 0: return 1 else: @@ -108,7 +113,7 @@ def markForDeletion(nodeName): tipsRemoved += markForDeletion(n) return tipsRemoved findTrimmables(rootName) -# Delete trimmable nodes + print(f"Deleting {len(nodesToDelete)} nodes") iterNum = 0 for nodeName in nodesToDelete: @@ -117,10 +122,13 @@ for nodeName in nodesToDelete: print(f"At iteration {iterNum}") # dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,)) - dbCur.execute("DELETE FROM edges WHERE node = ?", (nodeName,)) + dbCur.execute("DELETE FROM edges WHERE parent = ?", (nodeName,)) dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,)) dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,)) - dbCur.execute("DELETE FROM eol_ids WHERE name = ?", (nodeName,)) + # Could also delete from 'eol_ids', 'wiki_ids', and 'descs', but this + # makes it much harder to restore the original data if needed, and + # the memory savings didn't seem significant. + print(f"Updating num-tips for {len(nodeToTipsChg)} nodes") iterNum = 0 for (nodeName, tipsChg) in nodeToTipsChg.items(): @@ -129,6 +137,7 @@ for (nodeName, tipsChg) in nodeToTipsChg.items(): print(f"At iteration {iterNum}") # dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName)) -# Close db + +print("Closing database") dbCon.commit() dbCon.close() diff --git a/backend/server.py b/backend/server.py index 888f73a..4a364c3 100755 --- a/backend/server.py +++ b/backend/server.py @@ -28,7 +28,7 @@ if len(sys.argv) > 1: # Classes for objects sent as responses (matches lib.ts types in client-side code) class TolNode: - """ Used when responding to 'node' and 'chain' requests """ + " Used when responding to 'node' and 'chain' requests " def __init__(self, otolId, children, parent=None, tips=0, pSupport=False, commonName=None, imgName=None): self.otolId = otolId # string | null self.children = children # string[] @@ -38,24 +38,24 @@ class TolNode: self.commonName = commonName # null | string self.imgName = imgName # null | string | [string,string] | [null, string] | [string, null] class SearchSugg: - """ Represents a search suggestion """ + " Represents a search suggestion " def __init__(self, name, canonicalName=None): self.name = name # string self.canonicalName = canonicalName # string | null class SearchSuggResponse: - """ Sent as responses to 'search' requests """ + " Sent as responses to 'search' requests " def __init__(self, searchSuggs, hasMore): self.suggs = searchSuggs # SearchSugg[] self.hasMore = hasMore # boolean class DescInfo: - """ Represents a tol-node's associated description """ + " Represents a tol-node's associated description " def __init__(self, text, wikiId, fromRedirect, fromDbp): self.text = text # string self.wikiId = wikiId # number self.fromRedirect = fromRedirect # boolean self.fromDbp = fromDbp # boolean class ImgInfo: - """ Represents a tol-node's associated image """ + " Represents a tol-node's associated image " def __init__(self, id, src, url, license, artist, credit): self.id = id # number self.src = src # string @@ -64,7 +64,7 @@ class ImgInfo: self.artist = artist # string self.credit = credit # string class InfoResponse: - """ Sent as responses to 'info' requests """ + " Sent as responses to 'info' requests " def __init__(self, tolNode, descData, imgData): self.tolNode = tolNode # null | TolNode self.descData = descData # null | DescInfo | [DescInfo, DescInfo] @@ -84,7 +84,7 @@ def lookupNodes(names, useReducedTree): for (nodeName, otolId, tips) in cur.execute(query, names): nameToNodes[nodeName] = TolNode(otolId, [], tips=tips) # Get child info - query = f"SELECT node, child FROM {edgesTable} WHERE node IN ({queryParamStr})" + query = f"SELECT parent, child FROM {edgesTable} WHERE parent IN ({queryParamStr})" for (nodeName, childName) in cur.execute(query, names): nameToNodes[nodeName].children.append(childName) # Order children by tips @@ -96,7 +96,7 @@ def lookupNodes(names, useReducedTree): childToTips[n] = tips node.children.sort(key=lambda n: childToTips[n], reverse=True) # Get parent info - query = f"SELECT node, child, p_support FROM {edgesTable} WHERE child IN ({queryParamStr})" + query = f"SELECT parent, child, p_support FROM {edgesTable} WHERE child IN ({queryParamStr})" for (nodeName, childName, pSupport) in cur.execute(query, names): nameToNodes[childName].parent = nodeName nameToNodes[childName].pSupport = (pSupport == 1) |
