aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-22 23:16:42 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-22 23:16:42 +1000
commitabb936f5d76f7fe5cec1e8948d287da86643d504 (patch)
treef07b9eaadf5ae91363fdbac9d81b74e1fb0a436f
parente78c4df403e5f98afa08f7a0841ff233d5f6d05b (diff)
Refactor backend scriptsextended-db
-rw-r--r--backend/data/README.md80
-rwxr-xr-xbackend/data/addPickedNames.py26
-rwxr-xr-xbackend/data/dbpedia/genDescData.py96
-rw-r--r--backend/data/enwiki/README.md2
-rwxr-xr-xbackend/data/enwiki/downloadImgLicenseInfo.py60
-rwxr-xr-xbackend/data/enwiki/downloadImgs.py (renamed from backend/data/enwiki/downloadEnwikiImgs.py)37
-rwxr-xr-xbackend/data/enwiki/genDescData.py43
-rwxr-xr-xbackend/data/enwiki/genDumpIndexDb.py26
-rwxr-xr-xbackend/data/enwiki/genImgData.py72
-rwxr-xr-xbackend/data/enwiki/lookupPage.py22
-rw-r--r--backend/data/eol/README.md5
-rwxr-xr-xbackend/data/eol/downloadImgs.py152
-rwxr-xr-xbackend/data/eol/genImagesListDb.sh2
-rwxr-xr-xbackend/data/eol/reviewImgs.py48
-rwxr-xr-xbackend/data/genDbpData.py93
-rwxr-xr-xbackend/data/genEnwikiDescData.py54
-rwxr-xr-xbackend/data/genEnwikiNameData.py35
-rwxr-xr-xbackend/data/genEolNameData.py145
-rwxr-xr-xbackend/data/genImgs.py60
-rwxr-xr-xbackend/data/genLinkedImgs.py42
-rwxr-xr-xbackend/data/genOtolData.py181
-rwxr-xr-xbackend/data/genReducedTreeData.py167
-rwxr-xr-xbackend/data/reviewImgsToGen.py80
-rwxr-xr-xbackend/data/trimTree.py53
-rwxr-xr-xbackend/server.py16
25 files changed, 876 insertions, 721 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 7d1adad..f5b35f0 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -1,17 +1,50 @@
This directory holds files used to generate data.db, which contains tree-of-life data.
# Tables:
-- `nodes`: `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT`
-- `edges`: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)`
-- `eol_ids`: `id INT PRIMARY KEY, name TEXT`
-- `names`: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)`
-- `wiki_ids`: `name TEXT PRIMARY KEY, id INT, redirected INT`
-- `descs`: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT`
-- `node_imgs`: `name TEXT PRIMARY KEY, img_id INT, src TEXT`
-- `images`: `id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)`
-- `linked_imgs`: `name TEXT PRIMARY KEY, otol_ids TEXT`
-- `r_nodes`: `name TEXT PRIMARY KEY, tips INT`
-- `r_edges`: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)`
+## Tree Structure data
+- `nodes` <br>
+ Format : `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT` <br>
+ Represents a tree-of-life node. `tips` represents the number of no-child descendants.
+- `edges` <br>
+ Format: `parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child)` <br>
+ `p_support` is 1 if the edge has 'phylogenetic support', and 0 otherwise
+## Node name data
+- `eol_ids` <br>
+ Format: `id INT PRIMARY KEY, name TEXT` <br>
+ Associates an EOL ID with a node's name.
+- `names` <br>
+ Format: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)` <br>
+ Associates a node with alternative names.
+ `pref_alt` is 1 if the alt-name is the most 'preferred' one.
+ `src` indicates the dataset the alt-name was obtained from (can be 'eol', 'enwiki', or 'picked').
+## Node description data
+- `wiki_ids` <br>
+ Format: `name TEXT PRIMARY KEY, id INT, redirected INT` <br>
+ Associates a node with a wikipedia page ID.
+ `redirected` is 1 if the node was associated with a different page that redirected to this one.
+- `descs` <br>
+ Format: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT` <br>
+ Associates a wikipedia page ID with a short-description.
+ `from_dbp` is 1 if the description was obtained from DBpedia, and 0 otherwise.
+## Node image data
+- `node_imgs` <br>
+ Format: `name TEXT PRIMARY KEY, img_id INT, src TEXT` <br>
+ Associates a node with an image.
+- `images` <br>
+ Format: `id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)` <br>
+ Represents an image, identified by a source ('eol', 'enwiki', or 'picked'), and a source-specific ID.
+- `linked_imgs` <br>
+ Format: `name TEXT PRIMARY KEY, otol_ids TEXT` <br>
+ Associates a node with an image from another node.
+ `otol_ids` can be an otol ID, or two comma-separated otol IDs or empty strings.
+ The latter is used for compound nodes.
+## Reduced-tree data
+- `r_nodes` <br>
+ Format: `name TEXT PRIMARY KEY, tips INT` <br>
+ Like `nodes`, but for a reduced tree.
+- `r_edges` <br>
+ Format: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)` <br>
+ Like `edges` but for a reduced tree.
# Generating the Database
@@ -68,7 +101,7 @@ Some of the python scripts require third-party packages:
- pickedEnwikiNamesToSkip.txt: Same as with genDbpData.py.
- pickedEnwikiLabels.txt: Similar to pickedDbpLabels.txt.
-## Generate image data
+## Generate node image data
### Get images from EOL
1. Obtain 'image metadata files' in eol/, as specified in it's README.
2. In eol/, run downloadImgs.py, which downloads images (possibly multiple per node),
@@ -81,7 +114,7 @@ Some of the python scripts require third-party packages:
using the `wiki_ids` table, and stores them in a database.
2. In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing information for
those images, using wikipedia's online API.
-3. In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
+3. In enwiki/, run downloadImgs.py, which downloads 'permissively-licensed'
images into enwiki/imgs/.
### Merge the image sets
1. Run reviewImgsToGen.py, which displays images from eol/imgs/ and enwiki/imgs/,
@@ -107,15 +140,16 @@ Some of the python scripts require third-party packages:
`nodes`, `edges`, and `node_imgs` tables.
## Do some post-processing
-1. Run genReducedTreeData.py, which generates a second, reduced version of the tree,
- adding the `r_nodes` and `r_edges` tables, using `nodes` and `names`. Reads from
- pickedReducedNodes.txt, which lists names of nodes that must be included (1 per line).
-2. Optionally run trimTree.py, which tries to remove some 'low-significance' nodes,
- for the sake of performance and result-relevance. Otherwise, some nodes may have
- over 10k children, which can take a while to render (over a minute in my testing).
- You might want to backup the untrimmed tree first, as this operation is not easily
- reversible.
-3. Optionally run genEnwikiNameData.py, which adds more entries to the `names` table,
+1. Run genEnwikiNameData.py, which adds more entries to the `names` table,
using data in enwiki/, and the `names` and `wiki_ids` tables.
-4. Optionally run addPickedNames.py, which allows adding manually-selected name data to
+2. Optionally run addPickedNames.py, which allows adding manually-selected name data to
the `names` table, as specified in pickedNames.txt.
+ - pickedNames.txt: Has lines of the form `nodeName1|altName1|prefAlt1`.
+ These correspond to entries in the `names` table. `prefAlt` should be 1 or 0.
+ A line like `name1|name1|1` causes a node to have no preferred alt-name.
+3. Run genReducedTreeData.py, which generates a second, reduced version of the tree,
+ adding the `r_nodes` and `r_edges` tables, using `nodes` and `names`. Reads from
+ pickedReducedNodes.txt, which lists names of nodes that must be included (1 per line).
+4. Optionally run trimTree.py, which tries to remove some 'low significance' nodes,
+ for the sake of performance and content-relevance. Otherwise, some nodes may have
+ over 10k children, which can take a while to render (took over a minute in testing).
diff --git a/backend/data/addPickedNames.py b/backend/data/addPickedNames.py
index 3ef099a..d56a0cb 100755
--- a/backend/data/addPickedNames.py
+++ b/backend/data/addPickedNames.py
@@ -3,12 +3,11 @@
import sys
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads alt-name data from a file, and adds it to the 'names' table.\n"
-usageInfo += "The file is expected to have lines of the form: nodeName|altName|prefAlt\n"
-usageInfo += " These correspond to entries in the 'names' table. 'prefAlt' should\n"
-usageInfo += " be 1 or 0. A line may specify name1|name1|1, which causes the node\n"
-usageInfo += " to have no preferred alt-name.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads alt-name data from a file, and adds it to the database's 'names' table.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -16,15 +15,21 @@ if len(sys.argv) > 1:
dbFile = "data.db"
pickedNamesFile = "pickedNames.txt"
-# Open db
+print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Iterate through picked-names file
+
+print("Iterating through picked-names file")
with open(pickedNamesFile) as file:
for line in file:
# Get record data
- (nodeName, altName, prefAlt) = line.lower().rstrip().split("|")
+ nodeName, altName, prefAlt = line.lower().rstrip().split("|")
prefAlt = int(prefAlt)
+ # Check whether there exists a node with the name
+ row = dbCur.execute("SELECT name from nodes where name = ?", (nodeName,)).fetchone()
+ if row == None:
+ print(f"ERROR: No node with name \"{nodeName}\" exists")
+ break
# Remove any existing preferred-alt status
if prefAlt == 1:
query = "SELECT name, alt_name FROM names WHERE name = ? AND pref_alt = 1"
@@ -46,6 +51,7 @@ with open(pickedNamesFile) as file:
print(f"Updating record for alt-name {altName} for {nodeName}")
dbCur.execute("UPDATE names SET pref_alt = ?, src = 'picked' WHERE name = ? AND alt_name = ?",
(prefAlt, nodeName, altName))
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/dbpedia/genDescData.py b/backend/data/dbpedia/genDescData.py
index bba3ff5..d9e8a80 100755
--- a/backend/data/dbpedia/genDescData.py
+++ b/backend/data/dbpedia/genDescData.py
@@ -3,25 +3,28 @@
import sys, re
import bz2, sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n"
-usageInfo += "and creates a sqlite db containing that data.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Adds DBpedia labels/types/abstracts/etc data into a database.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines
+labelsFile = "labels_lang=en.ttl.bz2" # Had about 16e6 entries
idsFile = "page_lang=en_ids.ttl.bz2"
redirectsFile = "redirects_lang=en_transitive.ttl.bz2"
disambigFile = "disambiguations_lang=en.ttl.bz2"
typesFile = "instance-types_lang=en_specific.ttl.bz2"
abstractsFile = "short-abstracts_lang=en.ttl.bz2"
dbFile = "descData.db"
+# In testing, this script took a few hours to run, and generated about 10GB
-# Open db
+print("Creating database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Read/store labels
+
print("Reading/storing label data")
dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)")
dbCur.execute("CREATE INDEX labels_idx ON labels(label)")
@@ -32,16 +35,13 @@ with bz2.open(labelsFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print(f"Processing line {lineNum}")
+ print(f"At line {lineNum}")
#
match = labelLineRegex.fullmatch(line)
if match == None:
- print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
- sys.exit(1)
- else:
- dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2)))
-dbCon.commit()
-# Read/store wiki page ids
+ raise Exception(f"ERROR: Line {lineNum} has unexpected format")
+ dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2)))
+
print("Reading/storing wiki page ids")
dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)")
idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n')
@@ -50,20 +50,17 @@ with bz2.open(idsFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print(f"Processing line {lineNum}")
+ print(f"At line {lineNum}")
#
match = idLineRegex.fullmatch(line)
if match == None:
- print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
- sys.exit(1)
- else:
- try:
- dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2))))
- except sqlite3.IntegrityError as e:
- # Accounts for certain lines that have the same IRI
- print(f"Failed to add entry with IRI \"{match.group(1)}\": {e}")
-dbCon.commit()
-# Read/store redirects
+ raise Exception(f"ERROR: Line {lineNum} has unexpected format")
+ try:
+ dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2))))
+ except sqlite3.IntegrityError as e:
+ # Accounts for certain lines that have the same IRI
+ print(f"WARNING: Failed to add entry with IRI \"{match.group(1)}\": {e}")
+
print("Reading/storing redirection data")
dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)")
redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
@@ -72,37 +69,28 @@ with bz2.open(redirectsFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print(f"Processing line {lineNum}")
+ print(f"At line {lineNum}")
#
match = redirLineRegex.fullmatch(line)
if match == None:
- print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
- sys.exit(1)
- else:
- dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2)))
-dbCon.commit()
-# Read/store diambiguation-page data
+ raise Exception(f"ERROR: Line {lineNum} has unexpected format")
+ dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2)))
+
print("Reading/storing diambiguation-page data")
-disambigNames = set()
+dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)")
disambigLineRegex = redirLineRegex
lineNum = 0
with bz2.open(disambigFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print(f"Processing line {lineNum}")
+ print(f"At line {lineNum}")
#
match = disambigLineRegex.fullmatch(line)
if match == None:
- print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
- sys.exit(1)
- else:
- disambigNames.add(match.group(1))
-dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)")
-for name in disambigNames:
- dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,))
-dbCon.commit()
-# Read/store instance-type
+ raise Exception(f"ERROR: Line {lineNum} has unexpected format")
+ dbCur.execute("INSERT OR IGNORE INTO disambiguations VALUES (?)", (match.group(1),))
+
print("Reading/storing instance-type data")
dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)")
dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)")
@@ -112,16 +100,13 @@ with bz2.open(typesFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print(f"Processing line {lineNum}")
+ print(f"At line {lineNum}")
#
match = typeLineRegex.fullmatch(line)
if match == None:
- print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
- sys.exit(1)
- else:
- dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2)))
-dbCon.commit()
-# Read/store abstracts
+ raise Exception(f"ERROR: Line {lineNum} has unexpected format")
+ dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2)))
+
print("Reading/storing abstracts")
dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)")
descLineRegex = labelLineRegex
@@ -130,17 +115,16 @@ with bz2.open(abstractsFile, mode='rt') as file:
for line in file:
lineNum += 1
if lineNum % 1e5 == 0:
- print(f"Processing line {lineNum}")
+ print(f"At line {lineNum}")
#
if line[0] == "#":
continue
match = descLineRegex.fullmatch(line)
if match == None:
- print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
- sys.exit(1)
- else:
- dbCur.execute("INSERT INTO abstracts VALUES (?, ?)",
- (match.group(1), match.group(2).replace(r'\"', '"')))
-# Close db
+ raise Exception(f"ERROR: Line {lineNum} has unexpected format")
+ dbCur.execute("INSERT INTO abstracts VALUES (?, ?)",
+ (match.group(1), match.group(2).replace(r'\"', '"')))
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index 1c16a2e..90d16c7 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -42,7 +42,7 @@ This directory holds files obtained from/using [English Wikipedia](https://en.wi
`img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
- `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
-- downloadEnwikiImgs.py <br>
+- downloadImgs.py <br>
Used to download image files into imgs/.
# Other Files
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
index 097304b..399922e 100755
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ b/backend/data/enwiki/downloadImgLicenseInfo.py
@@ -5,41 +5,48 @@ import sqlite3, urllib.parse, html
import requests
import time, signal
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads image names from a file, and uses enwiki's API to obtain\n"
-usageInfo += "licensing information for them, adding the info to a sqlite db.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and looks\n"
-usageInfo += "at names added to the db to decide what to skip.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads image names from a database, and uses enwiki's online API to obtain
+licensing information for them, adding the info to the database.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+at already-processed names to decide what to skip.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-imgDb = "imgData.db" # About 130k image names
+imgDb = "imgData.db"
apiUrl = "https://en.wikipedia.org/w/api.php"
+userAgent = "terryt.dev (terry06890@gmail.com)"
batchSz = 50 # Max 50
tagRegex = re.compile(r"<[^<]+>")
whitespaceRegex = re.compile(r"\s+")
-# Open db
+print("Opening database")
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
dbCur2 = dbCon.cursor()
-# Create table if it doesn't exist
+print("Checking for table")
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='imgs'").fetchone() == None:
dbCur.execute("CREATE TABLE imgs(" \
"name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)")
-# Get image names
+
print("Reading image names")
imgNames = set()
for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
imgNames.add(imgName)
-print(f"Found {len(imgNames)} images")
+print(f"Found {len(imgNames)}")
+
+print("Checking for already-processed images")
oldSz = len(imgNames)
for (imgName,) in dbCur.execute("SELECT name FROM imgs"):
imgNames.discard(imgName)
-print(f"Skipping {oldSz - len(imgNames)} already-done images")
+print(f"Found {oldSz - len(imgNames)}")
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -48,7 +55,8 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Iterate through image names, making API requests
+
+print("Iterating through image names")
imgNames = list(imgNames)
iterNum = 0
for i in range(0, len(imgNames), batchSz):
@@ -63,7 +71,7 @@ for i in range(0, len(imgNames), batchSz):
imgBatch = ["File:" + x for x in imgBatch]
# Make request
headers = {
- "user-agent": "terryt.dev (terry06890@gmail.com)",
+ "user-agent": userAgent,
"accept-encoding": "gzip",
}
params = {
@@ -80,16 +88,16 @@ for i in range(0, len(imgNames), batchSz):
response = requests.get(apiUrl, params=params, headers=headers)
responseObj = response.json()
except Exception as e:
- print(f"Error while downloading info: {e}", file=sys.stderr)
- print(f"\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
+ print(f"ERROR: Exception while downloading info: {e}")
+ print(f"\tImage batch: " + "|".join(imgBatch))
continue
# Parse response-object
if "query" not in responseObj or "pages" not in responseObj["query"]:
- print("WARNING: Response object for doesn't have page data", file=sys.stderr)
- print("\tImage batch: " + "|".join(imgBatch), file=sys.stderr)
+ print("WARNING: Response object for doesn't have page data")
+ print("\tImage batch: " + "|".join(imgBatch))
if "error" in responseObj:
errorCode = responseObj["error"]["code"]
- print(f"\tError code: {errorCode}", file=sys.stderr)
+ print(f"\tError code: {errorCode}")
if errorCode == "maxlag":
time.sleep(5)
continue
@@ -111,10 +119,10 @@ for i in range(0, len(imgNames), batchSz):
title = normalisedToInput[title]
title = title[5:] # Remove 'File:'
if title not in imgNames:
- print(f"WARNING: Got title \"{title}\" not in image-name list", file=sys.stderr)
+ print(f"WARNING: Got title \"{title}\" not in image-name list")
continue
if "imageinfo" not in page:
- print(f"WARNING: No imageinfo section for page \"{title}\"", file=sys.stderr)
+ print(f"WARNING: No imageinfo section for page \"{title}\"")
continue
metadata = page["imageinfo"][0]["extmetadata"]
url = page["imageinfo"][0]["url"]
@@ -122,7 +130,7 @@ for i in range(0, len(imgNames), batchSz):
artist = metadata['Artist']['value'] if 'Artist' in metadata else None
credit = metadata['Credit']['value'] if 'Credit' in metadata else None
restrictions = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None
- # Remove newlines
+ # Remove markup
if artist != None:
artist = tagRegex.sub(" ", artist)
artist = whitespaceRegex.sub(" ", artist)
@@ -134,7 +142,9 @@ for i in range(0, len(imgNames), batchSz):
credit = html.unescape(credit)
credit = urllib.parse.unquote(credit)
# Add to db
- dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)", (title, license, artist, credit, restrictions, url))
-# Close db
+ dbCur2.execute("INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)",
+ (title, license, artist, credit, restrictions, url))
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadImgs.py
index 2929a0d..8fb605f 100755
--- a/backend/data/enwiki/downloadEnwikiImgs.py
+++ b/backend/data/enwiki/downloadImgs.py
@@ -5,13 +5,16 @@ import sqlite3
import urllib.parse, requests
import time, signal
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an sqlite db,\n"
-usageInfo += "into a specified directory.'\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish an ongoing download and exit.\n"
-usageInfo += "The program can be re-run to continue downloading, and looks\n"
-usageInfo += "in the output directory do decide what to skip.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Downloads images from URLs in an image database, into an output directory,
+with names of the form 'pageId1.ext1'.
+
+SIGINT causes the program to finish an ongoing download and exit.
+The program can be re-run to continue downloading, and looks
+in the output directory do decide what to skip.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -19,18 +22,18 @@ if len(sys.argv) > 1:
imgDb = "imgData.db" # About 130k image names
outDir = "imgs"
licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
+# In testing, this downloaded about 100k images, over several days
-# Create output directory if not present
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Get existing image names
-print("Gettings already-downloaded images")
+print("Checking for already-downloaded images")
fileList = os.listdir(outDir)
pageIdsDone = set()
for filename in fileList:
(basename, extension) = os.path.splitext(filename)
pageIdsDone.add(int(basename))
-print(f"Found {len(pageIdsDone)} already-downloaded images")
+print(f"Found {len(pageIdsDone)}")
+
# Set SIGINT handler
interrupted = False
oldHandler = None
@@ -39,10 +42,10 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
-# Open db
+
+print("Opening database")
dbCon = sqlite3.connect(imgDb)
dbCur = dbCon.cursor()
-# Start downloads
print("Starting downloads")
iterNum = 0
query = "SELECT page_id, license, artist, credit, restrictions, url FROM" \
@@ -68,7 +71,7 @@ for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query)
urlParts = urllib.parse.urlparse(url)
extension = os.path.splitext(urlParts.path)[1]
if len(extension) <= 1:
- print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
+ print(f"WARNING: No filename extension found in URL {url}")
sys.exit(1)
outFile = f"{outDir}/{pageId}{extension}"
headers = {
@@ -81,8 +84,8 @@ for (pageId, license, artist, credit, restrictions, url) in dbCur.execute(query)
file.write(response.content)
time.sleep(1)
# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to "throttle self to 1 cache miss per sec"
- # It's unclear how to properly check for cache misses, so just do about <=1 per sec
+ # It's unclear how to properly check for cache misses, so this just aims for 1 per sec
except Exception as e:
- print(f"Error while downloading to {outFile}: {e}", file=sys.stderr)
-# Close db
+ print(f"Error while downloading to {outFile}: {e}")
+print("Closing database")
dbCon.close()
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
index 032dbed..b0ca272 100755
--- a/backend/data/enwiki/genDescData.py
+++ b/backend/data/enwiki/genDescData.py
@@ -5,31 +5,36 @@ import bz2
import html, mwxml, mwparserfromhell
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
-usageInfo += "and short-description info to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads through the wiki dump, and attempts to
+parse short-descriptions, and add them to a database.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages
enwikiDb = "descData.db"
+# In testing, this script took over 10 hours to run, and generated about 5GB
-# Some regexps and functions for parsing wikitext
descLineRegex = re.compile("^ *[A-Z'\"]")
embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
-parensGrpRegex = re.compile(r" \([^()]*\)")
-leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
def convertTemplateReplace(match):
if match.group(2) == None:
return f"{match.group(1)} {match.group(4)}"
else:
return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
+parensGroupRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+
def parseDesc(text):
- # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
- # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines,
+ # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs,
+ # and then accumulate lines until a blank one.
+ # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines,
# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell,
lines = []
openBraceCount = 0
@@ -74,18 +79,15 @@ def removeMarkup(content):
content = embeddedHtmlRegex.sub("", content)
content = convertTemplateRegex.sub(convertTemplateReplace, content)
content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
- content = parensGrpRegex.sub("", content)
+ content = parensGroupRegex.sub("", content)
content = leftoverBraceRegex.sub("", content)
return content
-# Other helper functions
def convertTitle(title):
return html.unescape(title).replace("_", " ")
-# Check for existing db
+print("Creating database")
if os.path.exists(enwikiDb):
- print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
- sys.exit(1)
-# Create db
+ raise Exception(f"ERROR: Existing {enwikiDb}")
dbCon = sqlite3.connect(enwikiDb)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
@@ -93,8 +95,8 @@ dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Read through dump file
-print("Reading dump file")
+
+print("Iterating through dump file")
with bz2.open(dumpFile, mode='rt') as file:
dump = mwxml.Dump.from_file(file)
pageNum = 0
@@ -102,13 +104,15 @@ with bz2.open(dumpFile, mode='rt') as file:
pageNum += 1
if pageNum % 1e4 == 0:
print(f"At page {pageNum}")
+ if pageNum > 3e4:
+ break
# Parse page
if page.namespace == 0:
try:
dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
except sqlite3.IntegrityError as e:
# Accounts for certain pages that have the same title
- print(f"Failed to add page with title \"{page.title}\": {e}")
+ print(f"Failed to add page with title \"{page.title}\": {e}", file=sys.stderr)
continue
if page.redirect != None:
dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
@@ -117,6 +121,7 @@ with bz2.open(dumpFile, mode='rt') as file:
desc = parseDesc(revision.text)
if desc != None:
dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/genDumpIndexDb.py b/backend/data/enwiki/genDumpIndexDb.py
index ee3e813..3955885 100755
--- a/backend/data/enwiki/genDumpIndexDb.py
+++ b/backend/data/enwiki/genDumpIndexDb.py
@@ -4,25 +4,26 @@ import sys, os, re
import bz2
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump index file,\n"
-usageInfo += "and stores it's offset and title data to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Adds data from the wiki dump index-file into a database.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # 22,034,540 lines
+indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines
indexDb = "dumpIndex.db"
-# Check for existing db
if os.path.exists(indexDb):
- print(f"ERROR: Existing {indexDb}", file=sys.stderr)
- sys.exit(1)
-# Create db
+ raise Exception(f"ERROR: Existing {indexDb}")
+print("Creating database")
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)")
-# Reading index file
+
+print("Iterating through index file")
lineRegex = re.compile(r"([^:]+):([^:]+):(.*)")
lastOffset = 0
lineNum = 0
@@ -42,7 +43,7 @@ with bz2.open(indexFile, mode='rt') as file:
dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (t, p, lastOffset, offset))
except sqlite3.IntegrityError as e:
# Accounts for certain entries in the file that have the same title
- print(f"Failed on title \"{t}\": {e}")
+ print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
entriesToAdd = []
lastOffset = offset
entriesToAdd.append([title, pageId])
@@ -50,7 +51,8 @@ for (title, pageId) in entriesToAdd:
try:
dbCur.execute("INSERT INTO offsets VALUES (?, ?, ?, ?)", (title, pageId, lastOffset, -1))
except sqlite3.IntegrityError as e:
- print(f"Failed on title \"{t}\": {e}")
-# Close db
+ print(f"Failed on title \"{t}\": {e}", file=sys.stderr)
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py
index 9bd28f4..dedfe14 100755
--- a/backend/data/enwiki/genImgData.py
+++ b/backend/data/enwiki/genImgData.py
@@ -4,9 +4,15 @@ import sys, re
import bz2, html, urllib.parse
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
-usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of page IDs, looks up their content in the wiki dump,
+and tries to parse infobox image names, storing them into a database.
+
+The program can be re-run with an updated set of page IDs, and
+will skip already-processed page IDs.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -21,58 +27,64 @@ def getInputPageIds():
return pageIds
dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
indexDb = "dumpIndex.db"
-imgDb = "imgData.db" # Output db
+imgDb = "imgData.db" # The database to create
idLineRegex = re.compile(r"<id>(.*)</id>")
imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+# In testing, got about 360k image names
-# Open dbs
+print("Getting input page-ids")
+pageIds = getInputPageIds()
+print(f"Found {len(pageIds)}")
+
+print("Opening databases")
indexDbCon = sqlite3.connect(indexDb)
indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
-# Create image-db table
-pidsDone = set()
+print("Checking tables")
if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+ # Create tables if not present
imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
else:
+ # Check for already-processed page IDs
+ numSkipped = 0
for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
- pidsDone.add(pid)
- print(f"Will skip {len(pidsDone)} already-processed page-ids")
-# Get input pageIds
-print("Getting input page-ids", file=sys.stderr)
-pageIds = getInputPageIds()
-for pid in pidsDone:
- pageIds.remove(pid)
-print(f"Found {len(pageIds)} page-ids to process")
-# Get page-id dump-file offsets
-print("Getting dump-file offsets", file=sys.stderr)
+ if pid in pageIds:
+ pageIds.remove(pid)
+ numSkipped += 1
+ else:
+ print(f"WARNING: Found already-processed page ID {pid} which was not in input set")
+ print(f"Will skip {numSkipped} already-processed page IDs")
+
+print("Getting dump-file offsets")
offsetToPageids = {}
-offsetToEnd = {}
+offsetToEnd = {} # Maps chunk-start offsets to their chunk-end offsets
iterNum = 0
for pageId in pageIds:
iterNum += 1
if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}", file=sys.stderr)
+ print(f"At iteration {iterNum}")
#
query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
row = indexDbCur.execute(query, (pageId,)).fetchone()
if row == None:
- print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
+ print(f"WARNING: Page ID {pageId} not found")
continue
(chunkOffset, endOffset) = row
offsetToEnd[chunkOffset] = endOffset
if chunkOffset not in offsetToPageids:
offsetToPageids[chunkOffset] = []
offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
-# Look through dump file, jumping to chunks containing relevant pages
-print("Reading through dump file", file=sys.stderr)
+print(f"Found {len(offsetToEnd)} chunks to check")
+
+print("Iterating through chunks in dump file")
def getImageName(content):
- """ Given an array of text-content lines, returns an image-filename, or None """
+ " Given an array of text-content lines, tries to return an infoxbox image name, or None "
+ # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
for line in content:
match = imageLineRegex.match(line)
if match != None:
@@ -109,16 +121,15 @@ def getImageName(content):
imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
imageName = imageName.replace("_", " ")
return imageName
- # Skip lines like: | image = &lt;imagemap&gt;
+ # Exclude lines like: | image = &lt;imagemap&gt;
return None
- # Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
return None
with open(dumpFile, mode='rb') as file:
iterNum = 0
for (pageOffset, endOffset) in offsetToEnd.items():
iterNum += 1
if iterNum % 100 == 0:
- print(f"At iteration {iterNum}", file=sys.stderr)
+ print(f"At iteration {iterNum}")
#
pageIds = offsetToPageids[pageOffset]
# Jump to chunk
@@ -168,11 +179,12 @@ with open(dumpFile, mode='rb') as file:
imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
break
if not foundTextEnd:
- print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
+ print(f"WARNING: Did not find </text> for page id {pageId}")
break
if not foundText:
- print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
-# Close dbs
+ print(f"WARNING: Did not find <text> for page id {pageId}")
+
+print("Closing databases")
indexDbCon.close()
imgDbCon.commit()
imgDbCon.close()
diff --git a/backend/data/enwiki/lookupPage.py b/backend/data/enwiki/lookupPage.py
index 76f2f95..1a90851 100755
--- a/backend/data/enwiki/lookupPage.py
+++ b/backend/data/enwiki/lookupPage.py
@@ -4,9 +4,12 @@ import sys, re
import bz2
import sqlite3
-usageInfo = f"usage: {sys.argv[0]} title1\n"
-usageInfo += "Looks up a page with title title1 in a wikipedia dump,\n"
-usageInfo += "using a dump index db, and prints the corresponding <page>.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]} title1
+
+Looks up a page with title title1 in the wiki dump, using
+the dump-index db, and prints the corresponding <page>.
+"""
if len(sys.argv) != 2:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -15,20 +18,19 @@ dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
indexDb = "dumpIndex.db"
pageTitle = sys.argv[1].replace("_", " ")
-# Searching index file
-print("Lookup offset in index db")
+print("Looking up offset in index db")
dbCon = sqlite3.connect(indexDb)
dbCur = dbCon.cursor()
query = "SELECT title, offset, next_offset FROM offsets WHERE title = ?"
row = dbCur.execute(query, (pageTitle,)).fetchone()
if row == None:
print("Title not found")
- sys.exit(1)
-(_, pageOffset, endOffset) = row
+ sys.exit(0)
+_, pageOffset, endOffset = row
dbCon.close()
print(f"Found chunk at offset {pageOffset}")
-# Read dump file
-print("Reading dump file")
+
+print("Reading from wiki dump")
content = []
with open(dumpFile, mode='rb') as file:
# Get uncompressed chunk
@@ -61,6 +63,6 @@ with open(dumpFile, mode='rb') as file:
if line.lstrip() == "</page>":
break
lineIdx += 1
-# Print content
+
print("Content: ")
print("\n".join(content))
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md
index fbb008d..8c527a8 100644
--- a/backend/data/eol/README.md
+++ b/backend/data/eol/README.md
@@ -11,9 +11,10 @@ This directory holds files obtained from/using the [Encyclopedia of Life](https:
Contains metadata for images from EOL.
- imagesList/ <br>
Extracted from imagesList.tgz.
+- genImagesListDb.sh <br>
+ Creates a database, and imports imagesList/*.csv files into it.
- imagesList.db <br>
- Contains data from imagesList/.
- Created by running genImagesListDb.sh, which simply imports csv files into a database. <br>
+ Created by running genImagesListDb.sh <br>
Tables: <br>
- `images`:
`content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT`
diff --git a/backend/data/eol/downloadImgs.py b/backend/data/eol/downloadImgs.py
index ac72ea1..96bc085 100755
--- a/backend/data/eol/downloadImgs.py
+++ b/backend/data/eol/downloadImgs.py
@@ -7,18 +7,24 @@ import time
from threading import Thread
import signal
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Downloads images from URLs specified in an image-list database,\n"
-usageInfo += "for a specified set of EOL IDs. Downloaded images get names of\n"
-usageInfo += "the form 'eolId1 contentId1.ext1'.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT causes the program to finish ongoing downloads and exit.\n"
-usageInfo += "The program can be re-run to continue downloading. It looks for\n"
-usageInfo += "existing downloaded files, and continues after the one with\n"
-usageInfo += "highest EOL ID.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+For some set of EOL IDs, downloads associated images from URLs in
+an image-list database. Uses multiple downloading threads.
+
+May obtain multiple images per ID. The images will get names
+with the form 'eolId1 contentId1.ext1'.
+
+SIGINT causes the program to finish ongoing downloads and exit.
+The program can be re-run to continue downloading. It looks for
+already-downloaded files, and continues after the one with
+highest EOL ID.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
+# In testing, this downloaded about 70k images, over a few days
imagesListDb = "imagesList.db"
def getInputEolIds():
@@ -30,44 +36,29 @@ def getInputEolIds():
dbCon.close()
return eolIds
outDir = "imgsForReview/"
-LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
+MAX_IMGS_PER_ID = 3
+MAX_THREADS = 5
POST_DL_DELAY_MIN = 2 # Minimum delay in seconds to pause after download before starting another (for each thread)
POST_DL_DELAY_MAX = 3
+LICENSE_REGEX = r"cc-by((-nc)?(-sa)?(-[234]\.[05])?)|cc-publicdomain|cc-0-1\.0|public domain"
-# Get eol-ids from data db
print("Getting input EOL IDs")
eolIds = getInputEolIds()
-# Get eol-ids from images db
-print("Getting images-list-db EOL IDs")
+print("Getting EOL IDs to download for")
+# Get IDs from images-list db
imgDbCon = sqlite3.connect(imagesListDb)
imgCur = imgDbCon.cursor()
imgListIds = set()
-for row in imgCur.execute("SELECT DISTINCT page_id FROM images"):
- imgListIds.add(row[0])
-# Get eol-id intersection, and sort into list
+for (pageId,) in imgCur.execute("SELECT DISTINCT page_id FROM images"):
+ imgListIds.add(pageId)
+# Get set intersection, and sort into list
eolIds = eolIds.intersection(imgListIds)
eolIds = sorted(eolIds)
-print(f"Resulted in {len(eolIds)} EOL IDs")
+print(f"Result: {len(eolIds)} EOL IDs")
-MAX_IMGS_PER_ID = 3
-MAX_THREADS = 5
-numThreads = 0
-threadException = None # Used for ending main thread after a non-main thread exception
-def downloadImg(url, outFile):
- global numThreads, threadException
- try:
- data = requests.get(url)
- with open(outFile, 'wb') as file:
- file.write(data.content)
- time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
- except Exception as e:
- print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
- threadException = e
- numThreads -= 1
-# Create output directory if not present
+print("Checking output directory")
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Find next eol ID to download for
print("Finding next ID to download for")
nextIdx = 0
fileList = os.listdir(outDir)
@@ -78,7 +69,11 @@ if len(ids) > 0:
if nextIdx == len(eolIds):
print("No IDs left. Exiting...")
sys.exit(0)
-# Detect SIGINT signals
+
+print("Starting download threads")
+numThreads = 0
+threadException = None # Used for ending main thread after a non-main thread exception
+# Handle SIGINT signals
interrupted = False
oldHandler = None
def onSigint(sig, frame):
@@ -86,33 +81,27 @@ def onSigint(sig, frame):
interrupted = True
signal.signal(signal.SIGINT, oldHandler)
oldHandler = signal.signal(signal.SIGINT, onSigint)
+# Function for threads to execute
+def downloadImg(url, outFile):
+ global numThreads, threadException
+ try:
+ data = requests.get(url)
+ with open(outFile, 'wb') as file:
+ file.write(data.content)
+ time.sleep(random.random() * (POST_DL_DELAY_MAX - POST_DL_DELAY_MIN) + POST_DL_DELAY_MIN)
+ except Exception as e:
+ print(f"Error while downloading to {outFile}: {str(e)}", file=sys.stderr)
+ threadException = e
+ numThreads -= 1
# Manage downloading
for idx in range(nextIdx, len(eolIds)):
eolId = eolIds[idx]
# Get image urls
imgDataList = []
ownerSet = set() # Used to get images from different owners, for variety
- for row in imgCur.execute(
- "SELECT content_id, page_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?", (eolId,)):
- license = row[3]
- copyrightOwner = row[4]
- if re.fullmatch(LICENSE_REGEX, license) == None:
- continue
- if len(copyrightOwner) > 100: # Ignore certain copyrightOwner fields that seem long and problematic
- continue
- if copyrightOwner not in ownerSet:
- ownerSet.add(copyrightOwner)
- imgDataList.append(row)
- if len(ownerSet) == MAX_IMGS_PER_ID:
- break
- if len(imgDataList) == 0:
- continue
- # Determine output filenames
- outFiles = []
- urls = []
- for row in imgDataList:
- contentId = row[0]
- url = row[2]
+ exitLoop = False
+ query = "SELECT content_id, copy_url, license, copyright_owner FROM images WHERE page_id = ?"
+ for (contentId, url, license, copyrightOwner) in imgCur.execute(query, (eolId,)):
if url.startswith("data/"):
url = "https://content.eol.org/" + url
urlParts = urllib.parse.urlparse(url)
@@ -120,28 +109,37 @@ for idx in range(nextIdx, len(eolIds)):
if len(extension) <= 1:
print(f"WARNING: No filename extension found in URL {url}", file=sys.stderr)
continue
- outFiles.append(str(eolId) + " " + str(contentId) + extension)
- urls.append(url)
- # Start downloads
- exitLoop = False
- for i in range(len(outFiles)):
- outPath = outDir + outFiles[i]
- if not os.path.exists(outPath):
- # Enforce thread limit
- while numThreads == MAX_THREADS:
+ # Check image-quantity limit
+ if len(ownerSet) == MAX_IMGS_PER_ID:
+ break
+ # Check for skip conditions
+ if re.fullmatch(LICENSE_REGEX, license) == None:
+ continue
+ if len(copyrightOwner) > 100: # Avoid certain copyrightOwner fields that seem long and problematic
+ continue
+ if copyrightOwner in ownerSet:
+ continue
+ ownerSet.add(copyrightOwner)
+ # Determine output filename
+ outPath = f"{outDir}{eolId} {contentId}{extension}"
+ if os.path.exists(outPath):
+ print(f"WARNING: {outPath} already exists. Skipping download.")
+ continue
+ # Check thread limit
+ while numThreads == MAX_THREADS:
+ time.sleep(1)
+ # Wait for threads after an interrupt or thread-exception
+ if interrupted or threadException != None:
+ print("Waiting for existing threads to end")
+ while numThreads > 0:
time.sleep(1)
- # Wait for threads after an interrupt or thread-exception
- if interrupted or threadException != None:
- print("Waiting for existing threads to end")
- while numThreads > 0:
- time.sleep(1)
- exitLoop = True
- break
- print(f"Downloading image to {outPath}")
- # Perform download
- numThreads += 1
- thread = Thread(target=downloadImg, args=(urls[i], outPath), daemon=True)
- thread.start()
+ exitLoop = True
+ break
+ # Perform download
+ print(f"Downloading image to {outPath}")
+ numThreads += 1
+ thread = Thread(target=downloadImg, args=(url, outPath), daemon=True)
+ thread.start()
if exitLoop:
break
# Close images-list db
diff --git a/backend/data/eol/genImagesListDb.sh b/backend/data/eol/genImagesListDb.sh
index 3a8ced7..87dd840 100755
--- a/backend/data/eol/genImagesListDb.sh
+++ b/backend/data/eol/genImagesListDb.sh
@@ -1,7 +1,9 @@
#!/bin/bash
set -e
+# Combine CSV files into one, skipping header lines
cat imagesList/media_*_{1..58}.csv | tail -n +2 > imagesList.csv
+# Create database, and import the CSV file
sqlite3 imagesList.db <<END
CREATE TABLE images (
content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT);
diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py
index 5290f9e..ecdf7ab 100755
--- a/backend/data/eol/reviewImgs.py
+++ b/backend/data/eol/reviewImgs.py
@@ -7,11 +7,14 @@ from tkinter import ttk
import PIL
from PIL import ImageTk, Image, ImageOps
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Provides a GUI for reviewing images. Looks in a for-review directory for\n"
-usageInfo += "images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to\n"
-usageInfo += "choose an image to keep, or reject all. Also provides image rotation.\n"
-usageInfo += "Chosen images are placed in another directory, and rejected ones are deleted.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Provides a GUI for reviewing images. Looks in a for-review directory for
+images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to
+choose an image to keep, or reject all. Also provides image rotation.
+Chosen images are placed in another directory, and rejected ones are deleted.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -21,6 +24,7 @@ outDir = "imgs/"
extraInfoDbCon = sqlite3.connect("../data.db")
extraInfoDbCur = extraInfoDbCon.cursor()
def getExtraInfo(eolId):
+ global extraInfoDbCur
query = "SELECT names.alt_name FROM" \
" names INNER JOIN eol_ids ON eol_ids.name = names.name" \
" WHERE id = ? and pref_alt = 1"
@@ -31,21 +35,21 @@ def getExtraInfo(eolId):
return f"Reviewing EOL ID {eolId}"
IMG_DISPLAY_SZ = 400
MAX_IMGS_PER_ID = 3
-PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
+IMG_BG_COLOR = (88, 28, 135)
+PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), IMG_BG_COLOR)
-# Create output directory if not present
+print("Checking output directory")
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Get images for review
-print("Reading input image list")
+print("Getting input image list")
imgList = os.listdir(imgDir)
imgList.sort(key=lambda s: int(s.split(" ")[0]))
if len(imgList) == 0:
- print("No input images found", file=sys.stderr)
- sys.exit(1)
+ print("No input images found")
+ sys.exit(0)
class EolImgReviewer:
- """ Provides the GUI for reviewing images """
+ " Provides the GUI for reviewing images "
def __init__(self, root, imgList):
self.root = root
root.title("EOL Image Reviewer")
@@ -68,7 +72,7 @@ class EolImgReviewer:
# Add padding
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
- # Add bindings
+ # Add keyboard bindings
root.bind("<q>", self.quit)
root.bind("<Key-j>", lambda evt: self.accept(0))
root.bind("<Key-k>", lambda evt: self.accept(1))
@@ -87,11 +91,11 @@ class EolImgReviewer:
self.nextImgNames = []
self.rotations = []
self.getNextImgs()
- # For more info
+ # For displaying extra info
self.numReviewed = 0
self.startTime = time.time()
def getNextImgs(self):
- """ Updates display with new images to review, or ends program """
+ " Updates display with new images to review, or ends program "
# Gather names of next images to review
for i in range(MAX_IMGS_PER_ID):
if self.imgListIdx == len(self.imgList):
@@ -123,7 +127,7 @@ class EolImgReviewer:
del self.nextImgNames[idx]
del self.rotations[idx]
continue
- self.imgs[idx] = self.resizeForDisplay(img)
+ self.imgs[idx] = self.resizeImgForDisplay(img)
else:
self.imgs[idx] = PLACEHOLDER_IMG
self.photoImgs[idx] = ImageTk.PhotoImage(self.imgs[idx])
@@ -140,7 +144,7 @@ class EolImgReviewer:
title += f" (imgs {firstImgIdx} to {lastImgIdx} out of {len(self.imgList)})"
self.root.title(title)
def accept(self, imgIdx):
- """ React to a user selecting an image """
+ " React to a user selecting an image "
if imgIdx >= len(self.nextImgNames):
print("Invalid selection")
return
@@ -159,19 +163,20 @@ class EolImgReviewer:
self.numReviewed += 1
self.getNextImgs()
def reject(self):
- """ React to a user rejecting all images of a set """
+ " React to a user rejecting all images of a set "
for i in range(len(self.nextImgNames)):
os.remove(imgDir + self.nextImgNames[i])
self.numReviewed += 1
self.getNextImgs()
def rotate(self, imgIdx, anticlockwise = False):
- """ Respond to a user rotating an image """
+ " Respond to a user rotating an image "
deg = -90 if not anticlockwise else 90
self.imgs[imgIdx] = self.imgs[imgIdx].rotate(deg)
self.photoImgs[imgIdx] = ImageTk.PhotoImage(self.imgs[imgIdx])
self.labels[imgIdx].config(image=self.photoImgs[imgIdx])
self.rotations[imgIdx] = (self.rotations[imgIdx] + deg) % 360
def quit(self, e = None):
+ global extraInfoDbCon
print(f"Number reviewed: {self.numReviewed}")
timeElapsed = time.time() - self.startTime
print(f"Time elapsed: {timeElapsed:.2f} seconds")
@@ -179,8 +184,8 @@ class EolImgReviewer:
print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
extraInfoDbCon.close()
self.root.destroy()
- def resizeForDisplay(self, img):
- """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
+ def resizeImgForDisplay(self, img):
+ " Returns a copy of an image, shrunk to fit in it's frame (keeps aspect ratio), and with a background "
if max(img.width, img.height) > IMG_DISPLAY_SZ:
if (img.width > img.height):
newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
@@ -194,6 +199,7 @@ class EolImgReviewer:
int((IMG_DISPLAY_SZ - img.height) / 2)))
return bgImg
# Create GUI and defer control
+print("Starting GUI")
root = tki.Tk()
EolImgReviewer(root, imgList)
root.mainloop()
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
index afe1e17..df3a6be 100755
--- a/backend/data/genDbpData.py
+++ b/backend/data/genDbpData.py
@@ -3,11 +3,12 @@
import sys, os, re
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads DBpedia data from dbpedia/*, along with tree-of-life\n"
-usageInfo += "node and name data from a sqlite database, associates nodes with\n"
-usageInfo += "DBpedia IRIs, and adds alt-name and description information for\n"
-usageInfo += "those nodes.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads a database containing data from DBpedia, and tries to associate
+DBpedia IRIs with nodes in a database, adding short-descriptions for them.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -16,18 +17,21 @@ dbpediaDb = "dbpedia/descData.db"
namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
pickedLabelsFile = "pickedDbpLabels.txt"
dbFile = "data.db"
+rootNodeName = "cellular organisms"
+rootLabel = "organism" # Will be associated with root node
+# Got about 400k descriptions when testing
-# Open dbs
+print("Opening databases")
dbpCon = sqlite3.connect(dbpediaDb)
dbpCur = dbpCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Get node names
-print("Reading node names")
+
+print("Getting node names")
nodeNames = set()
for (name,) in dbCur.execute("SELECT name from nodes"):
nodeNames.add(name)
-# Skipping certain names
+
print("Checking for names to skip")
oldSz = len(nodeNames)
if os.path.exists(namesToSkipFile):
@@ -35,22 +39,22 @@ if os.path.exists(namesToSkipFile):
for line in file:
nodeNames.remove(line.rstrip())
print(f"Skipping {oldSz - len(nodeNames)} nodes")
-# Get disambiguation page labels
+
print("Reading disambiguation-page labels")
disambigLabels = set()
query = "SELECT labels.iri from labels INNER JOIN disambiguations ON labels.iri = disambiguations.iri"
for (label,) in dbpCur.execute(query):
disambigLabels.add(label)
-# Try associating nodes with IRIs, accounting for disambiguation labels
-print("Trying to associate nodes with labels")
+
+print("Trying to associate nodes with DBpedia labels")
nodeToLabel = {}
-nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)")
-nameToVariants = {}
+nameVariantRegex = re.compile(r"(.*) \(([^)]+)\)") # Used to recognise labels like 'Thor (shrimp)'
+nameToVariants = {} # Maps node names to lists of matching labels
iterNum = 0
for (label,) in dbpCur.execute("SELECT label from labels"):
iterNum += 1
if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
+ print(f"At iteration {iterNum}")
#
if label in disambigLabels:
continue
@@ -69,18 +73,20 @@ for (label,) in dbpCur.execute("SELECT label from labels"):
nameToVariants[subName] = [label]
elif name not in nameToVariants[subName]:
nameToVariants[subName].append(label)
+# Associate labels without conflicts
for (name, variants) in nameToVariants.items():
if len(variants) == 1:
nodeToLabel[name] = variants[0]
for name in nodeToLabel:
del nameToVariants[name]
-nodeToLabel["cellular organisms"] = "organism" # Special case for root node
-print(f"Number of conflicts: {len(nameToVariants)}")
-# Try resolving conflicts
+# Special case for root node
+nodeToLabel[rootNodeName] = rootLabel
+if rootNodeName in nameToVariants:
+ del nameToVariants["cellular organisms"]
+
+print("Trying to resolve {len(nameToVariants)} conflicts")
def resolveWithPickedLabels():
- # Attempts conflict resolution using a file with lines of the form 'name1|label1',
- # where label1 may be absent, indicating that no label should be associated with the name
- print("Resolving conflicts using picked-labels")
+ " Attempts to resolve conflicts using a picked-names file "
with open(pickedLabelsFile) as file:
for line in file:
(name, _, label) = line.rstrip().partition("|")
@@ -94,11 +100,13 @@ def resolveWithPickedLabels():
print(f"INFO: Picked label \"{label}\" for name \"{name}\" outside choice set", file=sys.stderr)
nodeToLabel[name] = label
del nameToVariants[name]
- print(f"Remaining number of conflicts: {len(nameToVariants)}")
def resolveWithCategoryList():
- # Attempts conflict resolution using category-text in labels of the form 'name1 (category1)'
- # Does a generic-category pass first (avoid stuff like Pan being classified as a horse instead of an ape)
- print("Resolving conflicts using category-list")
+ """
+ Attempts to resolve conflicts by looking for labels like 'name1 (category1)',
+ and choosing those with a category1 that seems 'biological'.
+ Does two passes, using more generic categories first. This helps avoid stuff like
+ Pan being classified as a horse instead of an ape.
+ """
generalCategories = {
"species", "genus",
"plant", "fungus", "animal",
@@ -107,7 +115,7 @@ def resolveWithCategoryList():
}
specificCategories = {
"protist", "alveolate", "dinoflagellates",
- "orchid", "Poaceae", "fern", "moss", "alga",
+ "orchid", "poaceae", "fern", "moss", "alga",
"bryozoan", "hydrozoan",
"sponge", "cnidarian", "coral", "polychaete", "echinoderm",
"bivalve", "gastropod", "chiton",
@@ -139,10 +147,8 @@ def resolveWithCategoryList():
break
for name in namesToRemove:
del nameToVariants[name]
- print(f"Remaining number of conflicts: {len(nameToVariants)}")
def resolveWithTypeData():
- # Attempts conflict-resolution using dbpedia's instance-type data
- print("Resolving conflicts using instance-type data")
+ " Attempts to resolve conflicts using DBpedia's type data "
taxonTypes = { # Obtained from the DBpedia ontology
"http://dbpedia.org/ontology/Species",
"http://dbpedia.org/ontology/Archaea",
@@ -179,7 +185,7 @@ def resolveWithTypeData():
for (label, type) in dbpCur.execute("SELECT label, type from labels INNER JOIN types on labels.iri = types.iri"):
iterNum += 1
if iterNum % 1e5 == 0:
- print(f"Processing line {iterNum}")
+ print(f"At iteration {iterNum}")
#
if type in taxonTypes:
name = label.lower()
@@ -193,20 +199,17 @@ def resolveWithTypeData():
if name in nameToVariants:
nodeToLabel[name] = label
del nameToVariants[name]
- print(f"Remaining number of conflicts: {len(nameToVariants)}")
+#resolveWithTypeData()
+#resolveWithCategoryList()
resolveWithPickedLabels()
-# Associate nodes with IRIs
+print(f"Remaining number of conflicts: {len(nameToVariants)}")
+
print("Getting node IRIs")
nodeToIri = {}
-iterNum = 0
for (name, label) in nodeToLabel.items():
- row = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone()
- if row == None:
- print(f"ERROR: Couldn't find label {label}", file=sys.stderr)
- sys.exit(1)
- else:
- nodeToIri[name] = row[0]
-# Resolve redirects
+ (iri,) = dbpCur.execute("SELECT iri FROM labels where label = ? COLLATE NOCASE", (label,)).fetchone()
+ nodeToIri[name] = iri
+
print("Resolving redirects")
redirectingIriSet = set()
iterNum = 0
@@ -219,9 +222,10 @@ for (name, iri) in nodeToIri.items():
if row != None:
nodeToIri[name] = row[0]
redirectingIriSet.add(name)
-# Find descriptions, and add to db
-print("Adding node description data")
+
+print("Adding description tables")
dbCur.execute("CREATE TABLE wiki_ids (name TEXT PRIMARY KEY, id INT, redirected INT)")
+dbCur.execute("CREATE INDEX wiki_id_idx ON wiki_ids(id)")
dbCur.execute("CREATE TABLE descs (wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT)")
iterNum = 0
for (name, iri) in nodeToIri.items():
@@ -232,10 +236,11 @@ for (name, iri) in nodeToIri.items():
query = "SELECT abstract, id FROM abstracts INNER JOIN ids ON abstracts.iri = ids.iri WHERE ids.iri = ?"
row = dbpCur.execute(query, (iri,)).fetchone()
if row != None:
- (desc, wikiId) = row
+ desc, wikiId = row
dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, wikiId, 1 if name in redirectingIriSet else 0))
dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (wikiId, desc, 1))
-# Close dbs
+
+print("Closing databases")
dbCon.commit()
dbCon.close()
dbpCon.commit()
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index dbc8d6b..d3f93ed 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -3,10 +3,13 @@
import sys, re, os
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki data from enwiki/, and node and name data"
-usageInfo += "from a sqlite database, and adds description data for names that\n"
-usageInfo += "don't have them.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads a database containing data from Wikipedia, and tries to associate
+wiki pages with nodes in the database, and add descriptions for nodes
+that don't have them.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -15,36 +18,39 @@ enwikiDb = "enwiki/descData.db"
dbFile = "data.db"
namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
pickedLabelsFile = "pickedEnwikiLabels.txt"
+# Got about 25k descriptions when testing
-# Open dbs
+print("Opening databases")
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Read name/title files
+
+print("Checking for names to skip")
namesToSkip = set()
-nameToPickedTitle = {} # Maps names to titles to be used for them
if os.path.exists(namesToSkipFile):
with open(namesToSkipFile) as file:
for line in file:
namesToSkip.add(line.rstrip())
- print(f"Read in {len(namesToSkip)} names to skip")
+ print(f"Found {len(namesToSkip)}")
+print("Checking for picked-titles")
+nameToPickedTitle = {}
if os.path.exists(pickedLabelsFile):
with open(pickedLabelsFile) as file:
for line in file:
(name, _, title) = line.rstrip().partition("|")
nameToPickedTitle[name.lower()] = title
-print(f"Read in {len(nameToPickedTitle)} titles to use for certain names")
-# Get node names without descriptions
-print("Getting node names")
+print(f"Found {len(nameToPickedTitle)}")
+
+print("Getting names of nodes without descriptions")
nodeNames = set()
query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL"
-for row in dbCur.execute(query):
- nodeNames.add(row[0])
-print(f"Found {len(nodeNames)} names")
+for (name,) in dbCur.execute(query):
+ nodeNames.add(name)
+print(f"Found {len(nodeNames)}")
nodeNames.difference_update(namesToSkip)
-# Find page id for each node name
-print("Getting node page-ids")
+
+print("Associating nodes with page IDs")
nodeToPageId = {}
iterNum = 0
for name in nodeNames:
@@ -63,34 +69,34 @@ for name in nodeNames:
nodeToPageId[name] = row[0]
else:
print("WARNING: Picked title {title} not found", file=sys.stderr)
-# Resolve redirects
+
print("Resolving redirects")
redirectingNames = set()
iterNum = 0
for (name, pageId) in nodeToPageId.items():
iterNum += 1
- if iterNum % 1000 == 0:
+ if iterNum % 1e3 == 0:
print(f"At iteration {iterNum}")
#
- row = enwikiCur.execute(
- "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?",
- (pageId,)).fetchone()
+ query = "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?"
+ row = enwikiCur.execute(query, (pageId,)).fetchone()
if row != None:
nodeToPageId[name] = row[0]
redirectingNames.add(name)
-# Add descriptions for each node
+
print("Adding description data")
iterNum = 0
for (name, pageId) in nodeToPageId.items():
iterNum += 1
- if iterNum % 1000 == 0:
+ if iterNum % 1e3 == 0:
print(f"At iteration {iterNum}")
#
row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone()
if row != None:
dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0))
dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0))
-# Close dbs
+
+print("Closing databases")
dbCon.commit()
dbCon.close()
enwikiCon.close()
diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py
index 8285a40..7ad61d1 100755
--- a/backend/data/genEnwikiNameData.py
+++ b/backend/data/genEnwikiNameData.py
@@ -3,9 +3,13 @@
import sys, re
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads Wikimedia enwiki redirect data from enwiki/, and node and wiki-id\n"
-usageInfo += "data from a sqlite database, and adds supplmenentary alt-name data.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads from a database containing data from Wikipdia, along with
+node and wiki-id information from the database, and use wikipedia
+page-redirect information to add additional alt-name data.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -15,19 +19,19 @@ dbFile = "data.db"
altNameRegex = re.compile(r"[a-zA-Z]+")
# Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',
-# Open dbs
+print("Opening databases")
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Get nodes with wiki-ids
+
print("Getting nodes with wiki IDs")
nodeToWikiId = {}
-for row in dbCur.execute("SELECT name, id from wiki_ids"):
- nodeToWikiId[row[0]] = row[1]
-print(f"Found {len(nodeToWikiId)} nodes")
-# Find wiki-ids that redirect to each node
-print("Finding redirecter names")
+for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"):
+ nodeToWikiId[nodeName] = wikiId
+print(f"Found {len(nodeToWikiId)}")
+
+print("Iterating through nodes, finding names that redirect to them")
nodeToAltNames = {}
numAltNames = 0
iterNum = 0
@@ -45,8 +49,8 @@ for (nodeName, wikiId) in nodeToWikiId.items():
nodeToAltNames[nodeName].add(name.lower())
numAltNames += 1
print(f"Found {numAltNames} alt-names")
-# Remove existing alt-names
-print("Removing existing alt-names")
+
+print("Excluding existing alt-names from the set")
query = "SELECT alt_name FROM names WHERE alt_name IN ({})"
iterNum = 0
for (nodeName, altNames) in nodeToAltNames.items():
@@ -60,12 +64,13 @@ for (nodeName, altNames) in nodeToAltNames.items():
numAltNames -= len(existingNames)
altNames.difference_update(existingNames)
print(f"Left with {numAltNames} alt-names")
-# Add alt-names
-print("Adding alt-names")
+
+print("Adding alt-names to database")
for (nodeName, altNames) in nodeToAltNames.items():
for altName in altNames:
dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0))
-# Close dbs
+
+print("Closing databases")
dbCon.commit()
dbCon.close()
enwikiCon.close()
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index d852751..dd33ee0 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -3,34 +3,39 @@
import sys, re, os
import html, csv, sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads vernacular-names CSV data (from the Encyclopedia of Life site),\n"
-usageInfo += "makes associations with node data in a sqlite database, and writes\n"
-usageInfo += "name data to that database.\n"
-usageInfo += "\n"
-usageInfo += "Expects a CSV header describing lines with format:\n"
-usageInfo += " page_id, canonical_form, vernacular_string, language_code,\n"
-usageInfo += " resource_name, is_preferred_by_resource, is_preferred_by_eol\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads files describing name data from the 'Encyclopedia of Life' site,
+tries to associate names with nodes in the database, and adds tables
+to represent associated names.
+
+Reads a vernacularNames.csv file:
+ Starts with a header line containing:
+ page_id, canonical_form, vernacular_string, language_code,
+ resource_name, is_preferred_by_resource, is_preferred_by_eol
+ The canonical_form and vernacular_string fields contain names
+ associated with the page ID. Names are not always unique to
+ particular page IDs.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-vnamesFile = "eol/vernacularNames.csv"
+vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries
dbFile = "data.db"
-NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"}
+namesToSkip = {"unknown", "unknown species", "unidentified species"}
pickedIdsFile = "pickedEolIds.txt"
-badAltsFile = "pickedEolAltsToSkip.txt"
+altsToSkipFile = "pickedEolAltsToSkip.txt"
-# Read in vernacular-names data
- # Note: Canonical-names may have multiple pids
- # Note: A canonical-name's associated pids might all have other associated names
print("Reading in vernacular-names data")
-nameToPids = {}
+nameToPids = {} # 'pid' means 'Page ID'
canonicalNameToPids = {}
pidToNames = {}
-pidToPreferred = {}
+pidToPreferred = {} # Maps pids to 'preferred' names
def updateMaps(name, pid, canonical, preferredAlt):
- if name in NAMES_TO_SKIP:
+ global namesToSkip, nameToPids, canonicalNameToPids, pidToNames, pidToPreferred
+ if name in namesToSkip:
return
if name not in nameToPids:
nameToPids[name] = {pid}
@@ -52,6 +57,9 @@ with open(vnamesFile, newline="") as csvfile:
lineNum = 0
for row in reader:
lineNum += 1
+ if lineNum % 1e5 == 0:
+ print(f"At line {lineNum}")
+ # Skip header line
if lineNum == 1:
continue
# Parse line
@@ -64,7 +72,7 @@ with open(vnamesFile, newline="") as csvfile:
updateMaps(name1, pid, True, False)
if lang == "eng" and name2 != "":
updateMaps(name2, pid, False, preferred)
-# Check for manually-picked pids
+
print("Checking for manually-picked pids")
nameToPickedPid = {}
if os.path.exists(pickedIdsFile):
@@ -73,64 +81,77 @@ if os.path.exists(pickedIdsFile):
(name, _, eolId) = line.rstrip().partition("|")
nameToPickedPid[name] = None if eolId == "" else int(eolId)
print(f"Found {len(nameToPickedPid)}")
-# Read in node-alt_names to avoid
-print("Checking for bad-alt-names")
-nameToBadAlts = {}
-if os.path.exists(badAltsFile):
- with open(badAltsFile) as file:
+
+print("Checking for alt-names to skip")
+nameToAltsToSkip = {}
+numToSkip = 0
+if os.path.exists(altsToSkipFile):
+ with open(altsToSkipFile) as file:
for line in file:
(name, _, altName) = line.rstrip().partition("|")
- if name not in nameToBadAlts:
- nameToBadAlts[name] = [altName]
+ if name not in nameToAltsToSkip:
+ nameToAltsToSkip[name] = [altName]
else:
- nameToBadAlts[name].append(altName)
-print(f"Found bad-alts for {len(nameToBadAlts)} nodes")
-# Open db connection
+ nameToAltsToSkip[name].append(altName)
+ numToSkip += 1
+print(f"Found {numToSkip} alt-names to skip")
+
+print("Creating database tables")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Create tables
dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))")
dbCur.execute("CREATE INDEX names_idx ON names(name)")
dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
-# Iterate through 'nodes' table, resolving to canonical-names
+
+print("Associating nodes with names")
usedPids = set()
unresolvedNodeNames = set()
dbCur2 = dbCon.cursor()
def addToDb(nodeName, pidToUse):
- altNames = set()
- preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
+ " Adds page-ID-associated name data to a node in the database "
+ global dbCur, pidToPreferred
dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
+ # Get alt-names
+ altNames = set()
for n in pidToNames[pidToUse]:
+ # Avoid alt-names with >3 words
if len(n.split(" ")) > 3:
continue
+ # Avoid alt-names that already name a node in the database
if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None:
continue
- if nodeName in nameToBadAlts and n in nameToBadAlts[nodeName]:
- print(f"Excluding bad-alt {n} for node {nodeName}")
+ # Check for picked alt-name-to-skip
+ if nodeName in nameToAltsToSkip and n in nameToAltsToSkip[nodeName]:
+ print(f"Excluding alt-name {n} for node {nodeName}")
continue
+ #
altNames.add(n)
+ # Add alt-names to db
+ preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
for n in altNames:
isPreferred = 1 if (n == preferredName) else 0
dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred))
-for name in nameToPickedPid: # Add manually-picked pids
- pickedPid = nameToPickedPid[name]
- usedPids.add(pickedPid)
- if pickedPid != None:
- addToDb(name, pickedPid)
-iterationNum = 0
-for (name,) in dbCur2.execute("SELECT name FROM nodes"):
- iterationNum += 1
- if iterationNum % 10000 == 0:
- print(f"Loop 1 iteration {iterationNum}")
- if name in nameToPickedPid:
+print("Adding picked IDs")
+for (name, pid) in nameToPickedPid.items():
+ if pid != None:
+ addToDb(name, pid)
+ usedPids.add(pid)
+print("Associating nodes with canonical names")
+iterNum = 0
+for (nodeName,) in dbCur2.execute("SELECT name FROM nodes"):
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"At iteration {iterNum}")
+ if nodeName in nameToPickedPid:
continue
- # If name matches a canonical-name, add alt-name entries to 'names' table
- if name in canonicalNameToPids:
+ # Check for matching canonical name
+ if nodeName in canonicalNameToPids:
pidToUse = None
- for pid in canonicalNameToPids[name]:
+ # Pick an associated page ID
+ for pid in canonicalNameToPids[nodeName]:
hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred
hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred
if hasLowerPrio:
@@ -138,24 +159,26 @@ for (name,) in dbCur2.execute("SELECT name FROM nodes"):
if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio):
pidToUse = pid
if pidToUse != None:
+ addToDb(nodeName, pidToUse)
usedPids.add(pidToUse)
- addToDb(name, pidToUse)
- elif name in nameToPids:
- unresolvedNodeNames.add(name)
-# Iterate through unresolved nodes, resolving to vernacular-names
-iterationNum = 0
-for name in unresolvedNodeNames:
- iterationNum += 1
- if iterationNum % 100 == 0:
- print(f"Loop 2 iteration {iterationNum}")
- # Add alt-name entries to 'names' table for first corresponding pid
+ elif nodeName in nameToPids:
+ unresolvedNodeNames.add(nodeName)
+print("Associating leftover nodes with other names")
+iterNum = 0
+for nodeName in unresolvedNodeNames:
+ iterNum += 1
+ if iterNum % 100 == 0:
+ print(f"At iteration {iterNum}")
+ # Check for matching name
pidToUse = None
- for pid in nameToPids[name]:
+ for pid in nameToPids[nodeName]:
+ # Pick an associated page ID
if pid not in usedPids and (pidToUse == None or pid < pidToUse):
pidToUse = pid
if pidToUse != None:
+ addToDb(nodeName, pidToUse)
usedPids.add(pidToUse)
- addToDb(name, pidToUse)
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genImgs.py b/backend/data/genImgs.py
index 097959f..ecca8e0 100755
--- a/backend/data/genImgs.py
+++ b/backend/data/genImgs.py
@@ -4,13 +4,18 @@ import sys, os, subprocess
import sqlite3, urllib.parse
import signal
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
-usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
-usageInfo += "Also adds image metadata to an sqlite database.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
-usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads node IDs and image paths from a file, and possibly from a directory,
+and generates cropped/resized versions of those images into a directory,
+with names of the form 'nodeId1.jpg'. Also adds image metadata to the
+database.
+
+SIGINT can be used to stop, and the program can be re-run to continue
+processing. It uses already-existing database entries to decide what
+to skip.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -23,19 +28,19 @@ pickedImgsDir = "pickedImgs/"
pickedImgsFilename = "imgData.txt"
dbFile = "data.db"
IMG_OUT_SZ = 200
-genImgFiles = True
+genImgFiles = True # Usable for debugging
-# Create output directory if not present
if not os.path.exists(outDir):
os.mkdir(outDir)
-# Open dbs
+
+print("Opening databases")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
eolCon = sqlite3.connect(eolImgDb)
eolCur = eolCon.cursor()
enwikiCon = sqlite3.connect(enwikiImgDb)
enwikiCur = enwikiCon.cursor()
-# Get 'picked images' info
+print("Checking for picked-images")
nodeToPickedImg = {}
if os.path.exists(pickedImgsDir + pickedImgsFilename):
lineNum = 0
@@ -49,29 +54,34 @@ if os.path.exists(pickedImgsDir + pickedImgsFilename):
"nodeName": nodeName, "id": lineNum,
"filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
}
-# Create image tables if not present
+
+print("Checking for image tables")
nodesDone = set()
imgsDone = set()
if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
+ # Add image tables if not present
dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
dbCur.execute("CREATE TABLE images" \
" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
else:
- # Get existing node-associations
+ # Get existing image-associated nodes
for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
nodesDone.add(otolId)
- # And images
+ # Get existing node-associated images
for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
imgsDone.add((imgId, imgSrc))
- print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
-# Detect SIGINT signals
+ print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images to skip")
+
+# Set SIGINT handler
interrupted = False
def onSigint(sig, frame):
global interrupted
interrupted = True
signal.signal(signal.SIGINT, onSigint)
-# Iterate though images to process
+
+print("Iterating through input images")
def quit():
+ print("Closing databases")
dbCon.commit()
dbCon.close()
eolCon.close()
@@ -94,7 +104,7 @@ def convertImage(imgPath, outPath):
print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
return False
return True
-print("Processing picked images")
+print("Processing picked-images")
for (otolId, imgData) in nodeToPickedImg.items():
# Check for SIGINT event
if interrupted:
@@ -105,7 +115,8 @@ for (otolId, imgData) in nodeToPickedImg.items():
continue
# Convert image
if genImgFiles:
- if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"):
+ success = convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg")
+ if not success:
quit()
else:
print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
@@ -135,7 +146,8 @@ with open(imgListFile) as file:
continue
# Convert image
if genImgFiles:
- if not convertImage(imgPath, outDir + otolId + ".jpg"):
+ success = convertImage(imgPath, outDir + otolId + ".jpg")
+ if not success:
break
else:
if iterNum % 1e4 == 0:
@@ -146,13 +158,13 @@ with open(imgListFile) as file:
imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
imgName = os.path.splitext(imgName)[0] # Remove extension
if fromEol:
- (eolId, _, contentId) = imgName.partition(" ")
- (eolId, contentId) = (int(eolId), int(contentId))
+ eolId, _, contentId = imgName.partition(" ")
+ eolId, contentId = (int(eolId), int(contentId))
if (eolId, "eol") not in imgsDone:
query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
row = eolCur.execute(query, (contentId,)).fetchone()
if row == None:
- print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
+ print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}")
break
(url, license, owner) = row
dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
@@ -167,7 +179,7 @@ with open(imgListFile) as file:
" WHERE page_imgs.page_id = ?"
row = enwikiCur.execute(query, (enwikiId,)).fetchone()
if row == None:
- print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
+ print(f"ERROR: No image record for enwiki ID {enwikiId}")
break
(name, license, artist, credit) = row
url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
diff --git a/backend/data/genLinkedImgs.py b/backend/data/genLinkedImgs.py
index 9fe07a2..a8e1322 100755
--- a/backend/data/genLinkedImgs.py
+++ b/backend/data/genLinkedImgs.py
@@ -3,9 +3,12 @@
import sys, re
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Adds a table to data.db, associating nodes without images to\n"
-usageInfo += "usable child images.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Look for nodes without images in the database, and tries to
+associate them with images from their children.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -14,24 +17,22 @@ dbFile = "data.db"
compoundNameRegex = re.compile(r"\[(.+) \+ (.+)]")
upPropagateCompoundImgs = False
-# Open db
+print("Opening databases")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE linked_imgs (name TEXT PRIMARY KEY, otol_ids TEXT)")
- # Associates a node with one (or two) otol-ids with usable images,
- # encoded as 'otolId1' or 'otolId1,otolId2'
-# Get nodes with images
+
print("Getting nodes with images")
resolvedNodes = {} # Will map node names to otol IDs with a usable image
query = "SELECT nodes.name, nodes.id FROM nodes INNER JOIN node_imgs ON nodes.name = node_imgs.name"
for (name, otolId) in dbCur.execute(query):
resolvedNodes[name] = otolId
-print(f"Got {len(resolvedNodes)} nodes")
-# Iterate through resolved nodes, resolving ancestors where able
-print("Resolving ancestor nodes")
-nodesToResolve = {}
-processedNodes = {}
-parentToChosenTips = {}
+print(f"Found {len(resolvedNodes)}")
+
+print("Iterating through nodes, trying to resolve images for ancestors")
+nodesToResolve = {} # Maps a node name to a list of objects that represent possible child images
+processedNodes = {} # Map a node name to an OTOL ID, representing a child node whose image is to be used
+parentToChosenTips = {} # used to prefer images from children with more tips
iterNum = 0
while len(resolvedNodes) > 0:
iterNum += 1
@@ -43,13 +44,13 @@ while len(resolvedNodes) > 0:
# Traverse upwards, resolving ancestors if able
while True:
# Get parent
- row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (nodeName,)).fetchone()
+ row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (nodeName,)).fetchone()
if row == None or row[0] in processedNodes or row[0] in resolvedNodes:
break
parent = row[0]
# Get parent data
if parent not in nodesToResolve:
- childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (parent,))]
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (parent,))]
query = "SELECT name, tips FROM nodes WHERE name IN ({})".format(",".join(["?"] * len(childNames)))
childObjs = [{"name": row[0], "tips": row[1], "otolId": None} for row in dbCur.execute(query, childNames)]
childObjs.sort(key=lambda x: x["tips"], reverse=True)
@@ -66,7 +67,7 @@ while len(resolvedNodes) > 0:
nodeName = parent
continue
else:
- # Add potential otol-id
+ # Mark child as a potential choice
childObj = next(c for c in childObjs if c["name"] == nodeName)
childObj["otolId"] = otolId
break
@@ -78,8 +79,8 @@ while len(resolvedNodes) > 0:
parentToChosenTips[name] = childObj["tips"]
dbCur.execute("INSERT INTO linked_imgs VALUES (?, ?)", (name, childObj["otolId"]))
nodesToResolve.clear()
-# Iterate through processed nodes with compound names
-print("Replacing images for compound-name nodes")
+
+print("Replacing linked-images for compound nodes")
iterNum = 0
for nodeName in processedNodes.keys():
iterNum += 1
@@ -106,7 +107,7 @@ for nodeName in processedNodes.keys():
if upPropagateCompoundImgs:
while True:
# Get parent
- row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (nodeName,)).fetchone()
+ row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (nodeName,)).fetchone()
if row != None:
parent = row[0]
# Check num tips
@@ -118,6 +119,7 @@ for nodeName in processedNodes.keys():
nodeName = parent
continue
break
-# Close db
+
+print("Closing databases")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index 87b35c3..36b6197 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -3,29 +3,33 @@
import sys, re, os
import json, sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads labelled_supertree_ottnames.tre & annotations.json (from an Open Tree of Life release),\n"
-usageInfo += "and creates a sqlite database, which holds entries of the form (name text, data text).\n"
-usageInfo += "Each row holds a tree-of-life node's name, JSON-encoded child name array, a parent name or '',\n"
-usageInfo += "number of descendant 'tips', and a 1 or 0 indicating phylogenetic-support.\n"
-usageInfo += "\n"
-usageInfo += "Expected labelled_supertree_ottnames.tre format:\n"
-usageInfo += " Represents a tree-of-life in Newick format, roughly like (n1,n2,(n3,n4)n5)n6,\n"
-usageInfo += " where root node is named n6, and has children n1, n2, and n5.\n"
-usageInfo += " Name forms include Homo_sapiens_ott770315, mrcaott6ott22687, and 'Oxalis san-miguelii ott5748753'\n"
-usageInfo += " Some names can be split up into a 'simple' name (like Homo_sapiens) and an id (like ott770315)\n"
-usageInfo += "Expected annotations.json format:\n"
-usageInfo += " JSON object holding information about the tree-of-life release.\n"
-usageInfo += " The object's 'nodes' field maps node IDs to objects holding information about that node,\n"
-usageInfo += " such as phylogenetic trees that support/conflict with it's placement.\n"
-usageInfo += "\n"
-usageInfo += "Some node trimming is done on the extracted tree, for performance and relevance reasons.\n"
-usageInfo += "The app can get quite laggy when some nodes in the chain have over 10k children.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads files describing a tree-of-life from an 'Open Tree of Life' release,
+and stores tree information in a database.
+
+Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format:
+ The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6
+ The root node is named n6, and has children n1, n2, and n5.
+ Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753',
+ 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'.
+ The node with ID 'ott770315' will get the name 'homo sapiens'.
+ A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]).
+ It is possible for multiple nodes to have the same name.
+ In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc.
+Reads an annotations.json file, which is assumed to have this format:
+ Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node,
+ such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that
+ support/conflict with the node's placement.
+Reads from a picked-names file, if present, which specifies name and node ID pairs.
+ These help resolve cases where multiple nodes share the same name.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
-treeFile = "otol/labelled_supertree_ottnames.tre"
+treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes
annFile = "otol/annotations.json"
dbFile = "data.db"
nodeMap = {} # Maps node IDs to node objects
@@ -33,19 +37,32 @@ nameToFirstId = {} # Maps node names to first found ID (names might have multipl
dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs
pickedNamesFile = "pickedOtolNames.txt"
-# Parse treeFile
+class Node:
+ " Represents a tree-of-life node "
+ def __init__(self, name, childIds, parentId, tips, pSupport):
+ self.name = name
+ self.childIds = childIds
+ self.parentId = parentId
+ self.tips = tips
+ self.pSupport = pSupport
+
print("Parsing tree file")
+# Read file
data = None
with open(treeFile) as file:
data = file.read()
dataIdx = 0
+# Parse content
+iterNum = 0
def parseNewick():
- """Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node name or None"""
- global dataIdx
+ " Parses a node using 'data' and 'dataIdx', updates nodeMap accordingly, and returns the node's ID "
+ global data, dataIdx, iterNum
+ iterNum += 1
+ if iterNum % 1e5 == 0:
+ print(f"At iteration {iterNum}")
# Check for EOF
if dataIdx == len(data):
- print("ERROR: Unexpected EOF at index " + str(dataIdx), file=sys.stderr)
- return None
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
# Check for node
if data[dataIdx] == "(": # parse inner node
dataIdx += 1
@@ -53,12 +70,9 @@ def parseNewick():
while True:
# Read child
childId = parseNewick()
- if childId == None:
- return None
childIds.append(childId)
if (dataIdx == len(data)):
- print("ERROR: Unexpected EOF", file=sys.stderr)
- return None
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
# Check for next child
if (data[dataIdx] == ","):
dataIdx += 1
@@ -66,33 +80,25 @@ def parseNewick():
else:
# Get node name and id
dataIdx += 1 # Consume an expected ')'
- [name, id] = parseNewickName()
+ name, id = parseNewickName()
updateNameMaps(name, id)
# Get child num-tips total
tips = 0
for childId in childIds:
- tips += nodeMap[childId]["tips"]
+ tips += nodeMap[childId].tips
# Add node to nodeMap
- nodeMap[id] = {"name": name, "children": childIds, "parent": None, "tips": tips, "pSupport": False}
+ nodeMap[id] = Node(name, childIds, None, tips, False)
# Update childrens' parent reference
for childId in childIds:
- nodeMap[childId]["parent"] = id
+ nodeMap[childId].parentId = id
return id
else: # Parse node name
- [name, id] = parseNewickName()
+ name, id = parseNewickName()
updateNameMaps(name, id)
- nodeMap[id] = {"name": name, "children": [], "parent": None, "tips": 1, "pSupport": False}
+ nodeMap[id] = Node(name, [], None, 1, False)
return id
-def updateNameMaps(name, id):
- if name not in nameToFirstId:
- nameToFirstId[name] = id
- else:
- if name not in dupNameToIds:
- dupNameToIds[name] = [nameToFirstId[name], id]
- else:
- dupNameToIds[name].append(id)
def parseNewickName():
- """Helper that parses an input node name, and returns a [name,id] pair"""
+ " Parses a node name using 'data' and 'dataIdx', and returns a (name, id) pair "
global data, dataIdx
name = None
end = dataIdx
@@ -102,7 +108,7 @@ def parseNewickName():
inQuote = True
while end < len(data):
if (data[end] == "'"):
- if end + 1 < len(data) and data[end+1] == "'": # Account for '' as escaped-quote
+ if end + 1 < len(data) and data[end + 1] == "'": # Account for '' as escaped-quote
end += 2
continue
else:
@@ -111,75 +117,86 @@ def parseNewickName():
break
end += 1
if inQuote:
- raise Exception("ERROR: Unexpected EOF")
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
name = data[dataIdx:end]
dataIdx = end
else:
while end < len(data) and not re.match(r"[(),]", data[end]):
end += 1
if (end == dataIdx):
- raise Exception("ERROR: Unexpected EOF")
+ raise Exception(f"ERROR: Unexpected EOF at index {dataIdx}")
name = data[dataIdx:end].rstrip()
if end == len(data): # Ignore trailing input semicolon
name = name[:-1]
dataIdx = end
- # Convert to [name, id]
+ # Convert to (name, id)
name = name.lower()
if name.startswith("mrca"):
- return [name, name]
+ return (name, name)
elif name[0] == "'":
match = re.fullmatch(r"'([^\\\"]+) (ott\d+)'", name)
if match == None:
raise Exception(f"ERROR: invalid name \"{name}\"")
name = match.group(1).replace("''", "'")
- return [name, match.group(2)]
+ return (name, match.group(2))
else:
match = re.fullmatch(r"([^\\\"]+)_(ott\d+)", name)
if match == None:
raise Exception(f"ERROR: invalid name \"{name}\"")
- return [match.group(1).replace("_", " "), match.group(2)]
+ return (match.group(1).replace("_", " "), match.group(2))
+def updateNameMaps(name, id):
+ global nameToFirstId, dupNameToIds
+ if name not in nameToFirstId:
+ nameToFirstId[name] = id
+ else:
+ if name not in dupNameToIds:
+ dupNameToIds[name] = [nameToFirstId[name], id]
+ else:
+ dupNameToIds[name].append(id)
rootId = parseNewick()
-# Resolve duplicate names
-print("Resolving duplicates")
+
+print("Resolving duplicate names")
+# Read picked-names file
nameToPickedId = {}
if os.path.exists(pickedNamesFile):
with open(pickedNamesFile) as file:
for line in file:
(name, _, otolId) = line.rstrip().partition("|")
nameToPickedId[name] = otolId
-for [dupName, ids] in dupNameToIds.items():
+# Resolve duplicates
+for (dupName, ids) in dupNameToIds.items():
# Check for picked id
if dupName in nameToPickedId:
idToUse = nameToPickedId[dupName]
else:
# Get conflicting node with most tips
- tipNums = [nodeMap[id]["tips"] for id in ids]
+ tipNums = [nodeMap[id].tips for id in ids]
maxIdx = tipNums.index(max(tipNums))
idToUse = ids[maxIdx]
# Adjust name of other conflicting nodes
counter = 2
for id in ids:
if id != idToUse:
- nodeMap[id]["name"] += " [" + str(counter)+ "]"
+ nodeMap[id].name += f" [{counter}]"
counter += 1
-# Change mrca* names
+
print("Changing mrca* names")
def convertMrcaName(id):
node = nodeMap[id]
- name = node["name"]
- childIds = node["children"]
+ name = node.name
+ childIds = node.childIds
if len(childIds) < 2:
- print(f"WARNING: MRCA node \"{name}\" has less than 2 children", file=sys.stderr)
+ print(f"WARNING: MRCA node \"{name}\" has less than 2 children")
return
# Get 2 children with most tips
- childTips = [nodeMap[id]["tips"] for id in childIds]
- maxIdx = childTips.index(max(childTips))
- childTips[maxIdx] = 0
+ childTips = [nodeMap[id].tips for id in childIds]
+ maxIdx1 = childTips.index(max(childTips))
+ childTips[maxIdx1] = 0
maxIdx2 = childTips.index(max(childTips))
- childId1 = childIds[maxIdx]
+ childId1 = childIds[maxIdx1]
childId2 = childIds[maxIdx2]
- childName1 = nodeMap[childId1]["name"]
- childName2 = nodeMap[childId2]["name"]
+ childName1 = nodeMap[childId1].name
+ childName2 = nodeMap[childId2].name
# Check for mrca* child names
if childName1.startswith("mrca"):
childName1 = convertMrcaName(childId1)
@@ -193,44 +210,44 @@ def convertMrcaName(id):
if match != None:
childName2 = match.group(1)
# Create composite name
- node["name"] = f"[{childName1} + {childName2}]"
+ node.name = f"[{childName1} + {childName2}]"
return childName1
-for [id, node] in nodeMap.items():
- if node["name"].startswith("mrca"):
+for (id, node) in nodeMap.items():
+ if node.name.startswith("mrca"):
convertMrcaName(id)
-# Parse annFile
+
print("Parsing annotations file")
+# Read file
data = None
with open(annFile) as file:
data = file.read()
obj = json.loads(data)
-nodeAnnsMap = obj['nodes']
-# Add annotations data
-print("Adding annotation data")
-for [id, node] in nodeMap.items():
+nodeAnnsMap = obj["nodes"]
+# Find relevant annotations
+for (id, node) in nodeMap.items():
# Set has-support value using annotations
if id in nodeAnnsMap:
nodeAnns = nodeAnnsMap[id]
supportQty = len(nodeAnns["supported_by"]) if "supported_by" in nodeAnns else 0
conflictQty = len(nodeAnns["conflicts_with"]) if "conflicts_with" in nodeAnns else 0
- node["pSupport"] = supportQty > 0 and conflictQty == 0
+ node.pSupport = supportQty > 0 and conflictQty == 0
# Root node gets support
- if node["parent"] == None:
- node["pSupport"] = True
-# Create db
+ if node.parentId == None:
+ node.pSupport = True
+
print("Creating nodes and edges tables")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)")
dbCur.execute("CREATE INDEX nodes_idx_nc ON nodes(name COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))")
+dbCur.execute("CREATE TABLE edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))")
dbCur.execute("CREATE INDEX edges_child_idx ON edges(child)")
for (otolId, node) in nodeMap.items():
- dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node["name"], otolId, node["tips"]))
- childIds = node["children"]
- for childId in childIds:
+ dbCur.execute("INSERT INTO nodes VALUES (?, ?, ?)", (node.name, otolId, node.tips))
+ for childId in node.childIds:
childNode = nodeMap[childId]
dbCur.execute("INSERT INTO edges VALUES (?, ?, ?)",
- (node["name"], childNode["name"], 1 if childNode["pSupport"] else 0))
+ (node.name, childNode.name, 1 if childNode.pSupport else 0))
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py
index b475794..2e56bba 100755
--- a/backend/data/genReducedTreeData.py
+++ b/backend/data/genReducedTreeData.py
@@ -3,123 +3,131 @@
import sys, os.path, re
import json, sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads \n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Creates a reduced version of the tree in the database.
+Reads a subset of the node names from a file, and creates a
+minimal tree that contains them, possibly with a few extras.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
dbFile = "data.db"
-nodeNamesFile = "reducedTreeNodes.txt"
+nodeNamesFile = "pickedReducedNodes.txt"
minimalNames = set()
nodeMap = {} # Maps node names to node objects
PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit
-compNameRegex = re.compile(r"\[.+ \+ .+]")
+compNameRegex = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes
+
+class Node:
+ " Represents a node from the database "
+ def __init__(self, id, children, parent, tips, pSupport):
+ self.id = id
+ self.children = children
+ self.parent = parent
+ self.tips = tips
+ self.pSupport = pSupport
-# Connect to db
+print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Read in minimal set of node names
+
print("Getting minimal name set")
iterNum = 0
with open(nodeNamesFile) as file:
for line in file:
iterNum += 1
if iterNum % 100 == 0:
- print(f"Iteration {iterNum}")
+ print(f"At iteration {iterNum}")
#
- row = dbCur.execute("SELECT name from nodes WHERE name = ?", (line.rstrip(),)).fetchone()
+ name = line.rstrip()
+ row = dbCur.execute("SELECT name from nodes WHERE name = ?", (name,)).fetchone()
if row == None:
- row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (line.rstrip(),)).fetchone()
+ row = dbCur.execute("SELECT name from names WHERE alt_name = ?", (name,)).fetchone()
if row != None:
minimalNames.add(row[0])
if len(minimalNames) == 0:
- print("ERROR: No names found", file=sys.stderr)
- sys.exit(1)
-print(f"Name set has {len(minimalNames)} names")
-# Add nodes that connect up to root
-print("Getting connected nodes set")
-iterNum = 0
+ print("No names found")
+ sys.exit(0)
+print(f"Result has {len(minimalNames)} names")
+
+print("Getting ancestor nodes")
rootName = None
+iterNum = 0
for name in minimalNames:
iterNum += 1
if iterNum % 100 == 0:
- print(f"Iteration {iterNum}")
+ print(f"At iteration {iterNum}")
#
prevName = None
while name != None:
if name not in nodeMap:
(id, tips) = dbCur.execute("SELECT id, tips from nodes where name = ?", (name,)).fetchone()
- row = dbCur.execute("SELECT node, p_support from edges where child = ?", (name,)).fetchone()
+ row = dbCur.execute("SELECT parent, p_support from edges where child = ?", (name,)).fetchone()
parent = None if row == None or row[0] == "" else row[0]
- pSupport = 1 if row == None or row[1] == 1 else 0
- nodeMap[name] = {
- "id": id,
- "children": [] if prevName == None else [prevName],
- "parent": parent,
- "tips": 0,
- "pSupport": pSupport,
- }
+ pSupport = row == None or row[1] == 1
+ children = [] if prevName == None else [prevName]
+ nodeMap[name] = Node(id, children, parent, 0, pSupport)
prevName = name
name = parent
else:
if prevName != None:
- nodeMap[name]["children"].append(prevName)
+ nodeMap[name].children.append(prevName)
break
if name == None:
rootName = prevName
-print(f"New node set has {len(nodeMap)} nodes")
-# Merge-upward compsite-named nodes
-print("Merging-upward composite-named nodes")
+print(f"Result has {len(nodeMap)} nodes")
+
+print("Merging-upward composite nodes")
namesToRemove = set()
-for (name, nodeObj) in nodeMap.items():
- parent = nodeObj["parent"]
+for (name, node) in nodeMap.items():
+ parent = node.parent
if parent != None and compNameRegex.fullmatch(name) != None:
# Connect children to parent
- nodeMap[parent]["children"].remove(name)
- nodeMap[parent]["children"].extend(nodeObj["children"])
- for n in nodeObj["children"]:
- nodeMap[n]["parent"] = parent
- nodeMap[n]["pSupport"] &= nodeObj["pSupport"]
+ nodeMap[parent].children.remove(name)
+ nodeMap[parent].children.extend(node.children)
+ for n in node.children:
+ nodeMap[n].parent = parent
+ nodeMap[n].pSupport &= node.pSupport
# Remember for removal
namesToRemove.add(name)
for name in namesToRemove:
del nodeMap[name]
-print(f"New node set has {len(nodeMap)} nodes")
-# Remove certain 'chain collapsible' nodes
+print(f"Result has {len(nodeMap)} nodes")
+
print("Removing 'chain collapsible' nodes")
namesToRemove2 = set()
-for (name, nodeObj) in nodeMap.items():
- hasOneChild = len(nodeObj["children"]) == 1
- isOnlyChild = nodeObj["parent"] != None and len(nodeMap[nodeObj["parent"]]["children"]) == 1
+for (name, node) in nodeMap.items():
+ hasOneChild = len(node.children) == 1
+ isOnlyChild = node.parent != None and len(nodeMap[node.parent].children) == 1
if name not in minimalNames and (hasOneChild or isOnlyChild):
- parentName = nodeObj["parent"]
- children = nodeObj["children"]
+ parent = node.parent
# Connect parent and children
- nodeMap[parentName]["children"].remove(name)
- nodeMap[parentName]["children"].extend(children)
- for n in children:
- nodeMap[n]["parent"] = parentName
- # Adjust child pSupport
- nodeMap[n]["pSupport"] &= nodeObj["pSupport"]
+ nodeMap[parent].children.remove(name)
+ nodeMap[parent].children.extend(node.children)
+ for n in node.children:
+ nodeMap[n].parent = parent
+ nodeMap[n].pSupport &= node.pSupport
# Remember for removal
namesToRemove2.add(name)
for name in namesToRemove2:
del nodeMap[name]
namesToRemove.add(name)
-print(f"New node set has {len(nodeMap)} nodes")
-# Add some connected children
-print("Adding additional nearby children")
+print(f"Result has {len(nodeMap)} nodes")
+
+print("Adding some additional nearby children")
namesToAdd = []
iterNum = 0
-for (name, nodeObj) in nodeMap.items():
+for (name, node) in nodeMap.items():
iterNum += 1
if iterNum % 100 == 0:
- print(f"Iteration {iterNum}")
+ print(f"At iteration {iterNum}")
#
- numChildren = len(nodeObj["children"])
+ numChildren = len(node.children)
if numChildren < PREF_NUM_CHILDREN:
- children = [row[0] for row in dbCur.execute("SELECT child FROM edges where node = ?", (name,))]
+ children = [row[0] for row in dbCur.execute("SELECT child FROM edges where parent = ?", (name,))]
newChildren = []
for n in children:
if n in nodeMap or n in namesToRemove:
@@ -132,43 +140,38 @@ for (name, nodeObj) in nodeMap.items():
continue
newChildren.append(n)
newChildNames = newChildren[:max(0, PREF_NUM_CHILDREN - numChildren)]
- nodeObj["children"].extend(newChildNames)
+ node.children.extend(newChildNames)
namesToAdd.extend(newChildNames)
for name in namesToAdd:
- (parent, pSupport) = dbCur.execute("SELECT node, p_support from edges WHERE child = ?", (name,)).fetchone()
+ parent, pSupport = dbCur.execute("SELECT parent, p_support from edges WHERE child = ?", (name,)).fetchone()
(id,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (name,)).fetchone()
parent = None if parent == "" else parent
- nodeMap[name] = {
- "id": id,
- "children": [],
- "parent": parent,
- "tips": 0,
- "pSupport": pSupport == 1,
- }
-print(f"New node set has {len(nodeMap)} nodes")
-# set tips vals
-print("Setting tips vals")
+ nodeMap[name] = Node(id, [], parent, 0, pSupport == 1)
+print(f"Result has {len(nodeMap)} nodes")
+
+print("Setting 'tips' values")
def setTips(nodeName):
- nodeObj = nodeMap[nodeName]
- if len(nodeObj["children"]) == 0:
- nodeObj["tips"] = 1
+ node = nodeMap[nodeName]
+ if len(node.children) == 0:
+ node.tips = 1
return 1
- tips = sum([setTips(childName) for childName in nodeObj["children"]])
- nodeObj["tips"] = tips
+ tips = sum([setTips(childName) for childName in node.children])
+ node.tips = tips
return tips
setTips(rootName)
-# Add new nodes to db
-print("Adding to db")
+
+print("Adding reduced tree to database")
dbCur.execute("CREATE TABLE r_nodes (name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT)")
dbCur.execute("CREATE INDEX r_nodes_idx_nc ON r_nodes(name COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE r_edges (node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child))")
+dbCur.execute("CREATE TABLE r_edges (parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child))")
dbCur.execute("CREATE INDEX r_edges_child_idx ON r_edges(child)")
-for (name, nodeObj) in nodeMap.items():
- parentName = "" if nodeObj["parent"] == None else nodeObj["parent"]
- dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, nodeObj["id"], nodeObj["tips"]))
- for childName in nodeObj["children"]:
- pSupport = 1 if nodeMap[childName]["pSupport"] else 0
+for (name, node) in nodeMap.items():
+ parentName = "" if node.parent == None else node.parent
+ dbCur.execute("INSERT INTO r_nodes VALUES (?, ?, ?)", (name, node.id, node.tips))
+ for childName in node.children:
+ pSupport = 1 if nodeMap[childName].pSupport else 0
dbCur.execute("INSERT INTO r_edges VALUES (?, ?, ?)", (name, childName, pSupport))
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/data/reviewImgsToGen.py b/backend/data/reviewImgsToGen.py
index 4d970ba..de592f5 100755
--- a/backend/data/reviewImgsToGen.py
+++ b/backend/data/reviewImgsToGen.py
@@ -7,15 +7,18 @@ from tkinter import ttk
import PIL
from PIL import ImageTk, Image, ImageOps
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Provides a GUI that displays, for each tol-node, an associated image from\n"
-usageInfo += "eol/* and enwiki/*, and enables the user to choose which to use. Writes\n"
-usageInfo += "choice data to a text file with lines of the form 'otolId1 imgPath1', or\n"
-usageInfo += "'otolId1', where no path indicates a choice of no image.\n"
-usageInfo += "\n"
-usageInfo += "The program can be closed, and run again to continue from the last choice.\n"
-usageInfo += "The program looks for an existing output file to determine what choices\n"
-usageInfo += "have already been made.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Provides a GUI that displays, for each node in the database, associated
+images from EOL and Wikipedia, and allows choosing which to use. Writes
+choice data to a text file with lines of the form 'otolId1 imgPath1', or
+'otolId1', where no path indicates a choice of no image.
+
+The program can be closed, and run again to continue from the last choice.
+The program looks for an existing output file to determine what choices
+have already been made.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
@@ -28,16 +31,18 @@ IMG_DISPLAY_SZ = 400
PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
onlyReviewPairs = True
-# Open db
+print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Associate nodes with images
-nodeToImgs = {} # Maps otol-ids to img-path arrays
-print("Looking through EOL images")
+
+nodeToImgs = {} # Maps otol-ids to arrays of image paths
+print("Iterating through images from EOL")
if os.path.exists(eolImgDir):
for filename in os.listdir(eolImgDir):
- (eolId, _, _) = filename.partition(" ")
+ # Get associated EOL ID
+ eolId, _, _ = filename.partition(" ")
query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?"
+ # Get associated node IDs
found = False
for (otolId,) in dbCur.execute(query, (int(eolId),)):
if otolId not in nodeToImgs:
@@ -45,13 +50,15 @@ if os.path.exists(eolImgDir):
nodeToImgs[otolId].append(eolImgDir + filename)
found = True
if not found:
- print(f"No node found for {eolImgDir}{filename}", file=sys.stderr)
-print(f"Result has {len(nodeToImgs)} node entries")
-print("Looking through enwiki images")
+ print(f"WARNING: No node found for {eolImgDir}{filename}")
+print(f"Result: {len(nodeToImgs)} nodes with images")
+print("Iterating through images from Wikipedia")
if os.path.exists(enwikiImgDir):
for filename in os.listdir(enwikiImgDir):
+ # Get associated page ID
(wikiId, _, _) = filename.partition(".")
- query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids._id = ?"
+ # Get associated node IDs
+ query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id = ?"
found = False
for (otolId,) in dbCur.execute(query, (int(wikiId),)):
if otolId not in nodeToImgs:
@@ -59,10 +66,9 @@ if os.path.exists(enwikiImgDir):
nodeToImgs[otolId].append(enwikiImgDir + filename)
found = True
if not found:
- print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr)
-print(f"Result has {len(nodeToImgs)} node entries")
-# Check for already-made choices
-print("Filtering out already-chosen IDs")
+ print(f"WARNING: No node found for {enwikiImgDir}{filename}")
+print(f"Result: {len(nodeToImgs)} nodes with images")
+print("Filtering out already-made image choices")
oldSz = len(nodeToImgs)
if os.path.exists(outFile):
with open(outFile) as file:
@@ -74,7 +80,7 @@ if os.path.exists(outFile):
print(f"Filtered out {oldSz - len(nodeToImgs)} entries")
class ImgReviewer:
- """ Provides the GUI for reviewing images """
+ " Provides the GUI for reviewing images "
def __init__(self, root, nodeToImgs):
self.root = root
root.title("Image Reviewer")
@@ -96,7 +102,7 @@ class ImgReviewer:
# Add padding
for child in mainFrame.winfo_children():
child.grid_configure(padx=5, pady=5)
- # Add bindings
+ # Add keyboard bindings
root.bind("<q>", self.quit)
root.bind("<Key-j>", lambda evt: self.accept(0))
root.bind("<Key-k>", lambda evt: self.accept(1))
@@ -112,7 +118,7 @@ class ImgReviewer:
# Initialise images to review
self.getNextImgs()
def getNextImgs(self):
- """ Updates display with new images to review, or ends program """
+ " Updates display with new images to review, or ends program "
# Get next image paths
while True:
self.listIdx += 1
@@ -120,7 +126,7 @@ class ImgReviewer:
print("No more images to review. Exiting program.")
self.quit()
return
- (self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
+ self.otolId, imgPaths = self.nodeImgsList[self.listIdx]
# Potentially skip user choice
if onlyReviewPairs and len(imgPaths) == 1:
with open(outFile, 'a') as file:
@@ -141,12 +147,12 @@ class ImgReviewer:
continue
if imgPath.startswith("eol/"):
self.eolImgPath = imgPath
- self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
+ self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(img))
elif imgPath.startswith("enwiki/"):
self.enwikiImgPath = imgPath
- self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
+ self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(img))
else:
- print(f"Unexpected image path {imgPath}", file=sys.stderr)
+ print(f"Unexpected image path {imgPath}")
self.quit()
return
# Re-iterate if all image paths invalid
@@ -157,14 +163,14 @@ class ImgReviewer:
return
# Add placeholder images
if self.eolImgPath == None:
- self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
+ self.eolImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
elif self.enwikiImgPath == None:
- self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
+ self.enwikiImg = ImageTk.PhotoImage(self.resizeImgForDisplay(PLACEHOLDER_IMG))
# Update image-frames
self.labels[0].config(image=self.eolImg)
self.labels[1].config(image=self.enwikiImg)
# Update title
- title = f"Imgs for otol ID {self.otolId}"
+ title = f"Images for otol ID {self.otolId}"
query = "SELECT names.alt_name FROM" \
" nodes INNER JOIN names ON nodes.name = names.name" \
" WHERE nodes.id = ? and pref_alt = 1"
@@ -174,7 +180,7 @@ class ImgReviewer:
title += f" ({self.listIdx + 1} out of {len(self.nodeImgsList)})"
self.root.title(title)
def accept(self, imgIdx):
- """ React to a user selecting an image """
+ " React to a user selecting an image "
imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath
if imgPath == None:
print("Invalid selection")
@@ -184,12 +190,13 @@ class ImgReviewer:
self.numReviewed += 1
self.getNextImgs()
def reject(self):
- """ React to a user rejecting all images of a set """
+ " React to a user rejecting all images of a set "
with open(outFile, 'a') as file:
file.write(f"{self.otolId}\n")
self.numReviewed += 1
self.getNextImgs()
def quit(self, e = None):
+ global dbCon
print(f"Number reviewed: {self.numReviewed}")
timeElapsed = time.time() - self.startTime
print(f"Time elapsed: {timeElapsed:.2f} seconds")
@@ -197,8 +204,8 @@ class ImgReviewer:
print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
dbCon.close()
self.root.destroy()
- def resizeForDisplay(self, img):
- """ Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
+ def resizeImgForDisplay(self, img):
+ " Returns a copy of an image, shrunk to fit it's frame (keeps aspect ratio), and with a background "
if max(img.width, img.height) > IMG_DISPLAY_SZ:
if (img.width > img.height):
newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
@@ -212,6 +219,7 @@ class ImgReviewer:
int((IMG_DISPLAY_SZ - img.height) / 2)))
return bgImg
# Create GUI and defer control
+print("Starting GUI")
root = tki.Tk()
ImgReviewer(root, nodeToImgs)
root.mainloop()
diff --git a/backend/data/trimTree.py b/backend/data/trimTree.py
index 302ea0d..fa269d8 100755
--- a/backend/data/trimTree.py
+++ b/backend/data/trimTree.py
@@ -3,21 +3,25 @@
import sys
import sqlite3
-usageInfo = f"usage: {sys.argv[0]}\n"
-usageInfo += "Removes certain children from a tol-tree in an sqlite db.\n"
-usageInfo += "Looks for nodes with an amount of children above a threshold,\n"
-usageInfo += "and removes the excess, excluding those with 'significant'\n"
-usageInfo += "associations, like those with descriptions and images.\n"
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Tries to remove 'low significance' nodes from the database. Currently
+removes nodes that don't have an image or description, or a presence in
+the reduced tree. Also, for nodes with 'many' children, trims some more,
+ignoring the presence of node descriptions.
+"""
if len(sys.argv) > 1:
print(usageInfo, file=sys.stderr)
sys.exit(1)
dbFile = "data.db"
-softChildLimit = 500
+softChildLimit = 500 # Used to determine when a node has 'many' children
+print("Opening database")
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
-# Get nodes that shouldn't be deleted, along with their ancestors
+
print("Finding nodes to keep")
nodesToKeep = set()
nodesToStronglyKeep = set()
@@ -41,25 +45,26 @@ for name in nodesToKeep:
print(f"\tAt iteration {iterNum}")
#
while True:
- row = dbCur.execute("SELECT node FROM edges WHERE child = ?", (name,)).fetchone()
+ row = dbCur.execute("SELECT parent FROM edges WHERE child = ?", (name,)).fetchone()
if row != None:
parent = row[0]
if parent not in nodesToKeep and parent not in ancestors:
ancestors.add(parent)
- if name in nodesToStronglyKeep:
+ if name not in nodesToStronglyKeep:
nodesToStronglyKeep.add(parent)
name = parent
continue
break
nodesToKeep.update(ancestors)
-print(f"Total of {len(nodesToKeep)} nodes to keep")
+print(f"Result: {len(nodesToKeep)} nodes to keep")
+
# Find root node
-query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.node IS NULL LIMIT 1"
+query = "SELECT name FROM nodes LEFT JOIN edges ON nodes.name = edges.child WHERE edges.parent IS NULL LIMIT 1"
(rootName,) = dbCur.execute(query).fetchone()
-print(f"Found root node '{rootName}'")
-# Traverse tree, looking for trimmable nodes
+print(f"Found root node \"{rootName}\"")
+
print("Looking for trimmable nodes")
-nodeToTipsChg = {}
+nodeToTipsChg = {} # Used to update 'tips' values after trimming
nodesToDelete = set()
iterNum = 0
def findTrimmables(nodeName):
@@ -68,15 +73,15 @@ def findTrimmables(nodeName):
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
- childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))]
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))]
childrenToKeep, otherChildren = set(), set()
for n in childNames:
if n in nodesToKeep:
childrenToKeep.add(n)
else:
otherChildren.add(n)
- # Check soft limit
tipsRemoved = 0
+ # Check soft limit
if len(childrenToKeep) > softChildLimit:
numToTrim = len(childrenToKeep) - softChildLimit
# Try removing weakly-kept nodes, preferring those with less tips
@@ -88,7 +93,7 @@ def findTrimmables(nodeName):
candidatesToTrim.sort(key=lambda n: childToTips[n], reverse=True)
otherChildren.update(candidatesToTrim[-numToTrim:])
childrenToKeep.difference_update(candidatesToTrim[-numToTrim:])
- # 'Simulate' deletions
+ # Mark nodes for deletion
for n in otherChildren:
tipsRemoved += markForDeletion(n)
# Recurse on children
@@ -99,7 +104,7 @@ def findTrimmables(nodeName):
return tipsRemoved
def markForDeletion(nodeName):
nodesToDelete.add(nodeName)
- childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE node = ?", (nodeName,))]
+ childNames = [row[0] for row in dbCur.execute("SELECT child FROM edges WHERE parent = ?", (nodeName,))]
if len(childNames) == 0:
return 1
else:
@@ -108,7 +113,7 @@ def markForDeletion(nodeName):
tipsRemoved += markForDeletion(n)
return tipsRemoved
findTrimmables(rootName)
-# Delete trimmable nodes
+
print(f"Deleting {len(nodesToDelete)} nodes")
iterNum = 0
for nodeName in nodesToDelete:
@@ -117,10 +122,13 @@ for nodeName in nodesToDelete:
print(f"At iteration {iterNum}")
#
dbCur.execute("DELETE FROM nodes WHERE name = ?", (nodeName,))
- dbCur.execute("DELETE FROM edges WHERE node = ?", (nodeName,))
+ dbCur.execute("DELETE FROM edges WHERE parent = ?", (nodeName,))
dbCur.execute("DELETE FROM edges WHERE child = ?", (nodeName,))
dbCur.execute("DELETE FROM names WHERE name = ?", (nodeName,))
- dbCur.execute("DELETE FROM eol_ids WHERE name = ?", (nodeName,))
+ # Could also delete from 'eol_ids', 'wiki_ids', and 'descs', but this
+ # makes it much harder to restore the original data if needed, and
+ # the memory savings didn't seem significant.
+
print(f"Updating num-tips for {len(nodeToTipsChg)} nodes")
iterNum = 0
for (nodeName, tipsChg) in nodeToTipsChg.items():
@@ -129,6 +137,7 @@ for (nodeName, tipsChg) in nodeToTipsChg.items():
print(f"At iteration {iterNum}")
#
dbCur.execute("UPDATE nodes SET tips = tips - ? WHERE name = ?", (tipsChg, nodeName))
-# Close db
+
+print("Closing database")
dbCon.commit()
dbCon.close()
diff --git a/backend/server.py b/backend/server.py
index 888f73a..4a364c3 100755
--- a/backend/server.py
+++ b/backend/server.py
@@ -28,7 +28,7 @@ if len(sys.argv) > 1:
# Classes for objects sent as responses (matches lib.ts types in client-side code)
class TolNode:
- """ Used when responding to 'node' and 'chain' requests """
+ " Used when responding to 'node' and 'chain' requests "
def __init__(self, otolId, children, parent=None, tips=0, pSupport=False, commonName=None, imgName=None):
self.otolId = otolId # string | null
self.children = children # string[]
@@ -38,24 +38,24 @@ class TolNode:
self.commonName = commonName # null | string
self.imgName = imgName # null | string | [string,string] | [null, string] | [string, null]
class SearchSugg:
- """ Represents a search suggestion """
+ " Represents a search suggestion "
def __init__(self, name, canonicalName=None):
self.name = name # string
self.canonicalName = canonicalName # string | null
class SearchSuggResponse:
- """ Sent as responses to 'search' requests """
+ " Sent as responses to 'search' requests "
def __init__(self, searchSuggs, hasMore):
self.suggs = searchSuggs # SearchSugg[]
self.hasMore = hasMore # boolean
class DescInfo:
- """ Represents a tol-node's associated description """
+ " Represents a tol-node's associated description "
def __init__(self, text, wikiId, fromRedirect, fromDbp):
self.text = text # string
self.wikiId = wikiId # number
self.fromRedirect = fromRedirect # boolean
self.fromDbp = fromDbp # boolean
class ImgInfo:
- """ Represents a tol-node's associated image """
+ " Represents a tol-node's associated image "
def __init__(self, id, src, url, license, artist, credit):
self.id = id # number
self.src = src # string
@@ -64,7 +64,7 @@ class ImgInfo:
self.artist = artist # string
self.credit = credit # string
class InfoResponse:
- """ Sent as responses to 'info' requests """
+ " Sent as responses to 'info' requests "
def __init__(self, tolNode, descData, imgData):
self.tolNode = tolNode # null | TolNode
self.descData = descData # null | DescInfo | [DescInfo, DescInfo]
@@ -84,7 +84,7 @@ def lookupNodes(names, useReducedTree):
for (nodeName, otolId, tips) in cur.execute(query, names):
nameToNodes[nodeName] = TolNode(otolId, [], tips=tips)
# Get child info
- query = f"SELECT node, child FROM {edgesTable} WHERE node IN ({queryParamStr})"
+ query = f"SELECT parent, child FROM {edgesTable} WHERE parent IN ({queryParamStr})"
for (nodeName, childName) in cur.execute(query, names):
nameToNodes[nodeName].children.append(childName)
# Order children by tips
@@ -96,7 +96,7 @@ def lookupNodes(names, useReducedTree):
childToTips[n] = tips
node.children.sort(key=lambda n: childToTips[n], reverse=True)
# Get parent info
- query = f"SELECT node, child, p_support FROM {edgesTable} WHERE child IN ({queryParamStr})"
+ query = f"SELECT parent, child, p_support FROM {edgesTable} WHERE child IN ({queryParamStr})"
for (nodeName, childName, pSupport) in cur.execute(query, names):
nameToNodes[childName].parent = nodeName
nameToNodes[childName].pSupport = (pSupport == 1)