From e78c4df403e5f98afa08f7a0841ff233d5f6d05b Mon Sep 17 00:00:00 2001
From: Terry Truong <terry06890@gmail.com>
Date: Wed, 22 Jun 2022 01:42:41 +1000
Subject: Update backend READMEs, rename some files for consistency

---
 .gitignore                                    |  31 ++--
 README.md                                     |   8 +-
 backend/README.md                             |   4 +
 backend/data/README.md                        | 232 +++++++++++++-------------
 backend/data/dbpedia/README.md                |  45 ++---
 backend/data/dbpedia/genData.py               | 146 ----------------
 backend/data/dbpedia/genDescData.py           | 146 ++++++++++++++++
 backend/data/enwiki/README.md                 |  73 ++++----
 backend/data/enwiki/downloadEnwikiImgs.py     |   2 +-
 backend/data/enwiki/downloadImgLicenseInfo.py |   2 +-
 backend/data/enwiki/genData.py                | 122 --------------
 backend/data/enwiki/genDescData.py            | 122 ++++++++++++++
 backend/data/enwiki/genImgData.py             | 178 ++++++++++++++++++++
 backend/data/enwiki/getEnwikiImgData.py       | 178 --------------------
 backend/data/eol/README.md                    |  33 ++--
 backend/data/eol/reviewImgs.py                |   2 +-
 backend/data/genDbpData.py                    |   6 +-
 backend/data/genEnwikiDescData.py             |   6 +-
 backend/data/genEnwikiNameData.py             |   2 +-
 backend/data/genEolNameData.py                |   4 +-
 backend/data/genImgs.py                       | 179 ++++++++++++++++++++
 backend/data/genImgsForWeb.py                 | 179 --------------------
 backend/data/genOtolData.py                   |  15 +-
 backend/data/genReducedTreeData.py            |   2 +-
 backend/data/otol/README.md                   |  14 +-
 backend/data/pickedImgs/README.md             |  16 +-
 backend/data/reducedTol/README.md             |   4 -
 backend/data/reviewImgsToGen.py               | 217 ++++++++++++++++++++++++
 backend/data/reviewImgsToMerge.py             | 217 ------------------------
 29 files changed, 1111 insertions(+), 1074 deletions(-)
 create mode 100644 backend/README.md
 delete mode 100755 backend/data/dbpedia/genData.py
 create mode 100755 backend/data/dbpedia/genDescData.py
 delete mode 100755 backend/data/enwiki/genData.py
 create mode 100755 backend/data/enwiki/genDescData.py
 create mode 100755 backend/data/enwiki/genImgData.py
 delete mode 100755 backend/data/enwiki/getEnwikiImgData.py
 create mode 100755 backend/data/genImgs.py
 delete mode 100755 backend/data/genImgsForWeb.py
 delete mode 100644 backend/data/reducedTol/README.md
 create mode 100755 backend/data/reviewImgsToGen.py
 delete mode 100755 backend/data/reviewImgsToMerge.py

diff --git a/.gitignore b/.gitignore
index 96d0644..c35bd98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,31 +3,32 @@
 /node_modules/
 /dist/
 /public/img/
+
+# Backend files
+/backend/data/data.db
 /backend/data/otol/*.tgz
 /backend/data/otol/*.json
 /backend/data/otol/*.tre
-/backend/data/data.db
 /backend/data/eol/*.tgz
 /backend/data/eol/*.csv
-/backend/data/eol/*.db
 /backend/data/eol/imagesList/
+/backend/data/eol/*.db
 /backend/data/eol/imgsForReview/
-/backend/data/eol/imgsReviewed/
-/backend/data/img/
+/backend/data/eol/imgs/
+/backend/data/dbpedia/*.bz2
+/backend/data/dbpedia/*.db
 /backend/data/enwiki/*.bz2
 /backend/data/enwiki/*.db
 /backend/data/enwiki/imgs/
 /backend/data/enwiki/.venv/
-/backend/data/dbpedia/*.bz2
-/backend/data/dbpedia/*.db
-/backend/data/genOtolNamesToKeep.txt
-/backend/data/genOtolDataPickedDups.txt
-/backend/data/genEolNameDataPickedIds.txt
-/backend/data/genEolNameDataBadAlts.txt
-/backend/data/genDescNamesToSkip.txt
-/backend/data/dbpPickedLabels.txt
-/backend/data/enwikiPickedLabels.txt
-/backend/data/mergedImgList.txt
+/backend/data/imgList.txt
 /backend/data/pickedImgs/
-/backend/data/reducedTol/names.txt
+/backend/data/img/
+/backend/data/pickedOtolNames.txt
+/backend/data/pickedEolIds.txt
+/backend/data/pickedEolAltsToSkip.txt
+/backend/data/pickedEnwikiNamesToSkip.txt
+/backend/data/pickedDbpLabels.txt
+/backend/data/pickedEnwikiLabels.txt
+/backend/data/pickedReducedNodes.txt
 /backend/data/pickedNames.txt
diff --git a/README.md b/README.md
index a860b3f..46e044d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
-# Grid of Life
+Grid of Life
+============
 
 An interactive visualisation of the biological tree of life.
 
@@ -8,3 +9,8 @@ Each tile represents a group of organisms with a common ancestor.
 -   Clicking on an expanded tile collapses it back into one tile.
 -   Double-clicking on a tile expands it to fill the whole view.
     Other tiles will be moved to the side.
+
+Files
+=====
+-   backend/: Contains code for running the server, and generating tree-of-life data
+-   
diff --git a/backend/README.md b/backend/README.md
new file mode 100644
index 0000000..331e7f4
--- /dev/null
+++ b/backend/README.md
@@ -0,0 +1,4 @@
+Files
+=====
+-   server.py: Runs the server
+-   data/: For generating the server's tree-of-life database
diff --git a/backend/data/README.md b/backend/data/README.md
index d4a6196..7d1adad 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -1,115 +1,121 @@
-File Generation Process
-=======================
-1   Tree Structure Data
-    1   Obtain data in otol/, as specified in it's README.
-    2   Run genOtolData.py, which creates data.db, and adds
-        'nodes' and 'edges' tables using data in otol/*, as well as
-        genOtolNamesToKeep.txt, if present.
-2   Name Data for Search
-    1   Obtain data in eol/, as specified in it's README.
-    2   Run genEolNameData.py, which adds 'names' and 'eol_ids' tables to data.db,
-        using data in eol/vernacularNames.csv and the 'nodes' table, and possibly
-        genEolNameDataPickedIds.txt.
-3   Node Description Data
-    1   Obtain data in dbpedia/ and enwiki/, as specified in their README files.
-    2   Run genDbpData.py, which adds 'wiki_ids' and 'descs' tables to data.db,
-        using data in dbpedia/dbpData.db, the 'nodes' table, and possibly
-        genDescNamesToSkip.txt and dbpPickedLabels.txt.
-    3   Run genEnwikiDescData.py, which adds to the 'wiki_ids' and 'descs' tables,
-        using data in enwiki/enwikiData.db, and the 'nodes' table.
-        Also uses genDescNamesToSkip.txt and genEnwikiDescTitlesToUse.txt for
-        skipping/resolving some name-page associations.
-4   Image Data
-    1   In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/.
-        It uses data in eol/imagesList.db, and the 'eol_ids' table.
-    2   In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique
-        images in eol/imgsReviewed/ (uses 'names' and 'eol_ids' to display extra info).
-    3   In enwiki/, run getEnwikiImgData.py, which generates a list of
-        tol-node images, and creates enwiki/enwikiImgs.db to store it.
-        Uses the 'wiki_ids' table to get tol-node wiki-ids.
-    4   In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing
-        information for images listed in enwiki/enwikiImgs.db, and stores
-        it in that db.
-    5   In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
-        images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/.
-    6   Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/,
-        and enables choosing, for each tol-node, which image should be used, if any,
-        and outputs choice information into mergedImgList.txt. Uses the 'nodes',
-        'eol_ids', and 'wiki_ids' tables (as well as 'names' for info-display).
-    7   Run genImgsForWeb.py, which creates cropped/resized images in img/,
-        using mergedImgList.txt, and possibly pickedImgs/, and adds 'images' and
-        'node_imgs' tables to data.db. <br>
-        Smartcrop's outputs might need to be manually created/adjusted: <br>
-        -   An input image might have no output produced, possibly due to
-            data incompatibilities, memory limits, etc. A few input image files
-            might actually be html files, containing a 'file not found' page.
-        -   An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
-        -   An input image might produce output with unexpected dimensions.
-            This seems to happen when the image is very large, and triggers a
-            decompression bomb warning.
-        The result might have as many as 150k images, with about 2/3 of them
-        being from wikipedia.
-    8   Run genLinkedImgs.py to add a 'linked_imgs' table to data.db,
-        which uses 'nodes', 'edges', 'eol_ids', and 'node_imgs', to associate
-        nodes without images to child images.
-5   Reduced Tree Structure Data
-    1   Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
-        data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
-6   Other
-    -   Optionally run genEnwikiNameData.py, which adds more entries to the 'names' table,
-        using data in enwiki/enwikiData.db, and the 'names' and 'wiki_ids' tables.
-    -   Optionally run addPickedNames.py, which adds manually-picked names to
-        the 'names' table, as specified in pickedNames.txt.
-    -   Optionally run trimTree.py, which tries to remove some 'low-significance' nodes,
-        for the sake of performance and result-relevance. Without this, jumping to certain
-        nodes within the fungi and moths can take over a minute to render.
+This directory holds files used to generate data.db, which contains tree-of-life data.
 
-data.db Tables
-==============
--   nodes:        name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT
--   edges:        node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
--   eol\_ids:     id INT PRIMARY KEY, name TEXT
--   names:        name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name)
--   wiki\_ids:    name TEXT PRIMARY KEY, id INT, redirected INT
--   descs:        wiki\_id INT PRIMARY KEY, desc TEXT, from\_dbp INT
--   node\_imgs:   name TEXT PRIMARY KEY, img\_id INT, src TEXT
--   images:       id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)
--   linked\_imgs: name TEXT PRIMARY KEY, otol\_ids TEXT
--   r\_nodes:     name TEXT PRIMARY KEY, tips INT
--   r\_edges:     node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child)
+# Tables:
+-   `nodes`:       `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT`
+-   `edges`:       `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)`
+-   `eol_ids`:     `id INT PRIMARY KEY, name TEXT`
+-   `names`:       `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)`
+-   `wiki_ids`:    `name TEXT PRIMARY KEY, id INT, redirected INT`
+-   `descs`:       `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT`
+-   `node_imgs`:   `name TEXT PRIMARY KEY, img_id INT, src TEXT`
+-   `images`:      `id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)`
+-   `linked_imgs`: `name TEXT PRIMARY KEY, otol_ids TEXT`
+-   `r_nodes`:     `name TEXT PRIMARY KEY, tips INT`
+-   `r_edges`:     `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)`
 
-Other Files
-===========
--   dbpPickedLabels.txt <br>
-    Contains DBpedia labels, one per line. Used by genDbpData.py to help
-    resolve conflicts when associating tree-of-life node names with
-    DBpedia node labels.
--   genOtolNamesToKeep.txt <br>
-    Contains names to avoid trimming off the tree data generated by
-    genOtolData.py.  Usage is optional, but, without it, a large amount
-    of possibly-significant nodes are removed, using a short-sighted
-    heuristic. <br>
-    One way to generate this list is to generate the files as usual,
-    then get node names that have an associated image, description, or
-    presence in r_nodes. Then run the genOtolData.py and genEolNameData.py
-    scripts again (after deleting their created tables).
--   genEnwikiDescNamesToSkip.txt <br>
-    Contains names for nodes that genEnwikiNameData.py should skip adding
-    a description for. Usage is optional, but without it, some nodes will
-    probably get descriptions that don't match (eg: the bee genus Osiris
-    might be described as an egyptian god). <br>
-    This file was generated by running genEnwikiNameData.py, then listing
-    the names that it added into a file, along with descriptions, and
-    manually removing those that seemed node-matching (got about 30k lines,
-    with about 1 in 30 descriptions non-matching). And, after creating
-    genEnwikiDescTitlesToUse.txt, names shared with that file were removed.
--   genEnwikiDescTitlesToUse.txt <br>
-    Contains enwiki titles with the form 'name1 (category1)' for
-    genEnwikiNameData.py to use to resolve nodes matching name name1.
-    Usage is optional, but it adds some descriptions that would otherwise
-    be skipped. <br>
-    This file was generated by taking the content of genEnwikiNameData.py,
-    after the manual filtering step, then, for each name name,1 getting
-    page titles from dbpedia/dbpData.db that match 'name1 (category1)'.
-    This was followed by manually removing lines, keeping those that
-    seemed to match the corresponding node (used the app to help with this).
+# Generating the Database
+
+For the most part, these steps should be done in order.
+
+As a warning, the whole process takes a lot of time and file space. The tree will probably
+have about 2.5 billion nodes. Downloading the images will take several days, and occupy over
+200 GB. And if you want good data, you'll need to do some manual review, which can take weeks.
+
+## Environment
+The scripts are written in python and bash.
+Some of the python scripts require third-party packages:
+-   jsonpickle: For encoding class objects as JSON.
+-   requests: For downloading data.
+-   PIL: For image processing.
+-   tkinter: For providing a basic GUI to review images.
+-   mwxml, mwparserfromhell: For parsing Wikipedia dumps.
+
+## Generate tree structure data
+1.  Obtain files in otol/, as specified in it's README.
+2.  Run genOtolData.py, which creates data.db, and adds the `nodes` and `edges` tables,
+    using data in otol/. It also uses these files, if they exist:
+    -   pickedOtolNames.txt: Has lines of the form `name1|otolId1`. Some nodes in the
+        tree may have the same name (eg: Pholidota can refer to pangolins or orchids).
+        Normally, such nodes will get the names 'name1', 'name1 [2]', 'name1 [3], etc.
+        This file can be used to manually specify which node should be named 'name1'.
+
+## Generate node name data
+1.  Obtain 'name data files' in eol/, as specified in it's README.
+2.  Run genEolNameData.py, which adds the `names` and `eol_ids` tables, using data in
+    eol/ and the `nodes` table. It also uses these files, if they exist:
+    -   pickedEolIds.txt: Has lines of the form `nodeName1|eolId1` or `nodeName1|`.
+        Specifies node names that should have a particular EOL ID, or no ID.
+        Quite a few taxons have ambiguous names, and may need manual correction.
+        For example, Viola may resolve to a taxon of butterflies or of plants.
+    -   pickedEolAltsToSkip.txt: Has lines of the form `nodeName1|altName1`.
+        Specifies that a node's alt-name set should exclude altName1.
+
+## Generate node description data
+### Get data from DBpedia
+1.  Obtain files in dbpedia/, as specified in it's README.
+2.  Run genDbpData.py, which adds the `wiki_ids` and `descs` tables, using data in
+    dbpedia/ and the `nodes` table. It also uses these files, if they exist:
+    -   pickedEnwikiNamesToSkip.txt: Each line holds the name of a node for which
+        no description should be obtained. Many node names have a same-name
+        wikipedia page that describes something different (eg: Osiris).
+    -   pickedDbpLabels.txt: Has lines of the form `nodeName1|label1`.
+        Specifies node names that should have a particular associated page label.
+### Get data from Wikipedia
+1.  Obtain 'description database files' in enwiki/, as specified in it's README.
+2.  Run genEnwikiDescData.py, which adds to the `wiki_ids` and `descs` tables,
+    using data in enwiki/ and the `nodes` table.
+    It also uses these files, if they exist:
+    -   pickedEnwikiNamesToSkip.txt: Same as with genDbpData.py.
+    -   pickedEnwikiLabels.txt: Similar to pickedDbpLabels.txt.
+
+## Generate image data
+### Get images from EOL
+1.  Obtain 'image metadata files' in eol/, as specified in it's README.
+2.  In eol/, run downloadImgs.py, which downloads images (possibly multiple per node),
+    into eol/imgsForReview, using data in eol/, as well as the `eol_ids` table.
+3.  In eol/, run reviewImgs.py, which interactively displays the downloaded images for
+    each node, providing the choice of which to use, moving them to eol/imgs/.
+    Uses `names` and `eol_ids` to display extra info.
+### Get images from Wikipedia
+1.  In enwiki/, run genImgData.py, which looks for wikipedia image names for each node,
+    using the `wiki_ids` table, and stores them in a database.
+2.  In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing information for
+    those images, using wikipedia's online API.
+3.  In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed'
+    images into enwiki/imgs/.
+### Merge the image sets
+1.  Run reviewImgsToGen.py, which displays images from eol/imgs/ and enwiki/imgs/,
+    and enables choosing, for each node, which image should be used, if any,
+    and outputs choice information into imgList.txt. Uses the `nodes`,
+    `eol_ids`, and `wiki_ids` tables (as well as `names` to display extra info).
+2.  Run genImgs.py, which creates cropped/resized images in img/, from files listed in
+    imgList.txt and located in eol/ and enwiki/, and creates the `node_imgs` and
+    `images` tables. If pickedImgs/ is present, images within it are also used. <br>
+    The outputs might need to be manually created/adjusted:
+    -   An input image might have no output produced, possibly due to
+        data incompatibilities, memory limits, etc. A few input image files
+        might actually be html files, containing a 'file not found' page.
+    -   An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg.
+    -   An input image might produce output with unexpected dimensions.
+        This seems to happen when the image is very large, and triggers a
+        decompression bomb warning.
+    The result might have as many as 150k images, with about 2/3 of them
+    being from wikipedia.
+### Add more image associations
+1.  Run genLinkedImgs.py, which tries to associate nodes without images to
+    images of it's children. Adds the `linked_imgs` table, and uses the
+    `nodes`, `edges`, and `node_imgs` tables.
+
+## Do some post-processing
+1.  Run genReducedTreeData.py, which generates a second, reduced version of the tree,
+    adding the `r_nodes` and `r_edges` tables, using `nodes` and `names`. Reads from
+    pickedReducedNodes.txt, which lists names of nodes that must be included (1 per line).
+2.  Optionally run trimTree.py, which tries to remove some 'low-significance' nodes,
+    for the sake of performance and result-relevance. Otherwise, some nodes may have
+    over 10k children, which can take a while to render (over a minute in my testing).
+    You might want to backup the untrimmed tree first, as this operation is not easily
+    reversible.
+3.  Optionally run genEnwikiNameData.py, which adds more entries to the `names` table,
+    using data in enwiki/, and the `names` and `wiki_ids` tables.
+4.  Optionally run addPickedNames.py, which allows adding manually-selected name data to
+    the `names` table, as specified in pickedNames.txt.
diff --git a/backend/data/dbpedia/README.md b/backend/data/dbpedia/README.md
index 78e2a90..8a08f20 100644
--- a/backend/data/dbpedia/README.md
+++ b/backend/data/dbpedia/README.md
@@ -1,28 +1,29 @@
-Downloaded Files
-================
--   labels\_lang=en.ttl.bz2 <br>
-    Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core, 
-    using the link <https://databus.dbpedia.org/dbpedia/generic/labels/2022.03.01/labels_lang=en.ttl.bz2>.
--   page\_lang=en\_ids.ttl.bz2 <br>
+This directory holds files obtained from/using [Dbpedia](https://www.dbpedia.org).
+
+# Downloaded Files
+-   `labels_lang=en.ttl.bz2` <br>
+    Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core.
+    Downloaded from <https://databus.dbpedia.org/dbpedia/generic/labels/2022.03.01/labels_lang=en.ttl.bz2>.
+-   `page_lang=en_ids.ttl.bz2` <br>
     Downloaded from <https://databus.dbpedia.org/dbpedia/generic/page/2022.03.01/page_lang=en_ids.ttl.bz2>
--   redirects\_lang=en\_transitive.ttl.bz2 <br>
+-   `redirects_lang=en_transitive.ttl.bz2` <br>
     Downloaded from <https://databus.dbpedia.org/dbpedia/generic/redirects/2022.03.01/redirects_lang=en_transitive.ttl.bz2>.
--   disambiguations\_lang=en.ttl.bz2 <br>
+-   `disambiguations_lang=en.ttl.bz2` <br>
     Downloaded from <https://databus.dbpedia.org/dbpedia/generic/disambiguations/2022.03.01/disambiguations_lang=en.ttl.bz2>.
--   instance-types\_lang=en\_specific.ttl.bz2 <br>
+-   `instance-types_lang=en_specific.ttl.bz2` <br>
     Downloaded from <https://databus.dbpedia.org/dbpedia/mappings/instance-types/2022.03.01/instance-types_lang=en_specific.ttl.bz2>.
--   short-abstracts\_lang=en.ttl.bz2 <br>
+-   `short-abstracts_lang=en.ttl.bz2` <br>
     Downloaded from <https://databus.dbpedia.org/vehnem/text/short-abstracts/2021.05.01/short-abstracts_lang=en.ttl.bz2>.
 
-Generated Files
-===============
--   dbpData.db <br>
-    An sqlite database representing data from the ttl files.
-    Generated by running genData.py.
-    Tables
-    -   labels:          iri TEXT PRIMARY KEY, label TEXT 
-    -   ids:             iri TEXT PRIMARY KEY, id INT
-    -   redirects:       iri TEXT PRIMARY KEY, target TEXT
-    -   disambiguations: iri TEXT PRIMARY KEY
-    -   types:           iri TEXT, type TEXT
-    -   abstracts:       iri TEXT PRIMARY KEY, abstract TEXT
+# Other Files
+-   genDescData.py <br>
+    Used to generate a database representing data from the ttl files.
+-   descData.db <br>
+    Generated by genDescData.py. <br>
+    Tables: <br>
+    -   `labels`:          `iri TEXT PRIMARY KEY, label TEXT `
+    -   `ids`:             `iri TEXT PRIMARY KEY, id INT`
+    -   `redirects`:       `iri TEXT PRIMARY KEY, target TEXT`
+    -   `disambiguations`: `iri TEXT PRIMARY KEY`
+    -   `types`:           `iri TEXT, type TEXT`
+    -   `abstracts`:       `iri TEXT PRIMARY KEY, abstract TEXT`
diff --git a/backend/data/dbpedia/genData.py b/backend/data/dbpedia/genData.py
deleted file mode 100755
index 41c48a8..0000000
--- a/backend/data/dbpedia/genData.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import bz2, sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n"
-usageInfo += "and creates a sqlite db containing that data.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines
-idsFile = "page_lang=en_ids.ttl.bz2"
-redirectsFile = "redirects_lang=en_transitive.ttl.bz2"
-disambigFile = "disambiguations_lang=en.ttl.bz2"
-typesFile = "instance-types_lang=en_specific.ttl.bz2"
-abstractsFile = "short-abstracts_lang=en.ttl.bz2"
-dbFile = "dbpData.db"
-
-# Open db
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-# Read/store labels
-print("Reading/storing label data")
-dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)")
-dbCur.execute("CREATE INDEX labels_idx ON labels(label)")
-dbCur.execute("CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)")
-labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n')
-lineNum = 0
-with bz2.open(labelsFile, mode='rt') as file:
-	for line in file:
-		lineNum += 1
-		if lineNum % 1e5 == 0:
-			print(f"Processing line {lineNum}")
-		#
-		match = labelLineRegex.fullmatch(line)
-		if match == None:
-			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
-			sys.exit(1)
-		else:
-			dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2)))
-dbCon.commit()
-# Read/store wiki page ids
-print("Reading/storing wiki page ids")
-dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)")
-idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n')
-lineNum = 0
-with bz2.open(idsFile, mode='rt') as file:
-	for line in file:
-		lineNum += 1
-		if lineNum % 1e5 == 0:
-			print(f"Processing line {lineNum}")
-		#
-		match = idLineRegex.fullmatch(line)
-		if match == None:
-			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
-			sys.exit(1)
-		else:
-			try:
-				dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2))))
-			except sqlite3.IntegrityError as e:
-				# Accounts for certain lines that have the same IRI
-				print(f"Failed to add entry with IRI \"{match.group(1)}\": {e}")
-dbCon.commit()
-# Read/store redirects
-print("Reading/storing redirection data")
-dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)")
-redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
-lineNum = 0
-with bz2.open(redirectsFile, mode='rt') as file:
-	for line in file:
-		lineNum += 1
-		if lineNum % 1e5 == 0:
-			print(f"Processing line {lineNum}")
-		#
-		match = redirLineRegex.fullmatch(line)
-		if match == None:
-			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
-			sys.exit(1)
-		else:
-			dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2)))
-dbCon.commit()
-# Read/store diambiguation-page data
-print("Reading/storing diambiguation-page data")
-disambigNames = set()
-disambigLineRegex = redirLineRegex
-lineNum = 0
-with bz2.open(disambigFile, mode='rt') as file:
-	for line in file:
-		lineNum += 1
-		if lineNum % 1e5 == 0:
-			print(f"Processing line {lineNum}")
-		#
-		match = disambigLineRegex.fullmatch(line)
-		if match == None:
-			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
-			sys.exit(1)
-		else:
-			disambigNames.add(match.group(1))
-dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)")
-for name in disambigNames:
-	dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,))
-dbCon.commit()
-# Read/store instance-type
-print("Reading/storing instance-type data")
-dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)")
-dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)")
-typeLineRegex = redirLineRegex
-lineNum = 0
-with bz2.open(typesFile, mode='rt') as file:
-	for line in file:
-		lineNum += 1
-		if lineNum % 1e5 == 0:
-			print(f"Processing line {lineNum}")
-		#
-		match = typeLineRegex.fullmatch(line)
-		if match == None:
-			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
-			sys.exit(1)
-		else:
-			dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2)))
-dbCon.commit()
-# Read/store abstracts
-print("Reading/storing abstracts")
-dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)")
-descLineRegex = labelLineRegex
-lineNum = 0
-with bz2.open(abstractsFile, mode='rt') as file:
-	for line in file:
-		lineNum += 1
-		if lineNum % 1e5 == 0:
-			print(f"Processing line {lineNum}")
-		#
-		if line[0] == "#":
-			continue
-		match = descLineRegex.fullmatch(line)
-		if match == None:
-			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
-			sys.exit(1)
-		else:
-			dbCur.execute("INSERT INTO abstracts VALUES (?, ?)",
-				(match.group(1), match.group(2).replace(r'\"', '"')))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/dbpedia/genDescData.py b/backend/data/dbpedia/genDescData.py
new file mode 100755
index 0000000..bba3ff5
--- /dev/null
+++ b/backend/data/dbpedia/genDescData.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2, sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n"
+usageInfo += "and creates a sqlite db containing that data.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines
+idsFile = "page_lang=en_ids.ttl.bz2"
+redirectsFile = "redirects_lang=en_transitive.ttl.bz2"
+disambigFile = "disambiguations_lang=en.ttl.bz2"
+typesFile = "instance-types_lang=en_specific.ttl.bz2"
+abstractsFile = "short-abstracts_lang=en.ttl.bz2"
+dbFile = "descData.db"
+
+# Open db
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Read/store labels
+print("Reading/storing label data")
+dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)")
+dbCur.execute("CREATE INDEX labels_idx ON labels(label)")
+dbCur.execute("CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)")
+labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n')
+lineNum = 0
+with bz2.open(labelsFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"Processing line {lineNum}")
+		#
+		match = labelLineRegex.fullmatch(line)
+		if match == None:
+			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
+			sys.exit(1)
+		else:
+			dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2)))
+dbCon.commit()
+# Read/store wiki page ids
+print("Reading/storing wiki page ids")
+dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)")
+idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n')
+lineNum = 0
+with bz2.open(idsFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"Processing line {lineNum}")
+		#
+		match = idLineRegex.fullmatch(line)
+		if match == None:
+			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
+			sys.exit(1)
+		else:
+			try:
+				dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2))))
+			except sqlite3.IntegrityError as e:
+				# Accounts for certain lines that have the same IRI
+				print(f"Failed to add entry with IRI \"{match.group(1)}\": {e}")
+dbCon.commit()
+# Read/store redirects
+print("Reading/storing redirection data")
+dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)")
+redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n')
+lineNum = 0
+with bz2.open(redirectsFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"Processing line {lineNum}")
+		#
+		match = redirLineRegex.fullmatch(line)
+		if match == None:
+			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
+			sys.exit(1)
+		else:
+			dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2)))
+dbCon.commit()
+# Read/store diambiguation-page data
+print("Reading/storing diambiguation-page data")
+disambigNames = set()
+disambigLineRegex = redirLineRegex
+lineNum = 0
+with bz2.open(disambigFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"Processing line {lineNum}")
+		#
+		match = disambigLineRegex.fullmatch(line)
+		if match == None:
+			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
+			sys.exit(1)
+		else:
+			disambigNames.add(match.group(1))
+dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)")
+for name in disambigNames:
+	dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,))
+dbCon.commit()
+# Read/store instance-type
+print("Reading/storing instance-type data")
+dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)")
+dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)")
+typeLineRegex = redirLineRegex
+lineNum = 0
+with bz2.open(typesFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"Processing line {lineNum}")
+		#
+		match = typeLineRegex.fullmatch(line)
+		if match == None:
+			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
+			sys.exit(1)
+		else:
+			dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2)))
+dbCon.commit()
+# Read/store abstracts
+print("Reading/storing abstracts")
+dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)")
+descLineRegex = labelLineRegex
+lineNum = 0
+with bz2.open(abstractsFile, mode='rt') as file:
+	for line in file:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"Processing line {lineNum}")
+		#
+		if line[0] == "#":
+			continue
+		match = descLineRegex.fullmatch(line)
+		if match == None:
+			print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr)
+			sys.exit(1)
+		else:
+			dbCur.execute("INSERT INTO abstracts VALUES (?, ?)",
+				(match.group(1), match.group(2).replace(r'\"', '"')))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md
index 6462d7d..1c16a2e 100644
--- a/backend/data/enwiki/README.md
+++ b/backend/data/enwiki/README.md
@@ -1,39 +1,52 @@
-Downloaded Files
-================
+This directory holds files obtained from/using [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page).
+
+# Downloaded Files
 -   enwiki-20220501-pages-articles-multistream.xml.bz2 <br>
-    Obtained via <https://dumps.wikimedia.org/backup-index.html>
-    (site suggests downloading from a mirror).  Contains text
-    content and metadata for pages in English Wikipedia
-    (current revision only, excludes talk pages).  Some file
-    content and format information was available from
-    <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
+    Obtained via <https://dumps.wikimedia.org/backup-index.html> (site suggests downloading from a mirror).
+    Contains text content and metadata for pages in enwiki.
+    Some file content and format information was available from
+        <https://meta.wikimedia.org/wiki/Data_dumps/What%27s_available_for_download>.
 -   enwiki-20220501-pages-articles-multistream-index.txt.bz2 <br>
     Obtained like above. Holds lines of the form offset1:pageId1:title1,
-    providing offsets, for each page, into the dump file, of a chunk of
+    providing, for each page, an offset into the dump file of a chunk of
     100 pages that includes it.
 
-Generated Files
-===============
+# Generated Dump-Index Files
+-   genDumpIndexDb.py <br>
+    Creates an sqlite-database version of the enwiki-dump index file.
 -   dumpIndex.db <br>
-    Holds data from the enwiki dump index file. Generated by
-    genDumpIndexDb.py, and used by lookupPage.py to get content for a
-    given page title. <br>
+    Generated by genDumpIndexDb.py. <br>
     Tables: <br>
-    -   offsets: title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next\_offset INT
--   enwikiData.db <br>
-    Holds data obtained from the enwiki dump file, in 'pages',
-    'redirects', and 'descs' tables. Generated by genData.py, which uses
-    python packages mwxml and mwparserfromhell. <br>
+    -   `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT`
+
+# Description Database Files
+-   genDescData.py <br>
+    Reads through pages in the dump file, and adds short-description info to a database.
+-   descData.db <br>
+    Generated by genDescData.py. <br>
     Tables: <br>
-    -   pages:     id INT PRIMARY KEY, title TEXT UNIQUE
-    -   redirects: id INT PRIMARY KEY, target TEXT
-    -   descs:     id INT PRIMARY KEY, desc TEXT
--   enwikiImgs.db <br>
-    Holds infobox-images obtained for some set of wiki page-ids.
-    Generated by running getEnwikiImgData.py, which uses the enwiki dump
-    file and dumpIndex.db. <br>
+    -   `pages`:     `id INT PRIMARY KEY, title TEXT UNIQUE`
+    -   `redirects`: `id INT PRIMARY KEY, target TEXT`
+    -   `descs`:     `id INT PRIMARY KEY, desc TEXT`
+
+# Image Database Files
+-   genImgData.py <br>
+    Used to find infobox image names for page IDs, storing them into a database.
+-   downloadImgLicenseInfo.py <br>
+    Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database.
+-   imgData.db <br>
+    Used to hold metadata about infobox images for a set of pageIDs.
+    Generated using getEnwikiImgData.py and downloadImgLicenseInfo.py. <br>
     Tables: <br>
-    -   page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT
-        (img\_name may be null, which is used to avoid re-processing the page-id on a second pass)
-    -   imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT
-        (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info)
+    -   `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT` <br>
+        `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids.
+    -   `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT` <br>
+        Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
+-   downloadEnwikiImgs.py <br>
+    Used to download image files into imgs/.
+
+# Other Files
+-   lookupPage.py <br>
+    Running `lookupPage.py title1` looks in the dump for a page with a given title,
+    and prints the contents to stdout. Uses dumpIndex.db.
+
diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py
index de9b862..2929a0d 100755
--- a/backend/data/enwiki/downloadEnwikiImgs.py
+++ b/backend/data/enwiki/downloadEnwikiImgs.py
@@ -16,7 +16,7 @@ if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-imgDb = "enwikiImgs.db" # About 130k image names
+imgDb = "imgData.db" # About 130k image names
 outDir = "imgs"
 licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE)
 
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
index 8231fbb..097304b 100755
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ b/backend/data/enwiki/downloadImgLicenseInfo.py
@@ -16,7 +16,7 @@ if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-imgDb = "enwikiImgs.db" # About 130k image names
+imgDb = "imgData.db" # About 130k image names
 apiUrl = "https://en.wikipedia.org/w/api.php"
 batchSz = 50 # Max 50
 tagRegex = re.compile(r"<[^<]+>")
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py
deleted file mode 100755
index 3e60bb5..0000000
--- a/backend/data/enwiki/genData.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os, re
-import bz2
-import html, mwxml, mwparserfromhell
-import sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
-usageInfo += "and short-description info to an sqlite db.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
-enwikiDb = "enwikiData.db"
-
-# Some regexps and functions for parsing wikitext
-descLineRegex = re.compile("^ *[A-Z'\"]")
-embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
-	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
-convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
-parensGrpRegex = re.compile(r" \([^()]*\)")
-leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
-def convertTemplateReplace(match):
-	if match.group(2) == None:
-		return f"{match.group(1)} {match.group(4)}"
-	else:
-		return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
-def parseDesc(text):
-	# Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
-	# Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, 
-		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
-	lines = []
-	openBraceCount = 0
-	openBracketCount = 0
-	inComment = False
-	skip = False
-	for line in text.splitlines():
-		line = line.strip()
-		if len(lines) == 0:
-			if len(line) > 0:
-				if openBraceCount > 0 or line[0] == "{":
-					openBraceCount += line.count("{")
-					openBraceCount -= line.count("}")
-					skip = True
-				if openBracketCount > 0 or line[0] == "[":
-					openBracketCount += line.count("[")
-					openBracketCount -= line.count("]")
-					skip = True
-				if inComment or line.find("<!--") != -1:
-					if line.find("-->") != -1:
-						if inComment:
-							inComment = False
-							skip = True
-					else:
-						inComment = True
-						skip = True
-				if skip:
-					skip = False
-					continue
-				if line[-1] == ":": # Seems to help avoid disambiguation pages
-					return None
-				if descLineRegex.match(line) != None:
-					lines.append(line)
-		else:
-			if len(line) == 0:
-				return removeMarkup(" ".join(lines))
-			lines.append(line)
-	if len(lines) > 0:
-		return removeMarkup(" ".join(lines))
-	return None
-def removeMarkup(content):
-	content = embeddedHtmlRegex.sub("", content)
-	content = convertTemplateRegex.sub(convertTemplateReplace, content)
-	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
-	content = parensGrpRegex.sub("", content)
-	content = leftoverBraceRegex.sub("", content)
-	return content
-# Other helper functions
-def convertTitle(title):
-	return html.unescape(title).replace("_", " ")
-
-# Check for existing db
-if os.path.exists(enwikiDb):
-	print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
-	sys.exit(1)
-# Create db
-dbCon = sqlite3.connect(enwikiDb)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
-dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
-dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
-dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
-# Read through dump file
-print("Reading dump file")
-with bz2.open(dumpFile, mode='rt') as file:
-	dump = mwxml.Dump.from_file(file)
-	pageNum = 0
-	for page in dump:
-		pageNum += 1
-		if pageNum % 1e4 == 0:
-			print(f"At page {pageNum}")
-		# Parse page
-		if page.namespace == 0:
-			try:
-				dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
-			except sqlite3.IntegrityError as e:
-				# Accounts for certain pages that have the same title
-				print(f"Failed to add page with title \"{page.title}\": {e}")
-				continue
-			if page.redirect != None:
-				dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
-			else:
-				revision = next(page)
-				desc = parseDesc(revision.text)
-				if desc != None:
-					dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
-# Close db
-dbCon.commit()
-dbCon.close()
diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py
new file mode 100755
index 0000000..032dbed
--- /dev/null
+++ b/backend/data/enwiki/genDescData.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python3
+
+import sys, os, re
+import bz2
+import html, mwxml, mwparserfromhell
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n"
+usageInfo += "and short-description info to an sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages
+enwikiDb = "descData.db"
+
+# Some regexps and functions for parsing wikitext
+descLineRegex = re.compile("^ *[A-Z'\"]")
+embeddedHtmlRegex = re.compile(r"<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$")
+	# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
+convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}")
+parensGrpRegex = re.compile(r" \([^()]*\)")
+leftoverBraceRegex = re.compile(r"(?:{\||{{).*")
+def convertTemplateReplace(match):
+	if match.group(2) == None:
+		return f"{match.group(1)} {match.group(4)}"
+	else:
+		return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}"
+def parseDesc(text):
+	# Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank
+	# Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, 
+		# nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, 
+	lines = []
+	openBraceCount = 0
+	openBracketCount = 0
+	inComment = False
+	skip = False
+	for line in text.splitlines():
+		line = line.strip()
+		if len(lines) == 0:
+			if len(line) > 0:
+				if openBraceCount > 0 or line[0] == "{":
+					openBraceCount += line.count("{")
+					openBraceCount -= line.count("}")
+					skip = True
+				if openBracketCount > 0 or line[0] == "[":
+					openBracketCount += line.count("[")
+					openBracketCount -= line.count("]")
+					skip = True
+				if inComment or line.find("<!--") != -1:
+					if line.find("-->") != -1:
+						if inComment:
+							inComment = False
+							skip = True
+					else:
+						inComment = True
+						skip = True
+				if skip:
+					skip = False
+					continue
+				if line[-1] == ":": # Seems to help avoid disambiguation pages
+					return None
+				if descLineRegex.match(line) != None:
+					lines.append(line)
+		else:
+			if len(line) == 0:
+				return removeMarkup(" ".join(lines))
+			lines.append(line)
+	if len(lines) > 0:
+		return removeMarkup(" ".join(lines))
+	return None
+def removeMarkup(content):
+	content = embeddedHtmlRegex.sub("", content)
+	content = convertTemplateRegex.sub(convertTemplateReplace, content)
+	content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup
+	content = parensGrpRegex.sub("", content)
+	content = leftoverBraceRegex.sub("", content)
+	return content
+# Other helper functions
+def convertTitle(title):
+	return html.unescape(title).replace("_", " ")
+
+# Check for existing db
+if os.path.exists(enwikiDb):
+	print(f"ERROR: Existing {enwikiDb}", file=sys.stderr)
+	sys.exit(1)
+# Create db
+dbCon = sqlite3.connect(enwikiDb)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
+dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
+dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
+dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
+dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
+# Read through dump file
+print("Reading dump file")
+with bz2.open(dumpFile, mode='rt') as file:
+	dump = mwxml.Dump.from_file(file)
+	pageNum = 0
+	for page in dump:
+		pageNum += 1
+		if pageNum % 1e4 == 0:
+			print(f"At page {pageNum}")
+		# Parse page
+		if page.namespace == 0:
+			try:
+				dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title)))
+			except sqlite3.IntegrityError as e:
+				# Accounts for certain pages that have the same title
+				print(f"Failed to add page with title \"{page.title}\": {e}")
+				continue
+			if page.redirect != None:
+				dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect)))
+			else:
+				revision = next(page)
+				desc = parseDesc(revision.text)
+				if desc != None:
+					dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc))
+# Close db
+dbCon.commit()
+dbCon.close()
diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py
new file mode 100755
index 0000000..9bd28f4
--- /dev/null
+++ b/backend/data/enwiki/genImgData.py
@@ -0,0 +1,178 @@
+#!/usr/bin/python3
+
+import sys, re
+import bz2, html, urllib.parse
+import sqlite3
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
+usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+def getInputPageIds():
+	pageIds = set()
+	dbCon = sqlite3.connect("../data.db")
+	dbCur = dbCon.cursor()
+	for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
+		pageIds.add(pageId)
+	dbCon.close()
+	return pageIds
+dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
+indexDb = "dumpIndex.db"
+imgDb = "imgData.db" # Output db
+idLineRegex = re.compile(r"<id>(.*)</id>")
+imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
+bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
+imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
+cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
+
+# Open dbs
+indexDbCon = sqlite3.connect(indexDb)
+indexDbCur = indexDbCon.cursor()
+imgDbCon = sqlite3.connect(imgDb)
+imgDbCur = imgDbCon.cursor()
+# Create image-db table
+pidsDone = set()
+if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+	imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
+	imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+else:
+	for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+		pidsDone.add(pid)
+	print(f"Will skip {len(pidsDone)} already-processed page-ids")
+# Get input pageIds
+print("Getting input page-ids", file=sys.stderr)
+pageIds = getInputPageIds()
+for pid in pidsDone:
+	pageIds.remove(pid)
+print(f"Found {len(pageIds)} page-ids to process")
+# Get page-id dump-file offsets
+print("Getting dump-file offsets", file=sys.stderr)
+offsetToPageids = {}
+offsetToEnd = {}
+iterNum = 0
+for pageId in pageIds:
+	iterNum += 1
+	if iterNum % 1e4 == 0:
+		print(f"At iteration {iterNum}", file=sys.stderr)
+	#
+	query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
+	row = indexDbCur.execute(query, (pageId,)).fetchone()
+	if row == None:
+		print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
+		continue
+	(chunkOffset, endOffset) = row
+	offsetToEnd[chunkOffset] = endOffset
+	if chunkOffset not in offsetToPageids:
+		offsetToPageids[chunkOffset] = []
+	offsetToPageids[chunkOffset].append(pageId)
+print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
+# Look through dump file, jumping to chunks containing relevant pages
+print("Reading through dump file", file=sys.stderr)
+def getImageName(content):
+	""" Given an array of text-content lines, returns an image-filename, or None """
+	for line in content:
+		match = imageLineRegex.match(line)
+		if match != None:
+			imageName = match.group(1).strip()
+			if imageName == "":
+				return None
+			imageName = html.unescape(imageName)
+			# Account for {{...
+			if imageName.startswith("{"):
+				match = cssImgCropRegex.match(imageName)
+				if match == None:
+					return None
+				imageName = match.group(1)
+			# Account for [[File:...|...]]
+			if imageName.startswith("["):
+				match = bracketImageRegex.match(imageName)
+				if match == None:
+					return None
+				imageName = match.group(1)
+			# Account for <!--
+			if imageName.find("<!--") != -1:
+				return None
+			# Remove an initial 'File:'
+			if imageName.startswith("File:"):
+				imageName = imageName[5:]
+			# Remove an initial 'Image:'
+			if imageName.startswith("Image:"):
+				imageName = imageName[6:]
+			# Check for extension
+			match = imageNameRegex.match(imageName)
+			if match != None:
+				imageName = match.group(0)
+				imageName = urllib.parse.unquote(imageName)
+				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
+				imageName = imageName.replace("_", " ")
+				return imageName
+			# Skip lines like: | image = &lt;imagemap&gt;
+			return None
+	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
+	return None
+with open(dumpFile, mode='rb') as file:
+	iterNum = 0
+	for (pageOffset, endOffset) in offsetToEnd.items():
+		iterNum += 1
+		if iterNum % 100 == 0:
+			print(f"At iteration {iterNum}", file=sys.stderr)
+		#
+		pageIds = offsetToPageids[pageOffset]
+		# Jump to chunk
+		file.seek(pageOffset)
+		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
+		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
+		# Look in chunk for pages
+		lines = data.splitlines()
+		lineIdx = 0
+		while lineIdx < len(lines):
+			# Look for <page>
+			if lines[lineIdx].lstrip() != "<page>":
+				lineIdx += 1
+				continue
+			# Check page id
+			lineIdx += 3
+			idLine = lines[lineIdx].lstrip()
+			match = idLineRegex.fullmatch(idLine)
+			if match == None or int(match.group(1)) not in pageIds:
+				lineIdx += 1
+				continue
+			pageId = int(match.group(1))
+			lineIdx += 1
+			# Look for <text> in <page>
+			foundText = False
+			while lineIdx < len(lines):
+				if not lines[lineIdx].lstrip().startswith("<text "):
+					lineIdx += 1
+					continue
+				foundText = True
+				# Get text content
+				content = []
+				line = lines[lineIdx]
+				content.append(line[line.find(">") + 1:])
+				lineIdx += 1
+				foundTextEnd = False
+				while lineIdx < len(lines):
+					line = lines[lineIdx]
+					if not line.endswith("</text>"):
+						content.append(line)
+						lineIdx += 1
+						continue
+					foundTextEnd = True
+					content.append(line[:line.rfind("</text>")])
+					# Look for image-filename
+					imageName = getImageName(content)
+					imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+					break
+				if not foundTextEnd:
+					print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
+				break
+			if not foundText:
+				print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
+# Close dbs
+indexDbCon.close()
+imgDbCon.commit()
+imgDbCon.close()
diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py
deleted file mode 100755
index f8bb2ee..0000000
--- a/backend/data/enwiki/getEnwikiImgData.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import bz2, html, urllib.parse
-import sqlite3
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n"
-usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-def getInputPageIds():
-	pageIds = set()
-	dbCon = sqlite3.connect("../data.db")
-	dbCur = dbCon.cursor()
-	for (pageId,) in dbCur.execute("SELECT id from wiki_ids"):
-		pageIds.add(pageId)
-	dbCon.close()
-	return pageIds
-dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
-indexDb = "dumpIndex.db"
-imgDb = "enwikiImgs.db" # Output db
-idLineRegex = re.compile(r"<id>(.*)</id>")
-imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
-bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
-imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
-cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
-
-# Open dbs
-indexDbCon = sqlite3.connect(indexDb)
-indexDbCur = indexDbCon.cursor()
-imgDbCon = sqlite3.connect(imgDb)
-imgDbCur = imgDbCon.cursor()
-# Create image-db table
-pidsDone = set()
-if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
-	imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
-	imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
-else:
-	for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
-		pidsDone.add(pid)
-	print(f"Will skip {len(pidsDone)} already-processed page-ids")
-# Get input pageIds
-print("Getting input page-ids", file=sys.stderr)
-pageIds = getInputPageIds()
-for pid in pidsDone:
-	pageIds.remove(pid)
-print(f"Found {len(pageIds)} page-ids to process")
-# Get page-id dump-file offsets
-print("Getting dump-file offsets", file=sys.stderr)
-offsetToPageids = {}
-offsetToEnd = {}
-iterNum = 0
-for pageId in pageIds:
-	iterNum += 1
-	if iterNum % 1e4 == 0:
-		print(f"At iteration {iterNum}", file=sys.stderr)
-	#
-	query = "SELECT offset, next_offset FROM offsets WHERE id = ?"
-	row = indexDbCur.execute(query, (pageId,)).fetchone()
-	if row == None:
-		print(f"WARNING: Page id {pageId} not found", file=sys.stderr)
-		continue
-	(chunkOffset, endOffset) = row
-	offsetToEnd[chunkOffset] = endOffset
-	if chunkOffset not in offsetToPageids:
-		offsetToPageids[chunkOffset] = []
-	offsetToPageids[chunkOffset].append(pageId)
-print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr)
-# Look through dump file, jumping to chunks containing relevant pages
-print("Reading through dump file", file=sys.stderr)
-def getImageName(content):
-	""" Given an array of text-content lines, returns an image-filename, or None """
-	for line in content:
-		match = imageLineRegex.match(line)
-		if match != None:
-			imageName = match.group(1).strip()
-			if imageName == "":
-				return None
-			imageName = html.unescape(imageName)
-			# Account for {{...
-			if imageName.startswith("{"):
-				match = cssImgCropRegex.match(imageName)
-				if match == None:
-					return None
-				imageName = match.group(1)
-			# Account for [[File:...|...]]
-			if imageName.startswith("["):
-				match = bracketImageRegex.match(imageName)
-				if match == None:
-					return None
-				imageName = match.group(1)
-			# Account for <!--
-			if imageName.find("<!--") != -1:
-				return None
-			# Remove an initial 'File:'
-			if imageName.startswith("File:"):
-				imageName = imageName[5:]
-			# Remove an initial 'Image:'
-			if imageName.startswith("Image:"):
-				imageName = imageName[6:]
-			# Check for extension
-			match = imageNameRegex.match(imageName)
-			if match != None:
-				imageName = match.group(0)
-				imageName = urllib.parse.unquote(imageName)
-				imageName = html.unescape(imageName) # Intentionally unescaping again (handles some odd cases)
-				imageName = imageName.replace("_", " ")
-				return imageName
-			# Skip lines like: | image = &lt;imagemap&gt;
-			return None
-	# Doesn't try and find images in outside-infobox [[File:...]] and <imagemap> sections
-	return None
-with open(dumpFile, mode='rb') as file:
-	iterNum = 0
-	for (pageOffset, endOffset) in offsetToEnd.items():
-		iterNum += 1
-		if iterNum % 100 == 0:
-			print(f"At iteration {iterNum}", file=sys.stderr)
-		#
-		pageIds = offsetToPageids[pageOffset]
-		# Jump to chunk
-		file.seek(pageOffset)
-		compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset)
-		data = bz2.BZ2Decompressor().decompress(compressedData).decode()
-		# Look in chunk for pages
-		lines = data.splitlines()
-		lineIdx = 0
-		while lineIdx < len(lines):
-			# Look for <page>
-			if lines[lineIdx].lstrip() != "<page>":
-				lineIdx += 1
-				continue
-			# Check page id
-			lineIdx += 3
-			idLine = lines[lineIdx].lstrip()
-			match = idLineRegex.fullmatch(idLine)
-			if match == None or int(match.group(1)) not in pageIds:
-				lineIdx += 1
-				continue
-			pageId = int(match.group(1))
-			lineIdx += 1
-			# Look for <text> in <page>
-			foundText = False
-			while lineIdx < len(lines):
-				if not lines[lineIdx].lstrip().startswith("<text "):
-					lineIdx += 1
-					continue
-				foundText = True
-				# Get text content
-				content = []
-				line = lines[lineIdx]
-				content.append(line[line.find(">") + 1:])
-				lineIdx += 1
-				foundTextEnd = False
-				while lineIdx < len(lines):
-					line = lines[lineIdx]
-					if not line.endswith("</text>"):
-						content.append(line)
-						lineIdx += 1
-						continue
-					foundTextEnd = True
-					content.append(line[:line.rfind("</text>")])
-					# Look for image-filename
-					imageName = getImageName(content)
-					imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
-					break
-				if not foundTextEnd:
-					print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
-				break
-			if not foundText:
-				print(f"Did not find <text> for page id {pageId}", file=sys.stderr)
-# Close dbs
-indexDbCon.close()
-imgDbCon.commit()
-imgDbCon.close()
diff --git a/backend/data/eol/README.md b/backend/data/eol/README.md
index 8338be0..fbb008d 100644
--- a/backend/data/eol/README.md
+++ b/backend/data/eol/README.md
@@ -1,18 +1,25 @@
-Downloaded Files
-================
--   imagesList.tgz <br>
-    Obtained from https://opendata.eol.org/dataset/images-list on 24/04/2022.
-    Listed as being last updated on 05/02/2020.
+This directory holds files obtained from/using the [Encyclopedia of Life](https://eol.org/).
+
+# Name Data Files
 -   vernacularNames.csv <br>
-    Obtained from https://opendata.eol.org/dataset/vernacular-names on 24/04/2022.
-    Listed as being last updated on 27/10/2020.
+    Obtained from <https://opendata.eol.org/dataset/vernacular-names> on 24/04/2022 (last updated on 27/10/2020).
+    Contains alternative-name data from EOL.
 
-Generated Files
-===============
+# Image Metadata Files
+-   imagesList.tgz <br>
+    Obtained from <https://opendata.eol.org/dataset/images-list> on 24/04/2022 (last updated on 05/02/2020).
+    Contains metadata for images from EOL.
 -   imagesList/ <br>
-    Obtained by extracting imagesList.tgz.
+    Extracted from imagesList.tgz.
 -   imagesList.db <br>
-    Represents data from eol/imagesList/*, and is created by genImagesListDb.sh. <br>
+    Contains data from imagesList/.
+    Created by running genImagesListDb.sh, which simply imports csv files into a database. <br>
     Tables: <br>
-    -   images:
-        content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT
+    -   `images`:
+        `content_id INT PRIMARY KEY, page_id INT, source_url TEXT, copy_url TEXT, license TEXT, copyright_owner TEXT`
+
+# Image Generation Files
+-   downloadImgs.py <br>
+    Used to download image files into imgsForReview/.
+-   reviewImgs.py <br>
+    Used to review images in imgsForReview/, moving acceptable ones into imgs/.
diff --git a/backend/data/eol/reviewImgs.py b/backend/data/eol/reviewImgs.py
index 4fea1c4..5290f9e 100755
--- a/backend/data/eol/reviewImgs.py
+++ b/backend/data/eol/reviewImgs.py
@@ -17,7 +17,7 @@ if len(sys.argv) > 1:
 	sys.exit(1)
 
 imgDir = "imgsForReview/"
-outDir = "imgsReviewed/"
+outDir = "imgs/"
 extraInfoDbCon = sqlite3.connect("../data.db")
 extraInfoDbCur = extraInfoDbCon.cursor()
 def getExtraInfo(eolId):
diff --git a/backend/data/genDbpData.py b/backend/data/genDbpData.py
index e921b6c..afe1e17 100755
--- a/backend/data/genDbpData.py
+++ b/backend/data/genDbpData.py
@@ -12,9 +12,9 @@ if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-dbpediaDb = "dbpedia/dbpData.db"
-namesToSkipFile = "genDescNamesToSkip.txt"
-pickedLabelsFile = "dbpPickedLabels.txt"
+dbpediaDb = "dbpedia/descData.db"
+namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
+pickedLabelsFile = "pickedDbpLabels.txt"
 dbFile = "data.db"
 
 # Open dbs
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index 2396540..dbc8d6b 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -11,10 +11,10 @@ if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-enwikiDb = "enwiki/enwikiData.db"
+enwikiDb = "enwiki/descData.db"
 dbFile = "data.db"
-namesToSkipFile = "genDescNamesToSkip.txt"
-pickedLabelsFile = "enwikiPickedLabels.txt"
+namesToSkipFile = "pickedEnwikiNamesToSkip.txt"
+pickedLabelsFile = "pickedEnwikiLabels.txt"
 
 # Open dbs
 enwikiCon = sqlite3.connect(enwikiDb)
diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py
index 71960a5..8285a40 100755
--- a/backend/data/genEnwikiNameData.py
+++ b/backend/data/genEnwikiNameData.py
@@ -10,7 +10,7 @@ if len(sys.argv) > 1:
 	print(usageInfo, file=sys.stderr)
 	sys.exit(1)
 
-enwikiDb = "enwiki/enwikiData.db"
+enwikiDb = "enwiki/descData.db"
 dbFile = "data.db"
 altNameRegex = re.compile(r"[a-zA-Z]+")
 	# Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
index aa3905e..d852751 100755
--- a/backend/data/genEolNameData.py
+++ b/backend/data/genEolNameData.py
@@ -18,8 +18,8 @@ if len(sys.argv) > 1:
 vnamesFile = "eol/vernacularNames.csv"
 dbFile = "data.db"
 NAMES_TO_SKIP = {"unknown", "unknown species", "unidentified species"}
-pickedIdsFile = "genEolNameDataPickedIds.txt"
-badAltsFile = "genEolNameDataBadAlts.txt"
+pickedIdsFile = "pickedEolIds.txt"
+badAltsFile = "pickedEolAltsToSkip.txt"
 
 # Read in vernacular-names data
 	# Note: Canonical-names may have multiple pids
diff --git a/backend/data/genImgs.py b/backend/data/genImgs.py
new file mode 100755
index 0000000..097959f
--- /dev/null
+++ b/backend/data/genImgs.py
@@ -0,0 +1,179 @@
+#!/usr/bin/python3
+
+import sys, os, subprocess
+import sqlite3, urllib.parse
+import signal
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
+usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
+usageInfo += "Also adds image metadata to an sqlite database.\n"
+usageInfo += "\n"
+usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
+usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+imgListFile = "imgList.txt"
+outDir = "img/"
+eolImgDb = "eol/imagesList.db"
+enwikiImgDb = "enwiki/imgData.db"
+pickedImgsDir = "pickedImgs/"
+pickedImgsFilename = "imgData.txt"
+dbFile = "data.db"
+IMG_OUT_SZ = 200
+genImgFiles = True
+
+# Create output directory if not present
+if not os.path.exists(outDir):
+	os.mkdir(outDir)
+# Open dbs
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+eolCon = sqlite3.connect(eolImgDb)
+eolCur = eolCon.cursor()
+enwikiCon = sqlite3.connect(enwikiImgDb)
+enwikiCur = enwikiCon.cursor()
+# Get 'picked images' info
+nodeToPickedImg = {}
+if os.path.exists(pickedImgsDir + pickedImgsFilename):
+	lineNum = 0
+	with open(pickedImgsDir + pickedImgsFilename) as file:
+		for line in file:
+			lineNum += 1
+			(filename, url, license, artist, credit) = line.rstrip().split("|")
+			nodeName = os.path.splitext(filename)[0] # Remove extension
+			(otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone()
+			nodeToPickedImg[otolId] = {
+				"nodeName": nodeName, "id": lineNum,
+				"filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
+			}
+# Create image tables if not present
+nodesDone = set()
+imgsDone = set()
+if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
+	dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
+	dbCur.execute("CREATE TABLE images" \
+		" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
+else:
+	# Get existing node-associations
+	for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
+		nodesDone.add(otolId)
+	# And images
+	for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
+		imgsDone.add((imgId, imgSrc))
+	print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
+# Detect SIGINT signals
+interrupted = False
+def onSigint(sig, frame):
+	global interrupted
+	interrupted = True
+signal.signal(signal.SIGINT, onSigint)
+# Iterate though images to process
+def quit():
+	dbCon.commit()
+	dbCon.close()
+	eolCon.close()
+	enwikiCon.close()
+	sys.exit(0)
+def convertImage(imgPath, outPath):
+	print(f"Converting {imgPath} to {outPath}")
+	if os.path.exists(outPath):
+		print(f"ERROR: Output image already exists")
+		return False
+	try:
+		completedProcess = subprocess.run(
+			['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
+			stdout=subprocess.DEVNULL
+		)
+	except Exception as e:
+		print(f"ERROR: Exception while attempting to run smartcrop: {e}")
+		return False
+	if completedProcess.returncode != 0:
+		print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
+		return False
+	return True
+print("Processing picked images")
+for (otolId, imgData) in nodeToPickedImg.items():
+	# Check for SIGINT event
+	if interrupted:
+		print("Exiting")
+		quit()
+	# Skip if already processed
+	if otolId in nodesDone:
+		continue
+	# Convert image
+	if genImgFiles:
+		if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"):
+			quit()
+	else:
+		print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
+	# Add entry to db
+	if (imgData["id"], "picked") not in imgsDone:
+		dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+			(imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"]))
+		imgsDone.add((imgData["id"], "picked"))
+	dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked"))
+	nodesDone.add(otolId)
+print("Processing images from eol and enwiki")
+iterNum = 0
+with open(imgListFile) as file:
+	for line in file:
+		iterNum += 1
+		# Check for SIGINT event
+		if interrupted:
+			print("Exiting")
+			break
+		# Skip lines without an image path
+		if line.find(" ") == -1:
+			continue
+		# Get filenames
+		(otolId, _, imgPath) = line.rstrip().partition(" ")
+		# Skip if already processed
+		if otolId in nodesDone:
+			continue
+		# Convert image
+		if genImgFiles:
+			if not convertImage(imgPath, outDir + otolId + ".jpg"):
+				break
+		else:
+			if iterNum % 1e4 == 0:
+				print(f"At iteration {iterNum}")
+		# Add entry to db
+		(nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone()
+		fromEol = imgPath.startswith("eol/")
+		imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
+		imgName = os.path.splitext(imgName)[0] # Remove extension
+		if fromEol:
+			(eolId, _, contentId) = imgName.partition(" ")
+			(eolId, contentId) = (int(eolId), int(contentId))
+			if (eolId, "eol") not in imgsDone:
+				query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
+				row = eolCur.execute(query, (contentId,)).fetchone()
+				if row == None:
+					print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
+					break
+				(url, license, owner) = row
+				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+					(eolId, "eol", url, license, owner, ""))
+				imgsDone.add((eolId, "eol"))
+			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
+		else:
+			enwikiId = int(imgName)
+			if (enwikiId, "enwiki") not in imgsDone:
+				query = "SELECT name, license, artist, credit FROM" \
+					" page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
+					" WHERE page_imgs.page_id = ?"
+				row = enwikiCur.execute(query, (enwikiId,)).fetchone()
+				if row == None:
+					print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
+					break
+				(name, license, artist, credit) = row
+				url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
+				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
+					(enwikiId, "enwiki", url, license, artist, credit))
+				imgsDone.add((enwikiId, "enwiki"))
+			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
+# Close dbs
+quit()
diff --git a/backend/data/genImgsForWeb.py b/backend/data/genImgsForWeb.py
deleted file mode 100755
index 3c299bb..0000000
--- a/backend/data/genImgsForWeb.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/python3
-
-import sys, os, subprocess
-import sqlite3, urllib.parse
-import signal
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Reads a list of eol/enwiki images from a file, and generates web-usable versions.\n"
-usageInfo += "Uses smartcrop, and places resulting images in a directory, with name 'otolId1.jpg'.\n"
-usageInfo += "Also adds image metadata to an sqlite database.\n"
-usageInfo += "\n"
-usageInfo += "SIGINT can be used to stop conversion, and the program can be re-run to\n"
-usageInfo += "continue processing. It uses existing output files to decide where to continue from.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-imgListFile = "mergedImgList.txt"
-outDir = "img/"
-eolImgDb = "eol/imagesList.db"
-enwikiImgDb = "enwiki/enwikiImgs.db"
-pickedImgsDir = "pickedImgs/"
-pickedImgsFile = "metadata.txt"
-dbFile = "data.db"
-IMG_OUT_SZ = 200
-genImgFiles = True
-
-# Create output directory if not present
-if not os.path.exists(outDir):
-	os.mkdir(outDir)
-# Open dbs
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-eolCon = sqlite3.connect(eolImgDb)
-eolCur = eolCon.cursor()
-enwikiCon = sqlite3.connect(enwikiImgDb)
-enwikiCur = enwikiCon.cursor()
-# Get 'picked images' info
-nodeToPickedImg = {}
-if os.path.exists(pickedImgsDir + pickedImgsFile):
-	lineNum = 0
-	with open(pickedImgsDir + pickedImgsFile) as file:
-		for line in file:
-			lineNum += 1
-			(filename, url, license, artist, credit) = line.rstrip().split("|")
-			nodeName = os.path.splitext(filename)[0] # Remove extension
-			(otolId,) = dbCur.execute("SELECT id FROM nodes WHERE name = ?", (nodeName,)).fetchone()
-			nodeToPickedImg[otolId] = {
-				"nodeName": nodeName, "id": lineNum,
-				"filename": filename, "url": url, "license": license, "artist": artist, "credit": credit,
-			}
-# Create image tables if not present
-nodesDone = set()
-imgsDone = set()
-if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='node_imgs'").fetchone() == None:
-	dbCur.execute("CREATE TABLE node_imgs (name TEXT PRIMARY KEY, img_id INT, src TEXT)")
-	dbCur.execute("CREATE TABLE images" \
-		" (id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src))")
-else:
-	# Get existing node-associations
-	for (otolId,) in dbCur.execute("SELECT nodes.id FROM node_imgs INNER JOIN nodes ON node_imgs.name = nodes.name"):
-		nodesDone.add(otolId)
-	# And images
-	for (imgId, imgSrc) in dbCur.execute("SELECT id, src from images"):
-		imgsDone.add((imgId, imgSrc))
-	print(f"Found {len(nodesDone)} nodes and {len(imgsDone)} images pre-existing")
-# Detect SIGINT signals
-interrupted = False
-def onSigint(sig, frame):
-	global interrupted
-	interrupted = True
-signal.signal(signal.SIGINT, onSigint)
-# Iterate though images to process
-def quit():
-	dbCon.commit()
-	dbCon.close()
-	eolCon.close()
-	enwikiCon.close()
-	sys.exit(0)
-def convertImage(imgPath, outPath):
-	print(f"Converting {imgPath} to {outPath}")
-	if os.path.exists(outPath):
-		print(f"ERROR: Output image already exists")
-		return False
-	try:
-		completedProcess = subprocess.run(
-			['npx', 'smartcrop-cli', '--width', str(IMG_OUT_SZ), '--height', str(IMG_OUT_SZ), imgPath, outPath],
-			stdout=subprocess.DEVNULL
-		)
-	except Exception as e:
-		print(f"ERROR: Exception while attempting to run smartcrop: {e}")
-		return False
-	if completedProcess.returncode != 0:
-		print(f"ERROR: smartcrop had exit status {completedProcess.returncode}")
-		return False
-	return True
-print("Processing picked images")
-for (otolId, imgData) in nodeToPickedImg.items():
-	# Check for SIGINT event
-	if interrupted:
-		print("Exiting")
-		quit()
-	# Skip if already processed
-	if otolId in nodesDone:
-		continue
-	# Convert image
-	if genImgFiles:
-		if not convertImage(pickedImgsDir + imgData["filename"], outDir + otolId + ".jpg"):
-			quit()
-	else:
-		print(f"Processing {imgData['nodeName']}: {otolId}.jpg")
-	# Add entry to db
-	if (imgData["id"], "picked") not in imgsDone:
-		dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
-			(imgData["id"], "picked", imgData["url"], imgData["license"], imgData["artist"], imgData["credit"]))
-		imgsDone.add((imgData["id"], "picked"))
-	dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (imgData["nodeName"], imgData["id"], "picked"))
-	nodesDone.add(otolId)
-print("Processing images from eol and enwiki")
-iterNum = 0
-with open(imgListFile) as file:
-	for line in file:
-		iterNum += 1
-		# Check for SIGINT event
-		if interrupted:
-			print("Exiting")
-			break
-		# Skip lines without an image path
-		if line.find(" ") == -1:
-			continue
-		# Get filenames
-		(otolId, _, imgPath) = line.rstrip().partition(" ")
-		# Skip if already processed
-		if otolId in nodesDone:
-			continue
-		# Convert image
-		if genImgFiles:
-			if not convertImage(imgPath, outDir + otolId + ".jpg"):
-				break
-		else:
-			if iterNum % 1e4 == 0:
-				print(f"At iteration {iterNum}")
-		# Add entry to db
-		(nodeName,) = dbCur.execute("SELECT name FROM nodes WHERE id = ?", (otolId,)).fetchone()
-		fromEol = imgPath.startswith("eol/")
-		imgName = os.path.basename(os.path.normpath(imgPath)) # Get last path component
-		imgName = os.path.splitext(imgName)[0] # Remove extension
-		if fromEol:
-			(eolId, _, contentId) = imgName.partition(" ")
-			(eolId, contentId) = (int(eolId), int(contentId))
-			if (eolId, "eol") not in imgsDone:
-				query = "SELECT source_url, license, copyright_owner FROM images WHERE content_id = ?"
-				row = eolCur.execute(query, (contentId,)).fetchone()
-				if row == None:
-					print(f"ERROR: No image record for EOL ID {eolId}, content ID {contentId}", file=sys.stderr)
-					break
-				(url, license, owner) = row
-				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
-					(eolId, "eol", url, license, owner, ""))
-				imgsDone.add((eolId, "eol"))
-			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, eolId, "eol"))
-		else:
-			enwikiId = int(imgName)
-			if (enwikiId, "enwiki") not in imgsDone:
-				query = "SELECT name, license, artist, credit FROM" \
-					" page_imgs INNER JOIN imgs ON page_imgs.img_name = imgs.name" \
-					" WHERE page_imgs.page_id = ?"
-				row = enwikiCur.execute(query, (enwikiId,)).fetchone()
-				if row == None:
-					print(f"ERROR: No image record for enwiki ID {enwikiId}", file=sys.stderr)
-					break
-				(name, license, artist, credit) = row
-				url = "https://en.wikipedia.org/wiki/File:" + urllib.parse.quote(name)
-				dbCur.execute("INSERT INTO images VALUES (?, ?, ?, ?, ?, ?)",
-					(enwikiId, "enwiki", url, license, artist, credit))
-				imgsDone.add((enwikiId, "enwiki"))
-			dbCur.execute("INSERT INTO node_imgs VALUES (?, ?, ?)", (nodeName, enwikiId, "enwiki"))
-# Close dbs
-quit()
diff --git a/backend/data/genOtolData.py b/backend/data/genOtolData.py
index cfb5bed..87b35c3 100755
--- a/backend/data/genOtolData.py
+++ b/backend/data/genOtolData.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-import sys, re
+import sys, re, os
 import json, sqlite3
 
 usageInfo =  f"usage: {sys.argv[0]}\n"
@@ -30,8 +30,8 @@ annFile = "otol/annotations.json"
 dbFile = "data.db"
 nodeMap = {} # Maps node IDs to node objects
 nameToFirstId = {} # Maps node names to first found ID (names might have multiple IDs)
-dupNameToIds = {} # Maps names of nodes with multiple IDs to those node IDs
-pickedDupsFile = "genOtolDataPickedDups.txt"
+dupNameToIds = {} # Maps names of nodes with multiple IDs to those IDs
+pickedNamesFile = "pickedOtolNames.txt"
 
 # Parse treeFile
 print("Parsing tree file")
@@ -142,10 +142,11 @@ rootId = parseNewick()
 # Resolve duplicate names
 print("Resolving duplicates")
 nameToPickedId = {}
-with open(pickedDupsFile) as file:
-	for line in file:
-		(name, _, otolId) = line.rstrip().partition("|")
-		nameToPickedId[name] = otolId
+if os.path.exists(pickedNamesFile):
+	with open(pickedNamesFile) as file:
+		for line in file:
+			(name, _, otolId) = line.rstrip().partition("|")
+			nameToPickedId[name] = otolId
 for [dupName, ids] in dupNameToIds.items():
 	# Check for picked id
 	if dupName in nameToPickedId:
diff --git a/backend/data/genReducedTreeData.py b/backend/data/genReducedTreeData.py
index 208c937..b475794 100755
--- a/backend/data/genReducedTreeData.py
+++ b/backend/data/genReducedTreeData.py
@@ -10,7 +10,7 @@ if len(sys.argv) > 1:
 	sys.exit(1)
 
 dbFile = "data.db"
-nodeNamesFile = "reducedTol/names.txt"
+nodeNamesFile = "reducedTreeNodes.txt"
 minimalNames = set()
 nodeMap = {} # Maps node names to node objects
 PREF_NUM_CHILDREN = 3 # Attempt inclusion of children up to this limit
diff --git a/backend/data/otol/README.md b/backend/data/otol/README.md
index a6f13c2..4be2fd2 100644
--- a/backend/data/otol/README.md
+++ b/backend/data/otol/README.md
@@ -1,6 +1,10 @@
-Downloaded Files
-================
+Files
+=====
+-   opentree13.4tree.tgz <br>
+    Obtained from <https://tree.opentreeoflife.org/about/synthesis-release/v13.4>.
+    Contains tree data from the [Open Tree of Life](https://tree.opentreeoflife.org/about/open-tree-of-life).
 -   labelled\_supertree\_ottnames.tre <br>
-    Obtained from https://tree.opentreeoflife.org/about/synthesis-release/v13.4.
--   annotations.json <br>
-    Obtained from https://tree.opentreeoflife.org/about/synthesis-release/v13.4.
+    Extracted from the .tgz file. Describes the structure of the tree.
+-   annotations.json
+    Extracted from the .tgz file. Contains additional attributes of tree
+    nodes. Used for finding out which nodes have 'phylogenetic support'.
diff --git a/backend/data/pickedImgs/README.md b/backend/data/pickedImgs/README.md
index 52fc608..dfe192b 100644
--- a/backend/data/pickedImgs/README.md
+++ b/backend/data/pickedImgs/README.md
@@ -1,12 +1,10 @@
-This directory is used for adding additional, manually-picked images,
-to the server's dataset, overriding any from eol and enwiki. If used,
-it is expected to contain image files, and a metadata.txt file that
-holds metadata.
+This directory holds additional image files to use for tree-of-life nodes,
+on top of those from EOL and Wikipedia.
 
 Possible Files
 ==============
--   Image files
--   metadata.txt <br>
-    Contains lines with the format filename|url|license|artist|credit.
-    The filename should be a tree-of-life node name, with an image
-    extension.  Other fields correspond to those in the 'images' table.
+-   (Image files)
+-   imgData.txt <br>
+    Contains lines with the format `filename|url|license|artist|credit`.
+    The filename should consist of a node name, with an image extension.
+    Other fields correspond to those in the `images` table (see ../README.md).
diff --git a/backend/data/reducedTol/README.md b/backend/data/reducedTol/README.md
deleted file mode 100644
index 103bffc..0000000
--- a/backend/data/reducedTol/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-Files
-=====
--   names.txt <br>
-	Contains names of nodes to be kept in a reduced Tree of Life.
diff --git a/backend/data/reviewImgsToGen.py b/backend/data/reviewImgsToGen.py
new file mode 100755
index 0000000..4d970ba
--- /dev/null
+++ b/backend/data/reviewImgsToGen.py
@@ -0,0 +1,217 @@
+#!/usr/bin/python3
+
+import sys, re, os, time
+import sqlite3
+import tkinter as tki
+from tkinter import ttk
+import PIL
+from PIL import ImageTk, Image, ImageOps
+
+usageInfo =  f"usage: {sys.argv[0]}\n"
+usageInfo += "Provides a GUI that displays, for each tol-node, an associated image from\n"
+usageInfo += "eol/* and enwiki/*, and enables the user to choose which to use. Writes\n"
+usageInfo += "choice data to a text file with lines of the form 'otolId1 imgPath1', or\n"
+usageInfo += "'otolId1', where no path indicates a choice of no image.\n"
+usageInfo += "\n"
+usageInfo += "The program can be closed, and run again to continue from the last choice.\n"
+usageInfo += "The program looks for an existing output file to determine what choices\n"
+usageInfo += "have already been made.\n"
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+eolImgDir = "eol/imgs/"
+enwikiImgDir = "enwiki/imgs/"
+dbFile = "data.db"
+outFile = "imgList.txt"
+IMG_DISPLAY_SZ = 400
+PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
+onlyReviewPairs = True
+
+# Open db
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Associate nodes with images
+nodeToImgs = {} # Maps otol-ids to img-path arrays
+print("Looking through EOL images")
+if os.path.exists(eolImgDir):
+	for filename in os.listdir(eolImgDir):
+		(eolId, _, _) = filename.partition(" ")
+		query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?"
+		found = False
+		for (otolId,) in dbCur.execute(query, (int(eolId),)):
+			if otolId not in nodeToImgs:
+				nodeToImgs[otolId] = []
+			nodeToImgs[otolId].append(eolImgDir + filename)
+			found = True
+		if not found:
+			print(f"No node found for {eolImgDir}{filename}", file=sys.stderr)
+print(f"Result has {len(nodeToImgs)} node entries")
+print("Looking through enwiki images")
+if os.path.exists(enwikiImgDir):
+	for filename in os.listdir(enwikiImgDir):
+		(wikiId, _, _) = filename.partition(".")
+		query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids._id = ?"
+		found = False
+		for (otolId,) in dbCur.execute(query, (int(wikiId),)):
+			if otolId not in nodeToImgs:
+				nodeToImgs[otolId] = []
+			nodeToImgs[otolId].append(enwikiImgDir + filename)
+			found = True
+		if not found:
+			print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr)
+print(f"Result has {len(nodeToImgs)} node entries")
+# Check for already-made choices
+print("Filtering out already-chosen IDs")
+oldSz = len(nodeToImgs)
+if os.path.exists(outFile):
+	with open(outFile) as file:
+		for line in file:
+			line = line.rstrip()
+			if " " in line:
+				line = line[:line.find(" ")]
+			del nodeToImgs[line]
+print(f"Filtered out {oldSz - len(nodeToImgs)} entries")
+
+class ImgReviewer:
+	""" Provides the GUI for reviewing images """
+	def __init__(self, root, nodeToImgs):
+		self.root = root
+		root.title("Image Reviewer")
+		# Setup main frame
+		mainFrame = ttk.Frame(root, padding="5 5 5 5")
+		mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
+		root.columnconfigure(0, weight=1)
+		root.rowconfigure(0, weight=1)
+		# Set up images-to-be-reviewed frames
+		self.eolImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
+		self.enwikiImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
+		self.labels = []
+		for i in (0, 1):
+			frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ)
+			frame.grid(column=i, row=0)
+			label = ttk.Label(frame, image=self.eolImg if i == 0 else self.enwikiImg)
+			label.grid(column=0, row=0)
+			self.labels.append(label)
+		# Add padding
+		for child in mainFrame.winfo_children():
+			child.grid_configure(padx=5, pady=5)
+		# Add bindings
+		root.bind("<q>", self.quit)
+		root.bind("<Key-j>", lambda evt: self.accept(0))
+		root.bind("<Key-k>", lambda evt: self.accept(1))
+		root.bind("<Key-l>", lambda evt: self.reject())
+		# Set fields
+		self.nodeImgsList = list(nodeToImgs.items())
+		self.listIdx = -1
+		self.otolId = None
+		self.eolImgPath = None
+		self.enwikiImgPath = None
+		self.numReviewed = 0
+		self.startTime = time.time()
+		# Initialise images to review
+		self.getNextImgs()
+	def getNextImgs(self):
+		""" Updates display with new images to review, or ends program """
+		# Get next image paths
+		while True:
+			self.listIdx += 1
+			if self.listIdx == len(self.nodeImgsList):
+				print("No more images to review. Exiting program.")
+				self.quit()
+				return
+			(self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
+			# Potentially skip user choice
+			if onlyReviewPairs and len(imgPaths) == 1:
+				with open(outFile, 'a') as file:
+					file.write(f"{self.otolId} {imgPaths[0]}\n")
+				continue
+			break
+		# Update displayed images
+		self.eolImgPath = self.enwikiImgPath = None
+		imageOpenError = False
+		for imgPath in imgPaths:
+			img = None
+			try:
+				img = Image.open(imgPath)
+				img = ImageOps.exif_transpose(img)
+			except PIL.UnidentifiedImageError:
+				print(f"UnidentifiedImageError for {imgPath}")
+				imageOpenError = True
+				continue
+			if imgPath.startswith("eol/"):
+				self.eolImgPath = imgPath
+				self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
+			elif imgPath.startswith("enwiki/"):
+				self.enwikiImgPath = imgPath
+				self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
+			else:
+				print(f"Unexpected image path {imgPath}", file=sys.stderr)
+				self.quit()
+				return
+		# Re-iterate if all image paths invalid
+		if self.eolImgPath == None and self.enwikiImgPath == None:
+			if imageOpenError:
+				self.reject()
+			self.getNextImgs()
+			return
+		# Add placeholder images
+		if self.eolImgPath == None:
+			self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
+		elif self.enwikiImgPath == None:
+			self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
+		# Update image-frames
+		self.labels[0].config(image=self.eolImg)
+		self.labels[1].config(image=self.enwikiImg)
+		# Update title
+		title = f"Imgs for otol ID {self.otolId}"
+		query = "SELECT names.alt_name FROM" \
+			" nodes INNER JOIN names ON nodes.name = names.name" \
+			" WHERE nodes.id = ? and pref_alt = 1"
+		row = dbCur.execute(query, (self.otolId,)).fetchone()
+		if row != None:
+			title += f", aka {row[0]}"
+		title += f" ({self.listIdx + 1} out of {len(self.nodeImgsList)})"
+		self.root.title(title)
+	def accept(self, imgIdx):
+		""" React to a user selecting an image """
+		imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath
+		if imgPath == None:
+			print("Invalid selection")
+			return
+		with open(outFile, 'a') as file:
+			file.write(f"{self.otolId} {imgPath}\n")
+		self.numReviewed += 1
+		self.getNextImgs()
+	def reject(self):
+		""" React to a user rejecting all images of a set """
+		with open(outFile, 'a') as file:
+			file.write(f"{self.otolId}\n")
+		self.numReviewed += 1
+		self.getNextImgs()
+	def quit(self, e = None):
+		print(f"Number reviewed: {self.numReviewed}")
+		timeElapsed = time.time() - self.startTime
+		print(f"Time elapsed: {timeElapsed:.2f} seconds")
+		if self.numReviewed > 0:
+			print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
+		dbCon.close()
+		self.root.destroy()
+	def resizeForDisplay(self, img):
+		""" Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
+		if max(img.width, img.height) > IMG_DISPLAY_SZ:
+			if (img.width > img.height):
+				newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
+				img = img.resize((IMG_DISPLAY_SZ, newHeight))
+			else:
+				newWidth = int(img.width * IMG_DISPLAY_SZ / img.height)
+				img = img.resize((newWidth, IMG_DISPLAY_SZ))
+		bgImg = PLACEHOLDER_IMG.copy()
+		bgImg.paste(img, box=(
+			int((IMG_DISPLAY_SZ - img.width) / 2),
+			int((IMG_DISPLAY_SZ - img.height) / 2)))
+		return bgImg
+# Create GUI and defer control
+root = tki.Tk()
+ImgReviewer(root, nodeToImgs)
+root.mainloop()
diff --git a/backend/data/reviewImgsToMerge.py b/backend/data/reviewImgsToMerge.py
deleted file mode 100755
index d177a5e..0000000
--- a/backend/data/reviewImgsToMerge.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os, time
-import sqlite3
-import tkinter as tki
-from tkinter import ttk
-import PIL
-from PIL import ImageTk, Image, ImageOps
-
-usageInfo =  f"usage: {sys.argv[0]}\n"
-usageInfo += "Provides a GUI that displays, for each tol-node, an associated image from\n"
-usageInfo += "eol/* and enwiki/*, and enables the user to choose which to use. Writes\n"
-usageInfo += "choice data to a text file with lines of the form 'otolId1 imgPath1', or\n"
-usageInfo += "'otolId1', where no path indicates a choice of no image.\n"
-usageInfo += "\n"
-usageInfo += "The program can be closed, and run again to continue from the last choice.\n"
-usageInfo += "The program looks for an existing output file to determine what choices\n"
-usageInfo += "have already been made.\n"
-if len(sys.argv) > 1:
-	print(usageInfo, file=sys.stderr)
-	sys.exit(1)
-
-eolImgDir = "eol/imgsReviewed/"
-enwikiImgDir = "enwiki/imgs/"
-dbFile = "data.db"
-outFile = "mergedImgList.txt"
-IMG_DISPLAY_SZ = 400
-PLACEHOLDER_IMG = Image.new("RGB", (IMG_DISPLAY_SZ, IMG_DISPLAY_SZ), (88, 28, 135))
-onlyReviewPairs = False
-
-# Open db
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-# Associate nodes with images
-nodeToImgs = {} # Maps otol-ids to img-path arrays
-print("Looking through EOL images")
-if os.path.exists(eolImgDir):
-	for filename in os.listdir(eolImgDir):
-		(eolId, _, _) = filename.partition(" ")
-		query = "SELECT nodes.id FROM nodes INNER JOIN eol_ids ON nodes.name = eol_ids.name WHERE eol_ids.id = ?"
-		found = False
-		for (otolId,) in dbCur.execute(query, (int(eolId),)):
-			if otolId not in nodeToImgs:
-				nodeToImgs[otolId] = []
-			nodeToImgs[otolId].append(eolImgDir + filename)
-			found = True
-		if not found:
-			print(f"No node found for {eolImgDir}{filename}", file=sys.stderr)
-print(f"Result has {len(nodeToImgs)} node entries")
-print("Looking through enwiki images")
-if os.path.exists(enwikiImgDir):
-	for filename in os.listdir(enwikiImgDir):
-		(wikiId, _, _) = filename.partition(".")
-		query = "SELECT nodes.id FROM nodes INNER JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids._id = ?"
-		found = False
-		for (otolId,) in dbCur.execute(query, (int(wikiId),)):
-			if otolId not in nodeToImgs:
-				nodeToImgs[otolId] = []
-			nodeToImgs[otolId].append(enwikiImgDir + filename)
-			found = True
-		if not found:
-			print(f"No node found for {enwikiImgDir}{filename}", file=sys.stderr)
-print(f"Result has {len(nodeToImgs)} node entries")
-# Check for already-made choices
-print("Filtering out already-chosen IDs")
-oldSz = len(nodeToImgs)
-if os.path.exists(outFile):
-	with open(outFile) as file:
-		for line in file:
-			line = line.rstrip()
-			if " " in line:
-				line = line[:line.find(" ")]
-			del nodeToImgs[line]
-print(f"Filtered out {oldSz - len(nodeToImgs)} entries")
-
-class ImgReviewer:
-	""" Provides the GUI for reviewing images """
-	def __init__(self, root, nodeToImgs):
-		self.root = root
-		root.title("Image Reviewer")
-		# Setup main frame
-		mainFrame = ttk.Frame(root, padding="5 5 5 5")
-		mainFrame.grid(column=0, row=0, sticky=(tki.N, tki.W, tki.E, tki.S))
-		root.columnconfigure(0, weight=1)
-		root.rowconfigure(0, weight=1)
-		# Set up images-to-be-reviewed frames
-		self.eolImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
-		self.enwikiImg = ImageTk.PhotoImage(PLACEHOLDER_IMG)
-		self.labels = []
-		for i in (0, 1):
-			frame = ttk.Frame(mainFrame, width=IMG_DISPLAY_SZ, height=IMG_DISPLAY_SZ)
-			frame.grid(column=i, row=0)
-			label = ttk.Label(frame, image=self.eolImg if i == 0 else self.enwikiImg)
-			label.grid(column=0, row=0)
-			self.labels.append(label)
-		# Add padding
-		for child in mainFrame.winfo_children():
-			child.grid_configure(padx=5, pady=5)
-		# Add bindings
-		root.bind("<q>", self.quit)
-		root.bind("<Key-j>", lambda evt: self.accept(0))
-		root.bind("<Key-k>", lambda evt: self.accept(1))
-		root.bind("<Key-l>", lambda evt: self.reject())
-		# Set fields
-		self.nodeImgsList = list(nodeToImgs.items())
-		self.listIdx = -1
-		self.otolId = None
-		self.eolImgPath = None
-		self.enwikiImgPath = None
-		self.numReviewed = 0
-		self.startTime = time.time()
-		# Initialise images to review
-		self.getNextImgs()
-	def getNextImgs(self):
-		""" Updates display with new images to review, or ends program """
-		# Get next image paths
-		while True:
-			self.listIdx += 1
-			if self.listIdx == len(self.nodeImgsList):
-				print("No more images to review. Exiting program.")
-				self.quit()
-				return
-			(self.otolId, imgPaths) = self.nodeImgsList[self.listIdx]
-			# Potentially skip user choice
-			if onlyReviewPairs and len(imgPaths) == 1:
-				with open(outFile, 'a') as file:
-					file.write(f"{self.otolId} {imgPaths[0]}\n")
-				continue
-			break
-		# Update displayed images
-		self.eolImgPath = self.enwikiImgPath = None
-		imageOpenError = False
-		for imgPath in imgPaths:
-			img = None
-			try:
-				img = Image.open(imgPath)
-				img = ImageOps.exif_transpose(img)
-			except PIL.UnidentifiedImageError:
-				print(f"UnidentifiedImageError for {imgPath}")
-				imageOpenError = True
-				continue
-			if imgPath.startswith("eol/"):
-				self.eolImgPath = imgPath
-				self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
-			elif imgPath.startswith("enwiki/"):
-				self.enwikiImgPath = imgPath
-				self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(img))
-			else:
-				print(f"Unexpected image path {imgPath}", file=sys.stderr)
-				self.quit()
-				return
-		# Re-iterate if all image paths invalid
-		if self.eolImgPath == None and self.enwikiImgPath == None:
-			if imageOpenError:
-				self.reject()
-			self.getNextImgs()
-			return
-		# Add placeholder images
-		if self.eolImgPath == None:
-			self.eolImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
-		elif self.enwikiImgPath == None:
-			self.enwikiImg = ImageTk.PhotoImage(self.resizeForDisplay(PLACEHOLDER_IMG))
-		# Update image-frames
-		self.labels[0].config(image=self.eolImg)
-		self.labels[1].config(image=self.enwikiImg)
-		# Update title
-		title = f"Imgs for otol ID {self.otolId}"
-		query = "SELECT names.alt_name FROM" \
-			" nodes INNER JOIN names ON nodes.name = names.name" \
-			" WHERE nodes.id = ? and pref_alt = 1"
-		row = dbCur.execute(query, (self.otolId,)).fetchone()
-		if row != None:
-			title += f", aka {row[0]}"
-		title += f" ({self.listIdx + 1} out of {len(self.nodeImgsList)})"
-		self.root.title(title)
-	def accept(self, imgIdx):
-		""" React to a user selecting an image """
-		imgPath = self.eolImgPath if imgIdx == 0 else self.enwikiImgPath
-		if imgPath == None:
-			print("Invalid selection")
-			return
-		with open(outFile, 'a') as file:
-			file.write(f"{self.otolId} {imgPath}\n")
-		self.numReviewed += 1
-		self.getNextImgs()
-	def reject(self):
-		""" React to a user rejecting all images of a set """
-		with open(outFile, 'a') as file:
-			file.write(f"{self.otolId}\n")
-		self.numReviewed += 1
-		self.getNextImgs()
-	def quit(self, e = None):
-		print(f"Number reviewed: {self.numReviewed}")
-		timeElapsed = time.time() - self.startTime
-		print(f"Time elapsed: {timeElapsed:.2f} seconds")
-		if self.numReviewed > 0:
-			print(f"Avg time per review: {timeElapsed/self.numReviewed:.2f} seconds")
-		dbCon.close()
-		self.root.destroy()
-	def resizeForDisplay(self, img):
-		""" Returns a copy of an image, shrunk to fit the display (keeps aspect ratio), and with a background """
-		if max(img.width, img.height) > IMG_DISPLAY_SZ:
-			if (img.width > img.height):
-				newHeight = int(img.height * IMG_DISPLAY_SZ/img.width)
-				img = img.resize((IMG_DISPLAY_SZ, newHeight))
-			else:
-				newWidth = int(img.width * IMG_DISPLAY_SZ / img.height)
-				img = img.resize((newWidth, IMG_DISPLAY_SZ))
-		bgImg = PLACEHOLDER_IMG.copy()
-		bgImg.paste(img, box=(
-			int((IMG_DISPLAY_SZ - img.width) / 2),
-			int((IMG_DISPLAY_SZ - img.height) / 2)))
-		return bgImg
-# Create GUI and defer control
-root = tki.Tk()
-ImgReviewer(root, nodeToImgs)
-root.mainloop()
-- 
cgit v1.2.3