diff options
22 files changed, 123 insertions, 188 deletions
diff --git a/backend/server.py b/backend/server.py index b2fffdc..48d6c3f 100755 --- a/backend/server.py +++ b/backend/server.py @@ -5,14 +5,11 @@ from wsgiref import simple_server, util import mimetypes from tilo import application -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Runs a basic dev server that serves a WSGI script and image files -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""") +parser.parse_args() # WSGI handler that uses 'application', but also serves image files def wrappingApp(environ, start_response): diff --git a/backend/tilo.py b/backend/tilo.py index 8bbe528..d86e94c 100755 --- a/backend/tilo.py +++ b/backend/tilo.py @@ -5,14 +5,7 @@ import urllib.parse import sqlite3 import gzip, jsonpickle -dbFile = "tolData/data.db" -DEFAULT_SUGG_LIM = 5 -MAX_SUGG_LIM = 50 -ROOT_NAME = "cellular organisms" - -usageInfo = f""" -Usage: {sys.argv[0]} - +HELP_INFO = """ WSGI script that serves tree-of-life data, in JSON form. Expected HTTP query parameters: @@ -30,9 +23,15 @@ Expected HTTP query parameters: weakly-trimmed, images-only, and picked-nodes trees. The default is 'images'. """ -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=HELP_INFO, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + +DB_FILE = "tolData/data.db" +DEFAULT_SUGG_LIM = 5 +MAX_SUGG_LIM = 50 +ROOT_NAME = "cellular organisms" # Classes for objects sent as responses (matches lib.ts types in client-side code) class TolNode: @@ -316,9 +315,8 @@ def handleReq(dbCur, environ): return None # Entry point for the WSGI script def application(environ, start_response): - global dbFile # Open db - dbCon = sqlite3.connect(dbFile) + dbCon = sqlite3.connect(DB_FILE) dbCur = dbCon.cursor() # Get response object val = handleReq(dbCur, environ) diff --git a/backend/tolData/addPickedNames.py b/backend/tolData/addPickedNames.py index d56a0cb..9b56422 100755 --- a/backend/tolData/addPickedNames.py +++ b/backend/tolData/addPickedNames.py @@ -3,14 +3,11 @@ import sys import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - -Reads alt-name data from a file, and adds it to the database's 'names' table. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Reads alt-name data from a file, and adds it to the database's 'names' table +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() dbFile = "data.db" pickedNamesFile = "pickedNames.txt" diff --git a/backend/tolData/dbpedia/genDescData.py b/backend/tolData/dbpedia/genDescData.py index d9e8a80..a23199d 100755 --- a/backend/tolData/dbpedia/genDescData.py +++ b/backend/tolData/dbpedia/genDescData.py @@ -3,14 +3,11 @@ import sys, re import bz2, sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - -Adds DBpedia labels/types/abstracts/etc data into a database. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Adds DBpedia labels/types/abstracts/etc data into a database +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() labelsFile = "labels_lang=en.ttl.bz2" # Had about 16e6 entries idsFile = "page_lang=en_ids.ttl.bz2" diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py index 399922e..dd39d54 100755 --- a/backend/tolData/enwiki/downloadImgLicenseInfo.py +++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py @@ -5,19 +5,16 @@ import sqlite3, urllib.parse, html import requests import time, signal -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads image names from a database, and uses enwiki's online API to obtain licensing information for them, adding the info to the database. SIGINT causes the program to finish an ongoing download and exit. The program can be re-run to continue downloading, and looks at already-processed names to decide what to skip. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() imgDb = "imgData.db" apiUrl = "https://en.wikipedia.org/w/api.php" diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py index 8fb605f..520677f 100755 --- a/backend/tolData/enwiki/downloadImgs.py +++ b/backend/tolData/enwiki/downloadImgs.py @@ -5,19 +5,16 @@ import sqlite3 import urllib.parse, requests import time, signal -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Downloads images from URLs in an image database, into an output directory, with names of the form 'pageId1.ext1'. SIGINT causes the program to finish an ongoing download and exit. The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() imgDb = "imgData.db" # About 130k image names outDir = "imgs" diff --git a/backend/tolData/enwiki/genDescData.py b/backend/tolData/enwiki/genDescData.py index b0ca272..0085d70 100755 --- a/backend/tolData/enwiki/genDescData.py +++ b/backend/tolData/enwiki/genDescData.py @@ -5,15 +5,12 @@ import bz2 import html, mwxml, mwparserfromhell import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - -Reads through the wiki dump, and attempts to -parse short-descriptions, and add them to a database. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Reads through the wiki dump, and attempts to parse short-descriptions, +and add them to a database +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages enwikiDb = "descData.db" diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py index 3955885..1bffb27 100755 --- a/backend/tolData/enwiki/genDumpIndexDb.py +++ b/backend/tolData/enwiki/genDumpIndexDb.py @@ -4,14 +4,11 @@ import sys, os, re import bz2 import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - -Adds data from the wiki dump index-file into a database. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Adds data from the wiki dump index-file into a database +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines indexDb = "dumpIndex.db" diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py index dedfe14..97e696f 100755 --- a/backend/tolData/enwiki/genImgData.py +++ b/backend/tolData/enwiki/genImgData.py @@ -4,18 +4,15 @@ import sys, re import bz2, html, urllib.parse import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" For some set of page IDs, looks up their content in the wiki dump, and tries to parse infobox image names, storing them into a database. The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() def getInputPageIds(): pageIds = set() diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py index 1a90851..e7b95f0 100755 --- a/backend/tolData/enwiki/lookupPage.py +++ b/backend/tolData/enwiki/lookupPage.py @@ -4,19 +4,17 @@ import sys, re import bz2 import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} title1 - -Looks up a page with title title1 in the wiki dump, using -the dump-index db, and prints the corresponding <page>. -""" -if len(sys.argv) != 2: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Looks up a page with title title1 in the wiki dump, using the dump-index +db, and prints the corresponding <page>. +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.add_argument("title", help="The title to look up") +args = parser.parse_args() dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" indexDb = "dumpIndex.db" -pageTitle = sys.argv[1].replace("_", " ") +pageTitle = args.title.replace("_", " ") print("Looking up offset in index db") dbCon = sqlite3.connect(indexDb) diff --git a/backend/tolData/eol/downloadImgs.py b/backend/tolData/eol/downloadImgs.py index 96bc085..7ca4e79 100755 --- a/backend/tolData/eol/downloadImgs.py +++ b/backend/tolData/eol/downloadImgs.py @@ -7,9 +7,8 @@ import time from threading import Thread import signal -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" For some set of EOL IDs, downloads associated images from URLs in an image-list database. Uses multiple downloading threads. @@ -20,11 +19,10 @@ SIGINT causes the program to finish ongoing downloads and exit. The program can be re-run to continue downloading. It looks for already-downloaded files, and continues after the one with highest EOL ID. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) -# In testing, this downloaded about 70k images, over a few days +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() + +# In testing, this script downloaded about 70k images, over a few days imagesListDb = "imagesList.db" def getInputEolIds(): diff --git a/backend/tolData/eol/genImagesListDb.py b/backend/tolData/eol/genImagesListDb.py index 32df10a..0c45887 100755 --- a/backend/tolData/eol/genImagesListDb.py +++ b/backend/tolData/eol/genImagesListDb.py @@ -4,14 +4,11 @@ import sys, os, re import csv import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Generates a sqlite db from a directory of CSV files holding EOL image data -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() imagesListDir = "imagesList/" dbFile = "imagesList.db" diff --git a/backend/tolData/eol/reviewImgs.py b/backend/tolData/eol/reviewImgs.py index ecdf7ab..979ed0e 100755 --- a/backend/tolData/eol/reviewImgs.py +++ b/backend/tolData/eol/reviewImgs.py @@ -7,17 +7,14 @@ from tkinter import ttk import PIL from PIL import ImageTk, Image, ImageOps -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Provides a GUI for reviewing images. Looks in a for-review directory for images named 'eolId1 contentId1.ext1', and, for each EOL ID, enables the user to choose an image to keep, or reject all. Also provides image rotation. Chosen images are placed in another directory, and rejected ones are deleted. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() imgDir = "imgsForReview/" outDir = "imgs/" diff --git a/backend/tolData/genDbpData.py b/backend/tolData/genDbpData.py index 68b135e..9d52e1d 100755 --- a/backend/tolData/genDbpData.py +++ b/backend/tolData/genDbpData.py @@ -3,16 +3,13 @@ import sys, os, re import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads a database containing data from DBpedia, and tries to associate DBpedia IRIs with nodes in the tree-of-life database, adding short-descriptions for them. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() dbpediaDb = "dbpedia/descData.db" namesToSkipFile = "pickedEnwikiNamesToSkip.txt" diff --git a/backend/tolData/genEnwikiDescData.py b/backend/tolData/genEnwikiDescData.py index 0e86fd5..e8a69ba 100755 --- a/backend/tolData/genEnwikiDescData.py +++ b/backend/tolData/genEnwikiDescData.py @@ -3,16 +3,13 @@ import sys, re, os import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads a database containing data from Wikipedia, and tries to associate wiki pages with nodes in the tree-of-life database, and add descriptions for nodes that don't have them. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() enwikiDb = "enwiki/descData.db" dbFile = "data.db" diff --git a/backend/tolData/genEnwikiNameData.py b/backend/tolData/genEnwikiNameData.py index 7ad61d1..ec76cca 100755 --- a/backend/tolData/genEnwikiNameData.py +++ b/backend/tolData/genEnwikiNameData.py @@ -3,16 +3,13 @@ import sys, re import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads from a database containing data from Wikipdia, along with node and wiki-id information from the database, and use wikipedia page-redirect information to add additional alt-name data. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() enwikiDb = "enwiki/descData.db" dbFile = "data.db" diff --git a/backend/tolData/genEolNameData.py b/backend/tolData/genEolNameData.py index 1b19a47..2c5414b 100755 --- a/backend/tolData/genEolNameData.py +++ b/backend/tolData/genEolNameData.py @@ -3,24 +3,21 @@ import sys, re, os import html, csv, sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads files describing name data from the 'Encyclopedia of Life' site, tries to associate names with nodes in the tree-of-life database, and adds tables to represent associated names. Reads a vernacularNames.csv file: - Starts with a header line containing: - page_id, canonical_form, vernacular_string, language_code, - resource_name, is_preferred_by_resource, is_preferred_by_eol - The canonical_form and vernacular_string fields contain names - associated with the page ID. Names are not always unique to - particular page IDs. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) + Starts with a header line containing: + page_id, canonical_form, vernacular_string, language_code, + resource_name, is_preferred_by_resource, is_preferred_by_eol + The canonical_form and vernacular_string fields contain names + associated with the page ID. Names are not always unique to + particular page IDs. +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries dbFile = "data.db" diff --git a/backend/tolData/genImgs.py b/backend/tolData/genImgs.py index ecca8e0..930990b 100755 --- a/backend/tolData/genImgs.py +++ b/backend/tolData/genImgs.py @@ -4,9 +4,8 @@ import sys, os, subprocess import sqlite3, urllib.parse import signal -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads node IDs and image paths from a file, and possibly from a directory, and generates cropped/resized versions of those images into a directory, with names of the form 'nodeId1.jpg'. Also adds image metadata to the @@ -15,10 +14,8 @@ database. SIGINT can be used to stop, and the program can be re-run to continue processing. It uses already-existing database entries to decide what to skip. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() imgListFile = "imgList.txt" outDir = "img/" diff --git a/backend/tolData/genLinkedImgs.py b/backend/tolData/genLinkedImgs.py index c9cc622..eb991b9 100755 --- a/backend/tolData/genLinkedImgs.py +++ b/backend/tolData/genLinkedImgs.py @@ -3,15 +3,12 @@ import sys, re import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Look for nodes without images in the database, and tries to -associate them with images from their children. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +associate them with images from their children +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() dbFile = "data.db" compoundNameRegex = re.compile(r"\[(.+) \+ (.+)]") diff --git a/backend/tolData/genOtolData.py b/backend/tolData/genOtolData.py index 4236999..6310cc9 100755 --- a/backend/tolData/genOtolData.py +++ b/backend/tolData/genOtolData.py @@ -3,31 +3,28 @@ import sys, re, os import json, sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads files describing a tree-of-life from an 'Open Tree of Life' release, and stores tree info in a database. Reads a labelled_supertree_ottnames.tre file, which is assumed to have this format: The tree-of-life is represented in Newick format, which looks like: (n1,n2,(n3,n4)n5)n6 - The root node is named n6, and has children n1, n2, and n5. + The root node is named n6, and has children n1, n2, and n5. Name examples include: Homo_sapiens_ott770315, mrcaott6ott22687, 'Oxalis san-miguelii ott5748753', - 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. - The node with ID 'ott770315' will get the name 'homo sapiens'. - A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). - It is possible for multiple nodes to have the same name. - In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. + 'ott770315' and 'mrcaott6ott22687' are node IDs. The latter is for a 'compound node'. + The node with ID 'ott770315' will get the name 'homo sapiens'. + A compound node will get a name composed from it's sub-nodes (eg: [name1 + name2]). + It is possible for multiple nodes to have the same name. + In these cases, extra nodes will be named sequentially, as 'name1 [2]', 'name1 [3]', etc. Reads an annotations.json file, which is assumed to have this format: Holds a JSON object, whose 'nodes' property maps node IDs to objects holding information about that node, such as the properties 'supported_by' and 'conflicts_with', which list phylogenetic trees that - support/conflict with the node's placement. + support/conflict with the node's placement. Reads from a picked-names file, if present, which specifies name and node ID pairs. - These help resolve cases where multiple nodes share the same name. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) + These help resolve cases where multiple nodes share the same name. +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() treeFile = "otol/labelled_supertree_ottnames.tre" # Had about 2.5e9 nodes annFile = "otol/annotations.json" diff --git a/backend/tolData/genReducedTrees.py b/backend/tolData/genReducedTrees.py index 907aad9..a954fd3 100755 --- a/backend/tolData/genReducedTrees.py +++ b/backend/tolData/genReducedTrees.py @@ -3,9 +3,8 @@ import sys, os.path, re import json, sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} [tree1] - +import argparse +parser = argparse.ArgumentParser(description=""" Creates reduced versions of the tree in the database: - A 'picked nodes' tree: Created from a minimal set of node names read from a file, @@ -17,15 +16,11 @@ Creates reduced versions of the tree in the database: Created by removing nodes that lack an image or description, or presence in the 'picked' tree. And, for nodes with 'many' children, removing some more, despite any node descriptions. +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.add_argument("--tree", choices=["picked", "images", "trimmed"], help="Only generate the specified tree") +args = parser.parse_args() -If tree1 is specified, as 'picked', 'images', or 'trimmed', only that -tree is generated. -""" -if len(sys.argv) > 2 or len(sys.argv) == 2 and re.fullmatch(r"picked|images|trimmed", sys.argv[1]) == None: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -tree = sys.argv[1] if len(sys.argv) > 1 else None +tree = args.tree dbFile = "data.db" pickedNodesFile = "pickedNodes.txt" COMP_NAME_REGEX = re.compile(r"\[.+ \+ .+]") # Used to recognise composite nodes diff --git a/backend/tolData/reviewImgsToGen.py b/backend/tolData/reviewImgsToGen.py index de592f5..88822c5 100755 --- a/backend/tolData/reviewImgsToGen.py +++ b/backend/tolData/reviewImgsToGen.py @@ -7,9 +7,8 @@ from tkinter import ttk import PIL from PIL import ImageTk, Image, ImageOps -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Provides a GUI that displays, for each node in the database, associated images from EOL and Wikipedia, and allows choosing which to use. Writes choice data to a text file with lines of the form 'otolId1 imgPath1', or @@ -18,10 +17,8 @@ choice data to a text file with lines of the form 'otolId1 imgPath1', or The program can be closed, and run again to continue from the last choice. The program looks for an existing output file to determine what choices have already been made. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() eolImgDir = "eol/imgs/" enwikiImgDir = "enwiki/imgs/" |
