diff options
Diffstat (limited to 'backend/tolData/enwiki')
| -rwxr-xr-x | backend/tolData/enwiki/downloadImgLicenseInfo.py | 11 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/downloadImgs.py | 11 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/genDescData.py | 15 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/genDumpIndexDb.py | 13 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/genImgData.py | 11 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/lookupPage.py | 18 |
6 files changed, 31 insertions, 48 deletions
diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py index 399922e..dd39d54 100755 --- a/backend/tolData/enwiki/downloadImgLicenseInfo.py +++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py @@ -5,19 +5,16 @@ import sqlite3, urllib.parse, html import requests import time, signal -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Reads image names from a database, and uses enwiki's online API to obtain licensing information for them, adding the info to the database. SIGINT causes the program to finish an ongoing download and exit. The program can be re-run to continue downloading, and looks at already-processed names to decide what to skip. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() imgDb = "imgData.db" apiUrl = "https://en.wikipedia.org/w/api.php" diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py index 8fb605f..520677f 100755 --- a/backend/tolData/enwiki/downloadImgs.py +++ b/backend/tolData/enwiki/downloadImgs.py @@ -5,19 +5,16 @@ import sqlite3 import urllib.parse, requests import time, signal -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" Downloads images from URLs in an image database, into an output directory, with names of the form 'pageId1.ext1'. SIGINT causes the program to finish an ongoing download and exit. The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() imgDb = "imgData.db" # About 130k image names outDir = "imgs" diff --git a/backend/tolData/enwiki/genDescData.py b/backend/tolData/enwiki/genDescData.py index b0ca272..0085d70 100755 --- a/backend/tolData/enwiki/genDescData.py +++ b/backend/tolData/enwiki/genDescData.py @@ -5,15 +5,12 @@ import bz2 import html, mwxml, mwparserfromhell import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - -Reads through the wiki dump, and attempts to -parse short-descriptions, and add them to a database. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Reads through the wiki dump, and attempts to parse short-descriptions, +and add them to a database +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages enwikiDb = "descData.db" diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py index 3955885..1bffb27 100755 --- a/backend/tolData/enwiki/genDumpIndexDb.py +++ b/backend/tolData/enwiki/genDumpIndexDb.py @@ -4,14 +4,11 @@ import sys, os, re import bz2 import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - -Adds data from the wiki dump index-file into a database. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Adds data from the wiki dump index-file into a database +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines indexDb = "dumpIndex.db" diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py index dedfe14..97e696f 100755 --- a/backend/tolData/enwiki/genImgData.py +++ b/backend/tolData/enwiki/genImgData.py @@ -4,18 +4,15 @@ import sys, re import bz2, html, urllib.parse import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} - +import argparse +parser = argparse.ArgumentParser(description=""" For some set of page IDs, looks up their content in the wiki dump, and tries to parse infobox image names, storing them into a database. The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.parse_args() def getInputPageIds(): pageIds = set() diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py index 1a90851..e7b95f0 100755 --- a/backend/tolData/enwiki/lookupPage.py +++ b/backend/tolData/enwiki/lookupPage.py @@ -4,19 +4,17 @@ import sys, re import bz2 import sqlite3 -usageInfo = f""" -Usage: {sys.argv[0]} title1 - -Looks up a page with title title1 in the wiki dump, using -the dump-index db, and prints the corresponding <page>. -""" -if len(sys.argv) != 2: - print(usageInfo, file=sys.stderr) - sys.exit(1) +import argparse +parser = argparse.ArgumentParser(description=""" +Looks up a page with title title1 in the wiki dump, using the dump-index +db, and prints the corresponding <page>. +""", formatter_class=argparse.RawDescriptionHelpFormatter) +parser.add_argument("title", help="The title to look up") +args = parser.parse_args() dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" indexDb = "dumpIndex.db" -pageTitle = sys.argv[1].replace("_", " ") +pageTitle = args.title.replace("_", " ") print("Looking up offset in index db") dbCon = sqlite3.connect(indexDb) |
