aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-08-20 13:16:21 +1000
committerTerry Truong <terry06890@gmail.com>2022-08-20 13:16:21 +1000
commit930c12d33e1093f874a4beb4d6376621e464e8c0 (patch)
tree381722fc3ab9ebda482cb18d29e1091458aa93da /backend/tolData/enwiki
parent8144003565797f0d18645a416b95d4365bba5fdd (diff)
Use argparse in python scripts
Diffstat (limited to 'backend/tolData/enwiki')
-rwxr-xr-xbackend/tolData/enwiki/downloadImgLicenseInfo.py11
-rwxr-xr-xbackend/tolData/enwiki/downloadImgs.py11
-rwxr-xr-xbackend/tolData/enwiki/genDescData.py15
-rwxr-xr-xbackend/tolData/enwiki/genDumpIndexDb.py13
-rwxr-xr-xbackend/tolData/enwiki/genImgData.py11
-rwxr-xr-xbackend/tolData/enwiki/lookupPage.py18
6 files changed, 31 insertions, 48 deletions
diff --git a/backend/tolData/enwiki/downloadImgLicenseInfo.py b/backend/tolData/enwiki/downloadImgLicenseInfo.py
index 399922e..dd39d54 100755
--- a/backend/tolData/enwiki/downloadImgLicenseInfo.py
+++ b/backend/tolData/enwiki/downloadImgLicenseInfo.py
@@ -5,19 +5,16 @@ import sqlite3, urllib.parse, html
import requests
import time, signal
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
+import argparse
+parser = argparse.ArgumentParser(description="""
Reads image names from a database, and uses enwiki's online API to obtain
licensing information for them, adding the info to the database.
SIGINT causes the program to finish an ongoing download and exit.
The program can be re-run to continue downloading, and looks
at already-processed names to decide what to skip.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
+""", formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.parse_args()
imgDb = "imgData.db"
apiUrl = "https://en.wikipedia.org/w/api.php"
diff --git a/backend/tolData/enwiki/downloadImgs.py b/backend/tolData/enwiki/downloadImgs.py
index 8fb605f..520677f 100755
--- a/backend/tolData/enwiki/downloadImgs.py
+++ b/backend/tolData/enwiki/downloadImgs.py
@@ -5,19 +5,16 @@ import sqlite3
import urllib.parse, requests
import time, signal
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
+import argparse
+parser = argparse.ArgumentParser(description="""
Downloads images from URLs in an image database, into an output directory,
with names of the form 'pageId1.ext1'.
SIGINT causes the program to finish an ongoing download and exit.
The program can be re-run to continue downloading, and looks
in the output directory do decide what to skip.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
+""", formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.parse_args()
imgDb = "imgData.db" # About 130k image names
outDir = "imgs"
diff --git a/backend/tolData/enwiki/genDescData.py b/backend/tolData/enwiki/genDescData.py
index b0ca272..0085d70 100755
--- a/backend/tolData/enwiki/genDescData.py
+++ b/backend/tolData/enwiki/genDescData.py
@@ -5,15 +5,12 @@ import bz2
import html, mwxml, mwparserfromhell
import sqlite3
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-Reads through the wiki dump, and attempts to
-parse short-descriptions, and add them to a database.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
+import argparse
+parser = argparse.ArgumentParser(description="""
+Reads through the wiki dump, and attempts to parse short-descriptions,
+and add them to a database
+""", formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.parse_args()
dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # Had about 22e6 pages
enwikiDb = "descData.db"
diff --git a/backend/tolData/enwiki/genDumpIndexDb.py b/backend/tolData/enwiki/genDumpIndexDb.py
index 3955885..1bffb27 100755
--- a/backend/tolData/enwiki/genDumpIndexDb.py
+++ b/backend/tolData/enwiki/genDumpIndexDb.py
@@ -4,14 +4,11 @@ import sys, os, re
import bz2
import sqlite3
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-Adds data from the wiki dump index-file into a database.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
+import argparse
+parser = argparse.ArgumentParser(description="""
+Adds data from the wiki dump index-file into a database
+""", formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.parse_args()
indexFile = "enwiki-20220501-pages-articles-multistream-index.txt.bz2" # Had about 22e6 lines
indexDb = "dumpIndex.db"
diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py
index dedfe14..97e696f 100755
--- a/backend/tolData/enwiki/genImgData.py
+++ b/backend/tolData/enwiki/genImgData.py
@@ -4,18 +4,15 @@ import sys, re
import bz2, html, urllib.parse
import sqlite3
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
+import argparse
+parser = argparse.ArgumentParser(description="""
For some set of page IDs, looks up their content in the wiki dump,
and tries to parse infobox image names, storing them into a database.
The program can be re-run with an updated set of page IDs, and
will skip already-processed page IDs.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
+""", formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.parse_args()
def getInputPageIds():
pageIds = set()
diff --git a/backend/tolData/enwiki/lookupPage.py b/backend/tolData/enwiki/lookupPage.py
index 1a90851..e7b95f0 100755
--- a/backend/tolData/enwiki/lookupPage.py
+++ b/backend/tolData/enwiki/lookupPage.py
@@ -4,19 +4,17 @@ import sys, re
import bz2
import sqlite3
-usageInfo = f"""
-Usage: {sys.argv[0]} title1
-
-Looks up a page with title title1 in the wiki dump, using
-the dump-index db, and prints the corresponding <page>.
-"""
-if len(sys.argv) != 2:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
+import argparse
+parser = argparse.ArgumentParser(description="""
+Looks up a page with title title1 in the wiki dump, using the dump-index
+db, and prints the corresponding <page>.
+""", formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument("title", help="The title to look up")
+args = parser.parse_args()
dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2"
indexDb = "dumpIndex.db"
-pageTitle = sys.argv[1].replace("_", " ")
+pageTitle = args.title.replace("_", " ")
print("Looking up offset in index db")
dbCon = sqlite3.connect(indexDb)