aboutsummaryrefslogtreecommitdiff
path: root/backend/tolData/enwiki
diff options
context:
space:
mode:
Diffstat (limited to 'backend/tolData/enwiki')
-rw-r--r--backend/tolData/enwiki/README.md2
-rwxr-xr-xbackend/tolData/enwiki/genImgData.py1
2 files changed, 1 insertions, 2 deletions
diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md
index dfced94..7df21c9 100644
--- a/backend/tolData/enwiki/README.md
+++ b/backend/tolData/enwiki/README.md
@@ -11,7 +11,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
providing, for each page, an offset into the dump file of a chunk of
100 pages that includes it.
-# Generated Dump-Index Files
+# Dump-Index Files
- genDumpIndexDb.py <br>
Creates an sqlite-database version of the enwiki-dump index file.
- dumpIndex.db <br>
diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py
index 97e696f..b5d546d 100755
--- a/backend/tolData/enwiki/genImgData.py
+++ b/backend/tolData/enwiki/genImgData.py
@@ -30,7 +30,6 @@ imageLineRegex = re.compile(r".*\| *image *= *([^|]*)")
bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]")
imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE)
cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE)
-# In testing, got about 360k image names
print("Getting input page-ids")
pageIds = getInputPageIds()