diff options
Diffstat (limited to 'backend/tolData/enwiki')
| -rw-r--r-- | backend/tolData/enwiki/README.md | 2 | ||||
| -rwxr-xr-x | backend/tolData/enwiki/genImgData.py | 1 |
2 files changed, 1 insertions, 2 deletions
diff --git a/backend/tolData/enwiki/README.md b/backend/tolData/enwiki/README.md index dfced94..7df21c9 100644 --- a/backend/tolData/enwiki/README.md +++ b/backend/tolData/enwiki/README.md @@ -11,7 +11,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. providing, for each page, an offset into the dump file of a chunk of 100 pages that includes it. -# Generated Dump-Index Files +# Dump-Index Files - genDumpIndexDb.py <br> Creates an sqlite-database version of the enwiki-dump index file. - dumpIndex.db <br> diff --git a/backend/tolData/enwiki/genImgData.py b/backend/tolData/enwiki/genImgData.py index 97e696f..b5d546d 100755 --- a/backend/tolData/enwiki/genImgData.py +++ b/backend/tolData/enwiki/genImgData.py @@ -30,7 +30,6 @@ imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE) cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) -# In testing, got about 360k image names print("Getting input page-ids") pageIds = getInputPageIds() |
