diff options
Diffstat (limited to 'backend/hist_data/enwiki')
| -rw-r--r-- | backend/hist_data/enwiki/README.md | 8 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_img_license_info.py | 5 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/download_imgs.py | 9 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_desc_data.py | 7 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_dump_index_db.py | 8 | ||||
| -rwxr-xr-x | backend/hist_data/enwiki/gen_img_data.py | 8 |
6 files changed, 21 insertions, 24 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md index 262ebdb..76d33e5 100644 --- a/backend/hist_data/enwiki/README.md +++ b/backend/hist_data/enwiki/README.md @@ -33,11 +33,11 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. # Image Files - `gen_img_data.py` <br> - Used to find infobox image names for page IDs, and store them into a database. + Finds infobox image names for page IDs, and stores them into a database. - `download_img_license_info.py` <br> - Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database. + Downloads licensing metadata for image names, via wikipedia's online API, and stores them into a database. - `img_data.db` <br> - Used to hold metadata about infobox images for a set of page IDs. + Holds metadata about infobox images for a set of page IDs. Generated using `gen_img_data.py` and `download_img_license_info.py`. <br> Tables: <br> - `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br> @@ -47,7 +47,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en. <br> Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. - `download_imgs.py` <br> - Used to download image files into imgs/. + Downloads image files into imgs/. # Description Files - `gen_desc_data.py` <br> diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py index 1217caf..43f2c43 100755 --- a/backend/hist_data/enwiki/download_img_license_info.py +++ b/backend/hist_data/enwiki/download_img_license_info.py @@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks at already-processed names to decide what to skip. """ -import re +import argparse +import re, time, signal import sqlite3, urllib.parse, html import requests -import time, signal IMG_DB = 'img_data.db' # @@ -150,7 +150,6 @@ def downloadInfo(imgDb: str) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py index bbd2cda..7dd0771 100755 --- a/backend/hist_data/enwiki/download_imgs.py +++ b/backend/hist_data/enwiki/download_imgs.py @@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks in the output directory do decide what to skip. """ -import re, os +import argparse +import re, os, time, signal import sqlite3 import urllib.parse, requests -import time, signal IMG_DB = 'img_data.db' # About 130k image names OUT_DIR = 'imgs' @@ -22,7 +22,7 @@ USER_AGENT = 'terryt.dev (terry06890@gmail.com)' TIMEOUT = 1 # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' # It's unclear how to properly check for cache misses, so we just aim for 1 per sec -BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit) +EXP_BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit) def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: if not os.path.exists(outDir): @@ -84,7 +84,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: time.sleep(timeout) except Exception as e: print(f'Error while downloading to {outFile}: {e}') - if not BACKOFF: + if not EXP_BACKOFF: return else: timeout *= 2 @@ -94,7 +94,6 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py index b3fde52..bb2b845 100755 --- a/backend/hist_data/enwiki/gen_desc_data.py +++ b/backend/hist_data/enwiki/gen_desc_data.py @@ -7,14 +7,14 @@ and adds them to a database # In testing, this script took over 10 hours to run, and generated about 5GB +import argparse import sys, os, re -import bz2 -import html, mwxml, mwparserfromhell +import bz2, html, mwxml, mwparserfromhell import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages DB_FILE = 'desc_data.db' - +# Regexps DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$') # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag @@ -119,7 +119,6 @@ def convertTitle(title: str) -> str: return html.unescape(title).replace('_', ' ') if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py index 5778680..6be8bc5 100755 --- a/backend/hist_data/enwiki/gen_dump_index_db.py +++ b/backend/hist_data/enwiki/gen_dump_index_db.py @@ -1,11 +1,12 @@ #!/usr/bin/python3 """ -Adds data from the wiki dump index-file into a database +Adds data from the wiki-dump index-file into a database """ + +import argparse import sys, os, re -import bz2 -import sqlite3 +import bz2, sqlite3 INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines DB_FILE = 'dump_index.db' @@ -53,7 +54,6 @@ def genData(indexFile: str, dbFile: str) -> None: dbCon.close() if __name__ == '__main__': - import argparse parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.parse_args() # diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py index 922b893..9aa3863 100755 --- a/backend/hist_data/enwiki/gen_img_data.py +++ b/backend/hist_data/enwiki/gen_img_data.py @@ -8,15 +8,15 @@ The program can be re-run with an updated set of page IDs, and will skip already-processed page IDs. """ -import re -import os, bz2, html, urllib.parse +import os, re +import bz2, html, urllib.parse import sqlite3 DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' INDEX_DB = 'dump_index.db' IMG_DB = 'img_data.db' # The database to create DB_FILE = os.path.join('..', 'data.db') -# +# Regexps ID_LINE_REGEX = re.compile(r'<id>(.*)</id>') IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') @@ -33,7 +33,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: # Create tables if not present imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)') - # 'img_name' may be NULL + # 'img_name' values are set to NULL to indicate page IDs where no image was found imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') else: # Check for already-processed page IDs |
