aboutsummaryrefslogtreecommitdiff
path: root/backend/hist_data/enwiki
diff options
context:
space:
mode:
Diffstat (limited to 'backend/hist_data/enwiki')
-rw-r--r--backend/hist_data/enwiki/README.md8
-rwxr-xr-xbackend/hist_data/enwiki/download_img_license_info.py5
-rwxr-xr-xbackend/hist_data/enwiki/download_imgs.py9
-rwxr-xr-xbackend/hist_data/enwiki/gen_desc_data.py7
-rwxr-xr-xbackend/hist_data/enwiki/gen_dump_index_db.py8
-rwxr-xr-xbackend/hist_data/enwiki/gen_img_data.py8
6 files changed, 21 insertions, 24 deletions
diff --git a/backend/hist_data/enwiki/README.md b/backend/hist_data/enwiki/README.md
index 262ebdb..76d33e5 100644
--- a/backend/hist_data/enwiki/README.md
+++ b/backend/hist_data/enwiki/README.md
@@ -33,11 +33,11 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
# Image Files
- `gen_img_data.py` <br>
- Used to find infobox image names for page IDs, and store them into a database.
+ Finds infobox image names for page IDs, and stores them into a database.
- `download_img_license_info.py` <br>
- Used to download licensing metadata for image names, via wikipedia's online API, and store them into a database.
+ Downloads licensing metadata for image names, via wikipedia's online API, and stores them into a database.
- `img_data.db` <br>
- Used to hold metadata about infobox images for a set of page IDs.
+ Holds metadata about infobox images for a set of page IDs.
Generated using `gen_img_data.py` and `download_img_license_info.py`. <br>
Tables: <br>
- `page_imgs`: `page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT` <br>
@@ -47,7 +47,7 @@ This directory holds files obtained/derived from [English Wikipedia](https://en.
<br>
Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability.
- `download_imgs.py` <br>
- Used to download image files into imgs/.
+ Downloads image files into imgs/.
# Description Files
- `gen_desc_data.py` <br>
diff --git a/backend/hist_data/enwiki/download_img_license_info.py b/backend/hist_data/enwiki/download_img_license_info.py
index 1217caf..43f2c43 100755
--- a/backend/hist_data/enwiki/download_img_license_info.py
+++ b/backend/hist_data/enwiki/download_img_license_info.py
@@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks
at already-processed names to decide what to skip.
"""
-import re
+import argparse
+import re, time, signal
import sqlite3, urllib.parse, html
import requests
-import time, signal
IMG_DB = 'img_data.db'
#
@@ -150,7 +150,6 @@ def downloadInfo(imgDb: str) -> None:
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
#
diff --git a/backend/hist_data/enwiki/download_imgs.py b/backend/hist_data/enwiki/download_imgs.py
index bbd2cda..7dd0771 100755
--- a/backend/hist_data/enwiki/download_imgs.py
+++ b/backend/hist_data/enwiki/download_imgs.py
@@ -9,10 +9,10 @@ The program can be re-run to continue downloading, and looks
in the output directory do decide what to skip.
"""
-import re, os
+import argparse
+import re, os, time, signal
import sqlite3
import urllib.parse, requests
-import time, signal
IMG_DB = 'img_data.db' # About 130k image names
OUT_DIR = 'imgs'
@@ -22,7 +22,7 @@ USER_AGENT = 'terryt.dev (terry06890@gmail.com)'
TIMEOUT = 1
# https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec'
# It's unclear how to properly check for cache misses, so we just aim for 1 per sec
-BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit)
+EXP_BACKOFF = False # If True, double the timeout each time a download error occurs (otherwise just exit)
def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
if not os.path.exists(outDir):
@@ -84,7 +84,7 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
time.sleep(timeout)
except Exception as e:
print(f'Error while downloading to {outFile}: {e}')
- if not BACKOFF:
+ if not EXP_BACKOFF:
return
else:
timeout *= 2
@@ -94,7 +94,6 @@ def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None:
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
#
diff --git a/backend/hist_data/enwiki/gen_desc_data.py b/backend/hist_data/enwiki/gen_desc_data.py
index b3fde52..bb2b845 100755
--- a/backend/hist_data/enwiki/gen_desc_data.py
+++ b/backend/hist_data/enwiki/gen_desc_data.py
@@ -7,14 +7,14 @@ and adds them to a database
# In testing, this script took over 10 hours to run, and generated about 5GB
+import argparse
import sys, os, re
-import bz2
-import html, mwxml, mwparserfromhell
+import bz2, html, mwxml, mwparserfromhell
import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages
DB_FILE = 'desc_data.db'
-
+# Regexps
DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]')
EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>|<!--[^<]+-->|<[^</]+>([^<]*|[^<]*<[^<]+>[^<]*)</[^<]+>|<[^<]+$')
# Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag
@@ -119,7 +119,6 @@ def convertTitle(title: str) -> str:
return html.unescape(title).replace('_', ' ')
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
#
diff --git a/backend/hist_data/enwiki/gen_dump_index_db.py b/backend/hist_data/enwiki/gen_dump_index_db.py
index 5778680..6be8bc5 100755
--- a/backend/hist_data/enwiki/gen_dump_index_db.py
+++ b/backend/hist_data/enwiki/gen_dump_index_db.py
@@ -1,11 +1,12 @@
#!/usr/bin/python3
"""
-Adds data from the wiki dump index-file into a database
+Adds data from the wiki-dump index-file into a database
"""
+
+import argparse
import sys, os, re
-import bz2
-import sqlite3
+import bz2, sqlite3
INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines
DB_FILE = 'dump_index.db'
@@ -53,7 +54,6 @@ def genData(indexFile: str, dbFile: str) -> None:
dbCon.close()
if __name__ == '__main__':
- import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.parse_args()
#
diff --git a/backend/hist_data/enwiki/gen_img_data.py b/backend/hist_data/enwiki/gen_img_data.py
index 922b893..9aa3863 100755
--- a/backend/hist_data/enwiki/gen_img_data.py
+++ b/backend/hist_data/enwiki/gen_img_data.py
@@ -8,15 +8,15 @@ The program can be re-run with an updated set of page IDs, and
will skip already-processed page IDs.
"""
-import re
-import os, bz2, html, urllib.parse
+import os, re
+import bz2, html, urllib.parse
import sqlite3
DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2'
INDEX_DB = 'dump_index.db'
IMG_DB = 'img_data.db' # The database to create
DB_FILE = os.path.join('..', 'data.db')
-#
+# Regexps
ID_LINE_REGEX = re.compile(r'<id>(.*)</id>')
IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)')
BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]')
@@ -33,7 +33,7 @@ def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None:
if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None:
# Create tables if not present
imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, title TEXT UNIQUE, img_name TEXT)')
- # 'img_name' may be NULL
+ # 'img_name' values are set to NULL to indicate page IDs where no image was found
imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)')
else:
# Check for already-processed page IDs