From 5de5fb93e50fe9006221b30ac4a66f1be0db82e7 Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 11 Sep 2022 14:55:42 +1000 Subject: Add backend unit tests - Add unit testing code in backend/tests/ - Change to snake-case for script/file/directory names - Use os.path.join() instead of '/' - Refactor script code into function defs and a main-guard - Make global vars all-caps Some fixes: - For getting descriptions, some wiki redirects weren't properly resolved - Linked images were sub-optimally propagated - Generation of reduced trees assumed a wiki-id association implied a description - Tilo.py had potential null dereferences by not always using a reduced node set - EOL image downloading didn't properly wait for all threads to end when finishing --- backend/tol_data/README.md | 155 ++++++++++ backend/tol_data/__init__.py | 0 backend/tol_data/dbpedia/README.md | 29 ++ backend/tol_data/dbpedia/__init__.py | 0 backend/tol_data/dbpedia/gen_desc_data.py | 120 ++++++++ backend/tol_data/enwiki/README.md | 63 ++++ backend/tol_data/enwiki/__init__.py | 0 .../tol_data/enwiki/download_img_license_info.py | 154 ++++++++++ backend/tol_data/enwiki/download_imgs.py | 99 ++++++ backend/tol_data/enwiki/gen_desc_data.py | 126 ++++++++ backend/tol_data/enwiki/gen_dump_index_db.py | 60 ++++ backend/tol_data/enwiki/gen_img_data.py | 193 ++++++++++++ backend/tol_data/enwiki/gen_pageview_data.py | 68 +++++ backend/tol_data/enwiki/lookup_page.py | 71 +++++ backend/tol_data/eol/README.md | 31 ++ backend/tol_data/eol/__init__.py | 0 backend/tol_data/eol/download_imgs.py | 152 ++++++++++ backend/tol_data/eol/gen_images_list_db.py | 39 +++ backend/tol_data/eol/review_imgs.py | 213 +++++++++++++ backend/tol_data/gen_desc_data.py | 92 ++++++ backend/tol_data/gen_imgs.py | 214 +++++++++++++ backend/tol_data/gen_linked_imgs.py | 117 +++++++ backend/tol_data/gen_mapping_data.py | 271 +++++++++++++++++ backend/tol_data/gen_name_data.py | 128 ++++++++ backend/tol_data/gen_otol_data.py | 267 ++++++++++++++++ backend/tol_data/gen_pop_data.py | 45 +++ backend/tol_data/gen_reduced_trees.py | 337 +++++++++++++++++++++ backend/tol_data/otol/README.md | 19 ++ backend/tol_data/picked_imgs/README.md | 10 + backend/tol_data/review_imgs_to_gen.py | 241 +++++++++++++++ backend/tol_data/wikidata/README.md | 18 ++ backend/tol_data/wikidata/__init__.py | 0 backend/tol_data/wikidata/gen_taxon_src_data.py | 239 +++++++++++++++ 33 files changed, 3571 insertions(+) create mode 100644 backend/tol_data/README.md create mode 100644 backend/tol_data/__init__.py create mode 100644 backend/tol_data/dbpedia/README.md create mode 100644 backend/tol_data/dbpedia/__init__.py create mode 100755 backend/tol_data/dbpedia/gen_desc_data.py create mode 100644 backend/tol_data/enwiki/README.md create mode 100644 backend/tol_data/enwiki/__init__.py create mode 100755 backend/tol_data/enwiki/download_img_license_info.py create mode 100755 backend/tol_data/enwiki/download_imgs.py create mode 100755 backend/tol_data/enwiki/gen_desc_data.py create mode 100755 backend/tol_data/enwiki/gen_dump_index_db.py create mode 100755 backend/tol_data/enwiki/gen_img_data.py create mode 100755 backend/tol_data/enwiki/gen_pageview_data.py create mode 100755 backend/tol_data/enwiki/lookup_page.py create mode 100644 backend/tol_data/eol/README.md create mode 100644 backend/tol_data/eol/__init__.py create mode 100755 backend/tol_data/eol/download_imgs.py create mode 100755 backend/tol_data/eol/gen_images_list_db.py create mode 100755 backend/tol_data/eol/review_imgs.py create mode 100755 backend/tol_data/gen_desc_data.py create mode 100755 backend/tol_data/gen_imgs.py create mode 100755 backend/tol_data/gen_linked_imgs.py create mode 100755 backend/tol_data/gen_mapping_data.py create mode 100755 backend/tol_data/gen_name_data.py create mode 100755 backend/tol_data/gen_otol_data.py create mode 100755 backend/tol_data/gen_pop_data.py create mode 100755 backend/tol_data/gen_reduced_trees.py create mode 100644 backend/tol_data/otol/README.md create mode 100644 backend/tol_data/picked_imgs/README.md create mode 100755 backend/tol_data/review_imgs_to_gen.py create mode 100644 backend/tol_data/wikidata/README.md create mode 100644 backend/tol_data/wikidata/__init__.py create mode 100755 backend/tol_data/wikidata/gen_taxon_src_data.py (limited to 'backend/tol_data') diff --git a/backend/tol_data/README.md b/backend/tol_data/README.md new file mode 100644 index 0000000..a21418b --- /dev/null +++ b/backend/tol_data/README.md @@ -0,0 +1,155 @@ +This directory holds files used to generate the tree-of-life database data.db. + +# Database Tables +## Tree Structure +- `nodes`
+ Format : `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT`
+ Represents a tree-of-life node. `tips` holds the number of no-child descendants +- `edges`
+ Format: `parent TEXT, child TEXT, p_support INT, PRIMARY KEY (parent, child)`
+ `p_support` is 1 if the edge has 'phylogenetic support', and 0 otherwise +## Node Mappings +- `eol_ids`
+ Format: `name TEXT PRIMARY KEY, id INT`
+ Associates nodes with EOL IDs +- `wiki_ids`
+ Format: `name TEXT PRIMARY KEY, id INT`
+ Associates nodes with wikipedia page IDs +## Node Vernacular Names +- `names`
+ Format: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)`
+ Associates a node with alternative names. + `pref_alt` is 1 if the alt-name is the most 'preferred' one. + `src` indicates the dataset the alt-name was obtained from (can be 'eol', 'enwiki', or 'picked'). +## Node Descriptions +- `descs`
+ Format: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT`
+ Associates a wikipedia page ID with a short-description. + `from_dbp` is 1 if the description was obtained from DBpedia, and 0 otherwise. +## Node Images +- `node_imgs`
+ Format: `name TEXT PRIMARY KEY, img_id INT, src TEXT`
+ Associates a node with an image. +- `images`
+ Format: `id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)`
+ Represents an image, identified by a source ('eol', 'enwiki', or 'picked'), and a source-specific ID. +- `linked_imgs`
+ Format: `name TEXT PRIMARY KEY, otol_ids TEXT`
+ Associates a node with an image from another node. + `otol_ids` can be an otol ID, or (for compound nodes) two comma-separated strings that may be otol IDs or empty. +## Reduced Trees +- `nodes_t`, `nodes_i`, `nodes_p`
+ These are like `nodes`, but describe nodes of reduced trees. +- `edges_t`, `edges_i`, `edges_p`
+ Like `edges` but for reduced trees. +## Other +- `node_iucn`
+ Format: `name TEXT PRIMARY KEY, iucn TEXT`
+ Associates nodes with IUCN conservation status strings (eg: 'endangered') +- `node_pop`
+ Format: `name TEXT PRIMARY KEY, pop INT`
+ Associates nodes with popularity values (higher means more popular) + +# Generating the Database + +As a warning, the whole process takes a lot of time and file space. The +tree will probably have about 2.6 million nodes. Downloading the images +takes several days, and occupies over 200 GB. + +## Environment +Some of the scripts use third-party packages: +- `indexed_bzip2`: For parallelised bzip2 processing. +- `jsonpickle`: For encoding class objects as JSON. +- `requests`: For downloading data. +- `PIL`: For image processing. +- `tkinter`: For providing a basic GUI to review images. +- `mwxml`, `mwparserfromhell`: For parsing Wikipedia dumps. + +## Generate Tree Structure Data +1. Obtain 'tree data files' in otol/, as specified in it's README. +2. Run `gen_otol_data.py`, which creates data.db, and adds the `nodes` and `edges` tables, + using data in otol/. It also uses these files, if they exist: + - `picked_otol_names.txt`: Has lines of the form `name1|otolId1`. + Can be used to override numeric suffixes added to same-name nodes. + +## Generate Dataset Mappings +1. Obtain 'taxonomy data files' in otol/, 'mapping files' in eol/, + files in wikidata/, and 'dump-index files' in enwiki/, as specified + in their READMEs. +2. Run `gen_mapping_data.py`, which adds the `eol_ids` and `wiki_ids` tables, + as well as `node_iucn`. It uses the files obtained above, the `nodes` table, + and 'picked mappings' files, if they exist. + - `picked_eol_ids.txt` contains lines like `3785967|405349`, specifying + an otol ID and an eol ID to map it to. The eol ID can be empty, + in which case the otol ID won't be mapped. + - `picked_wiki_ids.txt` and `picked_wiki_ids_rough.txt` contain lines like + `5341349|Human`, specifying an otol ID and an enwiki title, + which may contain spaces. The title can be empty. + +## Generate Node Name Data +1. Obtain 'name data files' in eol/, and 'description database files' in enwiki/, + as specified in their READMEs. +2. Run `gen_name_data.py`, which adds the `names` table, using data in eol/ and enwiki/, + along with the `nodes`, `eol_ids`, and `wiki_ids` tables.
+ It also uses `picked_names.txt`, if it exists. This file can hold lines like + `embryophyta|land plant|1`, specifying a node name, an alt-name to add for it, + and a 1 or 0 indicating whether it is a 'preferred' alt-name. The last field + can be empty, which indicates that the alt-name should be removed, or, if the + alt-name is the same as the node name, that no alt-name should be preferred. + +## Generate Node Description Data +1. Obtain files in dbpedia/, as specified in it's README. +2. Run `gen_desc_data.py`, which adds the `descs` table, using data in dbpedia/ and + enwiki/, and the `nodes` table. + +## Generate Node Images Data +### Get images from EOL +1. Obtain 'image metadata files' in eol/, as specified in it's README. +2. In eol/, run `download_imgs.py`, which downloads images (possibly multiple per node), + into eol/imgs_for_review, using data in eol/, as well as the `eol_ids` table. + By default, more images than needed are downloaded for review. To skip this, set + the script's MAX_IMGS_PER_ID to 1. +3. In eol/, run `review_imgs.py`, which interactively displays the downloaded images for + each node, providing the choice of which (if any) to use, moving them to eol/imgs/. + Uses `names` and `eol_ids` to display extra info. If MAX_IMGS_PER_ID was set to 1 in + the previous step, you can skip review by renaming the image folder. +### Get Images from Wikipedia +1. In enwiki/, run `gen_img_data.py`, which looks for wikipedia image names for each node, + using the `wiki_ids` table, and stores them in a database. +2. In enwiki/, run `download_img_license_info.py`, which downloads licensing information for + those images, using wikipedia's online API. +3. In enwiki/, run `download_imgs.py`, which downloads 'permissively-licensed' + images into enwiki/imgs/. +### Merge the Image Sets +1. Run `review_imgs_to_gen.py`, which displays images from eol/imgs/ and enwiki/imgs/, + and enables choosing, for each node, which image should be used, if any, + and outputs choice information into `img_list.txt`. Uses the `nodes`, + `eol_ids`, and `wiki_ids` tables (as well as `names` to display extra info). + To skip manual review, set REVIEW to 'none' in the script (the script will select any + image, preferring ones from Wikipedia). +2. Run `gen_imgs.py`, which creates cropped/resized images in img/, from files listed in + `img_list.txt` and located in eol/ and enwiki/, and creates the `node_imgs` and + `images` tables. If `picked_imgs/` is present, images within it are also used.
+ The outputs might need to be manually created/adjusted: + - An input image might have no output produced, possibly due to + data incompatibilities, memory limits, etc. A few input image files + might actually be html files, containing a 'file not found' page. + - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. + - An input image might produce output with unexpected dimensions. + This seems to happen when the image is very large, and triggers a + decompression bomb warning. +### Add more Image Associations +1. Run `gen_linked_imgs.py`, which tries to associate nodes without images to + images of it's children. Adds the `linked_imgs` table, and uses the + `nodes`, `edges`, and `node_imgs` tables. + +## Generate Reduced Trees +1. Run `gen_reduced_trees.py`, which generates multiple reduced versions of the tree, + adding the `nodes_*` and `edges_*` tables, using `nodes`, `edges`, `wiki_ids`, + `node_imgs`, `linked_imgs`, and `names`. Reads from `picked_nodes.txt`, which lists + names of nodes that must be included (1 per line). + +## Generate Node Popularity Data +1. Obtain 'page view files' in enwiki/, as specified in it's README. +2. Run `gen_pop_data.py`, which adds the `node_pop` table, using data in enwiki/, + and the `wiki_ids` table. diff --git a/backend/tol_data/__init__.py b/backend/tol_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tol_data/dbpedia/README.md b/backend/tol_data/dbpedia/README.md new file mode 100644 index 0000000..a708122 --- /dev/null +++ b/backend/tol_data/dbpedia/README.md @@ -0,0 +1,29 @@ +This directory holds files obtained/derived from [Dbpedia](https://www.dbpedia.org). + +# Downloaded Files +- `labels_lang=en.ttl.bz2`
+ Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core. + Downloaded from . +- `page_lang=en_ids.ttl.bz2`
+ Downloaded from +- `redirects_lang=en_transitive.ttl.bz2`
+ Downloaded from . +- `disambiguations_lang=en.ttl.bz2`
+ Downloaded from . +- `instance-types_lang=en_specific.ttl.bz2`
+ Downloaded from . +- `short-abstracts_lang=en.ttl.bz2`
+ Downloaded from . + +# Other Files +- `gen_desc_data.py`
+ Used to generate a database representing data from the ttl files. +- `desc_data.db`
+ Generated by `gen_desc_data.py`.
+ Tables:
+ - `labels`: `iri TEXT PRIMARY KEY, label TEXT ` + - `ids`: `iri TEXT PRIMARY KEY, id INT` + - `redirects`: `iri TEXT PRIMARY KEY, target TEXT` + - `disambiguations`: `iri TEXT PRIMARY KEY` + - `types`: `iri TEXT, type TEXT` + - `abstracts`: `iri TEXT PRIMARY KEY, abstract TEXT` diff --git a/backend/tol_data/dbpedia/__init__.py b/backend/tol_data/dbpedia/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tol_data/dbpedia/gen_desc_data.py b/backend/tol_data/dbpedia/gen_desc_data.py new file mode 100755 index 0000000..50418e0 --- /dev/null +++ b/backend/tol_data/dbpedia/gen_desc_data.py @@ -0,0 +1,120 @@ +#!/usr/bin/python3 + +""" +Adds DBpedia labels/types/abstracts/etc data into a database +""" + +# In testing, this script took a few hours to run, and generated about 10GB + +import re +import bz2, sqlite3 + +LABELS_FILE = 'labels_lang=en.ttl.bz2' # Had about 16e6 entries +IDS_FILE = 'page_lang=en_ids.ttl.bz2' +REDIRECTS_FILE = 'redirects_lang=en_transitive.ttl.bz2' +DISAMBIG_FILE = 'disambiguations_lang=en.ttl.bz2' +TYPES_FILE = 'instance-types_lang=en_specific.ttl.bz2' +ABSTRACTS_FILE = 'short-abstracts_lang=en.ttl.bz2' +DB_FILE = 'desc_data.db' + +def genData( + labelsFile: str, idsFile: str, redirectsFile: str, disambigFile: str, + typesFile: str, abstractsFile: str, dbFile: str) -> None: + """ Reads the files and writes to db """ + print('Creating database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + # + print('Reading/storing label data') + dbCur.execute('CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)') + dbCur.execute('CREATE INDEX labels_idx ON labels(label)') + dbCur.execute('CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)') + labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n') + with bz2.open(labelsFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = labelLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO labels VALUES (?, ?)', (match.group(1), match.group(2))) + # + print('Reading/storing wiki page ids') + dbCur.execute('CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)') + dbCur.execute('CREATE INDEX ids_idx ON ids(id)') + idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') + with bz2.open(idsFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = idLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + try: + dbCur.execute('INSERT INTO ids VALUES (?, ?)', (match.group(1), int(match.group(2)))) + except sqlite3.IntegrityError as e: + # Accounts for certain lines that have the same IRI + print(f'WARNING: Failed to add entry with IRI "{match.group(1)}": {e}') + # + print('Reading/storing redirection data') + dbCur.execute('CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)') + redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') + with bz2.open(redirectsFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = redirLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (match.group(1), match.group(2))) + # + print('Reading/storing diambiguation-page data') + dbCur.execute('CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)') + disambigLineRegex = redirLineRegex + with bz2.open(disambigFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = disambigLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT OR IGNORE INTO disambiguations VALUES (?)', (match.group(1),)) + # + print('Reading/storing instance-type data') + dbCur.execute('CREATE TABLE types (iri TEXT, type TEXT)') + dbCur.execute('CREATE INDEX types_iri_idx ON types(iri)') + typeLineRegex = redirLineRegex + with bz2.open(typesFile, mode='rt') as file: + for lineNum, line in enumerate(file, 1): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + match = typeLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO types VALUES (?, ?)', (match.group(1), match.group(2))) + # + print('Reading/storing abstracts') + dbCur.execute('CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)') + descLineRegex = labelLineRegex + with bz2.open(abstractsFile, mode='rt') as file: + for lineNum, line in enumerate(file): + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + if line[0] == '#': + continue + match = descLineRegex.fullmatch(line) + if match is None: + raise Exception(f'ERROR: Line {lineNum} has unexpected format') + dbCur.execute('INSERT INTO abstracts VALUES (?, ?)', + (match.group(1), match.group(2).replace(r'\"', '"'))) + # + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(LABELS_FILE, IDS_FILE, REDIRECTS_FILE, DISAMBIG_FILE, TYPES_FILE, ABSTRACTS_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/README.md b/backend/tol_data/enwiki/README.md new file mode 100644 index 0000000..ba1de33 --- /dev/null +++ b/backend/tol_data/enwiki/README.md @@ -0,0 +1,63 @@ +This directory holds files obtained/derived from [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page). + +# Downloaded Files +- `enwiki-20220501-pages-articles-multistream.xml.bz2`
+ Contains text content and metadata for pages in enwiki. + Obtained via (site suggests downloading from a mirror). + Some file content and format information was available from + . +- `enwiki-20220501-pages-articles-multistream-index.txt.bz2`
+ Obtained like above. Holds lines of the form offset1:pageId1:title1, + providing, for each page, an offset into the dump file of a chunk of + 100 pages that includes it. + +# Dump-Index Files +- `gen_dump_index_db.py`
+ Creates a database version of the enwiki-dump index file. +- `dumpIndex.db`
+ Generated by `gen_dump_index_db.py`.
+ Tables:
+ - `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT` + +# Description Database Files +- `gen_desc_data.py`
+ Reads through pages in the dump file, and adds short-description info to a database. +- `desc_data.db`
+ Generated by `gen_desc_data.py`.
+ Tables:
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE` + - `redirects`: `id INT PRIMARY KEY, target TEXT` + - `descs`: `id INT PRIMARY KEY, desc TEXT` + +# Image Database Files +- `gen_img_data.py`
+ Used to find infobox image names for page IDs, storing them into a database. +- `downloadImgLicenseInfo.py`
+ Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database. +- `img_data.db`
+ Used to hold metadata about infobox images for a set of pageIDs. + Generated using `get_enwiki_img_data.py` and `download_img_license_info.py`.
+ Tables:
+ - `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT`
+ `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids. + - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT`
+ Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. +- `downloadImgs.py`
+ Used to download image files into imgs/. + +# Page View Files +- `pageviews/pageviews-*-user.bz2` + Each holds wikimedia article page view data for some month. + Obtained via . + Some format info was available from . +- `gen_pageview_data.py`
+ Reads pageview/*, and creates a database holding average monthly pageview counts. +- `pageview_data.db`
+ Generated using `gen_pageview_data.py`.
+ Tables:
+ - `views`: `title TEXT PRIMARY KEY, id INT, views INT` + +# Other Files +- `lookup_page.py`
+ Running `lookup_page.py title1` looks in the dump for a page with a given title, + and prints the contents to stdout. Uses dumpIndex.db. diff --git a/backend/tol_data/enwiki/__init__.py b/backend/tol_data/enwiki/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tol_data/enwiki/download_img_license_info.py b/backend/tol_data/enwiki/download_img_license_info.py new file mode 100755 index 0000000..0a809ac --- /dev/null +++ b/backend/tol_data/enwiki/download_img_license_info.py @@ -0,0 +1,154 @@ +#!/usr/bin/python3 + +""" +Reads image names from a database, and uses enwiki's online API to obtain +licensing information for them, adding the info to the database. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +at already-processed names to decide what to skip. +""" + +import re +import sqlite3, urllib.parse, html +import requests +import time, signal + +IMG_DB = 'img_data.db' +# +API_URL = 'https://en.wikipedia.org/w/api.php' +USER_AGENT = 'terryt.dev (terry06890@gmail.com)' +BATCH_SZ = 50 # Max 50 +TAG_REGEX = re.compile(r'<[^<]+>') +WHITESPACE_REGEX = re.compile(r'\s+') + +def downloadInfo(imgDb: str) -> None: + print('Opening database') + dbCon = sqlite3.connect(imgDb) + dbCur = dbCon.cursor() + print('Checking for table') + if dbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="imgs"').fetchone() is None: + dbCur.execute('CREATE TABLE imgs (' \ + 'name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT)') + # + print('Reading image names') + imgNames: set[str] = set() + for (imgName,) in dbCur.execute('SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL'): + imgNames.add(imgName) + print(f'Found {len(imgNames)}') + # + print('Checking for already-processed images') + oldSz = len(imgNames) + for (imgName,) in dbCur.execute('SELECT name FROM imgs'): + imgNames.discard(imgName) + print(f'Found {oldSz - len(imgNames)}') + # + # Set SIGINT handler + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # + print('Iterating through image names') + imgNameList = list(imgNames) + iterNum = 0 + for i in range(0, len(imgNameList), BATCH_SZ): + iterNum += 1 + if iterNum % 1 == 0: + print(f'At iteration {iterNum} (after {(iterNum - 1) * BATCH_SZ} images)') + if interrupted: + print(f'Exiting loop at iteration {iterNum}') + break + # Get batch + imgBatch = imgNameList[i:i+BATCH_SZ] + imgBatch = ['File:' + x for x in imgBatch] + # Make request + headers = { + 'user-agent': USER_AGENT, + 'accept-encoding': 'gzip', + } + params = { + 'action': 'query', + 'format': 'json', + 'prop': 'imageinfo', + 'iiprop': 'extmetadata|url', + 'maxlag': '5', + 'titles': '|'.join(imgBatch), + 'iiextmetadatafilter': 'Artist|Credit|LicenseShortName|Restrictions', + } + responseObj = None + try: + response = requests.get(API_URL, params=params, headers=headers) + responseObj = response.json() + except Exception as e: + print(f'ERROR: Exception while downloading info: {e}') + print('\tImage batch: ' + '|'.join(imgBatch)) + continue + # Parse response-object + if 'query' not in responseObj or 'pages' not in responseObj['query']: + print('WARNING: Response object for doesn\'t have page data') + print('\tImage batch: ' + '|'.join(imgBatch)) + if 'error' in responseObj: + errorCode = responseObj['error']['code'] + print(f'\tError code: {errorCode}') + if errorCode == 'maxlag': + time.sleep(5) + continue + pages = responseObj['query']['pages'] + normalisedToInput: dict[str, str] = {} + if 'normalized' in responseObj['query']: + for entry in responseObj['query']['normalized']: + normalisedToInput[entry['to']] = entry['from'] + for page in pages.values(): + # Some fields // More info at https://www.mediawiki.org/wiki/Extension:CommonsMetadata#Returned_data + # LicenseShortName: short human-readable license name, apparently more reliable than 'License', + # Artist: author name (might contain complex html, multiple authors, etc) + # Credit: 'source' + # For image-map-like images, can be quite large/complex html, creditng each sub-image + # May be text2, where the text2 might be non-indicative + # Restrictions: specifies non-copyright legal restrictions + title: str = page['title'] + if title in normalisedToInput: + title = normalisedToInput[title] + title = title[5:] # Remove 'File:' + if title not in imgNames: + print(f'WARNING: Got title "{title}" not in image-name list') + continue + if 'imageinfo' not in page: + print(f'WARNING: No imageinfo section for page "{title}"') + continue + metadata = page['imageinfo'][0]['extmetadata'] + url: str = page['imageinfo'][0]['url'] + license: str | None = metadata['LicenseShortName']['value'] if 'LicenseShortName' in metadata else None + artist: str | None = metadata['Artist']['value'] if 'Artist' in metadata else None + credit: str | None = metadata['Credit']['value'] if 'Credit' in metadata else None + restrictions: str | None = metadata['Restrictions']['value'] if 'Restrictions' in metadata else None + # Remove markup + if artist is not None: + artist = TAG_REGEX.sub(' ', artist).strip() + artist = WHITESPACE_REGEX.sub(' ', artist) + artist = html.unescape(artist) + artist = urllib.parse.unquote(artist) + if credit is not None: + credit = TAG_REGEX.sub(' ', credit).strip() + credit = WHITESPACE_REGEX.sub(' ', credit) + credit = html.unescape(credit) + credit = urllib.parse.unquote(credit) + # Add to db + print((title, license, artist, credit, restrictions, url)) + dbCur.execute('INSERT INTO imgs VALUES (?, ?, ?, ?, ?, ?)', + (title, license, artist, credit, restrictions, url)) + # + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + downloadInfo(IMG_DB) diff --git a/backend/tol_data/enwiki/download_imgs.py b/backend/tol_data/enwiki/download_imgs.py new file mode 100755 index 0000000..ba874e1 --- /dev/null +++ b/backend/tol_data/enwiki/download_imgs.py @@ -0,0 +1,99 @@ +#!/usr/bin/python3 + +""" +Downloads images from URLs in an image database, into an output directory, +with names of the form 'pageId1.ext1'. + +SIGINT causes the program to finish an ongoing download and exit. +The program can be re-run to continue downloading, and looks +in the output directory do decide what to skip. +""" + +# In testing, this downloaded about 100k images, over several days + +import re, os +import sqlite3 +import urllib.parse, requests +import time, signal + +IMG_DB = 'img_data.db' # About 130k image names +OUT_DIR = 'imgs' +# +LICENSE_REGEX = re.compile(r'cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?', flags=re.IGNORECASE) +USER_AGENT = 'terryt.dev (terry06890@gmail.com)' +TIMEOUT = 1 + # https://en.wikipedia.org/wiki/Wikipedia:Database_download says to 'throttle to 1 cache miss per sec' + # It's unclear how to properly check for cache misses, so we just aim for 1 per sec + +def downloadImgs(imgDb: str, outDir: str, timeout: int) -> None: + if not os.path.exists(outDir): + os.mkdir(outDir) + print('Checking for already-downloaded images') + fileList = os.listdir(outDir) + pageIdsDone: set[int] = set() + for filename in fileList: + pageIdsDone.add(int(os.path.splitext(filename)[0])) + print(f'Found {len(pageIdsDone)}') + # + # Set SIGINT handler + interrupted = False + oldHandler = None + def onSigint(sig, frame): + nonlocal interrupted + interrupted = True + signal.signal(signal.SIGINT, oldHandler) + oldHandler = signal.signal(signal.SIGINT, onSigint) + # + print('Opening database') + dbCon = sqlite3.connect(imgDb) + dbCur = dbCon.cursor() + print('Starting downloads') + iterNum = 0 + query = 'SELECT page_id, license, artist, credit, restrictions, url FROM' \ + ' imgs INNER JOIN page_imgs ON imgs.name = page_imgs.img_name' + for pageId, license, artist, credit, restrictions, url in dbCur.execute(query): + if pageId in pageIdsDone: + continue + if interrupted: + print('Exiting loop') + break + # Check for problematic attributes + if license is None or LICENSE_REGEX.fullmatch(license) is None: + continue + if artist is None or artist == '' or len(artist) > 100 or re.match(r'(\d\. )?File:', artist) is not None: + continue + if credit is None or len(credit) > 300 or re.match(r'File:', credit) is not None: + continue + if restrictions is not None and restrictions != '': + continue + # Download image + iterNum += 1 + print(f'Iteration {iterNum}: Downloading for page-id {pageId}') + urlParts = urllib.parse.urlparse(url) + extension = os.path.splitext(urlParts.path)[1] + if len(extension) <= 1: + print(f'WARNING: No filename extension found in URL {url}') + continue + outFile = os.path.join(outDir, f'{pageId}{extension}') + print(outFile) + headers = { + 'user-agent': USER_AGENT, + 'accept-encoding': 'gzip', + } + try: + response = requests.get(url, headers=headers) + with open(outFile, 'wb') as file: + file.write(response.content) + time.sleep(timeout) + except Exception as e: + print(f'Error while downloading to {outFile}: {e}') + return + print('Closing database') + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + downloadImgs(IMG_DB, OUT_DIR, TIMEOUT) diff --git a/backend/tol_data/enwiki/gen_desc_data.py b/backend/tol_data/enwiki/gen_desc_data.py new file mode 100755 index 0000000..0dca16b --- /dev/null +++ b/backend/tol_data/enwiki/gen_desc_data.py @@ -0,0 +1,126 @@ +#!/usr/bin/python3 + +""" +Reads through the wiki dump, and attempts to parse short-descriptions, +and add them to a database +""" + +# In testing, this script took over 10 hours to run, and generated about 5GB + +import sys, os, re +import bz2 +import html, mwxml, mwparserfromhell +import sqlite3 + +DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' # Had about 22e6 pages +DB_FILE = 'desc_data.db' + +DESC_LINE_REGEX = re.compile('^ *[A-Z\'"]') +EMBEDDED_HTML_REGEX = re.compile(r'<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$') + # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag +CONVERT_TEMPLATE_REGEX = re.compile(r'{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}') +def convertTemplateReplace(match): + """ Used in regex-substitution with CONVERT_TEMPLATE_REGEX """ + if match.group(2) is None: + return f'{match.group(1)} {match.group(4)}' + else: + return f'{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}' +PARENS_GROUP_REGEX = re.compile(r' \([^()]*\)') +LEFTOVER_BRACE_REGEX = re.compile(r'(?:{\||{{).*') + +def genData(dumpFile: str, dbFile: str) -> None: + print('Creating database') + if os.path.exists(dbFile): + raise Exception(f'ERROR: Existing {dbFile}') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)') + dbCur.execute('CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)') + dbCur.execute('CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)') + dbCur.execute('CREATE INDEX redirects_idx ON redirects(target)') + dbCur.execute('CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)') + # + print('Iterating through dump file') + with bz2.open(dumpFile, mode='rt') as file: + for pageNum, page in enumerate(mwxml.Dump.from_file(file), 1): + if pageNum % 1e4 == 0: + print(f'At page {pageNum}') + # Parse page + if page.namespace == 0: + try: + dbCur.execute('INSERT INTO pages VALUES (?, ?)', (page.id, convertTitle(page.title))) + except sqlite3.IntegrityError as e: + # Accounts for certain pages that have the same title + print(f'Failed to add page with title "{page.title}": {e}', file=sys.stderr) + continue + if page.redirect is not None: + dbCur.execute('INSERT INTO redirects VALUES (?, ?)', (page.id, convertTitle(page.redirect))) + else: + revision = next(page) + desc = parseDesc(revision.text) + if desc is not None: + dbCur.execute('INSERT INTO descs VALUES (?, ?)', (page.id, desc)) + # + print('Closing database') + dbCon.commit() + dbCon.close() +def parseDesc(text: str) -> str | None: + # Find first matching line outside {{...}}, [[...]], and block-html-comment constructs, + # and then accumulate lines until a blank one. + # Some cases not accounted for include: disambiguation pages, abstracts with sentences split-across-lines, + # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + lines: list[str] = [] + openBraceCount = 0 + openBracketCount = 0 + inComment = False + skip = False + for line in text.splitlines(): + line = line.strip() + if not lines: + if line: + if openBraceCount > 0 or line[0] == '{': + openBraceCount += line.count('{') + openBraceCount -= line.count('}') + skip = True + if openBracketCount > 0 or line[0] == '[': + openBracketCount += line.count('[') + openBracketCount -= line.count(']') + skip = True + if inComment or line.find('') != -1: + if inComment: + inComment = False + skip = True + else: + inComment = True + skip = True + if skip: + skip = False + continue + if line[-1] == ':': # Seems to help avoid disambiguation pages + return None + if DESC_LINE_REGEX.match(line) is not None: + lines.append(line) + else: + if not line: + return removeMarkup(' '.join(lines)) + lines.append(line) + if lines: + return removeMarkup(' '.join(lines)) + return None +def removeMarkup(content: str) -> str: + content = EMBEDDED_HTML_REGEX.sub('', content) + content = CONVERT_TEMPLATE_REGEX.sub(convertTemplateReplace, content) + content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup + content = PARENS_GROUP_REGEX.sub('', content) + content = LEFTOVER_BRACE_REGEX.sub('', content) + return content +def convertTitle(title: str) -> str: + return html.unescape(title).replace('_', ' ') + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(DUMP_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_dump_index_db.py b/backend/tol_data/enwiki/gen_dump_index_db.py new file mode 100755 index 0000000..5f21c9b --- /dev/null +++ b/backend/tol_data/enwiki/gen_dump_index_db.py @@ -0,0 +1,60 @@ +#!/usr/bin/python3 + +""" +Adds data from the wiki dump index-file into a database +""" +import sys, os, re +import bz2 +import sqlite3 + +INDEX_FILE = 'enwiki-20220501-pages-articles-multistream-index.txt.bz2' # Had about 22e6 lines +DB_FILE = 'dumpIndex.db' + +def genData(indexFile: str, dbFile: str) -> None: + """ Reads the index file and creates the db """ + if os.path.exists(dbFile): + raise Exception(f'ERROR: Existing {dbFile}') + print('Creating database') + dbCon = sqlite3.connect(dbFile) + dbCur = dbCon.cursor() + dbCur.execute('CREATE TABLE offsets (title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT)') + print('Iterating through index file') + lineRegex = re.compile(r'([^:]+):([^:]+):(.*)') + lastOffset = 0 + lineNum = 0 + entriesToAdd: list[tuple[str, str]] = [] + with bz2.open(indexFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f'At line {lineNum}') + # + match = lineRegex.fullmatch(line.rstrip()) + assert match is not None + offsetStr, pageId, title = match.group(1,2,3) + offset = int(offsetStr) + if offset > lastOffset: + for t, p in entriesToAdd: + try: + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (t, int(p), lastOffset, offset)) + except sqlite3.IntegrityError as e: + # Accounts for certain entries in the file that have the same title + print(f'Failed on title "{t}": {e}', file=sys.stderr) + entriesToAdd = [] + lastOffset = offset + entriesToAdd.append((title, pageId)) + for title, pageId in entriesToAdd: + try: + dbCur.execute('INSERT INTO offsets VALUES (?, ?, ?, ?)', (title, int(pageId), lastOffset, -1)) + except sqlite3.IntegrityError as e: + print(f'Failed on title "{t}": {e}', file=sys.stderr) + print('Closing database') + dbCon.commit() + dbCon.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.parse_args() + # + genData(INDEX_FILE, DB_FILE) diff --git a/backend/tol_data/enwiki/gen_img_data.py b/backend/tol_data/enwiki/gen_img_data.py new file mode 100755 index 0000000..d4696f0 --- /dev/null +++ b/backend/tol_data/enwiki/gen_img_data.py @@ -0,0 +1,193 @@ +#!/usr/bin/python3 + +""" +For some set of page IDs, looks up their content in the wiki dump, +and tries to parse infobox image names, storing them into a database. + +The program can be re-run with an updated set of page IDs, and +will skip already-processed page IDs. +""" + +import re +import os, bz2, html, urllib.parse +import sqlite3 + +DUMP_FILE = 'enwiki-20220501-pages-articles-multistream.xml.bz2' +INDEX_DB = 'dumpIndex.db' +IMG_DB = 'img_data.db' # The database to create +DB_FILE = os.path.join('..', 'data.db') +# +ID_LINE_REGEX = re.compile(r'(.*)') +IMG_LINE_REGEX = re.compile(r'.*\| *image *= *([^|]*)') +BRACKET_IMG_REGEX = re.compile(r'\[\[(File:[^|]*).*]]') +IMG_NAME_REGEX = re.compile(r'.*\.(jpg|jpeg|png|gif|tiff|tif)', flags=re.IGNORECASE) +CSS_IMG_CROP_REGEX = re.compile(r'{{css image crop\|image *= *(.*)', flags=re.IGNORECASE) + +def genData(pageIds: set[int], dumpFile: str, indexDb: str, imgDb: str) -> None: + print('Opening databases') + indexDbCon = sqlite3.connect(indexDb) + indexDbCur = indexDbCon.cursor() + imgDbCon = sqlite3.connect(imgDb) + imgDbCur = imgDbCon.cursor() + print('Checking tables') + if imgDbCur.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="page_imgs"').fetchone() is None: + # Create tables if not present + imgDbCur.execute('CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)') # img_name may be NULL + imgDbCur.execute('CREATE INDEX page_imgs_idx ON page_imgs(img_name)') + else: + # Check for already-processed page IDs + numSkipped = 0 + for (pid,) in imgDbCur.execute('SELECT page_id FROM page_imgs'): + if pid in pageIds: + pageIds.remove(pid) + numSkipped += 1 + else: + print(f'Found already-processed page ID {pid} which was not in input set') + print(f'Will skip {numSkipped} already-processed page IDs') + # + print('Getting dump-file offsets') + offsetToPageids: dict[int, list[int]] = {} + offsetToEnd: dict[int, int] = {} # Maps chunk-start offsets to their chunk-end offsets + iterNum = 0 + for pageId in pageIds: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f'At iteration {iterNum}') + # + query = 'SELECT offset, next_offset FROM offsets WHERE id = ?' + row: tuple[int, int] | None = indexDbCur.execute(query, (pageId,)).fetchone() + if row is None: + print(f'WARNING: Page ID {pageId} not found') + continue + chunkOffset, endOffset = row + offsetToEnd[chunkOffset] = endOffset + if chunkOffset not in offsetToPageids: + offsetToPageids[chunkOffset] = [] + offsetToPageids[chunkOffset].append(pageId) + print(f'Found {len(offsetToEnd)} chunks to check') + # + print('Iterating through chunks in dump file') + with open(dumpFile, mode='rb') as file: + iterNum = 0 + for pageOffset, endOffset in offsetToEnd.items(): + iterNum += 1 + if iterNum % 100 == 0: + print(f'At iteration {iterNum}') + # + chunkPageIds = offsetToPageids[pageOffset] + # Jump to chunk + file.seek(pageOffset) + compressedData = file.read(None if endOffset == -1 else endOffset - pageOffset) + data = bz2.BZ2Decompressor().decompress(compressedData).decode() + # Look in chunk for pages + lines = data.splitlines() + lineIdx = 0 + while lineIdx < len(lines): + # Look for + if lines[lineIdx].lstrip() != '': + lineIdx += 1 + continue + # Check page id + lineIdx += 3 + idLine = lines[lineIdx].lstrip() + match = ID_LINE_REGEX.fullmatch(idLine) + if match is None or int(match.group(1)) not in chunkPageIds: + lineIdx += 1 + continue + pageId = int(match.group(1)) + lineIdx += 1 + # Look for in + foundText = False + while lineIdx < len(lines): + if not lines[lineIdx].lstrip().startswith('') + 1:]) + lineIdx += 1 + foundTextEnd = False + while lineIdx < len(lines): + line = lines[lineIdx] + if not line.endswith(''): + content.append(line) + lineIdx += 1 + continue + foundTextEnd = True + content.append(line[:line.rfind('')]) + # Look for image-filename + imageName = getImageName(content) + imgDbCur.execute('INSERT into page_imgs VALUES (?, ?)', (pageId, imageName)) + break + if not foundTextEnd: + print(f'WARNING: Did not find for page id {pageId}') + break + if not foundText: + print(f'WARNING: Did not find for page id {pageId}') + # + print('Closing databases') + indexDbCon.close() + imgDbCon.commit() + imgDbCon.close() +def getImageName(content: list[str]) -> str | None: + """ Given an array of text-content lines, tries to return an infoxbox image name, or None """ + # Doesn't try and find images in outside-infobox [[File:...]] and sections + for line in content: + match = IMG_LINE_REGEX.match(line) + if match is not None: + imageName = match.group(1).strip() + if imageName == '': + return None + imageName = html.unescape(imageName) + # Account for {{... + if imageName.startswith('{'): + match = CSS_IMG_CROP_REGEX.match(imageName) + if match is None: + return None + imageName = match.group(1) + # Account for [[File:...|...]] + if imageName.startswith('['): + match = BRACKET_IMG_REGEX.match(imageName) + if match is None: + return None + imageName = match.group(1) + # Account for