From e78c4df403e5f98afa08f7a0841ff233d5f6d05b Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Wed, 22 Jun 2022 01:42:41 +1000 Subject: Update backend READMEs, rename some files for consistency --- .gitignore | 31 ++-- README.md | 8 +- backend/README.md | 4 + backend/data/README.md | 232 +++++++++++++------------- backend/data/dbpedia/README.md | 45 ++--- backend/data/dbpedia/genData.py | 146 ---------------- backend/data/dbpedia/genDescData.py | 146 ++++++++++++++++ backend/data/enwiki/README.md | 73 ++++---- backend/data/enwiki/downloadEnwikiImgs.py | 2 +- backend/data/enwiki/downloadImgLicenseInfo.py | 2 +- backend/data/enwiki/genData.py | 122 -------------- backend/data/enwiki/genDescData.py | 122 ++++++++++++++ backend/data/enwiki/genImgData.py | 178 ++++++++++++++++++++ backend/data/enwiki/getEnwikiImgData.py | 178 -------------------- backend/data/eol/README.md | 33 ++-- backend/data/eol/reviewImgs.py | 2 +- backend/data/genDbpData.py | 6 +- backend/data/genEnwikiDescData.py | 6 +- backend/data/genEnwikiNameData.py | 2 +- backend/data/genEolNameData.py | 4 +- backend/data/genImgs.py | 179 ++++++++++++++++++++ backend/data/genImgsForWeb.py | 179 -------------------- backend/data/genOtolData.py | 15 +- backend/data/genReducedTreeData.py | 2 +- backend/data/otol/README.md | 14 +- backend/data/pickedImgs/README.md | 16 +- backend/data/reducedTol/README.md | 4 - backend/data/reviewImgsToGen.py | 217 ++++++++++++++++++++++++ backend/data/reviewImgsToMerge.py | 217 ------------------------ 29 files changed, 1111 insertions(+), 1074 deletions(-) create mode 100644 backend/README.md delete mode 100755 backend/data/dbpedia/genData.py create mode 100755 backend/data/dbpedia/genDescData.py delete mode 100755 backend/data/enwiki/genData.py create mode 100755 backend/data/enwiki/genDescData.py create mode 100755 backend/data/enwiki/genImgData.py delete mode 100755 backend/data/enwiki/getEnwikiImgData.py create mode 100755 backend/data/genImgs.py delete mode 100755 backend/data/genImgsForWeb.py delete mode 100644 backend/data/reducedTol/README.md create mode 100755 backend/data/reviewImgsToGen.py delete mode 100755 backend/data/reviewImgsToMerge.py diff --git a/.gitignore b/.gitignore index 96d0644..c35bd98 100644 --- a/.gitignore +++ b/.gitignore @@ -3,31 +3,32 @@ /node_modules/ /dist/ /public/img/ + +# Backend files +/backend/data/data.db /backend/data/otol/*.tgz /backend/data/otol/*.json /backend/data/otol/*.tre -/backend/data/data.db /backend/data/eol/*.tgz /backend/data/eol/*.csv -/backend/data/eol/*.db /backend/data/eol/imagesList/ +/backend/data/eol/*.db /backend/data/eol/imgsForReview/ -/backend/data/eol/imgsReviewed/ -/backend/data/img/ +/backend/data/eol/imgs/ +/backend/data/dbpedia/*.bz2 +/backend/data/dbpedia/*.db /backend/data/enwiki/*.bz2 /backend/data/enwiki/*.db /backend/data/enwiki/imgs/ /backend/data/enwiki/.venv/ -/backend/data/dbpedia/*.bz2 -/backend/data/dbpedia/*.db -/backend/data/genOtolNamesToKeep.txt -/backend/data/genOtolDataPickedDups.txt -/backend/data/genEolNameDataPickedIds.txt -/backend/data/genEolNameDataBadAlts.txt -/backend/data/genDescNamesToSkip.txt -/backend/data/dbpPickedLabels.txt -/backend/data/enwikiPickedLabels.txt -/backend/data/mergedImgList.txt +/backend/data/imgList.txt /backend/data/pickedImgs/ -/backend/data/reducedTol/names.txt +/backend/data/img/ +/backend/data/pickedOtolNames.txt +/backend/data/pickedEolIds.txt +/backend/data/pickedEolAltsToSkip.txt +/backend/data/pickedEnwikiNamesToSkip.txt +/backend/data/pickedDbpLabels.txt +/backend/data/pickedEnwikiLabels.txt +/backend/data/pickedReducedNodes.txt /backend/data/pickedNames.txt diff --git a/README.md b/README.md index a860b3f..46e044d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -# Grid of Life +Grid of Life +============ An interactive visualisation of the biological tree of life. @@ -8,3 +9,8 @@ Each tile represents a group of organisms with a common ancestor. - Clicking on an expanded tile collapses it back into one tile. - Double-clicking on a tile expands it to fill the whole view. Other tiles will be moved to the side. + +Files +===== +- backend/: Contains code for running the server, and generating tree-of-life data +- diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 0000000..331e7f4 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,4 @@ +Files +===== +- server.py: Runs the server +- data/: For generating the server's tree-of-life database diff --git a/backend/data/README.md b/backend/data/README.md index d4a6196..7d1adad 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -1,115 +1,121 @@ -File Generation Process -======================= -1 Tree Structure Data - 1 Obtain data in otol/, as specified in it's README. - 2 Run genOtolData.py, which creates data.db, and adds - 'nodes' and 'edges' tables using data in otol/*, as well as - genOtolNamesToKeep.txt, if present. -2 Name Data for Search - 1 Obtain data in eol/, as specified in it's README. - 2 Run genEolNameData.py, which adds 'names' and 'eol_ids' tables to data.db, - using data in eol/vernacularNames.csv and the 'nodes' table, and possibly - genEolNameDataPickedIds.txt. -3 Node Description Data - 1 Obtain data in dbpedia/ and enwiki/, as specified in their README files. - 2 Run genDbpData.py, which adds 'wiki_ids' and 'descs' tables to data.db, - using data in dbpedia/dbpData.db, the 'nodes' table, and possibly - genDescNamesToSkip.txt and dbpPickedLabels.txt. - 3 Run genEnwikiDescData.py, which adds to the 'wiki_ids' and 'descs' tables, - using data in enwiki/enwikiData.db, and the 'nodes' table. - Also uses genDescNamesToSkip.txt and genEnwikiDescTitlesToUse.txt for - skipping/resolving some name-page associations. -4 Image Data - 1 In eol/, run downloadImgs.py to download EOL images into eol/imgsForReview/. - It uses data in eol/imagesList.db, and the 'eol_ids' table. - 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique - images in eol/imgsReviewed/ (uses 'names' and 'eol_ids' to display extra info). - 3 In enwiki/, run getEnwikiImgData.py, which generates a list of - tol-node images, and creates enwiki/enwikiImgs.db to store it. - Uses the 'wiki_ids' table to get tol-node wiki-ids. - 4 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing - information for images listed in enwiki/enwikiImgs.db, and stores - it in that db. - 5 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed' - images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/. - 6 Run reviewImgsToMerge.py, which displays images from eol/ and enwiki/, - and enables choosing, for each tol-node, which image should be used, if any, - and outputs choice information into mergedImgList.txt. Uses the 'nodes', - 'eol_ids', and 'wiki_ids' tables (as well as 'names' for info-display). - 7 Run genImgsForWeb.py, which creates cropped/resized images in img/, - using mergedImgList.txt, and possibly pickedImgs/, and adds 'images' and - 'node_imgs' tables to data.db.
- Smartcrop's outputs might need to be manually created/adjusted:
- - An input image might have no output produced, possibly due to - data incompatibilities, memory limits, etc. A few input image files - might actually be html files, containing a 'file not found' page. - - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. - - An input image might produce output with unexpected dimensions. - This seems to happen when the image is very large, and triggers a - decompression bomb warning. - The result might have as many as 150k images, with about 2/3 of them - being from wikipedia. - 8 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, - which uses 'nodes', 'edges', 'eol_ids', and 'node_imgs', to associate - nodes without images to child images. -5 Reduced Tree Structure Data - 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to - data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. -6 Other - - Optionally run genEnwikiNameData.py, which adds more entries to the 'names' table, - using data in enwiki/enwikiData.db, and the 'names' and 'wiki_ids' tables. - - Optionally run addPickedNames.py, which adds manually-picked names to - the 'names' table, as specified in pickedNames.txt. - - Optionally run trimTree.py, which tries to remove some 'low-significance' nodes, - for the sake of performance and result-relevance. Without this, jumping to certain - nodes within the fungi and moths can take over a minute to render. +This directory holds files used to generate data.db, which contains tree-of-life data. -data.db Tables -============== -- nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT -- edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) -- eol\_ids: id INT PRIMARY KEY, name TEXT -- names: name TEXT, alt\_name TEXT, pref\_alt INT, src TEXT, PRIMARY KEY(name, alt\_name) -- wiki\_ids: name TEXT PRIMARY KEY, id INT, redirected INT -- descs: wiki\_id INT PRIMARY KEY, desc TEXT, from\_dbp INT -- node\_imgs: name TEXT PRIMARY KEY, img\_id INT, src TEXT -- images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src) -- linked\_imgs: name TEXT PRIMARY KEY, otol\_ids TEXT -- r\_nodes: name TEXT PRIMARY KEY, tips INT -- r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) +# Tables: +- `nodes`: `name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT` +- `edges`: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)` +- `eol_ids`: `id INT PRIMARY KEY, name TEXT` +- `names`: `name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name)` +- `wiki_ids`: `name TEXT PRIMARY KEY, id INT, redirected INT` +- `descs`: `wiki_id INT PRIMARY KEY, desc TEXT, from_dbp INT` +- `node_imgs`: `name TEXT PRIMARY KEY, img_id INT, src TEXT` +- `images`: `id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src)` +- `linked_imgs`: `name TEXT PRIMARY KEY, otol_ids TEXT` +- `r_nodes`: `name TEXT PRIMARY KEY, tips INT` +- `r_edges`: `node TEXT, child TEXT, p_support INT, PRIMARY KEY (node, child)` -Other Files -=========== -- dbpPickedLabels.txt
- Contains DBpedia labels, one per line. Used by genDbpData.py to help - resolve conflicts when associating tree-of-life node names with - DBpedia node labels. -- genOtolNamesToKeep.txt
- Contains names to avoid trimming off the tree data generated by - genOtolData.py. Usage is optional, but, without it, a large amount - of possibly-significant nodes are removed, using a short-sighted - heuristic.
- One way to generate this list is to generate the files as usual, - then get node names that have an associated image, description, or - presence in r_nodes. Then run the genOtolData.py and genEolNameData.py - scripts again (after deleting their created tables). -- genEnwikiDescNamesToSkip.txt
- Contains names for nodes that genEnwikiNameData.py should skip adding - a description for. Usage is optional, but without it, some nodes will - probably get descriptions that don't match (eg: the bee genus Osiris - might be described as an egyptian god).
- This file was generated by running genEnwikiNameData.py, then listing - the names that it added into a file, along with descriptions, and - manually removing those that seemed node-matching (got about 30k lines, - with about 1 in 30 descriptions non-matching). And, after creating - genEnwikiDescTitlesToUse.txt, names shared with that file were removed. -- genEnwikiDescTitlesToUse.txt
- Contains enwiki titles with the form 'name1 (category1)' for - genEnwikiNameData.py to use to resolve nodes matching name name1. - Usage is optional, but it adds some descriptions that would otherwise - be skipped.
- This file was generated by taking the content of genEnwikiNameData.py, - after the manual filtering step, then, for each name name,1 getting - page titles from dbpedia/dbpData.db that match 'name1 (category1)'. - This was followed by manually removing lines, keeping those that - seemed to match the corresponding node (used the app to help with this). +# Generating the Database + +For the most part, these steps should be done in order. + +As a warning, the whole process takes a lot of time and file space. The tree will probably +have about 2.5 billion nodes. Downloading the images will take several days, and occupy over +200 GB. And if you want good data, you'll need to do some manual review, which can take weeks. + +## Environment +The scripts are written in python and bash. +Some of the python scripts require third-party packages: +- jsonpickle: For encoding class objects as JSON. +- requests: For downloading data. +- PIL: For image processing. +- tkinter: For providing a basic GUI to review images. +- mwxml, mwparserfromhell: For parsing Wikipedia dumps. + +## Generate tree structure data +1. Obtain files in otol/, as specified in it's README. +2. Run genOtolData.py, which creates data.db, and adds the `nodes` and `edges` tables, + using data in otol/. It also uses these files, if they exist: + - pickedOtolNames.txt: Has lines of the form `name1|otolId1`. Some nodes in the + tree may have the same name (eg: Pholidota can refer to pangolins or orchids). + Normally, such nodes will get the names 'name1', 'name1 [2]', 'name1 [3], etc. + This file can be used to manually specify which node should be named 'name1'. + +## Generate node name data +1. Obtain 'name data files' in eol/, as specified in it's README. +2. Run genEolNameData.py, which adds the `names` and `eol_ids` tables, using data in + eol/ and the `nodes` table. It also uses these files, if they exist: + - pickedEolIds.txt: Has lines of the form `nodeName1|eolId1` or `nodeName1|`. + Specifies node names that should have a particular EOL ID, or no ID. + Quite a few taxons have ambiguous names, and may need manual correction. + For example, Viola may resolve to a taxon of butterflies or of plants. + - pickedEolAltsToSkip.txt: Has lines of the form `nodeName1|altName1`. + Specifies that a node's alt-name set should exclude altName1. + +## Generate node description data +### Get data from DBpedia +1. Obtain files in dbpedia/, as specified in it's README. +2. Run genDbpData.py, which adds the `wiki_ids` and `descs` tables, using data in + dbpedia/ and the `nodes` table. It also uses these files, if they exist: + - pickedEnwikiNamesToSkip.txt: Each line holds the name of a node for which + no description should be obtained. Many node names have a same-name + wikipedia page that describes something different (eg: Osiris). + - pickedDbpLabels.txt: Has lines of the form `nodeName1|label1`. + Specifies node names that should have a particular associated page label. +### Get data from Wikipedia +1. Obtain 'description database files' in enwiki/, as specified in it's README. +2. Run genEnwikiDescData.py, which adds to the `wiki_ids` and `descs` tables, + using data in enwiki/ and the `nodes` table. + It also uses these files, if they exist: + - pickedEnwikiNamesToSkip.txt: Same as with genDbpData.py. + - pickedEnwikiLabels.txt: Similar to pickedDbpLabels.txt. + +## Generate image data +### Get images from EOL +1. Obtain 'image metadata files' in eol/, as specified in it's README. +2. In eol/, run downloadImgs.py, which downloads images (possibly multiple per node), + into eol/imgsForReview, using data in eol/, as well as the `eol_ids` table. +3. In eol/, run reviewImgs.py, which interactively displays the downloaded images for + each node, providing the choice of which to use, moving them to eol/imgs/. + Uses `names` and `eol_ids` to display extra info. +### Get images from Wikipedia +1. In enwiki/, run genImgData.py, which looks for wikipedia image names for each node, + using the `wiki_ids` table, and stores them in a database. +2. In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing information for + those images, using wikipedia's online API. +3. In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed' + images into enwiki/imgs/. +### Merge the image sets +1. Run reviewImgsToGen.py, which displays images from eol/imgs/ and enwiki/imgs/, + and enables choosing, for each node, which image should be used, if any, + and outputs choice information into imgList.txt. Uses the `nodes`, + `eol_ids`, and `wiki_ids` tables (as well as `names` to display extra info). +2. Run genImgs.py, which creates cropped/resized images in img/, from files listed in + imgList.txt and located in eol/ and enwiki/, and creates the `node_imgs` and + `images` tables. If pickedImgs/ is present, images within it are also used.
+ The outputs might need to be manually created/adjusted: + - An input image might have no output produced, possibly due to + data incompatibilities, memory limits, etc. A few input image files + might actually be html files, containing a 'file not found' page. + - An input x.gif might produce x-1.jpg, x-2.jpg, etc, instead of x.jpg. + - An input image might produce output with unexpected dimensions. + This seems to happen when the image is very large, and triggers a + decompression bomb warning. + The result might have as many as 150k images, with about 2/3 of them + being from wikipedia. +### Add more image associations +1. Run genLinkedImgs.py, which tries to associate nodes without images to + images of it's children. Adds the `linked_imgs` table, and uses the + `nodes`, `edges`, and `node_imgs` tables. + +## Do some post-processing +1. Run genReducedTreeData.py, which generates a second, reduced version of the tree, + adding the `r_nodes` and `r_edges` tables, using `nodes` and `names`. Reads from + pickedReducedNodes.txt, which lists names of nodes that must be included (1 per line). +2. Optionally run trimTree.py, which tries to remove some 'low-significance' nodes, + for the sake of performance and result-relevance. Otherwise, some nodes may have + over 10k children, which can take a while to render (over a minute in my testing). + You might want to backup the untrimmed tree first, as this operation is not easily + reversible. +3. Optionally run genEnwikiNameData.py, which adds more entries to the `names` table, + using data in enwiki/, and the `names` and `wiki_ids` tables. +4. Optionally run addPickedNames.py, which allows adding manually-selected name data to + the `names` table, as specified in pickedNames.txt. diff --git a/backend/data/dbpedia/README.md b/backend/data/dbpedia/README.md index 78e2a90..8a08f20 100644 --- a/backend/data/dbpedia/README.md +++ b/backend/data/dbpedia/README.md @@ -1,28 +1,29 @@ -Downloaded Files -================ -- labels\_lang=en.ttl.bz2
- Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core, - using the link . -- page\_lang=en\_ids.ttl.bz2
+This directory holds files obtained from/using [Dbpedia](https://www.dbpedia.org). + +# Downloaded Files +- `labels_lang=en.ttl.bz2`
+ Obtained via https://databus.dbpedia.org/dbpedia/collections/latest-core. + Downloaded from . +- `page_lang=en_ids.ttl.bz2`
Downloaded from -- redirects\_lang=en\_transitive.ttl.bz2
+- `redirects_lang=en_transitive.ttl.bz2`
Downloaded from . -- disambiguations\_lang=en.ttl.bz2
+- `disambiguations_lang=en.ttl.bz2`
Downloaded from . -- instance-types\_lang=en\_specific.ttl.bz2
+- `instance-types_lang=en_specific.ttl.bz2`
Downloaded from . -- short-abstracts\_lang=en.ttl.bz2
+- `short-abstracts_lang=en.ttl.bz2`
Downloaded from . -Generated Files -=============== -- dbpData.db
- An sqlite database representing data from the ttl files. - Generated by running genData.py. - Tables - - labels: iri TEXT PRIMARY KEY, label TEXT - - ids: iri TEXT PRIMARY KEY, id INT - - redirects: iri TEXT PRIMARY KEY, target TEXT - - disambiguations: iri TEXT PRIMARY KEY - - types: iri TEXT, type TEXT - - abstracts: iri TEXT PRIMARY KEY, abstract TEXT +# Other Files +- genDescData.py
+ Used to generate a database representing data from the ttl files. +- descData.db
+ Generated by genDescData.py.
+ Tables:
+ - `labels`: `iri TEXT PRIMARY KEY, label TEXT ` + - `ids`: `iri TEXT PRIMARY KEY, id INT` + - `redirects`: `iri TEXT PRIMARY KEY, target TEXT` + - `disambiguations`: `iri TEXT PRIMARY KEY` + - `types`: `iri TEXT, type TEXT` + - `abstracts`: `iri TEXT PRIMARY KEY, abstract TEXT` diff --git a/backend/data/dbpedia/genData.py b/backend/data/dbpedia/genData.py deleted file mode 100755 index 41c48a8..0000000 --- a/backend/data/dbpedia/genData.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/python3 - -import sys, re -import bz2, sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n" -usageInfo += "and creates a sqlite db containing that data.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines -idsFile = "page_lang=en_ids.ttl.bz2" -redirectsFile = "redirects_lang=en_transitive.ttl.bz2" -disambigFile = "disambiguations_lang=en.ttl.bz2" -typesFile = "instance-types_lang=en_specific.ttl.bz2" -abstractsFile = "short-abstracts_lang=en.ttl.bz2" -dbFile = "dbpData.db" - -# Open db -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() -# Read/store labels -print("Reading/storing label data") -dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)") -dbCur.execute("CREATE INDEX labels_idx ON labels(label)") -dbCur.execute("CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)") -labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n') -lineNum = 0 -with bz2.open(labelsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") - # - match = labelLineRegex.fullmatch(line) - if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2))) -dbCon.commit() -# Read/store wiki page ids -print("Reading/storing wiki page ids") -dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)") -idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') -lineNum = 0 -with bz2.open(idsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") - # - match = idLineRegex.fullmatch(line) - if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - try: - dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2)))) - except sqlite3.IntegrityError as e: - # Accounts for certain lines that have the same IRI - print(f"Failed to add entry with IRI \"{match.group(1)}\": {e}") -dbCon.commit() -# Read/store redirects -print("Reading/storing redirection data") -dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)") -redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') -lineNum = 0 -with bz2.open(redirectsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") - # - match = redirLineRegex.fullmatch(line) - if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2))) -dbCon.commit() -# Read/store diambiguation-page data -print("Reading/storing diambiguation-page data") -disambigNames = set() -disambigLineRegex = redirLineRegex -lineNum = 0 -with bz2.open(disambigFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") - # - match = disambigLineRegex.fullmatch(line) - if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - disambigNames.add(match.group(1)) -dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)") -for name in disambigNames: - dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,)) -dbCon.commit() -# Read/store instance-type -print("Reading/storing instance-type data") -dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)") -dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)") -typeLineRegex = redirLineRegex -lineNum = 0 -with bz2.open(typesFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") - # - match = typeLineRegex.fullmatch(line) - if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2))) -dbCon.commit() -# Read/store abstracts -print("Reading/storing abstracts") -dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)") -descLineRegex = labelLineRegex -lineNum = 0 -with bz2.open(abstractsFile, mode='rt') as file: - for line in file: - lineNum += 1 - if lineNum % 1e5 == 0: - print(f"Processing line {lineNum}") - # - if line[0] == "#": - continue - match = descLineRegex.fullmatch(line) - if match == None: - print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) - sys.exit(1) - else: - dbCur.execute("INSERT INTO abstracts VALUES (?, ?)", - (match.group(1), match.group(2).replace(r'\"', '"'))) -# Close db -dbCon.commit() -dbCon.close() diff --git a/backend/data/dbpedia/genDescData.py b/backend/data/dbpedia/genDescData.py new file mode 100755 index 0000000..bba3ff5 --- /dev/null +++ b/backend/data/dbpedia/genDescData.py @@ -0,0 +1,146 @@ +#!/usr/bin/python3 + +import sys, re +import bz2, sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads DBpedia labels/types/abstracts/etc data,\n" +usageInfo += "and creates a sqlite db containing that data.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +labelsFile = "labels_lang=en.ttl.bz2" # Has about 16e6 lines +idsFile = "page_lang=en_ids.ttl.bz2" +redirectsFile = "redirects_lang=en_transitive.ttl.bz2" +disambigFile = "disambiguations_lang=en.ttl.bz2" +typesFile = "instance-types_lang=en_specific.ttl.bz2" +abstractsFile = "short-abstracts_lang=en.ttl.bz2" +dbFile = "descData.db" + +# Open db +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Read/store labels +print("Reading/storing label data") +dbCur.execute("CREATE TABLE labels (iri TEXT PRIMARY KEY, label TEXT)") +dbCur.execute("CREATE INDEX labels_idx ON labels(label)") +dbCur.execute("CREATE INDEX labels_idx_nc ON labels(label COLLATE NOCASE)") +labelLineRegex = re.compile(r'<([^>]+)> <[^>]+> "((?:[^"]|\\")+)"@en \.\n') +lineNum = 0 +with bz2.open(labelsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f"Processing line {lineNum}") + # + match = labelLineRegex.fullmatch(line) + if match == None: + print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO labels VALUES (?, ?)", (match.group(1), match.group(2))) +dbCon.commit() +# Read/store wiki page ids +print("Reading/storing wiki page ids") +dbCur.execute("CREATE TABLE ids (iri TEXT PRIMARY KEY, id INT)") +idLineRegex = re.compile(r'<([^>]+)> <[^>]+> "(\d+)".*\n') +lineNum = 0 +with bz2.open(idsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f"Processing line {lineNum}") + # + match = idLineRegex.fullmatch(line) + if match == None: + print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) + sys.exit(1) + else: + try: + dbCur.execute("INSERT INTO ids VALUES (?, ?)", (match.group(1), int(match.group(2)))) + except sqlite3.IntegrityError as e: + # Accounts for certain lines that have the same IRI + print(f"Failed to add entry with IRI \"{match.group(1)}\": {e}") +dbCon.commit() +# Read/store redirects +print("Reading/storing redirection data") +dbCur.execute("CREATE TABLE redirects (iri TEXT PRIMARY KEY, target TEXT)") +redirLineRegex = re.compile(r'<([^>]+)> <[^>]+> <([^>]+)> \.\n') +lineNum = 0 +with bz2.open(redirectsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f"Processing line {lineNum}") + # + match = redirLineRegex.fullmatch(line) + if match == None: + print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (match.group(1), match.group(2))) +dbCon.commit() +# Read/store diambiguation-page data +print("Reading/storing diambiguation-page data") +disambigNames = set() +disambigLineRegex = redirLineRegex +lineNum = 0 +with bz2.open(disambigFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f"Processing line {lineNum}") + # + match = disambigLineRegex.fullmatch(line) + if match == None: + print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) + sys.exit(1) + else: + disambigNames.add(match.group(1)) +dbCur.execute("CREATE TABLE disambiguations (iri TEXT PRIMARY KEY)") +for name in disambigNames: + dbCur.execute("INSERT INTO disambiguations VALUES (?)", (name,)) +dbCon.commit() +# Read/store instance-type +print("Reading/storing instance-type data") +dbCur.execute("CREATE TABLE types (iri TEXT, type TEXT)") +dbCur.execute("CREATE INDEX types_iri_idx ON types(iri)") +typeLineRegex = redirLineRegex +lineNum = 0 +with bz2.open(typesFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f"Processing line {lineNum}") + # + match = typeLineRegex.fullmatch(line) + if match == None: + print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO types VALUES (?, ?)", (match.group(1), match.group(2))) +dbCon.commit() +# Read/store abstracts +print("Reading/storing abstracts") +dbCur.execute("CREATE TABLE abstracts (iri TEXT PRIMARY KEY, abstract TEXT)") +descLineRegex = labelLineRegex +lineNum = 0 +with bz2.open(abstractsFile, mode='rt') as file: + for line in file: + lineNum += 1 + if lineNum % 1e5 == 0: + print(f"Processing line {lineNum}") + # + if line[0] == "#": + continue + match = descLineRegex.fullmatch(line) + if match == None: + print(f"ERROR: Line {lineNum} has unexpected format", file=sys.stderr) + sys.exit(1) + else: + dbCur.execute("INSERT INTO abstracts VALUES (?, ?)", + (match.group(1), match.group(2).replace(r'\"', '"'))) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index 6462d7d..1c16a2e 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -1,39 +1,52 @@ -Downloaded Files -================ +This directory holds files obtained from/using [English Wikipedia](https://en.wikipedia.org/wiki/Main_Page). + +# Downloaded Files - enwiki-20220501-pages-articles-multistream.xml.bz2
- Obtained via - (site suggests downloading from a mirror). Contains text - content and metadata for pages in English Wikipedia - (current revision only, excludes talk pages). Some file - content and format information was available from - . + Obtained via (site suggests downloading from a mirror). + Contains text content and metadata for pages in enwiki. + Some file content and format information was available from + . - enwiki-20220501-pages-articles-multistream-index.txt.bz2
Obtained like above. Holds lines of the form offset1:pageId1:title1, - providing offsets, for each page, into the dump file, of a chunk of + providing, for each page, an offset into the dump file of a chunk of 100 pages that includes it. -Generated Files -=============== +# Generated Dump-Index Files +- genDumpIndexDb.py
+ Creates an sqlite-database version of the enwiki-dump index file. - dumpIndex.db
- Holds data from the enwiki dump index file. Generated by - genDumpIndexDb.py, and used by lookupPage.py to get content for a - given page title.
+ Generated by genDumpIndexDb.py.
Tables:
- - offsets: title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next\_offset INT -- enwikiData.db
- Holds data obtained from the enwiki dump file, in 'pages', - 'redirects', and 'descs' tables. Generated by genData.py, which uses - python packages mwxml and mwparserfromhell.
+ - `offsets`: `title TEXT PRIMARY KEY, id INT UNIQUE, offset INT, next_offset INT` + +# Description Database Files +- genDescData.py
+ Reads through pages in the dump file, and adds short-description info to a database. +- descData.db
+ Generated by genDescData.py.
Tables:
- - pages: id INT PRIMARY KEY, title TEXT UNIQUE - - redirects: id INT PRIMARY KEY, target TEXT - - descs: id INT PRIMARY KEY, desc TEXT -- enwikiImgs.db
- Holds infobox-images obtained for some set of wiki page-ids. - Generated by running getEnwikiImgData.py, which uses the enwiki dump - file and dumpIndex.db.
+ - `pages`: `id INT PRIMARY KEY, title TEXT UNIQUE` + - `redirects`: `id INT PRIMARY KEY, target TEXT` + - `descs`: `id INT PRIMARY KEY, desc TEXT` + +# Image Database Files +- genImgData.py
+ Used to find infobox image names for page IDs, storing them into a database. +- downloadImgLicenseInfo.py
+ Used to download licensing metadata for image names, via wikipedia's online API, storing them into a database. +- imgData.db
+ Used to hold metadata about infobox images for a set of pageIDs. + Generated using getEnwikiImgData.py and downloadImgLicenseInfo.py.
Tables:
- - page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT - (img\_name may be null, which is used to avoid re-processing the page-id on a second pass) - - imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT - (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info) + - `page_imgs`: `page_id INT PRIMAY KEY, img_name TEXT`
+ `img_name` may be null, which means 'none found', and is used to avoid re-processing page-ids. + - `imgs`: `name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT`
+ Might lack some matches for `img_name` in `page_imgs`, due to licensing info unavailability. +- downloadEnwikiImgs.py
+ Used to download image files into imgs/. + +# Other Files +- lookupPage.py
+ Running `lookupPage.py title1` looks in the dump for a page with a given title, + and prints the contents to stdout. Uses dumpIndex.db. + diff --git a/backend/data/enwiki/downloadEnwikiImgs.py b/backend/data/enwiki/downloadEnwikiImgs.py index de9b862..2929a0d 100755 --- a/backend/data/enwiki/downloadEnwikiImgs.py +++ b/backend/data/enwiki/downloadEnwikiImgs.py @@ -16,7 +16,7 @@ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDb = "enwikiImgs.db" # About 130k image names +imgDb = "imgData.db" # About 130k image names outDir = "imgs" licenseRegex = re.compile(r"cc0|cc([ -]by)?([ -]sa)?([ -][1234]\.[05])?( \w\w\w?)?", flags=re.IGNORECASE) diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py index 8231fbb..097304b 100755 --- a/backend/data/enwiki/downloadImgLicenseInfo.py +++ b/backend/data/enwiki/downloadImgLicenseInfo.py @@ -16,7 +16,7 @@ if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -imgDb = "enwikiImgs.db" # About 130k image names +imgDb = "imgData.db" # About 130k image names apiUrl = "https://en.wikipedia.org/w/api.php" batchSz = 50 # Max 50 tagRegex = re.compile(r"<[^<]+>") diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py deleted file mode 100755 index 3e60bb5..0000000 --- a/backend/data/enwiki/genData.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/python3 - -import sys, os, re -import bz2 -import html, mwxml, mwparserfromhell -import sqlite3 - -usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n" -usageInfo += "and short-description info to an sqlite db.\n" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages -enwikiDb = "enwikiData.db" - -# Some regexps and functions for parsing wikitext -descLineRegex = re.compile("^ *[A-Z'\"]") -embeddedHtmlRegex = re.compile(r"<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$") - # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag -convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") -parensGrpRegex = re.compile(r" \([^()]*\)") -leftoverBraceRegex = re.compile(r"(?:{\||{{).*") -def convertTemplateReplace(match): - if match.group(2) == None: - return f"{match.group(1)} {match.group(4)}" - else: - return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}" -def parseDesc(text): - # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank - # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, - # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, - lines = [] - openBraceCount = 0 - openBracketCount = 0 - inComment = False - skip = False - for line in text.splitlines(): - line = line.strip() - if len(lines) == 0: - if len(line) > 0: - if openBraceCount > 0 or line[0] == "{": - openBraceCount += line.count("{") - openBraceCount -= line.count("}") - skip = True - if openBracketCount > 0 or line[0] == "[": - openBracketCount += line.count("[") - openBracketCount -= line.count("]") - skip = True - if inComment or line.find("") != -1: - if inComment: - inComment = False - skip = True - else: - inComment = True - skip = True - if skip: - skip = False - continue - if line[-1] == ":": # Seems to help avoid disambiguation pages - return None - if descLineRegex.match(line) != None: - lines.append(line) - else: - if len(line) == 0: - return removeMarkup(" ".join(lines)) - lines.append(line) - if len(lines) > 0: - return removeMarkup(" ".join(lines)) - return None -def removeMarkup(content): - content = embeddedHtmlRegex.sub("", content) - content = convertTemplateRegex.sub(convertTemplateReplace, content) - content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup - content = parensGrpRegex.sub("", content) - content = leftoverBraceRegex.sub("", content) - return content -# Other helper functions -def convertTitle(title): - return html.unescape(title).replace("_", " ") - -# Check for existing db -if os.path.exists(enwikiDb): - print(f"ERROR: Existing {enwikiDb}", file=sys.stderr) - sys.exit(1) -# Create db -dbCon = sqlite3.connect(enwikiDb) -dbCur = dbCon.cursor() -dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") -dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") -dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") -dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") -dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") -# Read through dump file -print("Reading dump file") -with bz2.open(dumpFile, mode='rt') as file: - dump = mwxml.Dump.from_file(file) - pageNum = 0 - for page in dump: - pageNum += 1 - if pageNum % 1e4 == 0: - print(f"At page {pageNum}") - # Parse page - if page.namespace == 0: - try: - dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) - except sqlite3.IntegrityError as e: - # Accounts for certain pages that have the same title - print(f"Failed to add page with title \"{page.title}\": {e}") - continue - if page.redirect != None: - dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) - else: - revision = next(page) - desc = parseDesc(revision.text) - if desc != None: - dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) -# Close db -dbCon.commit() -dbCon.close() diff --git a/backend/data/enwiki/genDescData.py b/backend/data/enwiki/genDescData.py new file mode 100755 index 0000000..032dbed --- /dev/null +++ b/backend/data/enwiki/genDescData.py @@ -0,0 +1,122 @@ +#!/usr/bin/python3 + +import sys, os, re +import bz2 +import html, mwxml, mwparserfromhell +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads a Wikimedia enwiki dump, and adds page, redirect,\n" +usageInfo += "and short-description info to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" # 22,034,540 pages +enwikiDb = "descData.db" + +# Some regexps and functions for parsing wikitext +descLineRegex = re.compile("^ *[A-Z'\"]") +embeddedHtmlRegex = re.compile(r"<[^<]+/>||<[^([^<]*|[^<]*<[^<]+>[^<]*)|<[^<]+$") + # Recognises a self-closing HTML tag, a tag with 0 children, tag with 1 child with 0 children, or unclosed tag +convertTemplateRegex = re.compile(r"{{convert\|(\d[^|]*)\|(?:(to|-)\|(\d[^|]*)\|)?([a-z][^|}]*)[^}]*}}") +parensGrpRegex = re.compile(r" \([^()]*\)") +leftoverBraceRegex = re.compile(r"(?:{\||{{).*") +def convertTemplateReplace(match): + if match.group(2) == None: + return f"{match.group(1)} {match.group(4)}" + else: + return f"{match.group(1)} {match.group(2)} {match.group(3)} {match.group(4)}" +def parseDesc(text): + # Find first matching line outside a {{...}} and [[...]] block-html-comments, then accumulate lines until a blank + # Some cases not accounted for: disambiguation pages, abstracts with sentences split-across-lines, + # nested embedded html, 'content significant' embedded-html, markup not removable with mwparsefromhell, + lines = [] + openBraceCount = 0 + openBracketCount = 0 + inComment = False + skip = False + for line in text.splitlines(): + line = line.strip() + if len(lines) == 0: + if len(line) > 0: + if openBraceCount > 0 or line[0] == "{": + openBraceCount += line.count("{") + openBraceCount -= line.count("}") + skip = True + if openBracketCount > 0 or line[0] == "[": + openBracketCount += line.count("[") + openBracketCount -= line.count("]") + skip = True + if inComment or line.find("") != -1: + if inComment: + inComment = False + skip = True + else: + inComment = True + skip = True + if skip: + skip = False + continue + if line[-1] == ":": # Seems to help avoid disambiguation pages + return None + if descLineRegex.match(line) != None: + lines.append(line) + else: + if len(line) == 0: + return removeMarkup(" ".join(lines)) + lines.append(line) + if len(lines) > 0: + return removeMarkup(" ".join(lines)) + return None +def removeMarkup(content): + content = embeddedHtmlRegex.sub("", content) + content = convertTemplateRegex.sub(convertTemplateReplace, content) + content = mwparserfromhell.parse(content).strip_code() # Remove wikitext markup + content = parensGrpRegex.sub("", content) + content = leftoverBraceRegex.sub("", content) + return content +# Other helper functions +def convertTitle(title): + return html.unescape(title).replace("_", " ") + +# Check for existing db +if os.path.exists(enwikiDb): + print(f"ERROR: Existing {enwikiDb}", file=sys.stderr) + sys.exit(1) +# Create db +dbCon = sqlite3.connect(enwikiDb) +dbCur = dbCon.cursor() +dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") +dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") +dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") +dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") +dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") +# Read through dump file +print("Reading dump file") +with bz2.open(dumpFile, mode='rt') as file: + dump = mwxml.Dump.from_file(file) + pageNum = 0 + for page in dump: + pageNum += 1 + if pageNum % 1e4 == 0: + print(f"At page {pageNum}") + # Parse page + if page.namespace == 0: + try: + dbCur.execute("INSERT INTO pages VALUES (?, ?)", (page.id, convertTitle(page.title))) + except sqlite3.IntegrityError as e: + # Accounts for certain pages that have the same title + print(f"Failed to add page with title \"{page.title}\": {e}") + continue + if page.redirect != None: + dbCur.execute("INSERT INTO redirects VALUES (?, ?)", (page.id, convertTitle(page.redirect))) + else: + revision = next(page) + desc = parseDesc(revision.text) + if desc != None: + dbCur.execute("INSERT INTO descs VALUES (?, ?)", (page.id, desc)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/genImgData.py b/backend/data/enwiki/genImgData.py new file mode 100755 index 0000000..9bd28f4 --- /dev/null +++ b/backend/data/enwiki/genImgData.py @@ -0,0 +1,178 @@ +#!/usr/bin/python3 + +import sys, re +import bz2, html, urllib.parse +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n" +usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +def getInputPageIds(): + pageIds = set() + dbCon = sqlite3.connect("../data.db") + dbCur = dbCon.cursor() + for (pageId,) in dbCur.execute("SELECT id from wiki_ids"): + pageIds.add(pageId) + dbCon.close() + return pageIds +dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" +indexDb = "dumpIndex.db" +imgDb = "imgData.db" # Output db +idLineRegex = re.compile(r"(.*)") +imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") +bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") +imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNORECASE) +cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) + +# Open dbs +indexDbCon = sqlite3.connect(indexDb) +indexDbCur = indexDbCon.cursor() +imgDbCon = sqlite3.connect(imgDb) +imgDbCur = imgDbCon.cursor() +# Create image-db table +pidsDone = set() +if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None: + imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL + imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") +else: + for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"): + pidsDone.add(pid) + print(f"Will skip {len(pidsDone)} already-processed page-ids") +# Get input pageIds +print("Getting input page-ids", file=sys.stderr) +pageIds = getInputPageIds() +for pid in pidsDone: + pageIds.remove(pid) +print(f"Found {len(pageIds)} page-ids to process") +# Get page-id dump-file offsets +print("Getting dump-file offsets", file=sys.stderr) +offsetToPageids = {} +offsetToEnd = {} +iterNum = 0 +for pageId in pageIds: + iterNum += 1 + if iterNum % 1e4 == 0: + print(f"At iteration {iterNum}", file=sys.stderr) + # + query = "SELECT offset, next_offset FROM offsets WHERE id = ?" + row = indexDbCur.execute(query, (pageId,)).fetchone() + if row == None: + print(f"WARNING: Page id {pageId} not found", file=sys.stderr) + continue + (chunkOffset, endOffset) = row + offsetToEnd[chunkOffset] = endOffset + if chunkOffset not in offsetToPageids: + offsetToPageids[chunkOffset] = [] + offsetToPageids[chunkOffset].append(pageId) +print(f"Found {len(offsetToEnd)} chunks to check", file=sys.stderr) +# Look through dump file, jumping to chunks containing relevant pages +print("Reading through dump file", file=sys.stderr) +def getImageName(content): + """ Given an array of text-content lines, returns an image-filename, or None """ + for line in content: + match = imageLineRegex.match(line) + if match != None: + imageName = match.group(1).strip() + if imageName == "": + return None + imageName = html.unescape(imageName) + # Account for {{... + if imageName.startswith("{"): + match = cssImgCropRegex.match(imageName) + if match == None: + return None + imageName = match.group(1) + # Account for [[File:...|...]] + if imageName.startswith("["): + match = bracketImageRegex.match(imageName) + if match == None: + return None + imageName = match.group(1) + # Account for