diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 32 | ||||
| -rw-r--r-- | backend/data/enwiki/README.md | 7 | ||||
| -rwxr-xr-x | backend/data/enwiki/getEnwikiImgData.py | 33 |
3 files changed, 42 insertions, 30 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 17484f4..18daa99 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -14,7 +14,8 @@ File Generation Process It uses data in eol/imagesList.db, and the 'eol\_ids' table. 2 In eol/, run reviewImgs.py to filter images in eol/imgsForReview/ into EOL-id-unique images in eol/imgsReviewed/ (uses 'names' and 'eol\_ids' to display extra info). - 3 Run genImgsForWeb.py to create cropped/resized images in img/, using + 3 // UPDATE + Run genImgsForWeb.py to create cropped/resized images in img/, using images in eol/imgsReviewed/, and also to add an 'images' table to data.db. 4 Run genLinkedImgs.py to add a 'linked_imgs' table to data.db, which uses 'nodes', 'edges', 'eol\_ids', and 'images', to associate @@ -22,21 +23,31 @@ File Generation Process 4 Node Description Data 1 Obtain data in dbpedia/, as specified in it's README. 2 Run genDbpData.py, which adds a 'descs' table to data.db, using - data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table. -5 Supplementary Name/Description Data + data in dbpedia/dbpData.db, the 'nodes' table, and possibly + dbpNamesToSkip.txt and dbpPickedLabels.txt. +5 Supplementary Name/Description/Image Data 1 Obtain data in enwiki/, as specified in it's README. 2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in enwiki/enwikiData.db, and the 'nodes' table. Also uses genEnwikiDesc*.txt files for skipping/resolving some name-page associations. - 3 Run genEnwikiNameData.py, which adds to the 'names' table, using data in - enwiki/enwikiData.db, and the 'names' and 'descs' tables. + 3 Optionally run genEnwikiNameData.py, which adds to the 'names' table, + using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables. + 4 In enwiki/, run getEnwikiImgData.py, which generates a list of + tol-node images, and creates enwiki/enwikiImgs.db to store it. + Uses the 'descs' table to get tol-node wiki-ids. + 5 In enwiki/, run downloadImgLicenseInfo.py, which downloads licensing + information for images listed in enwiki/enwikiImgs.db, and stores + it in that db. + 6 In enwiki/, run downloadEnwikiImgs.py, which downloads 'permissively-licensed' + images in listed in enwiki/enwikiImgs.db, storing them in enwiki/imgs/. + 7 // ADD 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. data.db Tables ============== -- nodes: name TEXT PRIMARY KEY, tips INT +- nodes: name TEXT PRIMARY KEY, id TEXT UNIQUE, tips INT - edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) - names: name TEXT, alt\_name TEXT, pref\_alt INT, PRIMARY KEY(name, alt\_name) - eol\_ids: id INT PRIMARY KEY, name TEXT @@ -51,14 +62,7 @@ Other Files - dbpPickedLabels.txt <br> Contains DBpedia labels, one per line. Used by genDbpData.py to help resolve conflicts when associating tree-of-life node names with - DBpedia node labels. Was generated by manually editing the output - of genDbpConflicts.py. -- genDbpConflicts.py <br> - Reads data from dbpedia/dbpData.db, and the 'nodes' table of data.db, - and looks for potential conflicts that would arise when genDbpData.db - tries to associate tree-of-life node names wth DBpedia node labels. It - writes data about them to conflicts.txt, which can be manually edited - to resolve them. + DBpedia node labels. - genOtolNamesToKeep.txt <br> Contains names to avoid trimming off the tree data generated by genOtolData.py. Usage is optional, but, without it, a large amount diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index c9615ef..ea97c9a 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -28,3 +28,10 @@ Generated Files - pages: id INT PRIMARY KEY, title TEXT UNIQUE - redirects: id INT PRIMARY KEY, target TEXT - descs: id INT PRIMARY KEY, desc TEXT +- enwikiImgs.db <br> + Holds infobox-images obtained for some set of wiki page-ids. + Generated by running getEnwikiImgData.py, which uses the enwiki dump + file and dumpIndex.db. <br> + Tables: <br> + - page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT + - imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py index 1992279..1eca0bd 100755 --- a/backend/data/enwiki/getEnwikiImgData.py +++ b/backend/data/enwiki/getEnwikiImgData.py @@ -5,16 +5,23 @@ import bz2, html, urllib.parse import sqlite3 usageInfo = f"usage: {sys.argv[0]}\n" -usageInfo += "Gets nodes with enwiki page-ids, and looks up their content in enwiki/,\n" -usageInfo += "trying to get infobox image filenames, and prints lines like 'pageId1 filename1'\n" +usageInfo += "For a set of page-ids, looks up their content in an enwiki dump,\n" +usageInfo += "trying to get infobox image filenames, adding info to an sqlite db.\n" if len(sys.argv) > 1: print(usageInfo, file=sys.stderr) sys.exit(1) -dbFile = "../data.db" -indexDb = "dumpIndex.db" +def getInputPageIds(): + pageIds = set() + dbCon = sqlite3.connect("../data.db") + dbCur = dbCon.cursor() + for (pageId,) in dbCur.execute("SELECT wiki_id from descs"): + pageIds.add(pageId) + dbCon.close() + return pageIds dumpFile = "enwiki-20220501-pages-articles-multistream.xml.bz2" -imgDb = "enwikiImgs.db" +indexDb = "dumpIndex.db" +imgDb = "enwikiImgs.db" # Output db idLineRegex = re.compile(r"<id>(.*)</id>") imageLineRegex = re.compile(r".*\| *image *= *([^|]*)") bracketImageRegex = re.compile(r"\[\[(File:[^|]*).*]]") @@ -22,19 +29,16 @@ imageNameRegex = re.compile(r".*\.(jpg|jpeg|png|gif|tiff|tif)", flags=re.IGNOREC cssImgCropRegex = re.compile(r"{{css image crop\|image *= *(.*)", flags=re.IGNORECASE) # Open dbs -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() indexDbCon = sqlite3.connect(indexDb) indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() # Create image-db table -imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMAY KEY, img_name TEXT)") -# Get nodes with enwiki page-ids -print("Getting nodes with wiki-ids", file=sys.stderr) -pageIds = set() -for (pageId,) in dbCur.execute("SELECT wiki_id from descs"): - pageIds.add(pageId) +imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") +imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") +# Get input pageIds +print("Getting input page-ids", file=sys.stderr) +pageIds = getInputPageIds() # Get page-id dump-file offsets print("Getting dump-file offsets", file=sys.stderr) offsetToPageids = {} @@ -106,8 +110,6 @@ with open(dumpFile, mode='rb') as file: iterNum += 1 if iterNum % 100 == 0: print(f"At iteration {iterNum}", file=sys.stderr) - if iterNum == 300: - break # pageIds = offsetToPageids[pageOffset] # Jump to chunk @@ -163,7 +165,6 @@ with open(dumpFile, mode='rb') as file: if not foundText: print(f"Did not find <text> for page id {pageId}", file=sys.stderr) # Close dbs -dbCon.close() indexDbCon.close() imgDbCon.commit() imgDbCon.close() |
