diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-08-30 12:27:42 +1000 |
| commit | e8e58a3bb9dc233dacf573973457c5b48d369503 (patch) | |
| tree | 242500ca304c5afbb7e6506e61da4c4dfff0b175 /backend/tolData/genEnwikiDescData.py | |
| parent | 930c12d33e1093f874a4beb4d6376621e464e8c0 (diff) | |
Add scripts for generating eol/enwiki mappings
- New data sources: OTOL taxonomy, EOL provider-ids, Wikidata dump
- Add 'node_iucn' table
- Remove 'redirected' field from 'wiki_ids' table
- Make 'eol_ids' table have 'name' as the primary key
- Combine name-generation scripts into genNameData.py
- Combine description-generation scripts into genDescData.py
Diffstat (limited to 'backend/tolData/genEnwikiDescData.py')
| -rwxr-xr-x | backend/tolData/genEnwikiDescData.py | 99 |
1 files changed, 0 insertions, 99 deletions
diff --git a/backend/tolData/genEnwikiDescData.py b/backend/tolData/genEnwikiDescData.py deleted file mode 100755 index e8a69ba..0000000 --- a/backend/tolData/genEnwikiDescData.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/python3 - -import sys, re, os -import sqlite3 - -import argparse -parser = argparse.ArgumentParser(description=""" -Reads a database containing data from Wikipedia, and tries to associate -wiki pages with nodes in the tree-of-life database, and add descriptions for -nodes that don't have them. -""", formatter_class=argparse.RawDescriptionHelpFormatter) -parser.parse_args() - -enwikiDb = "enwiki/descData.db" -dbFile = "data.db" -namesToSkipFile = "pickedEnwikiNamesToSkip.txt" -pickedLabelsFile = "pickedEnwikiLabels.txt" -# Got about 25k descriptions when testing - -print("Opening databases") -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Checking for names to skip") -namesToSkip = set() -if os.path.exists(namesToSkipFile): - with open(namesToSkipFile) as file: - for line in file: - namesToSkip.add(line.rstrip()) - print(f"Found {len(namesToSkip)}") -print("Checking for picked-titles") -nameToPickedTitle = {} -if os.path.exists(pickedLabelsFile): - with open(pickedLabelsFile) as file: - for line in file: - (name, _, title) = line.rstrip().partition("|") - nameToPickedTitle[name.lower()] = title -print(f"Found {len(nameToPickedTitle)}") - -print("Getting names of nodes without descriptions") -nodeNames = set() -query = "SELECT nodes.name FROM nodes LEFT JOIN wiki_ids ON nodes.name = wiki_ids.name WHERE wiki_ids.id IS NULL" -for (name,) in dbCur.execute(query): - nodeNames.add(name) -print(f"Found {len(nodeNames)}") -nodeNames.difference_update(namesToSkip) - -print("Associating nodes with page IDs") -nodeToPageId = {} -iterNum = 0 -for name in nodeNames: - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - if name not in nameToPickedTitle: - row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] - else: - title = nameToPickedTitle[name] - row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] - else: - print("WARNING: Picked title {title} not found", file=sys.stderr) - -print("Resolving redirects") -redirectingNames = set() -iterNum = 0 -for (name, pageId) in nodeToPageId.items(): - iterNum += 1 - if iterNum % 1e3 == 0: - print(f"At iteration {iterNum}") - # - query = "SELECT pages.id FROM redirects INNER JOIN pages ON redirects.target = pages.title WHERE redirects.id = ?" - row = enwikiCur.execute(query, (pageId,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] - redirectingNames.add(name) - -print("Adding description data") -iterNum = 0 -for (name, pageId) in nodeToPageId.items(): - iterNum += 1 - if iterNum % 1e3 == 0: - print(f"At iteration {iterNum}") - # - row = enwikiCur.execute("SELECT desc FROM descs where descs.id = ?", (pageId,)).fetchone() - if row != None: - dbCur.execute("INSERT INTO wiki_ids VALUES (?, ?, ?)", (name, pageId, 1 if name in redirectingNames else 0)) - dbCur.execute("INSERT OR IGNORE INTO descs VALUES (?, ?, ?)", (pageId, row[0], 0)) - -print("Closing databases") -dbCon.commit() -dbCon.close() -enwikiCon.close() |
