diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-06-19 17:04:40 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-19 17:04:40 +1000 |
| commit | cd611fc89535357b227dbc21727534c6275a27f0 (patch) | |
| tree | 925c68a3e35b229d4f77ad198a205ad5ba9a4b37 | |
| parent | abf0bb37f633a7c509fd8ae78628ff1e8bc97854 (diff) | |
Add code/file for adding manually-picked alt-names
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | backend/data/README.md | 4 | ||||
| -rwxr-xr-x | backend/data/addPickedNames.py | 48 | ||||
| -rw-r--r-- | backend/data/enwiki/README.md | 1 |
4 files changed, 53 insertions, 1 deletions
@@ -30,3 +30,4 @@ /backend/data/mergedImgList.txt /backend/data/pickedImgs/ /backend/data/reducedTol/names.txt +/backend/data/pickedNames.txt diff --git a/backend/data/README.md b/backend/data/README.md index 18e5da3..d4a6196 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -58,6 +58,8 @@ File Generation Process 6 Other - Optionally run genEnwikiNameData.py, which adds more entries to the 'names' table, using data in enwiki/enwikiData.db, and the 'names' and 'wiki_ids' tables. + - Optionally run addPickedNames.py, which adds manually-picked names to + the 'names' table, as specified in pickedNames.txt. - Optionally run trimTree.py, which tries to remove some 'low-significance' nodes, for the sake of performance and result-relevance. Without this, jumping to certain nodes within the fungi and moths can take over a minute to render. @@ -72,7 +74,7 @@ data.db Tables - descs: wiki\_id INT PRIMARY KEY, desc TEXT, from\_dbp INT - node\_imgs: name TEXT PRIMARY KEY, img\_id INT, src TEXT - images: id INT, src TEXT, url TEXT, license TEXT, artist TEXT, credit TEXT, PRIMARY KEY (id, src) -- linked\_imgs: name TEXT PRIMARY KEY, otol\_id TEXT, otol\_id2 TEXT +- linked\_imgs: name TEXT PRIMARY KEY, otol\_ids TEXT - r\_nodes: name TEXT PRIMARY KEY, tips INT - r\_edges: node TEXT, child TEXT, p\_support INT, PRIMARY KEY (node, child) diff --git a/backend/data/addPickedNames.py b/backend/data/addPickedNames.py new file mode 100755 index 0000000..b231ab7 --- /dev/null +++ b/backend/data/addPickedNames.py @@ -0,0 +1,48 @@ +#!/usr/bin/python3 + +import sys +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads alt-name data from a file, and adds it to the 'names' table.\n" +usageInfo += "The file is expected to have lines of the form: nodeName|altName|prefAlt\n" +usageInfo += " These correspond to entries in the 'names' table. 'prefAlt' should\n" +usageInfo += " be 1 or 0. A line may specify name1|name1|1, which causes the node\n" +usageInfo += " to have no preferred alt-name.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +dbFile = "data.db" +pickedNamesFile = "pickedNames.txt" + +# Open db +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Iterate through picked-names file +with open(pickedNamesFile) as file: + for line in file: + # Get record data + (nodeName, altName, prefAlt) = line.lower().rstrip().split("|") + prefAlt = int(prefAlt) + # Remove any existing preferred-alt status + if prefAlt == 1: + query = "SELECT name, alt_name FROM names WHERE name = ? AND pref_alt = 1" + row = dbCur.execute(query, (nodeName,)).fetchone() + if row != None: + dbCur.execute("UPDATE names SET pref_alt = 0 WHERE name = ? AND alt_name = ?", row) + # Check for an existing record + if nodeName == altName: + continue + query = "SELECT name, alt_name, pref_alt FROM names WHERE name = ? AND alt_name = ?" + row = dbCur.execute(query, (nodeName, altName)).fetchone() + if row == None: + dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'picked')", (nodeName, altName, prefAlt)) + else: + # Update existing record + if row[2] != prefAlt: + dbCur.execute("UPDATE names SET pref_alt = ?, src = 'picked' WHERE name = ? AND alt_name = ?", + (prefAlt, nodeName, altName)) +# Close db +dbCon.commit() +dbCon.close() diff --git a/backend/data/enwiki/README.md b/backend/data/enwiki/README.md index 22af5ba..6462d7d 100644 --- a/backend/data/enwiki/README.md +++ b/backend/data/enwiki/README.md @@ -34,5 +34,6 @@ Generated Files file and dumpIndex.db. <br> Tables: <br> - page\_imgs: page\_id INT PRIMAY KEY, img\_name TEXT + (img\_name may be null, which is used to avoid re-processing the page-id on a second pass) - imgs: name TEXT PRIMARY KEY, license TEXT, artist TEXT, credit TEXT, restrictions TEXT, url TEXT (might lack some matches for 'img_name' in 'page_imgs', due to inability to get license info) |
