diff options
Diffstat (limited to 'backend/data')
| -rw-r--r-- | backend/data/README.md | 27 | ||||
| -rwxr-xr-x | backend/data/genEnwikiDescData.py | 33 |
2 files changed, 54 insertions, 6 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index f090898..8ee6e41 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -4,7 +4,7 @@ File Generation Process 1 Obtain data in otol/, as specified in it's README. 2 Run genOtolData.py, which creates data.db, and adds 'nodes' and 'edges' tables using data in otol/*, as well as - namesToKeep.txt, if present. + genOtolNamesToKeep.txt, if present. 2 Name Data for Search 1 Obtain data in eol/, as specified in it's README. 2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db, @@ -26,7 +26,8 @@ File Generation Process 5 Supplementary Name/Description Data 1 Obtain data in enwiki/, as specified in it's README. 2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in - enwiki/enwikiData.db, and the 'nodes' table. + enwiki/enwikiData.db, and the 'nodes' table. Also uses genEnwikiDesc*.txt + files for skipping/resolving some name-page associations. 3 Run genEnwikiNameData.py, which adds to the 'names' table, using data in enwiki/enwikiData.db, and the 'names' and 'descs' tables. 5 Reduced Tree Structure Data @@ -58,7 +59,7 @@ Other Files tries to associate tree-of-life node names wth DBpedia node labels. It writes data about them to conflicts.txt, which can be manually edited to resolve them. -- namesToKeep.txt <br> +- genOtolNamesToKeep.txt <br> Contains names to avoid trimming off the tree data generated by genOtolData.py. Usage is optional, but, without it, a large amount of possibly-significant nodes are removed, using a short-sighted @@ -67,3 +68,23 @@ Other Files then get node names that have an associated image, description, or presence in r_nodes. Then run the genOtolData.py and genEolNameData.py scripts again (after deleting their created tables). +- genEnwikiDescNamesToSkip.txt <br> + Contains names for nodes that genEnwikiNameData.py should skip adding + a description for. Usage is optional, but without it, some nodes will + probably get descriptions that don't match (eg: the bee genus Osiris + might be described as an egyptian god). <br> + This file was generated by running genEnwikiNameData.py, then listing + the names that it added into a file, along with descriptions, and + manually removing those that seemed node-matching (got about 30k lines, + with about 1 in 30 descriptions non-matching). And, after creating + genEnwikiDescTitlesToUse.txt, names shared with that file were removed. +- genEnwikiDescTitlesToUse.txt <br> + Contains enwiki titles with the form 'name1 (category1)' for + genEnwikiNameData.py to use to resolve nodes matching name name1. + Usage is optional, but it adds some descriptions that would otherwise + be skipped. <br> + This file was generated by taking the content of genEnwikiNameData.py, + after the manual filtering step, then, for each name name,1 getting + page titles from dbpedia/dbpData.db that match 'name1 (category1)'. + This was followed by manually removing lines, keeping those that + seemed to match the corresponding node (used the app to help with this). diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py index 57e4194..3e11871 100755 --- a/backend/data/genEnwikiDescData.py +++ b/backend/data/genEnwikiDescData.py @@ -13,12 +13,30 @@ if len(sys.argv) > 1: enwikiDb = "enwiki/enwikiData.db" dbFile = "data.db" +namesToSkipFile = "genEnwikiDescNamesToSkip.txt" +titlesToUseFile = "genEnwikiDescTitlesToUse.txt" +titleToUseRegex = re.compile(r"(.*) \(.*\)") # Open dbs enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() +# Read name/title files +namesToSkip = set() +nameToPickedTitle = {} # Maps names to titles to be used for them +if os.path.exists(namesToSkipFile): + with open(namesToSkipFile) as file: + for line in file: + namesToSkip.add(line.rstrip()) + print("Read in {len(namesToSkip)} names to skip") +if os.path.exists(titlesToUseFile): + with open(titlesToUseFile) as file: + for line in file: + title = line.rstrip() + name = titleToUseRegex.sub(r"\1", title) # Remove parens + nameToPickedTitle[name.lower()] = title +print("Read in {len(titlesToUse)} titles to use for certain names") # Get node names without descriptions print("Getting node names") nodeNames = set() @@ -26,6 +44,7 @@ query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name for row in dbCur.execute(query): nodeNames.add(row[0]) print(f"Found {len(nodeNames)} names") +nodeNames.difference_update(namesToSkip) # Find page id for each node name print("Getting node page-ids") nodeToPageId = {} @@ -35,9 +54,17 @@ for name in nodeNames: if iterNum % 1e4 == 0: print(f"At iteration {iterNum}") # - row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] + if name not in nameToPickedTitle: + row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] + else: + title = nameToPickedTitle[name] + row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] + else: + print("WARNING: Picked title {title} not found", file=sys.stderr) # Resolve redirects print("Resolving redirects") redirectingNames = set() |
