2 files changed, 54 insertions, 6 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index f090898..8ee6e41 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -4,7 +4,7 @@ File Generation Process
     1   Obtain data in otol/, as specified in it's README.
     2   Run genOtolData.py, which creates data.db, and adds
         'nodes' and 'edges' tables using data in otol/*, as well as
-        namesToKeep.txt, if present.
+        genOtolNamesToKeep.txt, if present.
 2   Name Data for Search
     1   Obtain data in eol/, as specified in it's README.
     2   Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
@@ -26,7 +26,8 @@ File Generation Process
 5   Supplementary Name/Description Data
     1   Obtain data in enwiki/, as specified in it's README.
     2   Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
-        enwiki/enwikiData.db, and the 'nodes' table.
+        enwiki/enwikiData.db, and the 'nodes' table. Also uses genEnwikiDesc*.txt
+        files for skipping/resolving some name-page associations.
     3   Run genEnwikiNameData.py, which adds to the 'names' table, using data in
         enwiki/enwikiData.db, and the 'names' and 'descs' tables.
 5   Reduced Tree Structure Data
@@ -58,7 +59,7 @@ Other Files
     tries to associate tree-of-life node names wth DBpedia node labels. It
     writes data about them to conflicts.txt, which can be manually edited
     to resolve them.
--   namesToKeep.txt <br>
+-   genOtolNamesToKeep.txt <br>
     Contains names to avoid trimming off the tree data generated by
     genOtolData.py.  Usage is optional, but, without it, a large amount
     of possibly-significant nodes are removed, using a short-sighted
@@ -67,3 +68,23 @@ Other Files
     then get node names that have an associated image, description, or
     presence in r_nodes. Then run the genOtolData.py and genEolNameData.py
     scripts again (after deleting their created tables).
+-   genEnwikiDescNamesToSkip.txt <br>
+    Contains names for nodes that genEnwikiNameData.py should skip adding
+    a description for. Usage is optional, but without it, some nodes will
+    probably get descriptions that don't match (eg: the bee genus Osiris
+    might be described as an egyptian god). <br>
+    This file was generated by running genEnwikiNameData.py, then listing
+    the names that it added into a file, along with descriptions, and
+    manually removing those that seemed node-matching (got about 30k lines,
+    with about 1 in 30 descriptions non-matching). And, after creating
+    genEnwikiDescTitlesToUse.txt, names shared with that file were removed.
+-   genEnwikiDescTitlesToUse.txt <br>
+    Contains enwiki titles with the form 'name1 (category1)' for
+    genEnwikiNameData.py to use to resolve nodes matching name name1.
+    Usage is optional, but it adds some descriptions that would otherwise
+    be skipped. <br>
+    This file was generated by taking the content of genEnwikiNameData.py,
+    after the manual filtering step, then, for each name name,1 getting
+    page titles from dbpedia/dbpData.db that match 'name1 (category1)'.
+    This was followed by manually removing lines, keeping those that
+    seemed to match the corresponding node (used the app to help with this).
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index 57e4194..3e11871 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -13,12 +13,30 @@ if len(sys.argv) > 1:
 
 enwikiDb = "enwiki/enwikiData.db"
 dbFile = "data.db"
+namesToSkipFile = "genEnwikiDescNamesToSkip.txt"
+titlesToUseFile = "genEnwikiDescTitlesToUse.txt"
+titleToUseRegex = re.compile(r"(.*) \(.*\)")
 
 # Open dbs
 enwikiCon = sqlite3.connect(enwikiDb)
 enwikiCur = enwikiCon.cursor()
 dbCon = sqlite3.connect(dbFile)
 dbCur = dbCon.cursor()
+# Read name/title files
+namesToSkip = set()
+nameToPickedTitle = {} # Maps names to titles to be used for them
+if os.path.exists(namesToSkipFile):
+	with open(namesToSkipFile) as file:
+		for line in file:
+			namesToSkip.add(line.rstrip())
+	print("Read in {len(namesToSkip)} names to skip")
+if os.path.exists(titlesToUseFile):
+	with open(titlesToUseFile) as file:
+		for line in file:
+			title = line.rstrip()
+			name = titleToUseRegex.sub(r"\1", title) # Remove parens
+			nameToPickedTitle[name.lower()] = title
+print("Read in {len(titlesToUse)} titles to use for certain names")
 # Get node names without descriptions
 print("Getting node names")
 nodeNames = set()
@@ -26,6 +44,7 @@ query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name
 for row in dbCur.execute(query):
 	nodeNames.add(row[0])
 print(f"Found {len(nodeNames)} names")
+nodeNames.difference_update(namesToSkip)
 # Find page id for each node name
 print("Getting node page-ids")
 nodeToPageId = {}
@@ -35,9 +54,17 @@ for name in nodeNames:
 	if iterNum % 1e4 == 0:
 		print(f"At iteration {iterNum}")
 	#
-	row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
-	if row != None:
-		nodeToPageId[name] = row[0]
+	if name not in nameToPickedTitle:
+		row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
+		if row != None:
+			nodeToPageId[name] = row[0]
+	else:
+		title = nameToPickedTitle[name]
+		row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone()
+		if row != None:
+			nodeToPageId[name] = row[0]
+		else:
+			print("WARNING: Picked title {title} not found", file=sys.stderr)
 # Resolve redirects
 print("Resolving redirects")
 redirectingNames = set()