aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md27
-rwxr-xr-xbackend/data/genEnwikiDescData.py33
2 files changed, 54 insertions, 6 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index f090898..8ee6e41 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -4,7 +4,7 @@ File Generation Process
1 Obtain data in otol/, as specified in it's README.
2 Run genOtolData.py, which creates data.db, and adds
'nodes' and 'edges' tables using data in otol/*, as well as
- namesToKeep.txt, if present.
+ genOtolNamesToKeep.txt, if present.
2 Name Data for Search
1 Obtain data in eol/, as specified in it's README.
2 Run genEolNameData.py, which adds 'names' and 'eol\_ids' tables to data.db,
@@ -26,7 +26,8 @@ File Generation Process
5 Supplementary Name/Description Data
1 Obtain data in enwiki/, as specified in it's README.
2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
- enwiki/enwikiData.db, and the 'nodes' table.
+ enwiki/enwikiData.db, and the 'nodes' table. Also uses genEnwikiDesc*.txt
+ files for skipping/resolving some name-page associations.
3 Run genEnwikiNameData.py, which adds to the 'names' table, using data in
enwiki/enwikiData.db, and the 'names' and 'descs' tables.
5 Reduced Tree Structure Data
@@ -58,7 +59,7 @@ Other Files
tries to associate tree-of-life node names wth DBpedia node labels. It
writes data about them to conflicts.txt, which can be manually edited
to resolve them.
-- namesToKeep.txt <br>
+- genOtolNamesToKeep.txt <br>
Contains names to avoid trimming off the tree data generated by
genOtolData.py. Usage is optional, but, without it, a large amount
of possibly-significant nodes are removed, using a short-sighted
@@ -67,3 +68,23 @@ Other Files
then get node names that have an associated image, description, or
presence in r_nodes. Then run the genOtolData.py and genEolNameData.py
scripts again (after deleting their created tables).
+- genEnwikiDescNamesToSkip.txt <br>
+ Contains names for nodes that genEnwikiNameData.py should skip adding
+ a description for. Usage is optional, but without it, some nodes will
+ probably get descriptions that don't match (eg: the bee genus Osiris
+ might be described as an egyptian god). <br>
+ This file was generated by running genEnwikiNameData.py, then listing
+ the names that it added into a file, along with descriptions, and
+ manually removing those that seemed node-matching (got about 30k lines,
+ with about 1 in 30 descriptions non-matching). And, after creating
+ genEnwikiDescTitlesToUse.txt, names shared with that file were removed.
+- genEnwikiDescTitlesToUse.txt <br>
+ Contains enwiki titles with the form 'name1 (category1)' for
+ genEnwikiNameData.py to use to resolve nodes matching name name1.
+ Usage is optional, but it adds some descriptions that would otherwise
+ be skipped. <br>
+ This file was generated by taking the content of genEnwikiNameData.py,
+ after the manual filtering step, then, for each name name,1 getting
+ page titles from dbpedia/dbpData.db that match 'name1 (category1)'.
+ This was followed by manually removing lines, keeping those that
+ seemed to match the corresponding node (used the app to help with this).
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index 57e4194..3e11871 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -13,12 +13,30 @@ if len(sys.argv) > 1:
enwikiDb = "enwiki/enwikiData.db"
dbFile = "data.db"
+namesToSkipFile = "genEnwikiDescNamesToSkip.txt"
+titlesToUseFile = "genEnwikiDescTitlesToUse.txt"
+titleToUseRegex = re.compile(r"(.*) \(.*\)")
# Open dbs
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+# Read name/title files
+namesToSkip = set()
+nameToPickedTitle = {} # Maps names to titles to be used for them
+if os.path.exists(namesToSkipFile):
+ with open(namesToSkipFile) as file:
+ for line in file:
+ namesToSkip.add(line.rstrip())
+ print("Read in {len(namesToSkip)} names to skip")
+if os.path.exists(titlesToUseFile):
+ with open(titlesToUseFile) as file:
+ for line in file:
+ title = line.rstrip()
+ name = titleToUseRegex.sub(r"\1", title) # Remove parens
+ nameToPickedTitle[name.lower()] = title
+print("Read in {len(titlesToUse)} titles to use for certain names")
# Get node names without descriptions
print("Getting node names")
nodeNames = set()
@@ -26,6 +44,7 @@ query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name
for row in dbCur.execute(query):
nodeNames.add(row[0])
print(f"Found {len(nodeNames)} names")
+nodeNames.difference_update(namesToSkip)
# Find page id for each node name
print("Getting node page-ids")
nodeToPageId = {}
@@ -35,9 +54,17 @@ for name in nodeNames:
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
- row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
+ if name not in nameToPickedTitle:
+ row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
+ if row != None:
+ nodeToPageId[name] = row[0]
+ else:
+ title = nameToPickedTitle[name]
+ row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone()
+ if row != None:
+ nodeToPageId[name] = row[0]
+ else:
+ print("WARNING: Picked title {title} not found", file=sys.stderr)
# Resolve redirects
print("Resolving redirects")
redirectingNames = set()