Make backend dev server script serve the image files

Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images.
author: Terry Truong <terry06890@gmail.com> 2022-07-11 01:54:08 +1000
committer: Terry Truong <terry06890@gmail.com> 2022-07-11 01:54:08 +1000
commit: 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree: 3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/tolData/genEolNameData.py
parent: a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)
1 files changed, 184 insertions, 0 deletions
diff --git a/backend/tolData/genEolNameData.py b/backend/tolData/genEolNameData.py
new file mode 100755
index 0000000..dd33ee0
--- /dev/null
+++ b/backend/tolData/genEolNameData.py
@@ -0,0 +1,184 @@
+#!/usr/bin/python3
+
+import sys, re, os
+import html, csv, sqlite3
+
+usageInfo = f"""
+Usage: {sys.argv[0]}
+
+Reads files describing name data from the 'Encyclopedia of Life' site,
+tries to associate names with nodes in the database, and adds tables
+to represent associated names.
+
+Reads a vernacularNames.csv file:
+	Starts with a header line containing:
+		page_id, canonical_form, vernacular_string, language_code,
+		resource_name, is_preferred_by_resource, is_preferred_by_eol
+	The canonical_form and vernacular_string fields contain names
+		associated with the page ID. Names are not always unique to
+		particular page IDs.
+"""
+if len(sys.argv) > 1:
+	print(usageInfo, file=sys.stderr)
+	sys.exit(1)
+
+vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries
+dbFile = "data.db"
+namesToSkip = {"unknown", "unknown species", "unidentified species"}
+pickedIdsFile = "pickedEolIds.txt"
+altsToSkipFile = "pickedEolAltsToSkip.txt"
+
+print("Reading in vernacular-names data")
+nameToPids = {} # 'pid' means 'Page ID'
+canonicalNameToPids = {}
+pidToNames = {}
+pidToPreferred = {} # Maps pids to 'preferred' names
+def updateMaps(name, pid, canonical, preferredAlt):
+	global namesToSkip, nameToPids, canonicalNameToPids, pidToNames, pidToPreferred
+	if name in namesToSkip:
+		return
+	if name not in nameToPids:
+		nameToPids[name] = {pid}
+	else:
+		nameToPids[name].add(pid)
+	if canonical:
+		if name not in canonicalNameToPids:
+			canonicalNameToPids[name] = {pid}
+		else:
+			canonicalNameToPids[name].add(pid)
+	if pid not in pidToNames:
+		pidToNames[pid] = {name}
+	else:
+		pidToNames[pid].add(name)
+	if preferredAlt:
+		pidToPreferred[pid] = name
+with open(vnamesFile, newline="") as csvfile:
+	reader = csv.reader(csvfile)
+	lineNum = 0
+	for row in reader:
+		lineNum += 1
+		if lineNum % 1e5 == 0:
+			print(f"At line {lineNum}")
+		# Skip header line
+		if lineNum == 1:
+			continue
+		# Parse line
+		pid = int(row[0])
+		name1 = re.sub(r"<[^>]+>", "", row[1].lower()) # Remove tags
+		name2 = html.unescape(row[2]).lower()
+		lang = row[3]
+		preferred = row[6] == "preferred"
+		# Add to maps
+		updateMaps(name1, pid, True, False)
+		if lang == "eng" and name2 != "":
+			updateMaps(name2, pid, False, preferred)
+
+print("Checking for manually-picked pids")
+nameToPickedPid = {}
+if os.path.exists(pickedIdsFile):
+	with open(pickedIdsFile) as file:
+		for line in file:
+			(name, _, eolId) = line.rstrip().partition("|")
+			nameToPickedPid[name] = None if eolId == "" else int(eolId)
+print(f"Found {len(nameToPickedPid)}")
+
+print("Checking for alt-names to skip")
+nameToAltsToSkip = {}
+numToSkip = 0
+if os.path.exists(altsToSkipFile):
+	with open(altsToSkipFile) as file:
+		for line in file:
+			(name, _, altName) = line.rstrip().partition("|")
+			if name not in nameToAltsToSkip:
+				nameToAltsToSkip[name] = [altName]
+			else:
+				nameToAltsToSkip[name].append(altName)
+			numToSkip += 1
+print(f"Found {numToSkip} alt-names to skip")
+
+print("Creating database tables")
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))")
+dbCur.execute("CREATE INDEX names_idx ON names(name)")
+dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
+dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
+dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
+dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
+
+print("Associating nodes with names")
+usedPids = set()
+unresolvedNodeNames = set()
+dbCur2 = dbCon.cursor()
+def addToDb(nodeName, pidToUse):
+	" Adds page-ID-associated name data to a node in the database "
+	global dbCur, pidToPreferred
+	dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
+	# Get alt-names
+	altNames = set()
+	for n in pidToNames[pidToUse]:
+		# Avoid alt-names with >3 words
+		if len(n.split(" ")) > 3:
+			continue
+		# Avoid alt-names that already name a node in the database
+		if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None:
+			continue
+		# Check for picked alt-name-to-skip
+		if nodeName in nameToAltsToSkip and n in nameToAltsToSkip[nodeName]:
+			print(f"Excluding alt-name {n} for node {nodeName}")
+			continue
+		#
+		altNames.add(n)
+	# Add alt-names to db
+	preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
+	for n in altNames:
+		isPreferred = 1 if (n == preferredName) else 0
+		dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred))
+print("Adding picked IDs")
+for (name, pid) in nameToPickedPid.items():
+	if pid != None:
+		addToDb(name, pid)
+		usedPids.add(pid)
+print("Associating nodes with canonical names")
+iterNum = 0
+for (nodeName,) in dbCur2.execute("SELECT name FROM nodes"):
+	iterNum += 1
+	if iterNum % 1e5 == 0:
+		print(f"At iteration {iterNum}")
+	if nodeName in nameToPickedPid:
+		continue
+	# Check for matching canonical name
+	if nodeName in canonicalNameToPids:
+		pidToUse = None
+		# Pick an associated page ID
+		for pid in canonicalNameToPids[nodeName]:
+			hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred
+			hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred
+			if hasLowerPrio:
+				continue
+			if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio):
+				pidToUse = pid
+		if pidToUse != None:
+			addToDb(nodeName, pidToUse)
+			usedPids.add(pidToUse)
+	elif nodeName in nameToPids:
+		unresolvedNodeNames.add(nodeName)
+print("Associating leftover nodes with other names")
+iterNum = 0
+for nodeName in unresolvedNodeNames:
+	iterNum += 1
+	if iterNum % 100 == 0:
+		print(f"At iteration {iterNum}")
+	# Check for matching name
+	pidToUse = None
+	for pid in nameToPids[nodeName]:
+		# Pick an associated page ID
+		if pid not in usedPids and (pidToUse == None or pid < pidToUse):
+			pidToUse = pid
+	if pidToUse != None:
+		addToDb(nodeName, pidToUse)
+		usedPids.add(pidToUse)
+
+print("Closing database")
+dbCon.commit()
+dbCon.close()
author	Terry Truong <terry06890@gmail.com>	2022-07-11 01:54:08 +1000
committer	Terry Truong <terry06890@gmail.com>	2022-07-11 01:54:08 +1000
commit	5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree	3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/tolData/genEolNameData.py
parent	a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)