aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEolNameData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
committerTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
commit5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/data/genEolNameData.py
parenta8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/data/genEolNameData.py')
-rwxr-xr-xbackend/data/genEolNameData.py184
1 files changed, 0 insertions, 184 deletions
diff --git a/backend/data/genEolNameData.py b/backend/data/genEolNameData.py
deleted file mode 100755
index dd33ee0..0000000
--- a/backend/data/genEolNameData.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re, os
-import html, csv, sqlite3
-
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-Reads files describing name data from the 'Encyclopedia of Life' site,
-tries to associate names with nodes in the database, and adds tables
-to represent associated names.
-
-Reads a vernacularNames.csv file:
- Starts with a header line containing:
- page_id, canonical_form, vernacular_string, language_code,
- resource_name, is_preferred_by_resource, is_preferred_by_eol
- The canonical_form and vernacular_string fields contain names
- associated with the page ID. Names are not always unique to
- particular page IDs.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-vnamesFile = "eol/vernacularNames.csv" # Had about 2.8e6 entries
-dbFile = "data.db"
-namesToSkip = {"unknown", "unknown species", "unidentified species"}
-pickedIdsFile = "pickedEolIds.txt"
-altsToSkipFile = "pickedEolAltsToSkip.txt"
-
-print("Reading in vernacular-names data")
-nameToPids = {} # 'pid' means 'Page ID'
-canonicalNameToPids = {}
-pidToNames = {}
-pidToPreferred = {} # Maps pids to 'preferred' names
-def updateMaps(name, pid, canonical, preferredAlt):
- global namesToSkip, nameToPids, canonicalNameToPids, pidToNames, pidToPreferred
- if name in namesToSkip:
- return
- if name not in nameToPids:
- nameToPids[name] = {pid}
- else:
- nameToPids[name].add(pid)
- if canonical:
- if name not in canonicalNameToPids:
- canonicalNameToPids[name] = {pid}
- else:
- canonicalNameToPids[name].add(pid)
- if pid not in pidToNames:
- pidToNames[pid] = {name}
- else:
- pidToNames[pid].add(name)
- if preferredAlt:
- pidToPreferred[pid] = name
-with open(vnamesFile, newline="") as csvfile:
- reader = csv.reader(csvfile)
- lineNum = 0
- for row in reader:
- lineNum += 1
- if lineNum % 1e5 == 0:
- print(f"At line {lineNum}")
- # Skip header line
- if lineNum == 1:
- continue
- # Parse line
- pid = int(row[0])
- name1 = re.sub(r"<[^>]+>", "", row[1].lower()) # Remove tags
- name2 = html.unescape(row[2]).lower()
- lang = row[3]
- preferred = row[6] == "preferred"
- # Add to maps
- updateMaps(name1, pid, True, False)
- if lang == "eng" and name2 != "":
- updateMaps(name2, pid, False, preferred)
-
-print("Checking for manually-picked pids")
-nameToPickedPid = {}
-if os.path.exists(pickedIdsFile):
- with open(pickedIdsFile) as file:
- for line in file:
- (name, _, eolId) = line.rstrip().partition("|")
- nameToPickedPid[name] = None if eolId == "" else int(eolId)
-print(f"Found {len(nameToPickedPid)}")
-
-print("Checking for alt-names to skip")
-nameToAltsToSkip = {}
-numToSkip = 0
-if os.path.exists(altsToSkipFile):
- with open(altsToSkipFile) as file:
- for line in file:
- (name, _, altName) = line.rstrip().partition("|")
- if name not in nameToAltsToSkip:
- nameToAltsToSkip[name] = [altName]
- else:
- nameToAltsToSkip[name].append(altName)
- numToSkip += 1
-print(f"Found {numToSkip} alt-names to skip")
-
-print("Creating database tables")
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-dbCur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, pref_alt INT, src TEXT, PRIMARY KEY(name, alt_name))")
-dbCur.execute("CREATE INDEX names_idx ON names(name)")
-dbCur.execute("CREATE INDEX names_alt_idx ON names(alt_name)")
-dbCur.execute("CREATE INDEX names_alt_idx_nc ON names(alt_name COLLATE NOCASE)")
-dbCur.execute("CREATE TABLE eol_ids(id INT PRIMARY KEY, name TEXT)")
-dbCur.execute("CREATE INDEX eol_name_idx ON eol_ids(name)")
-
-print("Associating nodes with names")
-usedPids = set()
-unresolvedNodeNames = set()
-dbCur2 = dbCon.cursor()
-def addToDb(nodeName, pidToUse):
- " Adds page-ID-associated name data to a node in the database "
- global dbCur, pidToPreferred
- dbCur.execute("INSERT INTO eol_ids VALUES (?, ?)", (pidToUse, nodeName))
- # Get alt-names
- altNames = set()
- for n in pidToNames[pidToUse]:
- # Avoid alt-names with >3 words
- if len(n.split(" ")) > 3:
- continue
- # Avoid alt-names that already name a node in the database
- if dbCur.execute("SELECT name FROM nodes WHERE name = ?", (n,)).fetchone() != None:
- continue
- # Check for picked alt-name-to-skip
- if nodeName in nameToAltsToSkip and n in nameToAltsToSkip[nodeName]:
- print(f"Excluding alt-name {n} for node {nodeName}")
- continue
- #
- altNames.add(n)
- # Add alt-names to db
- preferredName = pidToPreferred[pidToUse] if (pidToUse in pidToPreferred) else None
- for n in altNames:
- isPreferred = 1 if (n == preferredName) else 0
- dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'eol')", (nodeName, n, isPreferred))
-print("Adding picked IDs")
-for (name, pid) in nameToPickedPid.items():
- if pid != None:
- addToDb(name, pid)
- usedPids.add(pid)
-print("Associating nodes with canonical names")
-iterNum = 0
-for (nodeName,) in dbCur2.execute("SELECT name FROM nodes"):
- iterNum += 1
- if iterNum % 1e5 == 0:
- print(f"At iteration {iterNum}")
- if nodeName in nameToPickedPid:
- continue
- # Check for matching canonical name
- if nodeName in canonicalNameToPids:
- pidToUse = None
- # Pick an associated page ID
- for pid in canonicalNameToPids[nodeName]:
- hasLowerPrio = pid not in pidToPreferred and pidToUse in pidToPreferred
- hasHigherPrio = pid in pidToPreferred and pidToUse not in pidToPreferred
- if hasLowerPrio:
- continue
- if pid not in usedPids and (pidToUse == None or pid < pidToUse or hasHigherPrio):
- pidToUse = pid
- if pidToUse != None:
- addToDb(nodeName, pidToUse)
- usedPids.add(pidToUse)
- elif nodeName in nameToPids:
- unresolvedNodeNames.add(nodeName)
-print("Associating leftover nodes with other names")
-iterNum = 0
-for nodeName in unresolvedNodeNames:
- iterNum += 1
- if iterNum % 100 == 0:
- print(f"At iteration {iterNum}")
- # Check for matching name
- pidToUse = None
- for pid in nameToPids[nodeName]:
- # Pick an associated page ID
- if pid not in usedPids and (pidToUse == None or pid < pidToUse):
- pidToUse = pid
- if pidToUse != None:
- addToDb(nodeName, pidToUse)
- usedPids.add(pidToUse)
-
-print("Closing database")
-dbCon.commit()
-dbCon.close()