diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-04-27 11:28:44 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-04-27 11:28:44 +1000 |
| commit | 565495b1153c87cbf907de31d116c5f89bcffc2a (patch) | |
| tree | d94f95249ee0375bfe2bfe0e064f24924d5ce1fe /backend | |
| parent | 6f52cd6b07970010c40270003d63aa74f84f6ae9 (diff) | |
Make generated EOL-names-data have unique eol_id for each tolnode name
Diffstat (limited to 'backend')
| -rwxr-xr-x | backend/data/eolNamesToSqlite.py | 63 |
1 files changed, 50 insertions, 13 deletions
diff --git a/backend/data/eolNamesToSqlite.py b/backend/data/eolNamesToSqlite.py index 4013a4c..1df1c23 100755 --- a/backend/data/eolNamesToSqlite.py +++ b/backend/data/eolNamesToSqlite.py @@ -7,16 +7,24 @@ vnamesFile = "eol/vernacular_names.csv" dbFile = "data.db" # Read in vernacular-names data + # Note: Canonical-names may have multiple pids + # Note: A canonical-name's associated pids might all have other associated names nameToPids = {} pidToNames = {} -def updateMaps(name, pid): +canonicalNameToPids = {} +def updateMaps(name, pid, canonical): if name not in nameToPids: nameToPids[name] = {pid} - elif pid not in nameToPids[name]: + else: nameToPids[name].add(pid) + if canonical: + if name not in canonicalNameToPids: + canonicalNameToPids[name] = {pid} + else: + canonicalNameToPids[name].add(pid) if pid not in pidToNames: pidToNames[pid] = {name} - elif name not in pidToNames[pid]: + else: pidToNames[pid].add(name) with open(vnamesFile, newline="") as csvfile: reader = csv.reader(csvfile) @@ -29,28 +37,57 @@ with open(vnamesFile, newline="") as csvfile: name1 = re.sub(r"<[^>]+>", "", row[1].lower()) name2 = row[2].lower() # Add to maps - updateMaps(name1, pid) - updateMaps(name2, pid) + updateMaps(name1, pid, True) + updateMaps(name2, pid, False) # Open db connection dbCon = sqlite3.connect(dbFile) cur = dbCon.cursor() # Create 'names' table cur.execute("CREATE TABLE names(name TEXT, alt_name TEXT, eol_id INT, PRIMARY KEY(name, alt_name))") -# Iterate through 'nodes' table +# Iterate through 'nodes' table, resolving to canonical-names +usedPids = set() +unresolvedNodeNames = set() cur2 = dbCon.cursor() iterationNum = 0 for row in cur2.execute("SELECT name FROM nodes"): name = row[0] iterationNum += 1 if iterationNum % 10000 == 0: - print("Iteration {}".format(iterationNum)) - # If name matches a vernacular-names name, add alt-name entries to the 'names' table - if name in nameToPids: - altNames = {name} - for pid in nameToPids[name]: - for n in pidToNames[pid]: + print("Loop 1 iteration {}".format(iterationNum)) + # If name matches a canonical-name, add alt-name entries to 'names' table + if name in canonicalNameToPids: + pidToUse = 0 + for pid in canonicalNameToPids[name]: + if pid not in usedPids: + pidToUse = pid + break + if pidToUse > 0: + usedPids.add(pidToUse) + altNames = {name} + for n in pidToNames[pidToUse]: altNames.add(n) + for n in altNames: + cur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, pidToUse)) + elif name in nameToPids: + unresolvedNodeNames.add(name) +# Iterate through unresolved nodes, resolving to vernacular-names +iterationNum = 0 +for name in unresolvedNodeNames: + iterationNum += 1 + if iterationNum % 10000 == 0: + print("Loop 2 iteration {}".format(iterationNum)) + # Add alt-name entries to 'names' table for first corresponding pid + pidToUse = 0 + for pid in nameToPids[name]: + if pid not in usedPids: + pidToUse = pid + break + if pidToUse > 0: + usedPids.add(pidToUse) + altNames = {name} + for n in pidToNames[pidToUse]: + altNames.add(n) for n in altNames: - cur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, pid)) + cur.execute("INSERT INTO names VALUES (?, ?, ?)", (name, n, pidToUse)) dbCon.commit() dbCon.close() |
