diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-07-11 01:54:08 +1000 |
| commit | 5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch) | |
| tree | 3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/data/genEnwikiNameData.py | |
| parent | a8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff) | |
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or
symlinked from, public/. This needed to be changed before each
build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/data/genEnwikiNameData.py')
| -rwxr-xr-x | backend/data/genEnwikiNameData.py | 76 |
1 files changed, 0 insertions, 76 deletions
diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py deleted file mode 100755 index 7ad61d1..0000000 --- a/backend/data/genEnwikiNameData.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/python3 - -import sys, re -import sqlite3 - -usageInfo = f""" -Usage: {sys.argv[0]} - -Reads from a database containing data from Wikipdia, along with -node and wiki-id information from the database, and use wikipedia -page-redirect information to add additional alt-name data. -""" -if len(sys.argv) > 1: - print(usageInfo, file=sys.stderr) - sys.exit(1) - -enwikiDb = "enwiki/descData.db" -dbFile = "data.db" -altNameRegex = re.compile(r"[a-zA-Z]+") - # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)', - -print("Opening databases") -enwikiCon = sqlite3.connect(enwikiDb) -enwikiCur = enwikiCon.cursor() -dbCon = sqlite3.connect(dbFile) -dbCur = dbCon.cursor() - -print("Getting nodes with wiki IDs") -nodeToWikiId = {} -for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"): - nodeToWikiId[nodeName] = wikiId -print(f"Found {len(nodeToWikiId)}") - -print("Iterating through nodes, finding names that redirect to them") -nodeToAltNames = {} -numAltNames = 0 -iterNum = 0 -for (nodeName, wikiId) in nodeToWikiId.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - nodeToAltNames[nodeName] = set() - query = "SELECT p1.title FROM pages p1" \ - " INNER JOIN redirects r1 ON p1.id = r1.id" \ - " INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?" - for (name,) in enwikiCur.execute(query, (wikiId,)): - if altNameRegex.fullmatch(name) != None and name.lower() != nodeName: - nodeToAltNames[nodeName].add(name.lower()) - numAltNames += 1 -print(f"Found {numAltNames} alt-names") - -print("Excluding existing alt-names from the set") -query = "SELECT alt_name FROM names WHERE alt_name IN ({})" -iterNum = 0 -for (nodeName, altNames) in nodeToAltNames.items(): - iterNum += 1 - if iterNum % 1e4 == 0: - print(f"At iteration {iterNum}") - # - existingNames = set() - for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)): - existingNames.add(name) - numAltNames -= len(existingNames) - altNames.difference_update(existingNames) -print(f"Left with {numAltNames} alt-names") - -print("Adding alt-names to database") -for (nodeName, altNames) in nodeToAltNames.items(): - for altName in altNames: - dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0)) - -print("Closing databases") -dbCon.commit() -dbCon.close() -enwikiCon.close() |
