aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEnwikiNameData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
committerTerry Truong <terry06890@gmail.com>2022-07-11 01:54:08 +1000
commit5fe71ea7b9d9a5d2dc6e8e5ce5b9193629eed74d (patch)
tree3b8b9d7299540a812ec93e224f8fc71249a98860 /backend/data/genEnwikiNameData.py
parenta8f80a02b88055cfcb45664ce3a3d24c2b2da98c (diff)
Make backend dev server script serve the image files
Previously, image files in backend/data/img were moved to, or symlinked from, public/. This needed to be changed before each build, otherwise vite would end up copying gigabytes of images.
Diffstat (limited to 'backend/data/genEnwikiNameData.py')
-rwxr-xr-xbackend/data/genEnwikiNameData.py76
1 files changed, 0 insertions, 76 deletions
diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py
deleted file mode 100755
index 7ad61d1..0000000
--- a/backend/data/genEnwikiNameData.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/python3
-
-import sys, re
-import sqlite3
-
-usageInfo = f"""
-Usage: {sys.argv[0]}
-
-Reads from a database containing data from Wikipdia, along with
-node and wiki-id information from the database, and use wikipedia
-page-redirect information to add additional alt-name data.
-"""
-if len(sys.argv) > 1:
- print(usageInfo, file=sys.stderr)
- sys.exit(1)
-
-enwikiDb = "enwiki/descData.db"
-dbFile = "data.db"
-altNameRegex = re.compile(r"[a-zA-Z]+")
- # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',
-
-print("Opening databases")
-enwikiCon = sqlite3.connect(enwikiDb)
-enwikiCur = enwikiCon.cursor()
-dbCon = sqlite3.connect(dbFile)
-dbCur = dbCon.cursor()
-
-print("Getting nodes with wiki IDs")
-nodeToWikiId = {}
-for (nodeName, wikiId) in dbCur.execute("SELECT name, id from wiki_ids"):
- nodeToWikiId[nodeName] = wikiId
-print(f"Found {len(nodeToWikiId)}")
-
-print("Iterating through nodes, finding names that redirect to them")
-nodeToAltNames = {}
-numAltNames = 0
-iterNum = 0
-for (nodeName, wikiId) in nodeToWikiId.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- nodeToAltNames[nodeName] = set()
- query = "SELECT p1.title FROM pages p1" \
- " INNER JOIN redirects r1 ON p1.id = r1.id" \
- " INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?"
- for (name,) in enwikiCur.execute(query, (wikiId,)):
- if altNameRegex.fullmatch(name) != None and name.lower() != nodeName:
- nodeToAltNames[nodeName].add(name.lower())
- numAltNames += 1
-print(f"Found {numAltNames} alt-names")
-
-print("Excluding existing alt-names from the set")
-query = "SELECT alt_name FROM names WHERE alt_name IN ({})"
-iterNum = 0
-for (nodeName, altNames) in nodeToAltNames.items():
- iterNum += 1
- if iterNum % 1e4 == 0:
- print(f"At iteration {iterNum}")
- #
- existingNames = set()
- for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)):
- existingNames.add(name)
- numAltNames -= len(existingNames)
- altNames.difference_update(existingNames)
-print(f"Left with {numAltNames} alt-names")
-
-print("Adding alt-names to database")
-for (nodeName, altNames) in nodeToAltNames.items():
- for altName in altNames:
- dbCur.execute("INSERT INTO names VALUES (?, ?, ?, 'enwiki')", (nodeName, altName, 0))
-
-print("Closing databases")
-dbCon.commit()
-dbCon.close()
-enwikiCon.close()