diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-26 13:22:36 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-05-26 13:22:36 +1000 |
| commit | 07397961bfb113bd9c03883f2b24e6d287f989ca (patch) | |
| tree | a5a4fc18b54689497eae85f269e9467e1a0068aa /backend | |
| parent | 2d67e54dc91708eaf89eca9dca27cec126f7f465 (diff) | |
Add some enwiki redirect data as alt-names
Diffstat (limited to 'backend')
| -rw-r--r-- | backend/data/README.md | 17 | ||||
| -rwxr-xr-x | backend/data/enwiki/genData.py | 1 | ||||
| -rwxr-xr-x | backend/data/genEnwikiDescData.py (renamed from backend/data/genEnwikiData.py) | 1 | ||||
| -rwxr-xr-x | backend/data/genEnwikiNameData.py | 71 |
4 files changed, 81 insertions, 9 deletions
diff --git a/backend/data/README.md b/backend/data/README.md index 8cfa960..d444e4f 100644 --- a/backend/data/README.md +++ b/backend/data/README.md @@ -21,14 +21,15 @@ File Generation Process which uses 'nodes', 'edges', 'eol_ids', and 'images', to associate nodes without images to child images. 4 Node Description Data - - Using DBpedia - 1 Obtain data in dbpedia/, as specified in it's README. - 2 Run genDbpData.py, which adds a 'descs' table to data.db, using - data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table. - - Supplementing with Wikipedia dump - 1 Obtain data in enwiki/, as specified in it's README. - 2 Run genEnwikiData.py, which adds to the 'descs' table, using data in - enwiki/enwikiData.db, and the 'nodes' table. + 1 Obtain data in dbpedia/, as specified in it's README. + 2 Run genDbpData.py, which adds a 'descs' table to data.db, using + data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table. +5 Supplementary Name/Description Data + 1 Obtain data in enwiki/, as specified in it's README. + 2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in + enwiki/enwikiData.db, and the 'nodes' table. + 3 Run genEnwikiNameData.py, which adds to the 'names' table, using data in + enwiki/enwikiData.db, and the 'names' and 'descs' tables. 5 Reduced Tree Structure Data 1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables. diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py index 4f0d62e..646292c 100755 --- a/backend/data/enwiki/genData.py +++ b/backend/data/enwiki/genData.py @@ -91,6 +91,7 @@ dbCur = dbCon.cursor() dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)") dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)") dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)") +dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)") dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)") # Read through dump file print("Reading dump file") diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiDescData.py index d33fd5d..40a6c92 100755 --- a/backend/data/genEnwikiData.py +++ b/backend/data/genEnwikiDescData.py @@ -68,5 +68,4 @@ for (name, pageId) in nodeToPageId.items(): # Close dbs dbCon.commit() dbCon.close() -enwikiCon.commit() enwikiCon.close() diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py new file mode 100755 index 0000000..dfed46c --- /dev/null +++ b/backend/data/genEnwikiNameData.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 + +import sys, re +import sqlite3 + +usageInfo = f"usage: {sys.argv[0]}\n" +usageInfo += "Reads Wikimedia enwiki redirect data from enwiki/, and node and wiki-id\n" +usageInfo += "data from a sqlite database, and adds supplmenentary alt-name data.\n" +if len(sys.argv) > 1: + print(usageInfo, file=sys.stderr) + sys.exit(1) + +enwikiDb = "enwiki/enwikiData.db" +dbFile = "data.db" +altNameRegex = re.compile(r"[a-zA-Z]+") + # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)', + +# Open dbs +enwikiCon = sqlite3.connect(enwikiDb) +enwikiCur = enwikiCon.cursor() +dbCon = sqlite3.connect(dbFile) +dbCur = dbCon.cursor() +# Get nodes with wiki-ids +print("Getting nodes with wiki IDs") +nodeToWikiId = {} +for row in dbCur.execute("SELECT name, wiki_id from descs"): + nodeToWikiId[row[0]] = row[1] +print("Found {} nodes".format(len(nodeToWikiId))) +# Find wiki-ids that redirect to each node +print("Finding redirecter names") +nodeToAltNames = {} +numAltNames = 0 +iterNum = 0 +for (nodeName, wikiId) in nodeToWikiId.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + nodeToAltNames[nodeName] = set() + query = "SELECT p1.title FROM pages p1" \ + " INNER JOIN redirects r1 ON p1.id = r1.id" \ + " INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?" + for (name,) in enwikiCur.execute(query, (wikiId,)): + if altNameRegex.fullmatch(name) != None: + nodeToAltNames[nodeName].add(name.lower()) + numAltNames += 1 +print("Found {} alt-names".format(numAltNames)) +# Remove existing alt-names +print("Removing existing alt-names") +query = "SELECT alt_name FROM names WHERE alt_name IN ({})" +iterNum = 0 +for (nodeName, altNames) in nodeToAltNames.items(): + iterNum += 1 + if iterNum % 1e4 == 0: + print("At iteration {}".format(iterNum)) + # + existingNames = set() + for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)): + existingNames.add(name) + numAltNames -= len(existingNames) + altNames.difference_update(existingNames) +print("Left with {} alt-names".format(numAltNames)) +# Add alt-names +print("Adding alt-names") +for (nodeName, altNames) in nodeToAltNames.items(): + for altName in altNames: + dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (nodeName, altName, 0)) +# Close dbs +dbCon.commit() +dbCon.close() +enwikiCon.close() |
