aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rw-r--r--backend/data/README.md17
-rwxr-xr-xbackend/data/enwiki/genData.py1
-rwxr-xr-xbackend/data/genEnwikiDescData.py (renamed from backend/data/genEnwikiData.py)1
-rwxr-xr-xbackend/data/genEnwikiNameData.py71
4 files changed, 81 insertions, 9 deletions
diff --git a/backend/data/README.md b/backend/data/README.md
index 8cfa960..d444e4f 100644
--- a/backend/data/README.md
+++ b/backend/data/README.md
@@ -21,14 +21,15 @@ File Generation Process
which uses 'nodes', 'edges', 'eol_ids', and 'images', to associate
nodes without images to child images.
4 Node Description Data
- - Using DBpedia
- 1 Obtain data in dbpedia/, as specified in it's README.
- 2 Run genDbpData.py, which adds a 'descs' table to data.db, using
- data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table.
- - Supplementing with Wikipedia dump
- 1 Obtain data in enwiki/, as specified in it's README.
- 2 Run genEnwikiData.py, which adds to the 'descs' table, using data in
- enwiki/enwikiData.db, and the 'nodes' table.
+ 1 Obtain data in dbpedia/, as specified in it's README.
+ 2 Run genDbpData.py, which adds a 'descs' table to data.db, using
+ data in dbpedia/dbpData.db, dbpPickedLabels.txt, and the 'nodes' table.
+5 Supplementary Name/Description Data
+ 1 Obtain data in enwiki/, as specified in it's README.
+ 2 Run genEnwikiDescData.py, which adds to the 'descs' table, using data in
+ enwiki/enwikiData.db, and the 'nodes' table.
+ 3 Run genEnwikiNameData.py, which adds to the 'names' table, using data in
+ enwiki/enwikiData.db, and the 'names' and 'descs' tables.
5 Reduced Tree Structure Data
1 Run genReducedTreeData.py, which adds 'r_nodes' and 'r_edges' tables to
data.db, using reducedTol/names.txt, and the 'nodes' and 'names' tables.
diff --git a/backend/data/enwiki/genData.py b/backend/data/enwiki/genData.py
index 4f0d62e..646292c 100755
--- a/backend/data/enwiki/genData.py
+++ b/backend/data/enwiki/genData.py
@@ -91,6 +91,7 @@ dbCur = dbCon.cursor()
dbCur.execute("CREATE TABLE pages (id INT PRIMARY KEY, title TEXT UNIQUE)")
dbCur.execute("CREATE INDEX pages_title_idx ON pages(title COLLATE NOCASE)")
dbCur.execute("CREATE TABLE redirects (id INT PRIMARY KEY, target TEXT)")
+dbCur.execute("CREATE INDEX redirects_idx ON redirects(target)")
dbCur.execute("CREATE TABLE descs (id INT PRIMARY KEY, desc TEXT)")
# Read through dump file
print("Reading dump file")
diff --git a/backend/data/genEnwikiData.py b/backend/data/genEnwikiDescData.py
index d33fd5d..40a6c92 100755
--- a/backend/data/genEnwikiData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -68,5 +68,4 @@ for (name, pageId) in nodeToPageId.items():
# Close dbs
dbCon.commit()
dbCon.close()
-enwikiCon.commit()
enwikiCon.close()
diff --git a/backend/data/genEnwikiNameData.py b/backend/data/genEnwikiNameData.py
new file mode 100755
index 0000000..dfed46c
--- /dev/null
+++ b/backend/data/genEnwikiNameData.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+
+import sys, re
+import sqlite3
+
+usageInfo = f"usage: {sys.argv[0]}\n"
+usageInfo += "Reads Wikimedia enwiki redirect data from enwiki/, and node and wiki-id\n"
+usageInfo += "data from a sqlite database, and adds supplmenentary alt-name data.\n"
+if len(sys.argv) > 1:
+ print(usageInfo, file=sys.stderr)
+ sys.exit(1)
+
+enwikiDb = "enwiki/enwikiData.db"
+dbFile = "data.db"
+altNameRegex = re.compile(r"[a-zA-Z]+")
+ # Avoids names like 'Evolution of Elephants', 'Banana fiber', 'Fish (zoology)',
+
+# Open dbs
+enwikiCon = sqlite3.connect(enwikiDb)
+enwikiCur = enwikiCon.cursor()
+dbCon = sqlite3.connect(dbFile)
+dbCur = dbCon.cursor()
+# Get nodes with wiki-ids
+print("Getting nodes with wiki IDs")
+nodeToWikiId = {}
+for row in dbCur.execute("SELECT name, wiki_id from descs"):
+ nodeToWikiId[row[0]] = row[1]
+print("Found {} nodes".format(len(nodeToWikiId)))
+# Find wiki-ids that redirect to each node
+print("Finding redirecter names")
+nodeToAltNames = {}
+numAltNames = 0
+iterNum = 0
+for (nodeName, wikiId) in nodeToWikiId.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ nodeToAltNames[nodeName] = set()
+ query = "SELECT p1.title FROM pages p1" \
+ " INNER JOIN redirects r1 ON p1.id = r1.id" \
+ " INNER JOIN pages p2 ON r1.target = p2.title WHERE p2.id = ?"
+ for (name,) in enwikiCur.execute(query, (wikiId,)):
+ if altNameRegex.fullmatch(name) != None:
+ nodeToAltNames[nodeName].add(name.lower())
+ numAltNames += 1
+print("Found {} alt-names".format(numAltNames))
+# Remove existing alt-names
+print("Removing existing alt-names")
+query = "SELECT alt_name FROM names WHERE alt_name IN ({})"
+iterNum = 0
+for (nodeName, altNames) in nodeToAltNames.items():
+ iterNum += 1
+ if iterNum % 1e4 == 0:
+ print("At iteration {}".format(iterNum))
+ #
+ existingNames = set()
+ for (name,) in dbCur.execute(query.format(",".join(["?"] * len(altNames))), list(altNames)):
+ existingNames.add(name)
+ numAltNames -= len(existingNames)
+ altNames.difference_update(existingNames)
+print("Left with {} alt-names".format(numAltNames))
+# Add alt-names
+print("Adding alt-names")
+for (nodeName, altNames) in nodeToAltNames.items():
+ for altName in altNames:
+ dbCur.execute("INSERT INTO names VALUES (?, ?, ?)", (nodeName, altName, 0))
+# Close dbs
+dbCon.commit()
+dbCon.close()
+enwikiCon.close()