aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEnwikiDescData.py
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data/genEnwikiDescData.py')
-rwxr-xr-xbackend/data/genEnwikiDescData.py33
1 files changed, 30 insertions, 3 deletions
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index 57e4194..3e11871 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -13,12 +13,30 @@ if len(sys.argv) > 1:
enwikiDb = "enwiki/enwikiData.db"
dbFile = "data.db"
+namesToSkipFile = "genEnwikiDescNamesToSkip.txt"
+titlesToUseFile = "genEnwikiDescTitlesToUse.txt"
+titleToUseRegex = re.compile(r"(.*) \(.*\)")
# Open dbs
enwikiCon = sqlite3.connect(enwikiDb)
enwikiCur = enwikiCon.cursor()
dbCon = sqlite3.connect(dbFile)
dbCur = dbCon.cursor()
+# Read name/title files
+namesToSkip = set()
+nameToPickedTitle = {} # Maps names to titles to be used for them
+if os.path.exists(namesToSkipFile):
+ with open(namesToSkipFile) as file:
+ for line in file:
+ namesToSkip.add(line.rstrip())
+ print("Read in {len(namesToSkip)} names to skip")
+if os.path.exists(titlesToUseFile):
+ with open(titlesToUseFile) as file:
+ for line in file:
+ title = line.rstrip()
+ name = titleToUseRegex.sub(r"\1", title) # Remove parens
+ nameToPickedTitle[name.lower()] = title
+print("Read in {len(titlesToUse)} titles to use for certain names")
# Get node names without descriptions
print("Getting node names")
nodeNames = set()
@@ -26,6 +44,7 @@ query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name
for row in dbCur.execute(query):
nodeNames.add(row[0])
print(f"Found {len(nodeNames)} names")
+nodeNames.difference_update(namesToSkip)
# Find page id for each node name
print("Getting node page-ids")
nodeToPageId = {}
@@ -35,9 +54,17 @@ for name in nodeNames:
if iterNum % 1e4 == 0:
print(f"At iteration {iterNum}")
#
- row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
- if row != None:
- nodeToPageId[name] = row[0]
+ if name not in nameToPickedTitle:
+ row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone()
+ if row != None:
+ nodeToPageId[name] = row[0]
+ else:
+ title = nameToPickedTitle[name]
+ row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone()
+ if row != None:
+ nodeToPageId[name] = row[0]
+ else:
+ print("WARNING: Picked title {title} not found", file=sys.stderr)
# Resolve redirects
print("Resolving redirects")
redirectingNames = set()