diff options
| author | Terry Truong <terry06890@gmail.com> | 2022-05-31 23:20:44 +1000 |
|---|---|---|
| committer | Terry Truong <terry06890@gmail.com> | 2022-06-01 00:37:33 +1000 |
| commit | 115a5cfd201a15477323f207585ae12e81f070fb (patch) | |
| tree | 9087edd0d956e201518993033b87aa701b54d9df /backend/data/genEnwikiDescData.py | |
| parent | 4927ca46897ce8983f92572df12f4d6ea86ca376 (diff) | |
Add script/instructions for filtering out mismatching node descs
Diffstat (limited to 'backend/data/genEnwikiDescData.py')
| -rwxr-xr-x | backend/data/genEnwikiDescData.py | 33 |
1 files changed, 30 insertions, 3 deletions
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py index 57e4194..3e11871 100755 --- a/backend/data/genEnwikiDescData.py +++ b/backend/data/genEnwikiDescData.py @@ -13,12 +13,30 @@ if len(sys.argv) > 1: enwikiDb = "enwiki/enwikiData.db" dbFile = "data.db" +namesToSkipFile = "genEnwikiDescNamesToSkip.txt" +titlesToUseFile = "genEnwikiDescTitlesToUse.txt" +titleToUseRegex = re.compile(r"(.*) \(.*\)") # Open dbs enwikiCon = sqlite3.connect(enwikiDb) enwikiCur = enwikiCon.cursor() dbCon = sqlite3.connect(dbFile) dbCur = dbCon.cursor() +# Read name/title files +namesToSkip = set() +nameToPickedTitle = {} # Maps names to titles to be used for them +if os.path.exists(namesToSkipFile): + with open(namesToSkipFile) as file: + for line in file: + namesToSkip.add(line.rstrip()) + print("Read in {len(namesToSkip)} names to skip") +if os.path.exists(titlesToUseFile): + with open(titlesToUseFile) as file: + for line in file: + title = line.rstrip() + name = titleToUseRegex.sub(r"\1", title) # Remove parens + nameToPickedTitle[name.lower()] = title +print("Read in {len(titlesToUse)} titles to use for certain names") # Get node names without descriptions print("Getting node names") nodeNames = set() @@ -26,6 +44,7 @@ query = "SELECT nodes.name FROM nodes LEFT JOIN descs ON nodes.name = descs.name for row in dbCur.execute(query): nodeNames.add(row[0]) print(f"Found {len(nodeNames)} names") +nodeNames.difference_update(namesToSkip) # Find page id for each node name print("Getting node page-ids") nodeToPageId = {} @@ -35,9 +54,17 @@ for name in nodeNames: if iterNum % 1e4 == 0: print(f"At iteration {iterNum}") # - row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() - if row != None: - nodeToPageId[name] = row[0] + if name not in nameToPickedTitle: + row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ? COLLATE NOCASE", (name,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] + else: + title = nameToPickedTitle[name] + row = enwikiCur.execute("SELECT id FROM pages WHERE pages.title = ?", (title,)).fetchone() + if row != None: + nodeToPageId[name] = row[0] + else: + print("WARNING: Picked title {title} not found", file=sys.stderr) # Resolve redirects print("Resolving redirects") redirectingNames = set() |
