From 19c5a2b6c57b8c4245bb07773caa8df598b3feec Mon Sep 17 00:00:00 2001 From: Terry Truong Date: Sun, 19 Jun 2022 14:50:38 +1000 Subject: Make manual enwiki-desc-associations more flexible Also enable getEnwikiImgData.py to skip already-processed wiki-ids --- backend/data/genEnwikiDescData.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'backend/data/genEnwikiDescData.py') diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py index debd3fd..2396540 100755 --- a/backend/data/genEnwikiDescData.py +++ b/backend/data/genEnwikiDescData.py @@ -14,8 +14,7 @@ if len(sys.argv) > 1: enwikiDb = "enwiki/enwikiData.db" dbFile = "data.db" namesToSkipFile = "genDescNamesToSkip.txt" -titlesToUseFile = "genEnwikiDescTitlesToUse.txt" -titleToUseRegex = re.compile(r"(.*) \(.*\)") +pickedLabelsFile = "enwikiPickedLabels.txt" # Open dbs enwikiCon = sqlite3.connect(enwikiDb) @@ -30,11 +29,10 @@ if os.path.exists(namesToSkipFile): for line in file: namesToSkip.add(line.rstrip()) print(f"Read in {len(namesToSkip)} names to skip") -if os.path.exists(titlesToUseFile): - with open(titlesToUseFile) as file: +if os.path.exists(pickedLabelsFile): + with open(pickedLabelsFile) as file: for line in file: - title = line.rstrip() - name = titleToUseRegex.sub(r"\1", title) # Remove parens + (name, _, title) = line.rstrip().partition("|") nameToPickedTitle[name.lower()] = title print(f"Read in {len(nameToPickedTitle)} titles to use for certain names") # Get node names without descriptions -- cgit v1.2.3