aboutsummaryrefslogtreecommitdiff
path: root/backend/data/genEnwikiDescData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-19 14:50:38 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-19 14:50:38 +1000
commit19c5a2b6c57b8c4245bb07773caa8df598b3feec (patch)
treee944f66fff51448e75677982c3b3118856aeeb18 /backend/data/genEnwikiDescData.py
parent8c8ed28c87e649f163aaa54899f4b00c9fc31224 (diff)
Make manual enwiki-desc-associations more flexible
Also enable getEnwikiImgData.py to skip already-processed wiki-ids
Diffstat (limited to 'backend/data/genEnwikiDescData.py')
-rwxr-xr-xbackend/data/genEnwikiDescData.py10
1 files changed, 4 insertions, 6 deletions
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index debd3fd..2396540 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -14,8 +14,7 @@ if len(sys.argv) > 1:
enwikiDb = "enwiki/enwikiData.db"
dbFile = "data.db"
namesToSkipFile = "genDescNamesToSkip.txt"
-titlesToUseFile = "genEnwikiDescTitlesToUse.txt"
-titleToUseRegex = re.compile(r"(.*) \(.*\)")
+pickedLabelsFile = "enwikiPickedLabels.txt"
# Open dbs
enwikiCon = sqlite3.connect(enwikiDb)
@@ -30,11 +29,10 @@ if os.path.exists(namesToSkipFile):
for line in file:
namesToSkip.add(line.rstrip())
print(f"Read in {len(namesToSkip)} names to skip")
-if os.path.exists(titlesToUseFile):
- with open(titlesToUseFile) as file:
+if os.path.exists(pickedLabelsFile):
+ with open(pickedLabelsFile) as file:
for line in file:
- title = line.rstrip()
- name = titleToUseRegex.sub(r"\1", title) # Remove parens
+ (name, _, title) = line.rstrip().partition("|")
nameToPickedTitle[name.lower()] = title
print(f"Read in {len(nameToPickedTitle)} titles to use for certain names")
# Get node names without descriptions