aboutsummaryrefslogtreecommitdiff
path: root/backend/data/enwiki/getEnwikiImgData.py
diff options
context:
space:
mode:
authorTerry Truong <terry06890@gmail.com>2022-06-19 14:50:38 +1000
committerTerry Truong <terry06890@gmail.com>2022-06-19 14:50:38 +1000
commit19c5a2b6c57b8c4245bb07773caa8df598b3feec (patch)
treee944f66fff51448e75677982c3b3118856aeeb18 /backend/data/enwiki/getEnwikiImgData.py
parent8c8ed28c87e649f163aaa54899f4b00c9fc31224 (diff)
Make manual enwiki-desc-associations more flexible
Also enable getEnwikiImgData.py to skip already-processed wiki-ids
Diffstat (limited to 'backend/data/enwiki/getEnwikiImgData.py')
-rwxr-xr-xbackend/data/enwiki/getEnwikiImgData.py16
1 files changed, 12 insertions, 4 deletions
diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py
index f9680ff..f8bb2ee 100755
--- a/backend/data/enwiki/getEnwikiImgData.py
+++ b/backend/data/enwiki/getEnwikiImgData.py
@@ -34,11 +34,20 @@ indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
# Create image-db table
-imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)")
-imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+pidsDone = set()
+if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+ imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
+ imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+else:
+ for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+ pidsDone.add(pid)
+ print(f"Will skip {len(pidsDone)} already-processed page-ids")
# Get input pageIds
print("Getting input page-ids", file=sys.stderr)
pageIds = getInputPageIds()
+for pid in pidsDone:
+ pageIds.remove(pid)
+print(f"Found {len(pageIds)} page-ids to process")
# Get page-id dump-file offsets
print("Getting dump-file offsets", file=sys.stderr)
offsetToPageids = {}
@@ -156,8 +165,7 @@ with open(dumpFile, mode='rb') as file:
content.append(line[:line.rfind("</text>")])
# Look for image-filename
imageName = getImageName(content)
- if imageName != None:
- imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+ imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
break
if not foundTextEnd:
print(f"Did not find </text> for page id {pageId}", file=sys.stderr)