diff options
Diffstat (limited to 'backend/data')
| -rwxr-xr-x | backend/data/enwiki/downloadImgLicenseInfo.py | 2 | ||||
| -rwxr-xr-x | backend/data/enwiki/getEnwikiImgData.py | 16 | ||||
| -rwxr-xr-x | backend/data/genEnwikiDescData.py | 10 |
3 files changed, 17 insertions, 11 deletions
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py index 5d99573..8231fbb 100755 --- a/backend/data/enwiki/downloadImgLicenseInfo.py +++ b/backend/data/enwiki/downloadImgLicenseInfo.py @@ -33,7 +33,7 @@ if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='im # Get image names print("Reading image names") imgNames = set() -for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs"): +for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"): imgNames.add(imgName) print(f"Found {len(imgNames)} images") oldSz = len(imgNames) diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py index f9680ff..f8bb2ee 100755 --- a/backend/data/enwiki/getEnwikiImgData.py +++ b/backend/data/enwiki/getEnwikiImgData.py @@ -34,11 +34,20 @@ indexDbCur = indexDbCon.cursor() imgDbCon = sqlite3.connect(imgDb) imgDbCur = imgDbCon.cursor() # Create image-db table -imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") -imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") +pidsDone = set() +if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None: + imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL + imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)") +else: + for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"): + pidsDone.add(pid) + print(f"Will skip {len(pidsDone)} already-processed page-ids") # Get input pageIds print("Getting input page-ids", file=sys.stderr) pageIds = getInputPageIds() +for pid in pidsDone: + pageIds.remove(pid) +print(f"Found {len(pageIds)} page-ids to process") # Get page-id dump-file offsets print("Getting dump-file offsets", file=sys.stderr) offsetToPageids = {} @@ -156,8 +165,7 @@ with open(dumpFile, mode='rb') as file: content.append(line[:line.rfind("</text>")]) # Look for image-filename imageName = getImageName(content) - if imageName != None: - imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName)) + imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName)) break if not foundTextEnd: print(f"Did not find </text> for page id {pageId}", file=sys.stderr) diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py index debd3fd..2396540 100755 --- a/backend/data/genEnwikiDescData.py +++ b/backend/data/genEnwikiDescData.py @@ -14,8 +14,7 @@ if len(sys.argv) > 1: enwikiDb = "enwiki/enwikiData.db" dbFile = "data.db" namesToSkipFile = "genDescNamesToSkip.txt" -titlesToUseFile = "genEnwikiDescTitlesToUse.txt" -titleToUseRegex = re.compile(r"(.*) \(.*\)") +pickedLabelsFile = "enwikiPickedLabels.txt" # Open dbs enwikiCon = sqlite3.connect(enwikiDb) @@ -30,11 +29,10 @@ if os.path.exists(namesToSkipFile): for line in file: namesToSkip.add(line.rstrip()) print(f"Read in {len(namesToSkip)} names to skip") -if os.path.exists(titlesToUseFile): - with open(titlesToUseFile) as file: +if os.path.exists(pickedLabelsFile): + with open(pickedLabelsFile) as file: for line in file: - title = line.rstrip() - name = titleToUseRegex.sub(r"\1", title) # Remove parens + (name, _, title) = line.rstrip().partition("|") nameToPickedTitle[name.lower()] = title print(f"Read in {len(nameToPickedTitle)} titles to use for certain names") # Get node names without descriptions |
