aboutsummaryrefslogtreecommitdiff
path: root/backend/data
diff options
context:
space:
mode:
Diffstat (limited to 'backend/data')
-rwxr-xr-xbackend/data/enwiki/downloadImgLicenseInfo.py2
-rwxr-xr-xbackend/data/enwiki/getEnwikiImgData.py16
-rwxr-xr-xbackend/data/genEnwikiDescData.py10
3 files changed, 17 insertions, 11 deletions
diff --git a/backend/data/enwiki/downloadImgLicenseInfo.py b/backend/data/enwiki/downloadImgLicenseInfo.py
index 5d99573..8231fbb 100755
--- a/backend/data/enwiki/downloadImgLicenseInfo.py
+++ b/backend/data/enwiki/downloadImgLicenseInfo.py
@@ -33,7 +33,7 @@ if dbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='im
# Get image names
print("Reading image names")
imgNames = set()
-for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs"):
+for (imgName,) in dbCur.execute("SELECT DISTINCT img_name FROM page_imgs WHERE img_name NOT NULL"):
imgNames.add(imgName)
print(f"Found {len(imgNames)} images")
oldSz = len(imgNames)
diff --git a/backend/data/enwiki/getEnwikiImgData.py b/backend/data/enwiki/getEnwikiImgData.py
index f9680ff..f8bb2ee 100755
--- a/backend/data/enwiki/getEnwikiImgData.py
+++ b/backend/data/enwiki/getEnwikiImgData.py
@@ -34,11 +34,20 @@ indexDbCur = indexDbCon.cursor()
imgDbCon = sqlite3.connect(imgDb)
imgDbCur = imgDbCon.cursor()
# Create image-db table
-imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)")
-imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+pidsDone = set()
+if imgDbCur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='page_imgs'").fetchone() == None:
+ imgDbCur.execute("CREATE TABLE page_imgs (page_id INT PRIMARY KEY, img_name TEXT)") # img_name may be NULL
+ imgDbCur.execute("CREATE INDEX page_imgs_idx ON page_imgs(img_name)")
+else:
+ for (pid,) in imgDbCur.execute("SELECT page_id FROM page_imgs"):
+ pidsDone.add(pid)
+ print(f"Will skip {len(pidsDone)} already-processed page-ids")
# Get input pageIds
print("Getting input page-ids", file=sys.stderr)
pageIds = getInputPageIds()
+for pid in pidsDone:
+ pageIds.remove(pid)
+print(f"Found {len(pageIds)} page-ids to process")
# Get page-id dump-file offsets
print("Getting dump-file offsets", file=sys.stderr)
offsetToPageids = {}
@@ -156,8 +165,7 @@ with open(dumpFile, mode='rb') as file:
content.append(line[:line.rfind("</text>")])
# Look for image-filename
imageName = getImageName(content)
- if imageName != None:
- imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
+ imgDbCur.execute("INSERT into page_imgs VALUES (?, ?)", (pageId, imageName))
break
if not foundTextEnd:
print(f"Did not find </text> for page id {pageId}", file=sys.stderr)
diff --git a/backend/data/genEnwikiDescData.py b/backend/data/genEnwikiDescData.py
index debd3fd..2396540 100755
--- a/backend/data/genEnwikiDescData.py
+++ b/backend/data/genEnwikiDescData.py
@@ -14,8 +14,7 @@ if len(sys.argv) > 1:
enwikiDb = "enwiki/enwikiData.db"
dbFile = "data.db"
namesToSkipFile = "genDescNamesToSkip.txt"
-titlesToUseFile = "genEnwikiDescTitlesToUse.txt"
-titleToUseRegex = re.compile(r"(.*) \(.*\)")
+pickedLabelsFile = "enwikiPickedLabels.txt"
# Open dbs
enwikiCon = sqlite3.connect(enwikiDb)
@@ -30,11 +29,10 @@ if os.path.exists(namesToSkipFile):
for line in file:
namesToSkip.add(line.rstrip())
print(f"Read in {len(namesToSkip)} names to skip")
-if os.path.exists(titlesToUseFile):
- with open(titlesToUseFile) as file:
+if os.path.exists(pickedLabelsFile):
+ with open(pickedLabelsFile) as file:
for line in file:
- title = line.rstrip()
- name = titleToUseRegex.sub(r"\1", title) # Remove parens
+ (name, _, title) = line.rstrip().partition("|")
nameToPickedTitle[name.lower()] = title
print(f"Read in {len(nameToPickedTitle)} titles to use for certain names")
# Get node names without descriptions