From 43d033d48067142f2baffce30fe3c80067711503 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Jun 2026 14:52:01 +0200 Subject: [PATCH] fix: apply skip_empty after pagination in MemoryDatasetClient.get_data The skip_empty filter was applied before the offset/limit slice, so the same query returned different items than FileSystemDatasetClient.get_data and even MemoryDatasetClient.iterate_items. Paginate first and filter the sliced page to keep all storage clients consistent. --- src/crawlee/storage_clients/_memory/_dataset_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 0028292e63..85206d4186 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -171,10 +171,6 @@ async def get_data( total = len(self._records) items = self._records.copy() - # Apply skip_empty filter if requested - if skip_empty: - items = [item for item in items if item] - # Apply sorting if desc: items = list(reversed(items)) @@ -182,6 +178,10 @@ async def get_data( # Apply pagination sliced_items = items[offset : (offset + limit) if limit is not None else total] + # Apply skip_empty filter if requested + if skip_empty: + sliced_items = [item for item in sliced_items if item] + await self._update_metadata(update_accessed_at=True) return DatasetItemsListPage(