From 385a6b43e45cbcf7d334a1e0fa830ce958d53af5 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Jun 2026 15:00:32 +0200 Subject: [PATCH] fix: retry sitemap fetching on error and raise when retries are exhausted The try/except in _fetch_and_process_sitemap wrapped the whole retry loop instead of its body, so any fetch error exited the loop after a single attempt and was silently swallowed, leaving callers with empty or partial results. Each attempt is now independently retried and the error is raised once all retries are exhausted. --- src/crawlee/_utils/sitemap.py | 19 ++++++----- tests/unit/_utils/test_sitemap.py | 55 ++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 2632ed98e5..74377f24d2 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -321,9 +321,9 @@ async def _fetch_and_process_sitemap( sitemap_url = source['url'] - try: - while retries_left > 0: - retries_left -= 1 + while retries_left > 0: + retries_left -= 1 + try: async with http_client.stream( sitemap_url, method='GET', headers=SITEMAP_HEADERS, proxy_info=proxy_info, timeout=timeout ) as response: @@ -372,12 +372,15 @@ async def _fetch_and_process_sitemap( yield result finally: parser.close() - break + break - except Exception as e: - if retries_left > 0: - logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}') - await asyncio.sleep(1) # Brief pause before retry + except Exception as e: + if retries_left > 0: + logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}') + await asyncio.sleep(1) # Brief pause before retry + else: + logger.exception(f'Failed to fetch sitemap {sitemap_url}, no retries left.') + raise class Sitemap: diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index 5f2005ca16..20fb761751 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -1,14 +1,19 @@ import base64 import gzip +from contextlib import asynccontextmanager from datetime import datetime -from typing import Any +from typing import TYPE_CHECKING, Any, cast from unittest.mock import AsyncMock, MagicMock +import pytest from yarl import URL from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap from crawlee.http_clients._base import HttpClient, HttpResponse +if TYPE_CHECKING: + from collections.abc import AsyncIterator + BASIC_SITEMAP = """ @@ -65,6 +70,30 @@ async def send_request(url: str, **_kwargs: Any) -> HttpResponse: return client +def _make_flaky_stream_client(body: bytes, *, fail_times: int) -> tuple[AsyncMock, list[int]]: + """Create a mock client whose `stream` fails with a network error for the first `fail_times` calls.""" + attempts: list[int] = [] + + @asynccontextmanager + async def stream(_url: str, **_kwargs: Any) -> 'AsyncIterator[HttpResponse]': + attempt = len(attempts) + 1 + attempts.append(attempt) + if attempt <= fail_times: + raise ConnectionError(f'Network error on attempt {attempt}') + + async def read_stream() -> 'AsyncIterator[bytes]': + yield body + + response = MagicMock(spec=HttpResponse) + response.headers = {'content-type': 'application/xml; charset=utf-8'} + response.read_stream = read_stream + yield cast('HttpResponse', response) + + client = AsyncMock(spec=HttpClient) + client.stream = stream + return client, attempts + + def compress_gzip(data: str) -> bytes: """Compress a string using gzip.""" return gzip.compress(data.encode()) @@ -267,6 +296,30 @@ async def test_sitemap_from_string() -> None: assert set(sitemap.urls) == BASIC_RESULTS +async def test_sitemap_fetch_retries_on_transient_error() -> None: + """Transient fetch errors are retried up to `sitemap_retries` times before giving up.""" + client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=2) + + items = [ + item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client) + ] + + assert len(attempts) == 3 + assert {item.loc for item in items} == BASIC_RESULTS + + +async def test_sitemap_fetch_raises_after_retries_exhausted() -> None: + """A persistent fetch error is raised to the caller once all retries are exhausted.""" + client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=10) + + with pytest.raises(ConnectionError): + _ = [ + item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client) + ] + + assert len(attempts) == 3 + + async def test_discover_sitemap_from_robots_txt() -> None: """Sitemap URL found in robots.txt is yielded.""" robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml'