diff --git a/CHANGELOG.md b/CHANGELOG.md index b2590b227e5..6d6c5d82762 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -88,6 +88,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum * Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828) * Ensured device aware dtype handling in `dpnp.identity` and `dpnp.gradient` [#2835](https://github.com/IntelPython/dpnp/pull/2835) * Fixed `dpnp.tensor.round` to use device-aware output dtype for boolean input [#2851](https://github.com/IntelPython/dpnp/pull/2851) +* Resolved a deadlock in `dpnp.linalg.qr` by releasing the GIL before OneMKL `orgqr` call to prevent host tasks contention [#2850](https://github.com/IntelPython/dpnp/pull/2850) ### Security diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp index 2297d759ea8..09c2523fd48 100644 --- a/dpnp/backend/extensions/lapack/orgqr.cpp +++ b/dpnp/backend/extensions/lapack/orgqr.cpp @@ -87,8 +87,17 @@ static sycl::event orgqr_impl(sycl::queue &exec_q, sycl::event orgqr_event; try { + // Release GIL to avoid serialization of host task submissions + // to the same queue in OneMKL + py::gil_scoped_release lock{}; + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + // mkl_lapack::orgqr() is done through GPU-to-Host reverse offload: + // exec_q.submit([&](sycl::handler& cgh) { + // cgh.depends_on(depends); + // cgh.host_task([=]() { orgqr_host(...); }); + // }).wait(); orgqr_event = mkl_lapack::orgqr( exec_q, m, // The number of rows in the matrix; (0 ≤ m).