diff --git a/CHANGELOG.md b/CHANGELOG.md index f8aaae542ec5..bf659a351a57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.20.0] - MM/DD/2026 +This release introduces a major architectural change: the Array API-compliant tensor implementation has been migrated from `dpctl.tensor` into `dpnp.tensor`, simplifying maintenance, reducing cross-project dependencies, and allows the tensor implementation to evolve within `dpnp`. This release changes the license from `BSD-2-Clause` to `BSD-3-Clause`. This release achieves `dpnp` compatibility with Python 3.14 and enables distributing `dpnp` packages with the latest Python version. Also, that release drops support for Python 3.9, making Python 3.10 the minimum required version. @@ -28,6 +29,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595) * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787) * Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815) +* Migrated tensor implementation from `dpctl.tensor` into `dpnp.tensor`, making `dpnp` the primary owner of the Array API-compliant tensor layer [#2856](https://github.com/IntelPython/dpnp/pull/2856) ### Changed @@ -84,6 +86,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum * Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814) * Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828) * Ensured device aware dtype handling in `dpnp.identity` and `dpnp.gradient` [#2835](https://github.com/IntelPython/dpnp/pull/2835) +* Fixed `dpnp.tensor.round` to use device-aware output dtype for boolean input [#2851](https://github.com/IntelPython/dpnp/pull/2851) ### Security diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh index 9d280274c784..907ebb68088f 100755 --- a/conda-recipe/build.sh +++ b/conda-recipe/build.sh @@ -1,5 +1,39 @@ #!/bin/bash +# Test reproducer: +echo "building ..." +icpx -fsycl --gcc-install-dir=$BUILD_PREFIX/lib/gcc/x86_64-conda-linux-gnu/14.3.0 --sysroot=$BUILD_PREFIX/x86_64-conda-linux-gnu/sysroot test_minimal.cpp -o test_minimal +echo "build is completed, run now ..." +./test_minimal +echo "run is done" + +echo "create tmp folder: $SRC_DIR/tmp" +mkdir -p $SRC_DIR/tmp +echo "run with dump enabled ..." +export SYCL_CACHE_DISABLE=1 +export IGC_ShaderDumpEnable=1 +export IGC_ShaderDumpEnableAll=1 +export IGC_DumpToCustomDir=$SRC_DIR/tmp/ +./test_minimal + +echo "waiting for .asm files..." +timeout=5 +while [ $timeout -gt 0 ]; do + if find $SRC_DIR/tmp -name "*.asm" -print -quit | grep -q .; then + echo "found .asm files" + break + fi + sleep 1 + ((timeout--)) +done + +echo "list files..." +ls -la $SRC_DIR/tmp +echo "print dump:" +find $SRC_DIR/tmp -name "*.asm" +find $SRC_DIR/tmp -name "*.asm" | head -n 1 | xargs -r cat +echo "test is complete" + # This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix export LIBRARY_PATH="$LIBRARY_PATH:${BUILD_PREFIX}/lib" diff --git a/test_minimal.cpp b/test_minimal.cpp new file mode 100644 index 000000000000..efd41b54c5d2 --- /dev/null +++ b/test_minimal.cpp @@ -0,0 +1,321 @@ +// Minimal reproducer +// +// Build: icpx -fsycl --gcc-install-dir=$CONDA_PREFIX/lib/gcc/x86_64-conda-linux-gnu/14.3.0 --sysroot=$CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot test_minimal.cpp -o test_minimal +// Run: ./test_minimal + +#include +#include +#include +#include + +using namespace sycl; + +// Print detailed device information +void print_device_info(const device& dev) { + std::cout << "========================================" << std::endl; + std::cout << "DEVICE INFORMATION" << std::endl; + std::cout << "========================================" << std::endl; + std::cout << std::endl; + + std::cout << "Device name: " << dev.get_info() << std::endl; + std::cout << "Vendor: " << dev.get_info() << std::endl; + std::cout << "Driver version: " << dev.get_info() << std::endl; + std::cout << "Device version: " << dev.get_info() << std::endl; + + std::cout << std::endl; + std::cout << "Device type: "; + if (dev.is_cpu()) std::cout << "CPU"; + else if (dev.is_gpu()) std::cout << "GPU"; + else if (dev.is_accelerator()) std::cout << "Accelerator"; + else std::cout << "Unknown"; + std::cout << std::endl; + + std::cout << std::endl; + std::cout << "Max compute units: " << dev.get_info() << std::endl; + std::cout << "Max work group size: " << dev.get_info() << std::endl; + std::cout << "Max work item dimensions: " << dev.get_info() << std::endl; + + auto max_work_item_sizes = dev.get_info>(); + std::cout << "Max work item sizes: [" + << max_work_item_sizes[0] << ", " + << max_work_item_sizes[1] << ", " + << max_work_item_sizes[2] << "]" << std::endl; + + std::cout << std::endl; + std::cout << "Global mem size: " + << (dev.get_info() / (1024*1024)) << " MB" << std::endl; + std::cout << "Local mem size: " + << (dev.get_info() / 1024) << " KB" << std::endl; + std::cout << "Max mem alloc size: " + << (dev.get_info() / (1024*1024)) << " MB" << std::endl; + + std::cout << std::endl; + std::cout << "Supports USM device: " + << (dev.has(aspect::usm_device_allocations) ? "YES" : "NO") << std::endl; + std::cout << "Supports USM host: " + << (dev.has(aspect::usm_host_allocations) ? "YES" : "NO") << std::endl; + std::cout << "Supports USM shared: " + << (dev.has(aspect::usm_shared_allocations) ? "YES" : "NO") << std::endl; + + std::cout << std::endl; + std::cout << "========================================" << std::endl; + std::cout << std::endl; +} + +// Kernel with backward dimension writes +template +class NonzeroIndexKernel; + +template +sycl::event extract_nonzero_indices( + queue &q, + size_t n_elems, + size_t nz_count, + int ndim, + const cumsumT* cumsum_data, + indexT* indices_data, + const size_t* shape +) +{ + constexpr size_t lws = 256; + const size_t n_groups = (n_elems + lws - 1) / lws; + + return q.submit([&](handler &cgh) { + local_accessor local_cumsum(lws + 1, cgh); + + cgh.parallel_for>( + nd_range<1>(n_groups * lws, lws), + [=](nd_item<1> ndit) { + const size_t gid = ndit.get_global_id(0); + const size_t lid = ndit.get_local_id(0); + const size_t group_id = ndit.get_group(0); + const size_t group_start = group_id * lws; + + // Load cumsum with halo + if (lid == 0) { + local_cumsum[0] = (group_start == 0) ? 0 : cumsum_data[group_start - 1]; + } + if (group_start + lid < n_elems) { + local_cumsum[lid + 1] = cumsum_data[group_start + lid]; + } + + group_barrier(ndit.get_group()); + + if (gid < n_elems) { + bool is_nonzero = (local_cumsum[lid + 1] != local_cumsum[lid]); + + if (is_nonzero) { + cumsumT output_pos = local_cumsum[lid + 1] - 1; + size_t flat_idx = gid; + + for (int dim = ndim - 1; dim >= 0; dim--) { + indices_data[output_pos * ndim + dim] = flat_idx % shape[dim]; + flat_idx /= shape[dim]; + } + } + } + } + ); + }); +} + +int main() { + queue q; + int64_t *cumsum_device = nullptr; + size_t *indices_device = nullptr; + size_t *shape_device = nullptr; + size_t *indices_host = nullptr; + + try { + q = queue(default_selector_v); + + auto device = q.get_device(); + print_device_info(device); + + std::cout << "========================================" << std::endl; + std::cout << "TEST CONFIGURATION" << std::endl; + std::cout << "========================================" << std::endl; + std::cout << std::endl; + + // Test parameters + const size_t n_elems = 6; + const int ndim = 2; + const size_t nz_count = 3; + const std::vector shape = {2, 3}; + + std::cout << "Input array (flat): [1, 0, 0, 4, 0, 6]" << std::endl; + std::cout << "Input array (2D): [[1, 0, 0]," << std::endl; + std::cout << " [4, 0, 6]]" << std::endl; + std::cout << "Shape: [" << shape[0] << ", " << shape[1] << "]" << std::endl; + std::cout << std::endl; + + std::cout << "Cumsum (precomputed): [1, 1, 1, 2, 2, 3]" << std::endl; + std::cout << "Nonzero elements: 3" << std::endl; + std::cout << "Nonzero positions:" << std::endl; + std::cout << " gid=0 → output[0] → row=0, col=0" << std::endl; + std::cout << " gid=3 → output[1] → row=1, col=0" << std::endl; + std::cout << " gid=5 → output[2] → row=1, col=2" << std::endl; + std::cout << std::endl; + + std::cout << "Kernel configuration:" << std::endl; + std::cout << " Work group size: 256" << std::endl; + std::cout << " Number of groups: 1" << std::endl; + std::cout << " Total work items: 256" << std::endl; + std::cout << " Active work items: 6 (processing 6 elements)" << std::endl; + std::cout << " Local memory: (256 + 1) * 8 bytes = 2056 bytes" << std::endl; + std::cout << std::endl; + + std::cout << "========================================" << std::endl; + std::cout << std::endl; + + // Hardcoded cumsum values for input [[1, 0, 0], [4, 0, 6]] + int64_t cumsum_values[] = {1, 1, 1, 2, 2, 3}; + + // Allocate device memory + cumsum_device = malloc_device(n_elems, q); + indices_device = malloc_device(nz_count * ndim, q); + shape_device = malloc_device(ndim, q); + + if (!cumsum_device || !indices_device || !shape_device) { + throw std::runtime_error("Device allocation failed"); + } + + // Copy data to device + q.copy(cumsum_values, cumsum_device, n_elems).wait(); + q.copy(shape.data(), shape_device, ndim).wait(); + + std::cout << "Running kernel..." << std::endl; + std::cout << "(writes dim 1 first, then dim 0)" << std::endl; + std::cout << std::endl; + + // Run the kernel + auto kernel_ev = extract_nonzero_indices( + q, n_elems, nz_count, ndim, + cumsum_device, indices_device, shape_device + ); + kernel_ev.wait(); + + // Read results + indices_host = malloc_host(nz_count * ndim, q); + if (!indices_host) { + throw std::runtime_error("Host allocation failed"); + } + q.copy(indices_device, indices_host, nz_count * ndim).wait(); + + std::cout << "========================================" << std::endl; + std::cout << "RESULTS" << std::endl; + std::cout << "========================================" << std::endl; + std::cout << std::endl; + + // Print raw packed output + std::cout << "Raw packed output: ["; + for (size_t i = 0; i < nz_count * ndim; i++) { + std::cout << indices_host[i]; + if (i < nz_count * ndim - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; + std::cout << "Expected output: [0, 0, 1, 0, 1, 2]" << std::endl; + std::cout << "Format: [row0, col0, row1, col1, row2, col2]" << std::endl; + std::cout << std::endl; + + // Unpack + std::vector rows(nz_count), cols(nz_count); + for (size_t i = 0; i < nz_count; i++) { + rows[i] = indices_host[i * ndim + 0]; + cols[i] = indices_host[i * ndim + 1]; + } + + std::cout << "Row indices: ["; + for (auto v : rows) std::cout << v << " "; + std::cout << "]" << std::endl; + std::cout << "Expected rows: [0 1 1]" << std::endl; + std::cout << std::endl; + + std::cout << "Col indices: ["; + for (auto v : cols) std::cout << v << " "; + std::cout << "]" << std::endl; + std::cout << "Expected cols: [0 0 2]" << std::endl; + std::cout << std::endl; + + // Verify + std::vector expected_rows = {0, 1, 1}; + std::vector expected_cols = {0, 0, 2}; + bool correct = (rows == expected_rows) && (cols == expected_cols); + + std::cout << "========================================" << std::endl; + if (correct) { + std::cout << "✓ Test PASSED!" << std::endl; + return 0; + } else { + std::cout << "✗ Test FAILED!" << std::endl; + std::cout << std::endl; + std::cout << "Analysis:" << std::endl; + + // Detailed analysis + bool rows_match = (rows == expected_rows); + bool cols_match = (cols == expected_cols); + + if (!rows_match) { + std::cout << " - Row indices are WRONG" << std::endl; + std::cout << " Expected: [0 1 1]" << std::endl; + std::cout << " Got: ["; + for (auto v : rows) std::cout << v << " "; + std::cout << "]" << std::endl; + } else { + std::cout << " - Row indices are correct" << std::endl; + } + + if (!cols_match) { + std::cout << " - Column indices are WRONG" << std::endl; + std::cout << " Expected: [0 0 2]" << std::endl; + std::cout << " Got: ["; + for (auto v : cols) std::cout << v << " "; + std::cout << "]" << std::endl; + } else { + std::cout << " - Column indices are correct" << std::endl; + } + + std::cout << std::endl; + + // Cleanup + if (cumsum_device) free(cumsum_device, q); + if (indices_device) free(indices_device, q); + if (shape_device) free(shape_device, q); + if (indices_host) free(indices_host, q); + + return 1; + } + + // Cleanup + if (cumsum_device) free(cumsum_device, q); + if (indices_device) free(indices_device, q); + if (shape_device) free(shape_device, q); + if (indices_host) free(indices_host, q); + + return 0; + + } catch (exception const& e) { + std::cerr << std::endl; + std::cerr << "========================================" << std::endl; + std::cerr << "SYCL EXCEPTION" << std::endl; + std::cerr << "========================================" << std::endl; + std::cerr << e.what() << std::endl; + + // Cleanup on error + if (cumsum_device) free(cumsum_device, q); + if (indices_device) free(indices_device, q); + if (shape_device) free(shape_device, q); + if (indices_host) free(indices_host, q); + + return 1; + } catch (std::exception const& e) { + std::cerr << std::endl; + std::cerr << "========================================" << std::endl; + std::cerr << "STANDARD EXCEPTION" << std::endl; + std::cerr << "========================================" << std::endl; + std::cerr << e.what() << std::endl; + + // Note: Can't cleanup here as we don't have queue reference + return 1; + } +}