diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8aaae542ec5..bf659a351a57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [0.20.0] - MM/DD/2026
 
+This release introduces a major architectural change: the Array API-compliant tensor implementation has been migrated from `dpctl.tensor` into `dpnp.tensor`, simplifying maintenance, reducing cross-project dependencies, and allows the tensor implementation to evolve within `dpnp`.
 This release changes the license from `BSD-2-Clause` to `BSD-3-Clause`.
 This release achieves `dpnp` compatibility with Python 3.14 and enables distributing `dpnp` packages with the latest Python version.
 Also, that release drops support for Python 3.9, making Python 3.10 the minimum required version.
@@ -28,6 +29,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
 * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
 * Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815)
+* Migrated tensor implementation from `dpctl.tensor` into `dpnp.tensor`, making `dpnp` the primary owner of the Array API-compliant tensor layer [#2856](https://github.com/IntelPython/dpnp/pull/2856)
 
 ### Changed
 
@@ -84,6 +86,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814)
 * Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828)
 * Ensured device aware dtype handling in `dpnp.identity` and `dpnp.gradient` [#2835](https://github.com/IntelPython/dpnp/pull/2835)
+* Fixed `dpnp.tensor.round` to use device-aware output dtype for boolean input [#2851](https://github.com/IntelPython/dpnp/pull/2851)
 
 ### Security
 
diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh
index 9d280274c784..907ebb68088f 100755
--- a/conda-recipe/build.sh
+++ b/conda-recipe/build.sh
@@ -1,5 +1,39 @@
 #!/bin/bash
 
+# Test reproducer:
+echo "building ..."
+icpx -fsycl --gcc-install-dir=$BUILD_PREFIX/lib/gcc/x86_64-conda-linux-gnu/14.3.0 --sysroot=$BUILD_PREFIX/x86_64-conda-linux-gnu/sysroot test_minimal.cpp -o test_minimal
+echo "build is completed, run now ..."
+./test_minimal
+echo "run is done"
+
+echo "create tmp folder: $SRC_DIR/tmp"
+mkdir -p $SRC_DIR/tmp
+echo "run with dump enabled ..."
+export SYCL_CACHE_DISABLE=1
+export IGC_ShaderDumpEnable=1
+export IGC_ShaderDumpEnableAll=1
+export IGC_DumpToCustomDir=$SRC_DIR/tmp/
+./test_minimal
+
+echo "waiting for .asm files..."
+timeout=5
+while [ $timeout -gt 0 ]; do
+    if find $SRC_DIR/tmp -name "*.asm" -print -quit | grep -q .; then
+        echo "found .asm files"
+        break
+    fi
+    sleep 1
+    ((timeout--))
+done
+
+echo "list files..."
+ls -la $SRC_DIR/tmp
+echo "print dump:"
+find $SRC_DIR/tmp -name "*.asm"
+find $SRC_DIR/tmp -name "*.asm" | head -n 1 | xargs -r cat
+echo "test is complete"
+
 # This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
 export LIBRARY_PATH="$LIBRARY_PATH:${BUILD_PREFIX}/lib"
 
diff --git a/test_minimal.cpp b/test_minimal.cpp
new file mode 100644
index 000000000000..efd41b54c5d2
--- /dev/null
+++ b/test_minimal.cpp
@@ -0,0 +1,321 @@
+// Minimal reproducer
+//
+// Build: icpx -fsycl --gcc-install-dir=$CONDA_PREFIX/lib/gcc/x86_64-conda-linux-gnu/14.3.0 --sysroot=$CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot test_minimal.cpp -o test_minimal
+// Run: ./test_minimal
+
+#include <sycl/sycl.hpp>
+#include <iostream>
+#include <vector>
+#include <iomanip>
+
+using namespace sycl;
+
+// Print detailed device information
+void print_device_info(const device& dev) {
+    std::cout << "========================================" << std::endl;
+    std::cout << "DEVICE INFORMATION" << std::endl;
+    std::cout << "========================================" << std::endl;
+    std::cout << std::endl;
+
+    std::cout << "Device name:              " << dev.get_info<info::device::name>() << std::endl;
+    std::cout << "Vendor:                   " << dev.get_info<info::device::vendor>() << std::endl;
+    std::cout << "Driver version:           " << dev.get_info<info::device::driver_version>() << std::endl;
+    std::cout << "Device version:           " << dev.get_info<info::device::version>() << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "Device type:              ";
+    if (dev.is_cpu()) std::cout << "CPU";
+    else if (dev.is_gpu()) std::cout << "GPU";
+    else if (dev.is_accelerator()) std::cout << "Accelerator";
+    else std::cout << "Unknown";
+    std::cout << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "Max compute units:        " << dev.get_info<info::device::max_compute_units>() << std::endl;
+    std::cout << "Max work group size:      " << dev.get_info<info::device::max_work_group_size>() << std::endl;
+    std::cout << "Max work item dimensions: " << dev.get_info<info::device::max_work_item_dimensions>() << std::endl;
+
+    auto max_work_item_sizes = dev.get_info<info::device::max_work_item_sizes<3>>();
+    std::cout << "Max work item sizes:      ["
+              << max_work_item_sizes[0] << ", "
+              << max_work_item_sizes[1] << ", "
+              << max_work_item_sizes[2] << "]" << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "Global mem size:          "
+              << (dev.get_info<info::device::global_mem_size>() / (1024*1024)) << " MB" << std::endl;
+    std::cout << "Local mem size:           "
+              << (dev.get_info<info::device::local_mem_size>() / 1024) << " KB" << std::endl;
+    std::cout << "Max mem alloc size:       "
+              << (dev.get_info<info::device::max_mem_alloc_size>() / (1024*1024)) << " MB" << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "Supports USM device:      "
+              << (dev.has(aspect::usm_device_allocations) ? "YES" : "NO") << std::endl;
+    std::cout << "Supports USM host:        "
+              << (dev.has(aspect::usm_host_allocations) ? "YES" : "NO") << std::endl;
+    std::cout << "Supports USM shared:      "
+              << (dev.has(aspect::usm_shared_allocations) ? "YES" : "NO") << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "========================================" << std::endl;
+    std::cout << std::endl;
+}
+
+// Kernel with backward dimension writes
+template <typename cumsumT, typename indexT>
+class NonzeroIndexKernel;
+
+template <typename cumsumT, typename indexT>
+sycl::event extract_nonzero_indices(
+    queue &q,
+    size_t n_elems,
+    size_t nz_count,
+    int ndim,
+    const cumsumT* cumsum_data,
+    indexT* indices_data,
+    const size_t* shape
+)
+{
+    constexpr size_t lws = 256;
+    const size_t n_groups = (n_elems + lws - 1) / lws;
+
+    return q.submit([&](handler &cgh) {
+        local_accessor<cumsumT, 1> local_cumsum(lws + 1, cgh);
+
+        cgh.parallel_for<NonzeroIndexKernel<cumsumT, indexT>>(
+            nd_range<1>(n_groups * lws, lws),
+            [=](nd_item<1> ndit) {
+                const size_t gid = ndit.get_global_id(0);
+                const size_t lid = ndit.get_local_id(0);
+                const size_t group_id = ndit.get_group(0);
+                const size_t group_start = group_id * lws;
+
+                // Load cumsum with halo
+                if (lid == 0) {
+                    local_cumsum[0] = (group_start == 0) ? 0 : cumsum_data[group_start - 1];
+                }
+                if (group_start + lid < n_elems) {
+                    local_cumsum[lid + 1] = cumsum_data[group_start + lid];
+                }
+
+                group_barrier(ndit.get_group());
+
+                if (gid < n_elems) {
+                    bool is_nonzero = (local_cumsum[lid + 1] != local_cumsum[lid]);
+
+                    if (is_nonzero) {
+                        cumsumT output_pos = local_cumsum[lid + 1] - 1;
+                        size_t flat_idx = gid;
+
+                        for (int dim = ndim - 1; dim >= 0; dim--) {
+                            indices_data[output_pos * ndim + dim] = flat_idx % shape[dim];
+                            flat_idx /= shape[dim];
+                        }
+                    }
+                }
+            }
+        );
+    });
+}
+
+int main() {
+    queue q;
+    int64_t *cumsum_device = nullptr;
+    size_t *indices_device = nullptr;
+    size_t *shape_device = nullptr;
+    size_t *indices_host = nullptr;
+
+    try {
+        q = queue(default_selector_v);
+
+        auto device = q.get_device();
+        print_device_info(device);
+
+        std::cout << "========================================" << std::endl;
+        std::cout << "TEST CONFIGURATION" << std::endl;
+        std::cout << "========================================" << std::endl;
+        std::cout << std::endl;
+
+        // Test parameters
+        const size_t n_elems = 6;
+        const int ndim = 2;
+        const size_t nz_count = 3;
+        const std::vector<size_t> shape = {2, 3};
+
+        std::cout << "Input array (flat):  [1, 0, 0, 4, 0, 6]" << std::endl;
+        std::cout << "Input array (2D):    [[1, 0, 0]," << std::endl;
+        std::cout << "                      [4, 0, 6]]" << std::endl;
+        std::cout << "Shape:               [" << shape[0] << ", " << shape[1] << "]" << std::endl;
+        std::cout << std::endl;
+
+        std::cout << "Cumsum (precomputed): [1, 1, 1, 2, 2, 3]" << std::endl;
+        std::cout << "Nonzero elements:     3" << std::endl;
+        std::cout << "Nonzero positions:" << std::endl;
+        std::cout << "  gid=0 → output[0] → row=0, col=0" << std::endl;
+        std::cout << "  gid=3 → output[1] → row=1, col=0" << std::endl;
+        std::cout << "  gid=5 → output[2] → row=1, col=2" << std::endl;
+        std::cout << std::endl;
+
+        std::cout << "Kernel configuration:" << std::endl;
+        std::cout << "  Work group size:     256" << std::endl;
+        std::cout << "  Number of groups:    1" << std::endl;
+        std::cout << "  Total work items:    256" << std::endl;
+        std::cout << "  Active work items:   6 (processing 6 elements)" << std::endl;
+        std::cout << "  Local memory:        (256 + 1) * 8 bytes = 2056 bytes" << std::endl;
+        std::cout << std::endl;
+
+        std::cout << "========================================" << std::endl;
+        std::cout << std::endl;
+
+        // Hardcoded cumsum values for input [[1, 0, 0], [4, 0, 6]]
+        int64_t cumsum_values[] = {1, 1, 1, 2, 2, 3};
+
+        // Allocate device memory
+        cumsum_device = malloc_device<int64_t>(n_elems, q);
+        indices_device = malloc_device<size_t>(nz_count * ndim, q);
+        shape_device = malloc_device<size_t>(ndim, q);
+
+        if (!cumsum_device || !indices_device || !shape_device) {
+            throw std::runtime_error("Device allocation failed");
+        }
+
+        // Copy data to device
+        q.copy<int64_t>(cumsum_values, cumsum_device, n_elems).wait();
+        q.copy<size_t>(shape.data(), shape_device, ndim).wait();
+
+        std::cout << "Running kernel..." << std::endl;
+        std::cout << "(writes dim 1 first, then dim 0)" << std::endl;
+        std::cout << std::endl;
+
+        // Run the kernel
+        auto kernel_ev = extract_nonzero_indices<int64_t, size_t>(
+            q, n_elems, nz_count, ndim,
+            cumsum_device, indices_device, shape_device
+        );
+        kernel_ev.wait();
+
+        // Read results
+        indices_host = malloc_host<size_t>(nz_count * ndim, q);
+        if (!indices_host) {
+            throw std::runtime_error("Host allocation failed");
+        }
+        q.copy<size_t>(indices_device, indices_host, nz_count * ndim).wait();
+
+        std::cout << "========================================" << std::endl;
+        std::cout << "RESULTS" << std::endl;
+        std::cout << "========================================" << std::endl;
+        std::cout << std::endl;
+
+        // Print raw packed output
+        std::cout << "Raw packed output: [";
+        for (size_t i = 0; i < nz_count * ndim; i++) {
+            std::cout << indices_host[i];
+            if (i < nz_count * ndim - 1) std::cout << ", ";
+        }
+        std::cout << "]" << std::endl;
+        std::cout << "Expected output:   [0, 0, 1, 0, 1, 2]" << std::endl;
+        std::cout << "Format:            [row0, col0, row1, col1, row2, col2]" << std::endl;
+        std::cout << std::endl;
+
+        // Unpack
+        std::vector<size_t> rows(nz_count), cols(nz_count);
+        for (size_t i = 0; i < nz_count; i++) {
+            rows[i] = indices_host[i * ndim + 0];
+            cols[i] = indices_host[i * ndim + 1];
+        }
+
+        std::cout << "Row indices:       [";
+        for (auto v : rows) std::cout << v << " ";
+        std::cout << "]" << std::endl;
+        std::cout << "Expected rows:     [0 1 1]" << std::endl;
+        std::cout << std::endl;
+
+        std::cout << "Col indices:       [";
+        for (auto v : cols) std::cout << v << " ";
+        std::cout << "]" << std::endl;
+        std::cout << "Expected cols:     [0 0 2]" << std::endl;
+        std::cout << std::endl;
+
+        // Verify
+        std::vector<size_t> expected_rows = {0, 1, 1};
+        std::vector<size_t> expected_cols = {0, 0, 2};
+        bool correct = (rows == expected_rows) && (cols == expected_cols);
+
+        std::cout << "========================================" << std::endl;
+        if (correct) {
+            std::cout << "✓ Test PASSED!" << std::endl;
+            return 0;
+        } else {
+            std::cout << "✗ Test FAILED!" << std::endl;
+            std::cout << std::endl;
+            std::cout << "Analysis:" << std::endl;
+
+            // Detailed analysis
+            bool rows_match = (rows == expected_rows);
+            bool cols_match = (cols == expected_cols);
+
+            if (!rows_match) {
+                std::cout << "  - Row indices are WRONG" << std::endl;
+                std::cout << "    Expected: [0 1 1]" << std::endl;
+                std::cout << "    Got:      [";
+                for (auto v : rows) std::cout << v << " ";
+                std::cout << "]" << std::endl;
+            } else {
+                std::cout << "  - Row indices are correct" << std::endl;
+            }
+
+            if (!cols_match) {
+                std::cout << "  - Column indices are WRONG" << std::endl;
+                std::cout << "    Expected: [0 0 2]" << std::endl;
+                std::cout << "    Got:      [";
+                for (auto v : cols) std::cout << v << " ";
+                std::cout << "]" << std::endl;
+            } else {
+                std::cout << "  - Column indices are correct" << std::endl;
+            }
+
+            std::cout << std::endl;
+
+            // Cleanup
+            if (cumsum_device) free(cumsum_device, q);
+            if (indices_device) free(indices_device, q);
+            if (shape_device) free(shape_device, q);
+            if (indices_host) free(indices_host, q);
+
+            return 1;
+        }
+
+        // Cleanup
+        if (cumsum_device) free(cumsum_device, q);
+        if (indices_device) free(indices_device, q);
+        if (shape_device) free(shape_device, q);
+        if (indices_host) free(indices_host, q);
+
+        return 0;
+
+    } catch (exception const& e) {
+        std::cerr << std::endl;
+        std::cerr << "========================================" << std::endl;
+        std::cerr << "SYCL EXCEPTION" << std::endl;
+        std::cerr << "========================================" << std::endl;
+        std::cerr << e.what() << std::endl;
+
+        // Cleanup on error
+        if (cumsum_device) free(cumsum_device, q);
+        if (indices_device) free(indices_device, q);
+        if (shape_device) free(shape_device, q);
+        if (indices_host) free(indices_host, q);
+
+        return 1;
+    } catch (std::exception const& e) {
+        std::cerr << std::endl;
+        std::cerr << "========================================" << std::endl;
+        std::cerr << "STANDARD EXCEPTION" << std::endl;
+        std::cerr << "========================================" << std::endl;
+        std::cerr << e.what() << std::endl;
+
+        // Note: Can't cleanup here as we don't have queue reference
+        return 1;
+    }
+}