diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c8a024c15..9ca0eda18f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch, Spatz)")
+set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch Spatz)
 
 if(platform STREQUAL MemPool)
   message(STATUS "Building for platform 'MemPool'")
@@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier)
   message(STATUS "Building for platform 'SoftHier'")
 elseif(platform STREQUAL Chimera)
   message(STATUS "Building for platform 'Chimera'")
+elseif(platform STREQUAL Spatz)
+  message(STATUS "Building for platform 'Spatz'")
 else()
   message(FATAL_ERROR "Invalid platform '${platform}' specified!")
 endif()
@@ -299,5 +301,33 @@ if(platform STREQUAL Chimera)
 
 endif()
 
+if(platform STREQUAL Spatz)
+
+  if(NOT DEFINED ENV{SPATZ_HOME})
+    message(FATAL_ERROR "Environment variable SPATZ_HOME not set.")
+  endif()
+
+  set(SPATZ_HOME $ENV{SPATZ_HOME})
+
+  set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/toolchain_llvm.cmake)
+
+  include(${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/spatz.cmake)
+
+  project(deeploy LANGUAGES C ASM)
+
+  message(STATUS "============================= ${platform} Configuration ============================")
+  message(STATUS "[cMake  ]   ISA                      = " ${ISA})
+  message(STATUS "================================================================================")
+  message(STATUS "")
+
+  add_subdirectory(TargetLibraries/Generic)
+  add_subdirectory(TargetLibraries/Spatz)
+  target_include_directories(deeployspatz PUBLIC TargetLibraries/Generic/inc)
+
+  add_subdirectory(DeeployTest)
+  target_link_libraries(deeploylib INTERFACE deeploybasic deeployspatz)
+
+endif()
+
 
 print_simulation_config()
diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
index 308b179aef..4b0ecfc258 100644
--- a/Deeploy/Targets/Generic/Bindings.py
+++ b/Deeploy/Targets/Generic/Bindings.py
@@ -19,12 +19,12 @@
     GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \
     MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
     RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \
-    iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
+    iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate, TopKTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \
     DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \
     LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \
     ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \
-    SoftmaxChecker, TransposeChecker
+    SoftmaxChecker, TransposeChecker, TopKChecker
 
 BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()])
 
@@ -327,3 +327,14 @@
         ConvTransposeTemplate.referenceTemplate,
         BasicTransformer) for type in FloatDataTypes
 ]
+
+BasicTopKBindings = [
+    NodeBinding(
+        TopKChecker(
+            [PointerClass(float32_t), PointerClass(int8_t)], # inputs
+            [PointerClass(float32_t), PointerClass(int8_t)] # outputs
+        ),
+        TopKTemplate.referenceTemplate,
+        BasicTransformer,
+    )
+]
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index cc733937cc..21e22992e6 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -709,3 +709,9 @@ def computeOps(self):
             numPx = opRep['dim_im_out_x']
 
         return numPx * opsPerPx
+
+
+class TopKLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index ad787d9e4b..b58f875c17 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -982,7 +982,7 @@ def parseNode(self, node: gs.Node) -> (bool):
             return False
 
         indices_shape = node.inputs[1].shape
-        assert np.prod(indices_shape) == 1, f"Only indices of size 1 supported. Got indices of shape {indices_shape}"
+        self.operatorRepresentation['num_indices'] = int(np.prod(indices_shape))
 
         self.operatorRepresentation['axis'] = node.attrs['axis'] if 'axis' in node.attrs else 0
         return True
@@ -1002,10 +1002,17 @@ def parseNodeCtxt(self,
 
         axis = self.operatorRepresentation['axis']
         shape = ctxt.lookup(node.inputs[0].name).shape
-        self.operatorRepresentation['batch'] = np.prod(shape[:axis])
-        self.operatorRepresentation['batch_length'] = np.prod(shape[axis:])
-        self.operatorRepresentation['axis_length'] = np.prod(shape[axis + 1:])
-        self.operatorRepresentation['index'] = int(node.inputs[1].values.item())
+        self.operatorRepresentation['batch'] = int(np.prod(shape[:axis])) if axis > 0 else 1
+        self.operatorRepresentation['batch_length'] = int(np.prod(shape[axis:]))
+        self.operatorRepresentation['axis_length'] = int(np.prod(shape[axis + 1:])) if axis + 1 < len(shape) else 1
+
+        if self.operatorRepresentation['num_indices'] == 1:
+            try:
+                self.operatorRepresentation['index'] = int(node.inputs[1].values.item())
+            except AttributeError:
+                self.operatorRepresentation['index'] = f"{self.operatorRepresentation['indices']}[0]"
+        else:
+            self.operatorRepresentation['index'] = 0 # in this case is not used but is needed for mako template
 
         return ctxt, True
 
@@ -2886,3 +2893,28 @@ def parseNodeCtxt(self,
         self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
 
         return ctxt, True
+
+# TopKParser: selects the largest k elements from a vector
+class TopKParser(NodeParser):
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return len(node.inputs)==2 and len(node.outputs)==2 and node.op=='TopK'
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        data_in = ctxt.lookup(node.inputs[0].name)
+        k_in = ctxt.lookup(node.inputs[1].name)
+        values_out = ctxt.lookup(node.outputs[0].name)
+        indices_out = ctxt.lookup(node.outputs[1].name)
+
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_in_size'] = int(np.prod(data_in.shape))
+        self.operatorRepresentation['k_value'] = int(k_in.values[0])
+        self.operatorRepresentation['values_out'] = values_out.name
+        self.operatorRepresentation['indices_out'] = indices_out.name
+
+        return ctxt, True
diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py
index e05e897270..2e4601bdd4 100644
--- a/Deeploy/Targets/Generic/Platform.py
+++ b/Deeploy/Targets/Generic/Platform.py
@@ -14,19 +14,19 @@
     BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \
     BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \
     BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \
-    DummyBinding
+    DummyBinding, BasicTopKBindings
 from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \
     ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \
     LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \
     ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \
-    SoftmaxLayer, SqrtLayer, TransposeLayer
+    SoftmaxLayer, SqrtLayer, TransposeLayer, TopKLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \
     DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \
     GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \
     IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \
     Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \
     RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \
-    TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
+    TransposeParser, TopKParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \
     ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \
@@ -67,6 +67,7 @@
 SoftmaxMapper = NodeMapper(SoftmaxParser(), BasicSoftmaxBindings)
 iSoftmaxMapper = NodeMapper(iSoftmaxParser(), BasicSoftmaxBindings)
 TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings)
+TopKMapper = NodeMapper(TopKParser(), BasicTopKBindings)
 UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
 QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
@@ -113,6 +114,7 @@
     'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
     'Squeeze': ReshapeLayer([UnsqueezeMapper]),
     'Transpose': TransposeLayer([TransposeMapper]),
+    'TopK': TopKLayer([TopKMapper]),
     'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
     'Slice': SliceLayer([SliceMapper]),
     'Quant': QuantLayer([QuantMapper]),
diff --git a/Deeploy/Targets/Generic/Templates/GatherTemplate.py b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
index dd5e534fa4..171fbab779 100644
--- a/Deeploy/Targets/Generic/Templates/GatherTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
@@ -10,8 +10,18 @@
 width = int(data_in_type.referencedType.typeWidth/8)
 %>
 BEGIN_SINGLE_CORE
+% if num_indices == 1:
 for (uint32_t i=0; i<${batch}; ++i) {
     memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width});
 }
+% else:
+for (uint32_t i=0; i<${batch}; ++i) {
+    for (uint32_t j=0; j<${num_indices}; ++j) {
+        memcpy(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length},
+               ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length},
+               ${axis_length} * ${width});
+    }
+}
+% endif
 END_SINGLE_CORE
 """)
diff --git a/Deeploy/Targets/Generic/Templates/TopKTemplate.py b/Deeploy/Targets/Generic/Templates/TopKTemplate.py
new file mode 100644
index 0000000000..3f9b6474fa
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/TopKTemplate.py
@@ -0,0 +1,40 @@
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+referenceTemplate = NodeTemplate("""
+// TopK (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+// Find the top ${k_value} values and their indices
+// Assumes 1D input for simplicity
+typedef struct {
+	${data_in_type.referencedType.typeName} value;
+	uint32_t index;
+} topk_pair_t;
+
+topk_pair_t pairs[${data_in_size}];
+for (uint32_t i = 0; i < ${data_in_size}; ++i) {
+	pairs[i].value = ((${data_in_type.referencedType.typeName}*)${data_in})[i];
+	pairs[i].index = i;
+}
+// Simple selection sort for top-k
+for (uint32_t i = 0; i < ${k_value}; ++i) {
+	uint32_t max_idx = i;
+	for (uint32_t j = i + 1; j < ${data_in_size}; ++j) {
+		if (pairs[j].value > pairs[max_idx].value) {
+			max_idx = j;
+		}
+	}
+	// Swap
+	if (max_idx != i) {
+		topk_pair_t tmp = pairs[i];
+		pairs[i] = pairs[max_idx];
+		pairs[max_idx] = tmp;
+	}
+	// Write output
+	((${values_out_type.referencedType.typeName}*)${values_out})[i] = pairs[i].value;
+	((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = pairs[i].index;
+}
+END_SINGLE_CORE
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
index c2c8d436f8..5d363206f8 100644
--- a/Deeploy/Targets/Generic/TypeCheckers.py
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -610,3 +610,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer],
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
         return [True]
+
+# TopKChecker: infers types for both values and indices outputs of TopK operation
+class TopKChecker(SignPropTypeChecker):
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]:
+        # Output 0: values (same as input), Output 1: indices (integer, usually not quantized)
+        # We assume indices output is not quantized (set to 0 or 1)
+        return [inputs[0].nLevels, 1]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        # Output 0: values (same signedness as input), Output 1: indices (unsigned)
+        return [inputs[0]._signed, False]
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/Bindings.py b/Deeploy/Targets/Spatz/Bindings.py
new file mode 100644
index 0000000000..b1456486d0
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Bindings.py
@@ -0,0 +1,97 @@
+from functools import partial
+
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
+    MemoryManagementGeneration
+from Deeploy.Targets.Spatz.CodeTransformationPasses.Benchmarking import SpatzBenchmarkInnerPass, SpatzBenchmarkOuterPass
+
+from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t
+from Deeploy.Targets.Generic.TypeCheckers import GatherChecker, MatMulChecker, TopKChecker, SoftmaxChecker
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterTiling import SnitchClusterTiling
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterSynch import SnitchSynchCoresPass
+from Deeploy.Targets.Spatz.DMA.SpatzDma import SpatzDma
+from Deeploy.Targets.Spatz.Templates import GatherTemplate, MatMulTemplate as SpatzMatMulTemplate, TopKTemplate, SoftmaxTemplate
+from Deeploy.Targets.Generic.Templates import MatMulTemplate, FloatMatMulTemplate
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
+    TilingVariableReplacementUpdate
+
+TilingCallClosure = partial(ClosureGeneration, closureSuffix = "_tiling_closure")
+MemoryAwareFunctionCallClosure = partial(MemoryAwareClosureGeneration,
+                                         closureSuffix = "_closure",
+                                         startRegion = "L3",
+                                         endRegion = "L1")
+
+BasicTransformer = CodeTransformation(
+    [ArgumentStructGeneration(),
+    MemoryManagementGeneration(),
+    FutureGeneration()])
+
+TiledTransformer = CodeTransformation([
+    TilingVariableReplacement("L1"),
+    TilingCallClosure(writeback = False),
+    SnitchSynchCoresPass(), # snrt_cluster_hw_barrier()
+    # SpatzBenchmarkInnerPass(), # <- attention: increases runtime and benchmarks only when tiling loop has one iteration
+    TilingVariableReplacementUpdate("L1"),
+    SnitchClusterTiling("L3", "L1", SpatzDma()),
+    # SpatzBenchmarkOuterPass(), # <- attention: increases runtime and benchmarks only when tiling loop has one iteration
+    ArgumentStructGeneration(),
+    MemoryManagementGeneration("L1"),
+    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+    MemoryManagementGeneration("L3"),
+    MemoryManagementGeneration(),
+])
+
+SpatzGatherBindings = [
+    NodeBinding(
+        GatherChecker(
+            [PointerClass(float32_t), PointerClass(type)],
+            [PointerClass(float32_t)]
+        ),
+        GatherTemplate.dynamicDMAtemplate,
+        TiledTransformer
+    ) for type in IntegerDataTypes
+]
+
+# with tiled transformer
+SpatzMatMulBindings = [
+    NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                SpatzMatMulTemplate.spatzSIMatMulTemplate, TiledTransformer),
+    NodeBinding(
+        MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+        SpatzMatMulTemplate.spatzFloatMatMulTemplate, TiledTransformer)
+]
+
+# without tiled transformer
+'''
+SpatzMatMulBindings = [
+    NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                SpatzMatMulTemplate.spatzSIMatMulTemplate, BasicTransformer),
+    NodeBinding(
+        MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+        SpatzMatMulTemplate.spatzFloatMatMulTemplate, BasicTransformer)
+]
+'''
+
+SpatzTopKBindings = [
+    NodeBinding(
+        TopKChecker(
+            [PointerClass(float32_t), PointerClass(int32_t)], # inputs
+            [PointerClass(float32_t), PointerClass(int32_t)] # outputs
+        ),
+        TopKTemplate.minHeapTemplate,
+        TiledTransformer,
+    )
+]
+
+
+SpatzSoftmaxBindings = [
+    NodeBinding(
+        SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+        SoftmaxTemplate.floatTilingTemplate,
+        TiledTransformer
+    )
+]
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/CodeTransformationPasses/Benchmarking.py b/Deeploy/Targets/Spatz/CodeTransformationPasses/Benchmarking.py
new file mode 100644
index 0000000000..0caa24f1b5
--- /dev/null
+++ b/Deeploy/Targets/Spatz/CodeTransformationPasses/Benchmarking.py
@@ -0,0 +1,23 @@
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, NodeTemplate, CodeSnippet, _NoVerbosity
+
+
+class SpatzBenchmarkInnerPass(CodeTransformationPass):
+    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str, verbose: CodeGenVerbosity = _NoVerbosity):
+        if "include_benchmark" not in ctxt.globalObjects:
+            ctxt.hoistGlobalDefinition("include_benchmark", "#include <benchmark.h>\n")
+        if "include_printf" not in ctxt.globalObjects:
+            ctxt.hoistGlobalDefinition("include_printf", "#include \"printf.h\"\n")
+        tsop = NodeTemplate("""  tsop = benchmark_get_cycle();\n""")
+        teop = NodeTemplate("""  teop = benchmark_get_cycle();\n""")
+        executionBlock.codeSnippets.insert(1, CodeSnippet(tsop, {}))
+        executionBlock.codeSnippets.append(CodeSnippet(teop, {}))
+        return ctxt, executionBlock
+
+class SpatzBenchmarkOuterPass(CodeTransformationPass):
+    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str, verbose: CodeGenVerbosity = _NoVerbosity):
+        t0 = NodeTemplate("""  uint32_t t0, tsop, teop, te;\n  t0 = benchmark_get_cycle();\n""")
+        te = NodeTemplate(f"""te = benchmark_get_cycle();if (snrt_is_dm_core()) {{printf(\"Benchmark of {name}:\\n\");\nprintf(\"data_in=%d; op=%d; data_out=%d; total=%d\\n\\n\", tsop-t0, teop-tsop, te-teop, te-t0); }}\nsnrt_cluster_hw_barrier();""")
+        
+        executionBlock.addLeft(t0, {})
+        executionBlock.addRight(te, {})
+        return ctxt, executionBlock
diff --git a/Deeploy/Targets/Spatz/DMA/SpatzDma.py b/Deeploy/Targets/Spatz/DMA/SpatzDma.py
new file mode 100644
index 0000000000..e13df3aaa1
--- /dev/null
+++ b/Deeploy/Targets/Spatz/DMA/SpatzDma.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
+
+
+class SnitchBarrierFuture(Future):
+    _initTemplate = NodeTemplate("")
+    _deinitTemplate = NodeTemplate("")
+    _allocTemplate = NodeTemplate("")
+    _waitTemplate = NodeTemplate("if (snrt_is_dm_core()) snrt_dma_wait_all();")
+
+
+class SnitchFuture(Future):
+    _initTemplate = NodeTemplate("snrt_dma_txid_t ${name} = (snrt_dma_txid_t) -1;")
+
+    _deinitTemplate = NodeTemplate("")
+
+    _allocTemplate = NodeTemplate("")
+
+    _waitTemplate = NodeTemplate(#remove if condition -1
+        "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait_all();")
+        # "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait(${name});")
+
+
+class SpatzDma(AsyncDma):
+
+    _transferTemplates = {
+        2:
+            NodeTemplate("""
+            if (snrt_is_dm_core()) {
+                ${future} = snrt_dma_start_2d(${dest}, ${src}, ${size}, ${stride_dest}, ${stride_src}, ${repeat});
+            }
+            """),
+    }
+    _waitingStrategy = PerTensorWaitingStrategy(SnitchFuture)
+
+    def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
+        super().__init__(transferTemplates)
+
+    def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                      shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                      direction: DmaDirection) -> None:
+        super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction)
+        assert strideLoc[1] == 1 and strideExt[1] == 1, f"Supports only contigous transfers in the innermost dimension"
+
+    def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...],
+                       strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection,
+                       future: Future) -> OperatorRepresentation:
+        operatorRepresentation: OperatorRepresentation = {
+            "dest": localBuffer.name if direction == "ExternalToLocal" else externalBuffer.name,
+            "src": externalBuffer.name if direction == "ExternalToLocal" else localBuffer.name,
+            "repeat": shape[0],
+            "size": shape[1],
+            "stride_dest": strideLoc[0] if direction == "ExternalToLocal" else strideExt[0],
+            "stride_src": strideExt[0] if direction == "ExternalToLocal" else strideLoc[0],
+            "future": future.name
+        }
+        return operatorRepresentation
diff --git a/Deeploy/Targets/Spatz/Deployer.py b/Deeploy/Targets/Spatz/Deployer.py
new file mode 100644
index 0000000000..4d99b61f54
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Deployer.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    NCHWtoNHWCPass, TransposeMatmulInputsPass
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass
+
+
+class SpatzDeployer(SignPropDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda x: x,
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets: Dict[str, int] = {}):
+
+        super().__init__(graph,
+                         deploymentPlatform,
+                         inputTypes,
+                         loweringOptimizer,
+                         scheduler,
+                         name,
+                         default_channels_first = default_channels_first,
+                         deeployStateDir = deeployStateDir)
diff --git a/Deeploy/Targets/Spatz/Platform.py b/Deeploy/Targets/Spatz/Platform.py
new file mode 100644
index 0000000000..5150f0928a
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Platform.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+import numpy as np
+
+from Deeploy.DeeployTypes import VariableBuffer, TransientBuffer, ConstantBuffer, StructBuffer, \
+    NodeMapper, NodeTemplate, TopologyOptimizer, DeploymentEngine, DeploymentPlatform
+
+from Deeploy.Targets.Spatz.Templates import AllocateTemplate as SpatzAllocateTemplate
+from Deeploy.Targets.Spatz.Templates import FreeTemplate as SpatzFreeTemplate
+
+from Deeploy.Targets.Spatz.Tiler import SpatzMatMulTilingBindings, SpatzGatherTilingBindings, SpatzTopKTilingBindings, SpatzSoftmaxTilingBindings
+from Deeploy.Targets.Generic.Layers import GEMMLayer, SoftmaxLayer, TopKLayer, GatherLayer
+from Deeploy.Targets.Generic.Parsers import MatMulParser, SoftmaxParser, TopKParser, GatherParser
+
+MatMulMapper = NodeMapper(MatMulParser(), SpatzMatMulTilingBindings)
+SoftmaxMapper = NodeMapper(SoftmaxParser(), SpatzSoftmaxTilingBindings)
+TopKMapper = NodeMapper(TopKParser(), SpatzTopKTilingBindings)
+GatherMapper = NodeMapper(GatherParser(), SpatzGatherTilingBindings)
+
+SpatzMapping = {
+    'MatMul': GEMMLayer([MatMulMapper]),
+    'Softmax': SoftmaxLayer([SoftmaxMapper]),
+    'TopK': TopKLayer([TopKMapper]),
+    'Gather': GatherLayer([GatherMapper]),
+}
+
+
+class SpatzVariableBuffer(VariableBuffer):
+    initTemplate = SpatzAllocateTemplate.spatzInitTemplate
+    allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate
+    deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate
+
+    def _bufferRepresentation(self):
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        return {
+            "type": self._instance,
+            "name": self.name,
+            "size": int(np.prod(self.shape)),
+            "_memoryLevel": memoryLevel
+        }
+
+class SpatzTransientBuffer(TransientBuffer):
+    initTemplate = SpatzAllocateTemplate.spatzInitTemplate
+    allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate
+    deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate
+
+    def _bufferRepresentation(self):
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        return {
+            "type": self._type,
+            "name": self.name,
+            "size": self.size,
+            "_memoryLevel": memoryLevel
+        }
+
+
+class SpatzConstantBuffer(ConstantBuffer):
+    initTemplate = SpatzAllocateTemplate.spatzGlobalInitTemplate
+    allocTemplate = NodeTemplate("")
+    deallocTemplate = NodeTemplate("")
+
+    def _bufferRepresentation(self):
+        operatorRepresentation = super()._bufferRepresentation()
+
+        if hasattr(self, "_memoryLevel"):
+            memoryLevel = self._memoryLevel
+        else:
+            memoryLevel = None
+
+        operatorRepresentation["_memoryLevel"] = memoryLevel
+
+        return operatorRepresentation
+
+
+class SpatzStructBuffer(StructBuffer):
+    initTemplate = SpatzAllocateTemplate.spatzStructInitTemplate
+    allocTemplate = SpatzAllocateTemplate.spatzStructAllocateTemplate
+    deallocTemplate = NodeTemplate("")
+
+
+SpatzOptimizer = TopologyOptimizer([
+], name = "SpatzOptimizer")
+
+includeList = [
+    "snrt.h",
+    "DeeploySpatzMath.h",
+]
+
+
+class SpatzEngine(DeploymentEngine):
+    def __init__(self, name: str, Mapping = SpatzMapping, initCode = "", includeList = includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class SpatzPlatform(DeploymentPlatform):
+
+    def __init__( self,
+        engines = [SpatzEngine("SpatzVectorProcessor")],
+        variableBuffer = SpatzVariableBuffer,
+        transientBuffer = SpatzTransientBuffer,
+        constantBuffer = SpatzConstantBuffer,
+        structBuffer = SpatzStructBuffer,
+        includeList: List[str] = includeList
+    ):
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
diff --git a/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py
new file mode 100644
index 0000000000..0834283947
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+# Declaration of a runtime-allocated buffer (just a pointer; the memory is
+# obtained at runtime by the allocate template below).
+spatzInitTemplate = NodeTemplate("${type.typeName} ${name}; // variable buffer of size ${size}\n")
+
+# Runtime allocation: L1 -> TCDM (snrt_l1alloc), L3/None -> DRAM (snrt_l3alloc).
+spatzGenericAllocate = NodeTemplate("""
+% if _memoryLevel == "L1":
+${name} = (${type.typeName}) snrt_l1alloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+% elif _memoryLevel == "L3" or _memoryLevel is None:
+${name} = (${type.typeName}) snrt_l3alloc(sizeof(${type.referencedType.typeName}) * ${size});\n
+% else:
+// COMPILER WARNING — unsupported memory level ${_memoryLevel}, defaulting to L3
+${name} = (${type.typeName}) snrt_l3alloc(${type.referencedType.typeWidth//8} * ${size});
+% endif
+""")
+
+# Constant buffers: emitted as static initialized arrays.
+spatzGlobalInitTemplate = NodeTemplate("static ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n")
+
+# Struct buffers.
+spatzStructInitTemplate = NodeTemplate("""
+static ${type.typeName} ${name};
+""")
+
+spatzStructAllocateTemplate = NodeTemplate("""
+    ${name} = (${structDict.typeName}) ${str(structDict)};
+""")
diff --git a/Deeploy/Targets/Spatz/Templates/FreeTemplate.py b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py
new file mode 100644
index 0000000000..f67cb3de38
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py
@@ -0,0 +1,5 @@
+from Deeploy.DeeployTypes import NodeTemplate
+
+# snrt_l1alloc currently does not support free-ing of memory (spatz/sw/snRuntime/src/alloc.c)
+spatzLocalTemplate = NodeTemplate("")
+spatzGlobalTemplate = NodeTemplate("")
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/Templates/GatherTemplate.py b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py
new file mode 100644
index 0000000000..b8b2ec20d1
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+dynamicDMAtemplate = NodeTemplate("""
+// Gather (Name: ${nodeName}, Op: ${nodeOp})
+// Dynamic DMA strategy (Spatz):
+// - indices already transferred to local memory by the tiling pass
+// - fetch selected rows directly from external data_in into local data_out
+<%
+width = int(data_in_type.referencedType.typeWidth/8)
+%>
+
+// Currently supported configuration: axis=0 and batch=1 (matches existing Spatz Gather tests)
+if ((${axis} != 0) || (${batch} != 1)) {
+    error();
+} else {
+    if (snrt_is_dm_core()) {
+        const size_t bytes_per_row = (size_t)${axis_length} * (size_t)${width}; // sizeof(${data_in_type.referencedType.typeName}) = ${width}
+        char *dst_base = (char *)${data_out};
+        const char *src_base = (const char *)${data_in};
+
+        for (size_t j = 0; j < (size_t)${num_indices}; ++j) {
+            const size_t dst_off = j * bytes_per_row;
+            const size_t src_off = (size_t)${indices}[j] * bytes_per_row;
+            snrt_dma_start_1d((void *)(dst_base + dst_off), (const void *)(src_base + src_off), bytes_per_row);
+        }
+
+        // Ensure all row DMAs complete before the tiling pass starts the output transfer.
+        snrt_dma_wait_all();
+    }
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py
new file mode 100644
index 0000000000..b528ee3fb4
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MatMulTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+        if hasattr(A, "_signed") and hasattr(A, "nLevels"):
+            operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        if hasattr(B, "_signed") and hasattr(B, "nLevels"):
+            operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        if hasattr(C, "_signed") and hasattr(C, "nLevels"):
+            operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+# signed integer
+spatzSIMatMulTemplate = _MatMulTemplate("""
+// MatMul (Name: ${nodeName}, Op: ${nodeOp})
+${A_type.typeName} ref_${data_out}_${A} = ${A};
+${B_type.typeName} ref_${data_out}_${B} = ${B};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for(uint32_t i=0;i<${batch};i++){
+    MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ${A_offset}, ${B_offset}, ${C_offset}
+    );
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+""")
+
+# supports single precision float (fp32)
+# also possible ot add half and double precision
+spatzFloatMatMulTemplate = NodeTemplate("""
+// Matmul (Name: ${nodeName}, Op: ${nodeOp})
+${A_type.typeName} ref_${data_out}_${A} = ${A};
+${B_type.typeName} ref_${data_out}_${B} = ${B};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+% if batch==1:
+Spatz_MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+    ref_${data_out}_${A},
+    ref_${data_out}_${B},
+    ref_${data_out}_${data_out},
+    ${M},
+    ${N},
+    ${O}
+);
+
+% else:
+for(uint32_t i=0; i<${batch}; i++){
+    Spatz_MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O}
+    );
+
+    ref_${data_out}_${A} += ${M} * ${N};
+    ref_${data_out}_${B} += ${N} * ${O};
+    ref_${data_out}_${data_out} += ${M} * ${O};
+}
+% endif
+""")
diff --git a/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py
new file mode 100644
index 0000000000..2ddcc2c9b0
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py
@@ -0,0 +1,8 @@
+from Deeploy.DeeployTypes import NodeTemplate
+
+# integerTilingTemplate
+
+floatTilingTemplate = NodeTemplate("""
+// Softmax (Name: ${nodeName}, Op: ${nodeOp})
+Spatz_Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength});
+""")
diff --git a/Deeploy/Targets/Spatz/Templates/TopKTemplate.py b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py
new file mode 100644
index 0000000000..7e7c836e62
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py
@@ -0,0 +1,49 @@
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+selectionSortTemplate = NodeTemplate("""
+// TopK node: finds the top ${k_value} values and their indices
+// Assumes 1D input 
+${data_in_type.referencedType.typeName} *values_tmp = snrt_l1alloc(sizeof(${data_in_type.referencedType.typeName})*${data_in_size});
+${indices_out_type.referencedType.typeName} *indices_tmp = snrt_l1alloc(sizeof(${indices_out_type.referencedType.typeName})*${data_in_size});
+
+for (uint32_t i = 0; i < ${data_in_size}; ++i) {
+	values_tmp[i] = ((${data_in_type.referencedType.typeName}*)${data_in})[i];
+	indices_tmp[i] = i;
+}
+// Simple selection sort for top-k
+for (uint32_t i = 0; i < ${k_value}; ++i) {
+	uint32_t max_idx = i;
+	for (uint32_t j = i + 1; j < ${data_in_size}; ++j) {
+        if (values_tmp[j] > values_tmp[max_idx]) {
+          max_idx = j;
+        }
+	}
+	// Swap
+	if (max_idx != i) {
+		float32_t tmp_val = values_tmp[i];
+		int32_t tmp_idx = indices_tmp[i];
+		values_tmp[i] = values_tmp[max_idx];
+		indices_tmp[i] = indices_tmp[max_idx];
+		values_tmp[max_idx] = tmp_val;
+		indices_tmp[max_idx] = tmp_idx;
+	}
+	// Write output
+	((${values_out_type.referencedType.typeName}*)${values_out})[i] = values_tmp[i];
+	((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = indices_tmp[i];
+}
+""")
+
+# compute_topk_vector_instructions
+minHeapTemplate = NodeTemplate("""
+compute_topk_min_heap(
+    ${k_value},
+    ${data_in_size},
+    ${data_in},
+    ${values_out},
+    ${indices_out}
+);
+
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py
new file mode 100644
index 0000000000..27a3c7ec3e
--- /dev/null
+++ b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+class GatherTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        pointer: List[str] = []
+        for key, value in parseDict.items():
+            if not isinstance(value, str):
+                continue
+
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                pointer.append(value)
+
+                _buffer = ctxt.lookup(value)
+                if isinstance(_buffer, TransientBuffer):
+                    continue
+
+                tilerModel.addTensorDimToModel(ctxt, value)
+
+                # no tile contraint for data_in, because is not moved by the tiling engine
+                if key == 'data_in':
+                    continue
+
+                for idx, shapeDim in enumerate(_buffer.shape):
+                    tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = value, dimIdx = idx) == shapeDim)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        # Dynamic-DMA Gather policy:
+        # - DMA only indices into local memory
+        # - Do NOT DMA the full data_in tile into local memory
+        # - DMA the output tile back to external memory
+        addrNames = ['indices', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        dataInBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+        indicesBuffer = ctxt.lookup(operatorRepresentation['indices'])
+
+        dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape))
+        indicesCube = HyperRectangle(offset = (0,) * len(indicesBuffer.shape), dims = tuple(indicesBuffer.shape))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            # Gather execution policy (dynamic DMA): load indices in L1, execute once, then store output tile.
+            # data_in stays in external memory; selected rows are fetched directly into the local output buffer.
+            _ = dataInCube  # Keep for clarity; intentionally unused in this schedule.
+            inputLoadSchedule.append({'indices': indicesCube})
+            outputLoadSchedule.append({'data_out': out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        repScheme = VariableReplacementScheme({}, {})
+
+        return repScheme, schedule
diff --git a/Deeploy/Targets/Spatz/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/MatMulTileConstraint.py
new file mode 100644
index 0000000000..4211fed17d
--- /dev/null
+++ b/Deeploy/Targets/Spatz/TileConstraints/MatMulTileConstraint.py
@@ -0,0 +1,240 @@
+import math
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class MatMulTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # ===== GET NECESSARY INFORMATION =====
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        tensorsShapeLenA = len(bufferA.shape)
+        tensorsShapeLenB = len(bufferB.shape)
+        tensorsShapeLenOutput = len(outputBuffer.shape)
+
+        # ===== ADD I/O DIMS TO MODEL AS VARS =====
+        for _buffer in [bufferA, bufferB, outputBuffer]:
+            tilerModel.addTensorDimToModel(ctxt, _buffer.name)
+
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        # *Checks on wether dimesnions are reversed via the transA and transB flags
+        #   A dims
+        AMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                        dimIdx = (tensorsShapeLenA - 2) + parseDict['transA'])
+        AMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                         dimIdx = (tensorsShapeLenA - 1) - parseDict['transA'])
+
+        #   B dims
+        BMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                        dimIdx = (tensorsShapeLenB - 2) + parseDict['transB'])
+        BMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                         dimIdx = (tensorsShapeLenB - 1) - parseDict['transB'])
+
+        #   Output dims
+        outputMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name,
+                                                             dimIdx = (tensorsShapeLenOutput - 2))
+        outputMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name,
+                                                              dimIdx = (tensorsShapeLenOutput - 1))
+
+        # ===== ADD CONSTRAINTS =====
+        #   Add batch constraints
+        if (bufferA.shape[:-2] == bufferB.shape[:-2]):
+            for idx in range(tensorsShapeLenA - 2):
+                tilerModel.addConstraint(
+                    tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx)
+                    == tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = tensorsShapeLenA - 3 - idx))
+
+            for idx in range(tensorsShapeLenB - 2):
+                tilerModel.addConstraint(
+                    tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx)
+                    == tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = tensorsShapeLenB - 3 - idx))
+
+        #   Add GEMM geometrical constraints
+        tilerModel.addConstraint(outputMatrixFirstDimVar == AMatrixFirstDimVar)
+        tilerModel.addConstraint(outputMatrixSecondDimVar == BMatrixSecondDimVar)
+
+        tilerModel.addConstraint(AMatrixSecondDimVar == BMatrixFirstDimVar)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # ===== GET NECESSARY INFORMATION =====
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = (len(bufferA.shape) - 1) - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                  dimIdx = (len(bufferB.shape) - 2) + parseDict['transB'])
+
+        # ===== ADD CONSTRAINTS =====
+        # VIC: We don't want to deal with intermediate results between kernel calls
+        tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+        tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+        # Spatz row-stride alignment: the kernel loads B rows / stores output rows at
+        # a stride of O elements. On this Spatz config every row base must be 64-bit
+        # aligned, otherwise a chained vector load corrupts the upper lanes. So force
+        # the O tile size to be a multiple of (8 / elemBytes) (fp32 -> 2). With an even
+        # original O the remainder tile is even too (even - even*k = even), so every
+        # tile's row stride stays 8-byte aligned. (Odd original O is unsupported: its
+        # remainder tile is odd and would be misaligned.)
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+        elemBytes = outputBuffer._type.referencedType.typeWidth // 8
+        modulo = 8 // elemBytes
+        if modulo > 1:
+            outputODimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name,
+                                                       dimIdx = len(outputBuffer.shape) - 1)
+            tilerModel.addTileSizeDivisibleConstraint(parseDict, "O", outputODimVar, modulo)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        # Get output cubes
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        # Get names, optimizer variables, buffers, and other information for elements of interest
+        addrNames = ['A', 'B', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        buffA = ctxt.lookup(operatorRepresentation['A'])
+        buffB = ctxt.lookup(operatorRepresentation['B'])
+        buffOut = ctxt.lookup(operatorRepresentation['data_out'])
+
+        transA = operatorRepresentation['transA']
+        transB = operatorRepresentation['transB']
+
+        tensorsShapeLenA = len(buffA.shape)
+        tensorsShapeLenB = len(buffB.shape)
+        tensorsShapeOutput = len(buffOut.shape)
+
+        # NSize depends on transA: if transA=0, N is last dim; if transA=1, N is second-to-last
+        NSize = buffA.shape[-1] if transA == 0 else buffA.shape[-2]
+        NOffset = 0
+
+        # Prepare input cubes lists
+        inputACubes = []
+        inputBCubes = []
+
+        # Prepare replacements lists
+        replacements = {"M": [], "O": [], "batch": []}
+
+        # Every output tile is constructed by a pair of input tiles. Reconstruct this pair.
+        for cube in outputCubes:
+            # Get output dimensions
+            MOffset, OOffset = cube.offset[-2:]
+            MSize, OSize = cube.dims[-2:]
+
+            # Check that batch tiling is set up properly
+            if len(cube.offset) > 2:
+                BatchSize = math.prod(cube.dims[:-2])
+
+                if len(cube.offset) > 3:
+                    assert all(off == 0 for off in cube.offset[:-3]), (
+                        f"Unsupported tiling across leading batch dims: offsets={cube.offset}. "
+                        "Only the last batch dim (besides M/O) may be tiled.")
+            else:
+                BatchSize = 1
+
+            # Prepare cube dimensions replacements
+            replacements["M"].append(MSize)
+            replacements["O"].append(OSize)
+            replacements["batch"].append(BatchSize)
+
+            # ===== Compute A cube information =====
+            #   Matrix offsets and shape (swap based on transA)
+            if transA == 0:
+                AMatrixOffsets = (MOffset, NOffset)
+                AMatrixShape = (MSize, NSize)
+            else:
+                AMatrixOffsets = (NOffset, MOffset)
+                AMatrixShape = (NSize, MSize)
+
+            #   Batch offset and shape (with broadcasting handling)
+            ABatchOffsets = list()
+            ABatchShape = list()
+
+            for idx in range(tensorsShapeLenA - 2):
+                if buffA.shape[tensorsShapeLenA - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]:
+                    ABatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx])
+                    ABatchShape.append(cube.dims[len(cube.dims) - 3 - idx])
+                else:
+                    ABatchOffsets.append(0)
+                    ABatchShape.append(1)
+
+            ACube = HyperRectangle(
+                tuple(reversed(ABatchOffsets)) + tuple(AMatrixOffsets),
+                tuple(reversed(ABatchShape)) + tuple(AMatrixShape))
+            inputACubes.append(ACube)
+
+            # ===== Compute B cube information =====
+            #   Matrix offsets and shape (swap based on transB)
+            if transB == 0:
+                BMatrixOffsets = (NOffset, OOffset)
+                BMatrixShape = (NSize, OSize)
+            else:
+                BMatrixOffsets = (OOffset, NOffset)
+                BMatrixShape = (OSize, NSize)
+
+            #   Batch offset and shape (with broadcasting handling)
+            BBatchOffsets = list()
+            BBatchShape = list()
+
+            for idx in range(tensorsShapeLenB - 2):
+                if buffB.shape[tensorsShapeLenB - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]:
+                    BBatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx])
+                    BBatchShape.append(cube.dims[len(cube.dims) - 3 - idx])
+                else:
+                    BBatchOffsets.append(0)
+                    BBatchShape.append(1)
+
+            BCube = HyperRectangle(
+                tuple(reversed(BBatchOffsets)) + tuple(BMatrixOffsets),
+                tuple(reversed(BBatchShape)) + tuple(BMatrixShape))
+            inputBCubes.append(BCube)
+
+        # Prepare load schedule lists for computed cubes
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        # Prepare replacements
+        replacements["N"] = [NSize] * len(outputCubes)
+
+        replacementTypes = {
+            "M": PointerClass(uint16_t),
+            "N": PointerClass(uint16_t),
+            "O": PointerClass(uint16_t),
+            "batch": PointerClass(uint16_t)
+        }
+
+        # Update load schedule lists
+        # *With strict=True to fail fast if different list lenghts
+        for a, b in zip(inputACubes, inputBCubes, strict = True):
+            inputLoadSchedule.append({"A": a, "B": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        # Prepare tiling schedule object
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py
new file mode 100644
index 0000000000..c34b84890f
--- /dev/null
+++ b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class SoftmaxTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Register and pin all referenced tensors to full shape to avoid tiling.
+        # This also covers constant inputs that may appear as parseDict string references.
+        tensorNames: List[str] = []
+
+        for value in parseDict.values():
+            if not isinstance(value, str):
+                continue
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                tensorNames.append(value)
+
+        for tensorName in tensorNames:
+            _buffer = ctxt.lookup(tensorName)
+            if isinstance(_buffer, TransientBuffer):
+                continue
+
+            tilerModel.addTensorDimToModel(ctxt, tensorName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx)
+                tilerModel.addConstraint(tileDimVar == shapeDim)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        dataInBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+
+        dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            inputLoadSchedule.append({'data_in': dataInCube})
+            outputLoadSchedule.append({'data_out': out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        repScheme = VariableReplacementScheme({}, {})
+
+        return repScheme, schedule
diff --git a/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py
new file mode 100644
index 0000000000..30572d5819
--- /dev/null
+++ b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class TopKTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Register and pin all referenced tensors to full shape to avoid tiling.
+        # This also covers constant inputs that may appear as parseDict string references.
+        tensorNames: List[str] = []
+
+        for value in parseDict.values():
+            if not isinstance(value, str):
+                continue
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                tensorNames.append(value)
+
+        for tensorName in tensorNames:
+            _buffer = ctxt.lookup(tensorName)
+            if isinstance(_buffer, TransientBuffer):
+                continue
+
+            tilerModel.addTensorDimToModel(ctxt, tensorName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx)
+                tilerModel.addConstraint(tileDimVar == shapeDim)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        # k_value is a scalar parsed in operatorRepresentation, not a tensor to transfer.
+        addrNames = ['data_in', 'values_out', 'indices_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        dataInBuffer = ctxt.lookup(operatorRepresentation['data_in'])
+
+        dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            # TopK execution policy: load full input in L1, execute once, then store both outputs.
+            inputLoadSchedule.append({'data_in': dataInCube})
+            outputLoadSchedule.append({'values_out': out, 'indices_out': out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        repScheme = VariableReplacementScheme({}, {})
+
+        return repScheme, schedule
diff --git a/Deeploy/Targets/Spatz/Tiler.py b/Deeploy/Targets/Spatz/Tiler.py
new file mode 100644
index 0000000000..f3f67102bc
--- /dev/null
+++ b/Deeploy/Targets/Spatz/Tiler.py
@@ -0,0 +1,18 @@
+from Deeploy.Targets.Spatz.Bindings import SpatzMatMulBindings, SpatzGatherBindings, SpatzTopKBindings, SpatzSoftmaxBindings
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+from Deeploy.Targets.Spatz.TileConstraints.MatMulTileConstraint import MatMulTileConstraint
+from Deeploy.Targets.Spatz.TileConstraints.GatherTileConstraint import GatherTileConstraint
+from Deeploy.Targets.Spatz.TileConstraints.TopKTileConstraint import TopKTileConstraint
+from Deeploy.Targets.Spatz.TileConstraints.SoftmaxTileConstraint import SoftmaxTileConstraint
+
+SpatzMatMulTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzMatMulBindings,
+                                                     tileConstraint = MatMulTileConstraint())
+
+SpatzGatherTilingBindings  = TilingReadyNodeBindings(nodeBindings = SpatzGatherBindings,
+                                                     tileConstraint = GatherTileConstraint())
+
+SpatzTopKTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzTopKBindings,
+                                                     tileConstraint = TopKTileConstraint())
+
+SpatzSoftmaxTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzSoftmaxBindings,
+                                                     tileConstraint = SoftmaxTileConstraint())
diff --git a/Deeploy/TilingExtension/MemoryScheduler.py b/Deeploy/TilingExtension/MemoryScheduler.py
index e46f50e6f7..c5fb2445f0 100644
--- a/Deeploy/TilingExtension/MemoryScheduler.py
+++ b/Deeploy/TilingExtension/MemoryScheduler.py
@@ -83,7 +83,10 @@ class MemoryScheduler():
     _COSTVARIABLENAME = "H"
     _COSTPRODUCTNAME = "costProduct"
 
-    byteAlignment = 4
+    # 8-byte (64-bit) alignment: required on Spatz so vector loads land on 64-bit
+    # boundaries (a misaligned vle that gets chained corrupts upper lanes). 8 is a
+    # superset of the previous 4-byte requirement, so it stays correct on other targets.
+    byteAlignment = 8
 
     @staticmethod
     def overlap(lifetimeA: Tuple[int, int], lifetimeB: Tuple[int, int]) -> bool:
diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py
index 5b067b2ce9..9a2aa6b9d9 100644
--- a/Deeploy/TilingExtension/TileConstraint.py
+++ b/Deeploy/TilingExtension/TileConstraint.py
@@ -131,7 +131,9 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List
 
             return solution, solutionLengths
 
-        assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!"
+        # Support multi-output nodes: use first output tensor to determine tiling structure.
+        # For operators like TopK with multiple outputs, all outputs share the same tiling pattern.
+        assert len(tilingSolution.outputTensorMemoryConstraints) >= 1, "Expected node to have at least one output!"
 
         outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items()))
         memoryPath = list(outTensorConstraint.memoryConstraints.keys())
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index 9b48d9456c..fd866de066 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -44,6 +44,51 @@
 
 
 class Tiler():
+    """Tiler for a computation graphs with memory-aware optimization.
+
+    The Tiler class provides functionality for tiling operations to fit within
+    memory constraints of target hardware platforms. It performs memory allocation, constraint
+    solving, and scheduling to optimize execution within hierarchical memory systems.
+
+    Parameters
+    ----------
+    memoryHierarchy : MemoryHierarchy
+        The memory hierarchy specification defining available memory levels and their capacities.
+
+    Attributes
+    ----------
+    arenaName : str
+        Name prefix for memory arena buffers.
+    memorySchedulerClass : Type[MemoryScheduler]
+        Class type for memory scheduler instances.
+    memoryHierarchy : MemoryHierarchy
+        The memory hierarchy configuration.
+    tilerModel : Optional[TilerModel]
+        The constraint solver model for tiling optimization.
+    innerMemoryScheduler : MemoryScheduler
+        Scheduler for inner memory level allocation.
+    outerMemoryScheduler : MemoryScheduler
+        Scheduler for outer memory level allocation.
+    symbolicMemoryConstraints : Optional[List[PatternMemoryConstraints]]
+        Symbolic memory constraints for the tiling problem.
+    visualizeMemoryAlloc : bool
+        Flag to enable memory allocation visualization.
+    memoryAllocStrategy : {"TetrisRandom", "TetrisCo-Opt", "MiniMalloc"}
+        Strategy for memory allocation.
+    searchStrategy : {"min", "max", "random-max"}
+        Search strategy for constraint solving.
+
+    Examples
+    --------
+    >>> L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 1024000)
+    >>> L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 512000)
+    >>> L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 128000)
+    >>> memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    >>> memoryHierarchy.setDefaultMemoryLevel("L3")
+    >>> tiler = Tiler(hierarchy)
+    >>> tiler.memoryAllocStrategy = "MiniMalloc"
+    >>> solution = tiler.computeTilingSchedule(context)
+    """
 
     arenaName = "MEMORYARENA"
     memorySchedulerClass: Type[MemoryScheduler] = MemoryScheduler
@@ -53,6 +98,17 @@ class Tiler():
 
     # Initialize with the list of TemplateTCFbinding
     def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = None, workDir: Optional[str] = None):
+        """Initialize the Tiler with a memory hierarchy.
+
+        Parameters
+        ----------
+        memoryHierarchy : MemoryHierarchy
+            The memory hierarchy specification defining available memory levels.
+        testName : Optional[str], optional
+            Optional name for the test case, used for file naming. Defaults to None.
+        workDir : Optional[str], optional
+            Optional working directory for temporary files. Defaults to None.
+        """
 
         self.memoryHierarchy = memoryHierarchy
         self.tilerModel: Optional[TilerModel] = None
@@ -85,10 +141,39 @@ def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = N
 
     @property
     def worstCaseBufferSize(self):
+        """Get the worst-case buffer sizes for each memory level.
+
+        Returns
+        -------
+        Dict[str, int]
+            Dictionary mapping memory level names to their worst-case buffer sizes in bytes.
+        """
         return self._worstCaseBufferSize
 
     def plotMemoryAlloc(self, memoryMap: Dict[str, List[List[MemoryBlock]]], ctxt: NetworkContext, deeployStateDir: str,
                         memoryHierarchy: MemoryHierarchy):
+        """Generate interactive visualization of memory allocation patterns.
+
+        Creates an HTML file with Plotly visualizations showing memory allocation
+        over time for each memory level in the hierarchy.
+
+        Parameters
+        ----------
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            Memory allocation map containing blocks for each memory level and time step.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        deeployStateDir : str
+            Directory path where the visualization HTML file will be saved.
+        memoryHierarchy : MemoryHierarchy
+            Memory hierarchy configuration for the visualization.
+
+        Notes
+        -----
+        Generates a file named 'memory_alloc.html' in the specified directory.
+        Each memory level is visualized as a separate subplot showing buffer
+        lifetimes and address space usage.
+        """
 
         os.makedirs(os.path.abspath(deeployStateDir), exist_ok = True)
         memoryAllocPlotPath = os.path.abspath(os.path.join(deeployStateDir, f"memory_alloc.html"))
@@ -177,6 +262,29 @@ def plotSingleMemoryLevel(memoryLevel: MemoryLevel):
 
     def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext,
                                      memoryMap: Dict[str, List[List[MemoryBlock]]]) -> NetworkContext:
+        """Convert network context to use static memory allocation.
+
+        Transforms the network context to use statically allocated memory arenas
+        based on the computed memory map. Updates buffer allocation templates
+        to reference specific offsets within memory arenas.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context to be updated.
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            Memory allocation map containing blocks for each memory level.
+
+        Returns
+        -------
+        NetworkContext
+            Updated network context with static memory allocation.
+
+        Notes
+        -----
+        Creates memory arena buffers for each memory level and updates
+        individual buffer allocation templates to use offsets within these arenas.
+        """
 
         maxAddr: Dict[str, int] = {}
 
@@ -254,6 +362,47 @@ def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext,
         return ctxt
 
     def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memoryLevel: str):
+        """Perform memory allocation using the MiniMalloc external tool.
+
+        Interfaces with the external MiniMalloc memory allocator to compute
+        optimal memory allocation for the given memory blocks and constraints.
+
+        Parameters
+        ----------
+        memoryMap : List[MemoryBlock]
+            List of memory blocks to be allocated.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        nodeMemoryConstraint : Optional[NodeMemoryConstraint]
+            Memory constraints for the current node, if available.
+        capacity : int
+            Total memory capacity available for allocation.
+        memoryLevel : str
+            Name of the memory level being allocated.
+
+        Returns
+        -------
+        List[MemoryBlock]
+            Updated memory blocks with assigned address spaces.
+
+        Raises
+        ------
+        KeyError
+            If MINIMALLOC_INSTALL_DIR environment variable is not set.
+        subprocess.CalledProcessError
+            If the MiniMalloc tool fails to execute successfully.
+
+        Notes
+        -----
+        Requires the MiniMalloc tool to be installed and the MINIMALLOC_INSTALL_DIR
+        environment variable to be set to the installation directory.
+        """
+
+        # MiniMalloc has no alignment flag, so allocate in units of byteAlignment:
+        # round sizes/capacity down/up to alignment units and scale offsets back to
+        # bytes. This guarantees every returned buffer offset is byteAlignment-aligned
+        # (required on Spatz so vector loads land on 64-bit boundaries).
+        alignment = MemoryScheduler.byteAlignment
 
         with open(f"{self._minimalloc_input}.csv", mode = "w", newline = "") as file:
             writer = csv.writer(file, lineterminator = "\n")
@@ -276,11 +425,14 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
                                 8) * nodeMemoryConstraint.tensorMemoryConstraints[
                                     memoryBlock.name].memoryConstraints[memoryLevel].multiBufferCoefficient
 
+                # Size in alignment units (rounded up) so offsets come back aligned.
+                _bufferSizeAligned = (int(_bufferSize) + alignment - 1) // alignment
+
                 writer.writerow([
                     memoryBlock.name,
                     str(memoryBlock.lifetime[0]),
                     str(memoryBlock.lifetime[1] + 1),
-                    str(int(_bufferSize))
+                    str(_bufferSizeAligned)
                 ])
 
         try:
@@ -289,8 +441,8 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
             raise KeyError("MINIMALLOC_INSTALL_DIR symbol not found!")
 
         minimallocOutput = subprocess.run([
-            f"{minimallocInstallDir}/minimalloc", f"--capacity={capacity}", f"--input={self._minimalloc_input}.csv",
-            f"--output={self._minimalloc_output}.csv"
+            f"{minimallocInstallDir}/minimalloc", f"--capacity={capacity // alignment}",
+            f"--input={self._minimalloc_input}.csv", f"--output={self._minimalloc_output}.csv"
         ],
                                           capture_output = True,
                                           text = True)
@@ -307,11 +459,39 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
             for row in reader:
                 for memoryBlock in memoryMap:
                     if memoryBlock.name == row[0]:
-                        memoryBlock._addrSpace = (int(row[-1]), int(row[-1]) + int(row[-2]))
+                        # Scale offset/size back from alignment units to bytes (offsets are
+                        # therefore multiples of `alignment`).
+                        memoryBlock._addrSpace = (int(row[-1]) * alignment,
+                                                  (int(row[-1]) + int(row[-2])) * alignment)
 
         return memoryMap
 
     def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution:
+        """Compute the optimal tiling schedule for the network.
+
+        Solves the constraint optimization problem to find the best tiling
+        solution that satisfies memory and computational constraints.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing the computational graph and constraints.
+
+        Returns
+        -------
+        TilingSolution
+            The computed tiling solution with memory constraints for each pattern.
+
+        Raises
+        ------
+        AssertionError
+            If the tiler model or symbolic memory constraints are not initialized.
+
+        Notes
+        -----
+        This method requires that setupModel() has been called previously to
+        initialize the constraint model and symbolic memory constraints.
+        """
         assert self.tilerModel is not None and self.symbolicMemoryConstraints is not None, "Set up the model before trying to compute a schedule!"
         collector = self.tilerModel.trySolveModel()
         tilingSolution = self._getTilingSolution(self.tilerModel, ctxt, collector, self.symbolicMemoryConstraints)
@@ -323,6 +503,29 @@ def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution:
         return tilingSolution
 
     def computeMemoryMap(self, ctxt: NetworkContext, tilingSolution: TilingSolution) -> MemoryMap:
+        """Compute memory allocation map from the tiling solution.
+
+        Generates a concrete memory allocation map that assigns specific
+        memory addresses to each buffer based on the tiling solution.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        tilingSolution : TilingSolution
+            The computed tiling solution.
+
+        Returns
+        -------
+        MemoryMap
+            Dictionary mapping memory level names to lists of memory blocks
+            for each time step.
+
+        Notes
+        -----
+        The memory allocation strategy (TetrisRandom, TetrisCo-Opt, or MiniMalloc)
+        determines how the actual memory addresses are assigned.
+        """
         memoryMap = {}
 
         for key in self.innerMemoryScheduler.memoryMap.keys():
@@ -348,6 +551,30 @@ def computeMemoryMap(self, ctxt: NetworkContext, tilingSolution: TilingSolution)
 
     def annotateMemoryLevel(self, ctxt: NetworkContext, tilingSolution: TilingSolution,
                             memoryMap: Dict) -> NetworkContext:
+        """Annotate memory constraints with actual address space allocations.
+
+        Updates the memory constraints in the tiling solution with the actual
+        address spaces computed during memory allocation.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        tilingSolution : TilingSolution
+            The tiling solution to be annotated.
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            Memory allocation map with assigned address spaces.
+
+        Returns
+        -------
+        NetworkContext
+            Updated network context (returned for consistency).
+
+        Notes
+        -----
+        This method modifies the tiling solution in-place by adding address
+        space information to memory constraints.
+        """
         for idx, pattern in enumerate(tilingSolution):
             for nodeIdx, nodeConstraint in enumerate(pattern.nodeConstraints):
                 for tensorConstraint in nodeConstraint.tensorMemoryConstraints.values():
@@ -373,6 +600,32 @@ def annotateMemoryLevel(self, ctxt: NetworkContext, tilingSolution: TilingSoluti
 
     def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: OrderedDict[str, ONNXLayer],
                    targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NetworkContext:
+        """Set up the constraint optimization model for tiling.
+
+        Initializes the tiler model with geometric constraints, memory constraints,
+        and optimization objectives based on the network schedule and layer bindings.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing the computational graph.
+        schedule : Schedule
+            Execution schedule defining the order of operations.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        NetworkContext
+            The network context (returned for consistency).
+
+        Notes
+        -----
+        This method must be called before computeTilingSchedule() to initialize
+        the constraint model and symbolic memory constraints.
+        """
 
         wrapSchedule: List[SubGraph] = []
         for entry in schedule:
@@ -396,6 +649,37 @@ def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: Ord
     # SCHEREMO: Return a integer factor or IntVar variable for the multi Buffer coefficient given the tiling path, hop and tensorName.
     def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
                             hop: str, tensorName: str) -> Union[int, IntVar]:
+        """Determine multi-buffering coefficient for a tensor in the tiling strategy.
+
+        Computes the buffering factor (e.g., double buffering = 2) for a given tensor
+        based on its type and usage pattern in the computation graph. This coefficient
+        is used to determine how many copies of the tensor should be kept in memory.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel, (unused)
+            The constraint solver model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        pattern : SubGraph, (unused)
+            The computation pattern being analyzed.
+        path : List[str], (unused)
+            Memory hierarchy path for the tensor.
+        hop : str, (unused)
+            Current memory level in the path.
+        tensorName : str
+            Name of the tensor to analyze.
+
+        Returns
+        -------
+        Union[int, IntVar]
+            Buffering coefficient (typically 1 for transient buffers, 2 for others).
+
+        Notes
+        -----
+        The multi-buffering strategy helps overlap computation with data movement
+        by maintaining multiple copies of buffers at different memory levels.
+        """
 
         varBuffer = ctxt.lookup(tensorName)
 
@@ -426,6 +710,30 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt
 
     def propagateIOBufferStrategy(self, tileConstraintPattern: PatternMemoryConstraints, pattern: SubGraph,
                                   ctxt: NetworkContext) -> PatternMemoryConstraints:
+        """Propagate I/O buffer strategy across the tiling pattern.
+
+        Implements static n-tuple buffering strategy by propagating border tensor
+        constraints across all steps in the tiling pattern.
+
+        Parameters
+        ----------
+        tileConstraintPattern : PatternMemoryConstraints
+            Memory constraints for the tiling pattern.
+        pattern : SubGraph
+            The computation subgraph being tiled.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+
+        Returns
+        -------
+        PatternMemoryConstraints
+            Updated pattern memory constraints with propagated I/O buffer strategy.
+
+        Notes
+        -----
+        This method ensures that border tensors (inputs/outputs of the pattern)
+        maintain consistent memory allocation across all computation steps.
+        """
 
         borderTensorStep = NodeMemoryConstraint()
         for patternStep in tileConstraintPattern.nodeConstraints:
@@ -438,6 +746,37 @@ def propagateIOBufferStrategy(self, tileConstraintPattern: PatternMemoryConstrai
 
     def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector,
                                        tensorConstraint: TensorMemoryConstraint) -> TensorMemoryConstraint:
+        """Resolve symbolic tensor memory constraints to concrete values.
+
+        Converts symbolic variables in tensor memory constraints to their
+        concrete values from the solver solution.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint solver model with the solution.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        collector : SolutionCollector
+            Solution collector from the constraint solver.
+        tensorConstraint : TensorMemoryConstraint
+            Symbolic tensor memory constraint to resolve.
+
+        Returns
+        -------
+        TensorMemoryConstraint
+            Tensor memory constraint with resolved concrete values.
+
+        Raises
+        ------
+        AssertionError
+            If the tiler model is not initialized.
+
+        Notes
+        -----
+        This method extracts the actual buffer sizes and shapes from the
+        solved constraint model and creates concrete memory constraints.
+        """
         assert self.tilerModel is not None, "Can't resolve tensor memory constraints, tilerModel is None!"
 
         tensorName = tensorConstraint.tensorName
@@ -472,6 +811,32 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo
 
     def _getTilingSolution(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector,
                            allConstraints: List[PatternMemoryConstraints]) -> List[PatternMemoryConstraints]:
+        """Extract tiling solution from the solved constraint model.
+
+        Processes all pattern memory constraints and resolves symbolic variables
+        to create a concrete tiling solution.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The solved constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        collector : SolutionCollector
+            Solution collector from the constraint solver.
+        allConstraints : List[PatternMemoryConstraints]
+            List of all symbolic pattern memory constraints.
+
+        Returns
+        -------
+        List[PatternMemoryConstraints]
+            Resolved tiling solution with concrete memory constraints.
+
+        Notes
+        -----
+        Only constraints that require resolution (multi-level or transient buffers)
+        are processed. Global single-level buffers are skipped.
+        """
 
         retList = []
 
@@ -502,6 +867,29 @@ def _checkResolve(ctxt, tensorName, tensorConstraint):
 
     def _setupTensorDimensionProducts(self, tilerModel: TilerModel, ctxt: NetworkContext,
                                       schedule: List[SubGraph]) -> TilerModel:
+        """Set up tensor dimension product variables in the tiler model.
+
+        Adds variables representing the number of elements in each tensor
+        to the constraint model for each pattern in the schedule.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+
+        Returns
+        -------
+        TilerModel
+            Updated tiler model with tensor dimension variables.
+
+        Notes
+        -----
+        Only processes tensors that are marked for deployment in the context.
+        """
 
         for idx, pattern in enumerate(schedule):
             subGraph = gs.Graph(nodes = pattern)
@@ -517,6 +905,33 @@ def _setupTensorDimensionProducts(self, tilerModel: TilerModel, ctxt: NetworkCon
 
     def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
                                    layerBinding: OrderedDict[str, ONNXLayer]) -> TilerModel:
+        """Set up geometric constraints for each layer in the schedule.
+
+        Adds geometric and policy constraints from each layer's tile constraint
+        specification to the tiler model.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+
+        Returns
+        -------
+        TilerModel
+            Updated tiler model with geometric constraints.
+
+        Notes
+        -----
+        Each pattern is treated as a decoupled sub-problem with respect to
+        geometric constraints. Dimension variables are regenerated for each
+        tensor using the copyIdx mechanism.
+        """
 
         # SCHEREMO: Each pattern is a decoupled sub-problem w.r.t the geometric constraints.
         # We need to regenerate dimension variables for each tensor
@@ -542,6 +957,30 @@ def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContex
         return tilerModel
 
     def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph]) -> TilerModel:
+        """Set up optimization heuristics for the tiler model.
+
+        Adds optimization objectives to maximize memory usage efficiency
+        for each pattern in the schedule.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+
+        Returns
+        -------
+        TilerModel
+            Updated tiler model with optimization objectives.
+
+        Notes
+        -----
+        Creates pattern-level memory size variables and adds maximization
+        objectives to encourage efficient memory utilization.
+        """
 
         for idx, pattern in enumerate(schedule):
 
@@ -556,7 +995,7 @@ def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedul
 
             patternMemSizeExpr: IntVar = 0
             for tensor in patternTensorList:
-                if not ctxt.lookup(tensor.name)._deploy:
+                if not ctxt.lookup(tensor.name)._deploy or isinstance(ctxt.lookup(tensor.name), ConstantBuffer):
                     continue
 
                 patternMemSizeExpr += tilerModel.getTensorNumberOfEltVar(
@@ -581,6 +1020,34 @@ def _setupMemoryConstraints(
             self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
             layerBinding: OrderedDict[str, ONNXLayer],
             targetMemoryLevelMapping: TargetMemoryLevelMapping) -> Tuple[TilerModel, List[PatternMemoryConstraints]]:
+        """Set up memory constraints for the tiling optimization.
+
+        Generates memory constraints for both inner and outer memory levels,
+        considering the memory hierarchy and scheduling requirements.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        Tuple[TilerModel, List[PatternMemoryConstraints]]
+            Updated tiler model and list of all memory constraints.
+
+        Notes
+        -----
+        Sets up both outer (inter-pattern) and inner (intra-pattern) memory
+        constraints, considering the chosen memory allocation strategy.
+        """
 
         allMemoryConstraints = self._generateAllMemoryConstraints(tilerModel, ctxt, schedule, layerBinding,
                                                                   targetMemoryLevelMapping)
@@ -621,6 +1088,34 @@ def _generateAllMemoryConstraints(
             self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
             layerBinding: OrderedDict[str, ONNXLayer],
             targetMemoryLevelMapping: TargetMemoryLevelMapping) -> List[PatternMemoryConstraints]:
+        """Generate all memory constraints combining dynamic and constant tensors.
+
+        Creates comprehensive memory constraints by combining dynamic tensor
+        constraints with constant tensor constraints for each pattern.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        List[PatternMemoryConstraints]
+            Complete list of memory constraints for all patterns.
+
+        Notes
+        -----
+        Combines results from _generateMemoryConstraints to create the complete
+        constraint set including both variable and constant buffers.
+        """
 
         dynamicTensorConstraints, constantTensorConstraints = self._generateMemoryConstraints(
             tilerModel, ctxt, schedule, layerBinding, targetMemoryLevelMapping)
@@ -641,6 +1136,39 @@ def _generateMemoryConstraints(
         self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
         layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping
     ) -> Tuple[List[PatternMemoryConstraints], NodeMemoryConstraint]:
+        """Generate memory constraints for variable and constant buffers.
+
+        Creates detailed memory constraints including outer/inner variable
+        buffer constraints, tiled tensor constraints, and constant buffer
+        constraints.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        Tuple[List[PatternMemoryConstraints], NodeMemoryConstraint]
+            Tuple containing:
+            - List of pattern memory constraints for dynamic tensors
+            - Node memory constraint for constant buffers
+
+        Notes
+        -----
+        Generates three levels of constraints:
+        1. First-level: global buffers + higher-level tensors
+        2. Tiled tensor constraints with double buffering
+        3. In-place tensor constraints for unkilled tensors
+        """
 
         # SCHEREMO: Construct non-double-buffered constraints of local variable buffers
 
@@ -703,6 +1231,38 @@ def _generateMemoryConstraints(
 
     def _generateTilePath(self, tilerModel: TilerModel, ctxt: NetworkContext,
                           tensorMemoryConstraint: TensorMemoryConstraint, pattern: SubGraph) -> TensorMemoryConstraint:
+        """Generate tiling path for a tensor across memory hierarchy levels.
+
+        Creates memory constraints for a tensor that needs to move between
+        different levels of the memory hierarchy, including multi-buffering.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        tensorMemoryConstraint : TensorMemoryConstraint
+            Original tensor memory constraint with multiple levels.
+        pattern : SubGraph
+            The computation pattern using this tensor.
+
+        Returns
+        -------
+        TensorMemoryConstraint
+            Updated tensor memory constraint with complete tiling path.
+
+        Raises
+        ------
+        AssertionError
+            If the tensor constraint doesn't have exactly 2 memory levels,
+            or if the multi-buffer factor is invalid.
+
+        Notes
+        -----
+        Uses breadth-first search to find the path between memory levels
+        and applies multi-buffering strategy at each intermediate level.
+        """
 
         assert len(tensorMemoryConstraint.memoryConstraints.keys()
                   ) == 2, "Can't generate a tile path for more than one hierarchy level!"
@@ -736,6 +1296,34 @@ def _generateTilePath(self, tilerModel: TilerModel, ctxt: NetworkContext,
     def _generateIntermediateTilingSteps(self, tilerModel: TilerModel, ctxt: NetworkContext,
                                          sourceStep: NodeMemoryConstraint, destinationStep: NodeMemoryConstraint,
                                          pattern: SubGraph) -> NodeMemoryConstraint:
+        """Generate intermediate tiling steps between source and destination constraints.
+
+        Creates tiling constraints for tensors that need to move between different
+        memory levels within a computation pattern.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        sourceStep : NodeMemoryConstraint
+            Memory constraints for the source step.
+        destinationStep : NodeMemoryConstraint
+            Memory constraints for the destination step.
+        pattern : SubGraph
+            The computation pattern being analyzed.
+
+        Returns
+        -------
+        NodeMemoryConstraint
+            Memory constraints for intermediate tiling steps.
+
+        Notes
+        -----
+        Identifies tensors that require tiling (those with multiple memory
+        constraints) and generates appropriate tiling paths for them.
+        """
         tileConstraintStep = NodeMemoryConstraint()
 
         mergedStep = sourceStep + destinationStep
@@ -755,6 +1343,39 @@ def _generateTilePathConstraints(self, tilerModel: TilerModel, ctxt: NetworkCont
                                      sourceConstraints: List[PatternMemoryConstraints],
                                      destinationConstraints: List[PatternMemoryConstraints],
                                      schedule: List[SubGraph]) -> List[PatternMemoryConstraints]:
+        """Generate tiling path constraints for all patterns in the schedule.
+
+        Creates comprehensive tiling constraints by combining source and destination
+        constraints for each pattern and applying I/O buffer strategies.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        sourceConstraints : List[PatternMemoryConstraints]
+            Source memory constraints for each pattern.
+        destinationConstraints : List[PatternMemoryConstraints]
+            Destination memory constraints for each pattern.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+
+        Returns
+        -------
+        List[PatternMemoryConstraints]
+            Complete tiling path constraints for all patterns.
+
+        Raises
+        ------
+        AssertionError
+            If source pattern constraints are not single-step.
+
+        Notes
+        -----
+        Assumes source patterns are constant and single-step since they
+        represent tensors that are live throughout the pattern execution.
+        """
 
         tileConstraints = []
 
@@ -781,6 +1402,26 @@ def _generateTilePathConstraints(self, tilerModel: TilerModel, ctxt: NetworkCont
         return tileConstraints
 
     def _generateBufferConstraints(self, ctxt: NetworkContext) -> NodeMemoryConstraint:
+        """Generate memory constraints for constant global buffers.
+
+        Creates memory constraints for all constant buffers that are marked
+        for deployment in the network context.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+
+        Returns
+        -------
+        NodeMemoryConstraint
+            Memory constraints for all constant global buffers.
+
+        Notes
+        -----
+        Only processes constant buffers with _deploy flag set to True.
+        Each buffer is treated as an input tensor in the constraints.
+        """
 
         constantGlobalConstraint: NodeMemoryConstraint = NodeMemoryConstraint()
         constantGlobalBuffers = [
@@ -805,6 +1446,37 @@ def _generateVariableBufferConstraints(
         self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
         layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping
     ) -> Tuple[List[PatternMemoryConstraints], List[PatternMemoryConstraints]]:
+        """Generate memory constraints for variable buffers using flow analysis.
+
+        Performs liveness analysis on the computation graph to determine
+        memory requirements for variable buffers at different points in execution.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        Tuple[List[PatternMemoryConstraints], List[PatternMemoryConstraints]]
+            Tuple containing:
+            - Outer memory constraints (inter-pattern)
+            - Inner memory constraints (intra-pattern)
+
+        Notes
+        -----
+        Uses graph flow analysis to compute liveness information and generates
+        both outer (pattern-level) and inner (step-level) memory constraints.
+        Includes transient buffer constraints for each computation step.
+        """
 
         def deltaFlow(
                 patternFlow: List[GenericFlowState[TensorMemLevelTuple]]) -> GenericFlowState[TensorMemLevelTuple]:
@@ -877,6 +1549,35 @@ def deltaFlow(
     def _generatePatternStepTransientBufferConstraints(
             self, tilerModel: TilerModel, ctxt: NetworkContext, layerBinding: OrderedDict[str, ONNXLayer],
             step: gs.Node, targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NodeMemoryConstraint:
+        """Generate memory constraints for transient buffers in a pattern step.
+
+        Computes memory requirements for temporary buffers needed during
+        the execution of a single computation step.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        step : gs.Node
+            The computation node being analyzed.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        NodeMemoryConstraint
+            Memory constraints for transient buffers in this step.
+
+        Notes
+        -----
+        Transient buffers are assumed to be allocated in the same memory
+        level as the main input of the computation step. Buffer sizes are
+        computed using the layer template's transient buffer size calculation.
+        """
 
         patternStepTransientBufferSizes = NodeMemoryConstraint()
 
@@ -907,6 +1608,26 @@ def _generatePatternStepTransientBufferConstraints(
         return patternStepTransientBufferSizes
 
     def assertLayerWiseTiling(self, schedule: List[List[gs.Node]]) -> bool:
+        """Assert that the schedule uses layer-wise tiling (one node per pattern).
+
+        Verifies that each pattern in the schedule contains exactly one node,
+        which is required for certain memory allocation strategies.
+
+        Parameters
+        ----------
+        schedule : List[List[gs.Node]]
+            The execution schedule to validate.
+
+        Returns
+        -------
+        bool
+            True if all patterns contain exactly one node, False otherwise.
+
+        Notes
+        -----
+        Layer-wise tiling is required when using the MiniMalloc memory
+        allocation strategy.
+        """
         for pattern in schedule:
             if len(pattern) > 1:
                 return False
@@ -914,12 +1635,55 @@ def assertLayerWiseTiling(self, schedule: List[List[gs.Node]]) -> bool:
         return True
 
     def assertUniformMemoryLevelAllocation(self, ctxt: NetworkContext, defaultMemoryLevel: str) -> bool:
+        """Assert that all local buffers are allocated to the default memory level.
+
+        Verifies that all local buffers in the network context are assigned
+        to the specified default memory level.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        defaultMemoryLevel : str
+            Name of the default memory level to check against.
+
+        Returns
+        -------
+        bool
+            True if all local buffers use the default memory level, False otherwise.
+
+        Notes
+        -----
+        Uniform memory level allocation is required when using the MiniMalloc
+        memory allocation strategy.
+        """
         for buffer in ctxt.localObjects.values():
             if buffer._memoryLevel != defaultMemoryLevel:
                 return False
         return True
 
     def testTilingSolutionCorrectness(self, tilingSolution: TilingSolution) -> None:
+        """Test the correctness of a computed tiling solution.
+
+        Validates that buffer sizes in the tiling solution are properly
+        aligned according to memory alignment requirements.
+
+        Parameters
+        ----------
+        tilingSolution : TilingSolution
+            The tiling solution to validate.
+
+        Raises
+        ------
+        AssertionError
+            If any buffer is not properly aligned or if multi-buffer
+            coefficients are not integers.
+
+        Notes
+        -----
+        Checks that all allocated buffers meet the byte alignment requirements
+        specified in MemoryScheduler.byteAlignment.
+        """
         # LMACAN: Assert buffer sizes are word aligned as per comment in MemoryScheduler.py:MemoryScheduler._buildCostVector()
         byteAlignment = MemoryScheduler.byteAlignment
         for patternMemoryConstraint in tilingSolution:
@@ -934,6 +1698,32 @@ def testTilingSolutionCorrectness(self, tilingSolution: TilingSolution) -> None:
 
     def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]], graph: gs.Graph,
                                  schedule: Schedule) -> None:
+        """Test the correctness of a computed memory map.
+
+        Validates that the memory map correctly represents buffer lifetimes
+        and ensures all required buffers are alive when needed.
+
+        Parameters
+        ----------
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            The memory map to validate.
+        graph : gs.Graph
+            The computation graph.
+        schedule : Schedule
+            The execution schedule.
+
+        Raises
+        ------
+        AssertionError
+            If output buffers are not alive until the end, input buffers
+            are not alive at the beginning, or required buffers are not
+            alive during computation steps.
+
+        Notes
+        -----
+        Performs comprehensive validation of buffer lifetimes to ensure
+        the memory map is consistent with the computation requirements.
+        """
 
         memoryBlockMap = {
             memoryBlock.name: memoryBlock for levelMemoryMap in memoryMap.values() for memoryBlock in levelMemoryMap[-1]
@@ -960,12 +1750,52 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]]
 
 
 class TilerDeployerWrapper(NetworkDeployerWrapper):
+    """Wrapper for network deployers that adds tiling capabilities.
+
+    Extends NetworkDeployerWrapper to provide automatic tiling and memory
+    management for neural network deployment on memory-constrained hardware.
+
+    Parameters
+    ----------
+    deployer : Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper]
+        The base deployer to wrap with tiling capabilities.
+    tilerCls : Type[Tiler], optional
+        The tiler class to use, by default Tiler.
+
+    Attributes
+    ----------
+    tiler : Tiler
+        The tiler instance used for memory optimization.
+
+    Raises
+    ------
+    AssertionError
+        If the platform is not a MemoryPlatform or MemoryPlatformWrapper.
+
+    Notes
+    -----
+    The wrapper automatically handles tiling setup, constraint solving,
+    and memory allocation during the binding process.
+    """
 
     def __init__(self,
                  deployer: Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper],
                  tilerCls: Type[Tiler] = Tiler,
                  testName: Optional[str] = None,
                  workDir: Optional[str] = None):
+        """Initialize the tiler deployer wrapper.
+
+        Parameters
+        ----------
+        deployer : Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper]
+            The base deployer to wrap.
+        tilerCls : Type[Tiler], optional
+            The tiler class to instantiate, by default Tiler.
+        testName : Optional[str], optional
+            Optional name for the test case, used for file naming. Defaults to None.
+        workDir : Optional[str], optional
+            Optional working directory for temporary files. Defaults to None.
+        """
         super().__init__(deployer)
         assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
             f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
@@ -973,9 +1803,56 @@ def __init__(self,
 
     @property
     def worstCaseBufferSize(self):
+        """Get the worst-case buffer sizes including inputs and outputs.
+
+        Computes the total worst-case memory requirements including
+        both tiled buffers and input/output buffers.
+
+        Returns
+        -------
+        Dict[str, int]
+            Dictionary mapping memory level names to their total worst-case
+            buffer sizes in bytes.
+
+        Notes
+        -----
+        Extends the tiler's worst-case buffer size calculation by adding
+        the memory requirements of input and output buffers.
+        """
         return self.tiler.worstCaseBufferSize
 
     def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optional[MemoryMap] = None):
+        """Perform tiling and memory allocation for the network.
+
+        Executes the complete tiling process including constraint setup,
+        optimization, memory allocation, and code generation updates.
+
+        Parameters
+        ----------
+        tilingSolution : Optional[TilingSolution], optional
+            Pre-computed tiling solution to use instead of computing one.
+            If None, the solution will be computed automatically.
+        memoryMap : Optional[MemoryMap], optional
+            Pre-computed memory map to use instead of computing one.
+            If None, the memory map will be computed automatically.
+
+        Raises
+        ------
+        AssertionError
+            If only one of tilingSolution or memoryMap is provided,
+            if MiniMalloc is used with non-layer-wise tiling,
+            or if tensors are not uniformly allocated when using MiniMalloc.
+
+        Notes
+        -----
+        When using MiniMalloc memory allocation strategy, additional
+        constraints apply:
+        - Only layer-wise execution is supported
+        - All tensors must be in the default memory level
+
+        The method performs validation of the computed solutions and
+        updates the execution blocks with tiling information.
+        """
         assert (tilingSolution is None and memoryMap is None) or (tilingSolution is not None and memoryMap is not None), \
             "You need to provide both the manual tilingSolution and the memoryMap to override tiling."
 
@@ -1022,6 +1899,21 @@ def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optio
         # SCHEREMO: Code generation STUB
 
     def bind(self):
+        """Bind the network with automatic tiling.
+
+        Performs the complete binding process including layer binding
+        and automatic tiling optimization.
+
+        Returns
+        -------
+        bool
+            True if binding was successful, False otherwise.
+
+        Notes
+        -----
+        Calls the parent bind() method first, then performs tiling
+        if the initial binding was successful.
+        """
         if not super().bind():
             return False
 
@@ -1046,9 +1938,35 @@ def _printMemorySummary(self):
 
 
 def TilingReadyNodeBindings(nodeBindings: List[NodeBinding], tileConstraint: TileConstraint) -> List[NodeBinding]:
-    '''
-    Apply the TillingReadyNodeTemplate to the template of each NodeBinding.
-    '''
+    """Apply tiling constraints to a list of node bindings.
+
+    Creates deep copies of the provided node bindings and attaches the
+    specified tile constraint to each binding's template.
+
+    Parameters
+    ----------
+    nodeBindings : List[NodeBinding]
+        List of node bindings to make tiling-ready.
+    tileConstraint : TileConstraint
+        The tile constraint to attach to each binding.
+
+    Returns
+    -------
+    List[NodeBinding]
+        List of node bindings with tiling constraints attached.
+
+    Notes
+    -----
+    The function creates deep copies to avoid modifying the original
+    node bindings. Each template in the copied bindings gets the
+    tileConstraint attribute set.
+
+    Examples
+    --------
+    >>> bindings = [binding1, binding2, binding3]
+    >>> constraint = MyTileConstraint()
+    >>> tiling_bindings = TilingReadyNodeBindings(bindings, constraint)
+    """
     nodeBindingsCopy = copy.deepcopy(nodeBindings)  #.copy()
     for binding in nodeBindingsCopy:
         binding.template.tileConstraint = tileConstraint
diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py
index db83974f0c..080211270b 100644
--- a/Deeploy/TilingExtension/TilerModel.py
+++ b/Deeploy/TilingExtension/TilerModel.py
@@ -10,6 +10,7 @@
 import numpy as np
 from ortools.constraint_solver.pywrapcp import IntExpr, IntVar, SolutionCollector, Solver
 
+from Deeploy.DeeployTypes import ConstantBuffer
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel
@@ -170,6 +171,10 @@ def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyId
 
         tensor = ctxt.lookup(tensorName)
 
+        # Skip constant buffers: they don't participate in tiling and don't need num_elements variables
+        if isinstance(tensor, ConstantBuffer):
+            return
+
         tensorDimProductExpr = 1
 
         for idx, _ in enumerate([
diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py
index 0974fa337b..945aaa971d 100644
--- a/Deeploy/TilingExtension/TilingCodegen.py
+++ b/Deeploy/TilingExtension/TilingCodegen.py
@@ -16,18 +16,94 @@
 
 @dataclass
 class MemoryTransfer():
+    """
+    Represents a memory transfer operation between two memory levels.
+
+    This dataclass encapsulates the source and destination memory constraints
+    for a memory transfer operation in the tiling system, defining where data
+    is transferred from and to in the memory hierarchy.
+
+    Attributes
+    ----------
+    source : MemoryConstraint
+        The source memory constraint defining the memory level data is
+        transferred from.
+    destination : MemoryConstraint
+        The destination memory constraint defining the memory level data is
+        transferred to.
+
+    Notes
+    -----
+    This class is used in conjunction with memory hierarchies to define
+    data movement patterns during tiled neural network execution.
+    """
     source: MemoryConstraint
     destination: MemoryConstraint
 
 
 @dataclass
 class HyperRectangle():
+    """
+    Represents a multi-dimensional rectangular region in tensor space.
+
+    A HyperRectangle defines a rectangular tile or region within a
+    multi-dimensional tensor, specified by its position (offset) and
+    dimensions (size) in each axis. This is fundamental for tiled
+    processing of tensors where operations are performed on smaller
+    rectangular chunks.
+
+    Attributes
+    ----------
+    offset : Tuple[int, ...]
+        Position of the hyperrectangle in feature map space. Each element
+        represents the starting index along the corresponding dimension.
+    dims : Tuple[int, ...]
+        Size of the hyperrectangle along each dimension. Each element
+        represents the extent of the rectangle in the corresponding dimension.
+
+    Parameters
+    ----------
+    offset : Tuple[int, ...]
+        Starting position of the rectangle in multi-dimensional space.
+    dims : Tuple[int, ...]
+        Dimensions/size of the rectangle in multi-dimensional space.
+
+    Raises
+    ------
+    AssertionError
+        If the offset and dims tuples have different lengths.
+
+    Notes
+    -----
+    The offset and dims must have the same rank (number of dimensions).
+    This ensures the hyperrectangle is well-defined in the tensor space.
+
+    Examples
+    --------
+    >>> rect = HyperRectangle((0, 5), (10, 15))
+    >>> # Creates a 2D rectangle starting at (0,5) with size 10x15
+    """
     # position of the hyperrectangle in feature map space
     offset: Tuple[int, ...]
     # size of the hyperrectangle
     dims: Tuple[int, ...]
 
     def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]):
+        """
+        Initialize a HyperRectangle with given offset and dimensions.
+
+        Parameters
+        ----------
+        offset : Tuple[int, ...]
+            Starting position of the rectangle in multi-dimensional space.
+        dims : Tuple[int, ...]
+            Dimensions/size of the rectangle in multi-dimensional space.
+
+        Raises
+        ------
+        AssertionError
+            If offset and dims have mismatching dimensions.
+        """
         assert len(offset) == len(
             dims), f"HyperRectangle offset and dims for mismatching dimensions {offset} and {dims}"
 
@@ -37,10 +113,58 @@ def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]):
 
 @dataclass
 class AbsoluteHyperRectangle:
+    """
+    Represents a HyperRectangle with an absolute offset in memory space.
+
+    This class combines a HyperRectangle with an absolute memory offset,
+    providing both the logical tensor coordinates and the physical memory
+    location. This is useful for tracking tiles that have been positioned
+    in specific memory locations during tiling operations.
+
+    Attributes
+    ----------
+    rectangle : HyperRectangle
+        The hyperrectangle defining the logical tensor region.
+    absoluteOffset : Tuple[int, ...]
+        The absolute offset in memory space where this rectangle is located.
+
+    Parameters
+    ----------
+    rectangle : HyperRectangle
+        The hyperrectangle to associate with the absolute offset.
+    absoluteOffset : Tuple[int, ...]
+        The absolute position in memory space.
+
+    Raises
+    ------
+    AssertionError
+        If the absoluteOffset and rectangle.offset have mismatching dimensions.
+
+    Notes
+    -----
+    The absoluteOffset must have the same dimensionality as the rectangle's
+    offset to ensure consistent coordinate mapping between logical and physical
+    memory spaces.
+    """
     rectangle: HyperRectangle
     absoluteOffset: Tuple[int, ...]
 
     def __init__(self, rectangle: HyperRectangle, absoluteOffset: Tuple[int, ...]):
+        """
+        Initialize an AbsoluteHyperRectangle with rectangle and absolute offset.
+
+        Parameters
+        ----------
+        rectangle : HyperRectangle
+            The hyperrectangle defining the logical tensor region.
+        absoluteOffset : Tuple[int, ...]
+            The absolute position in memory space.
+
+        Raises
+        ------
+        AssertionError
+            If absoluteOffset and rectangle.offset have mismatching dimensions.
+        """
         assert len(absoluteOffset) == len(
             rectangle.offset
         ), f"AsoluteHyperRectangle's absoluteOffset and rectangle's offset for mismatching dimensions {absoluteOffset} and {rectangle.offset}"
@@ -51,6 +175,46 @@ def __init__(self, rectangle: HyperRectangle, absoluteOffset: Tuple[int, ...]):
 
 @dataclass
 class TilingSchedule():
+    """
+    Represents a complete schedule for tiled execution of neural network operations.
+
+    A TilingSchedule defines how data should be loaded, processed, and stored
+    during tiled execution. It specifies the memory offsets for input and output
+    tensors, as well as the hyperrectangles that define which regions of data
+    are processed in each tiling step.
+
+    Attributes
+    ----------
+    inputBaseOffsets : Dict[str, List[int]]
+        Dictionary mapping tensor names to lists of base memory offsets for
+        input tiles. Each list should have length equal to the number of tiles.
+    outputBaseOffsets : Dict[str, List[int]]
+        Dictionary mapping tensor names to lists of base memory offsets for
+        output tiles. Each list should have length equal to the number of tiles.
+    inputLoadSchedule : List[Dict[str, HyperRectangle]]
+        List of dictionaries, one per tile, mapping tensor names to the
+        hyperrectangles that should be loaded as input for that tile.
+    outputLoadSchedule : List[Dict[str, HyperRectangle]]
+        List of dictionaries, one per tile, mapping tensor names to the
+        hyperrectangles that should be stored as output for that tile.
+
+    Parameters
+    ----------
+    inputBaseOffsets : Dict[str, List[int]]
+        Input tensor base offsets for each tile.
+    outputBaseOffsets : Dict[str, List[int]]
+        Output tensor base offsets for each tile.
+    inputLoadSchedule : List[Dict[str, HyperRectangle]]
+        Input loading schedule for each tile.
+    outputLoadSchedule : List[Dict[str, HyperRectangle]]
+        Output storing schedule for each tile.
+
+    Notes
+    -----
+    The lengths of inputLoadSchedule and outputLoadSchedule should typically
+    be equal, representing the same number of tiles. Each schedule step
+    corresponds to processing one tile of the operation.
+    """
     # the places to store input tiles
     # Should have length numTiles
     inputBaseOffsets: Dict[str, List[int]]
@@ -70,6 +234,27 @@ class TilingSchedule():
     def __init__(self, inputBaseOffsets: Dict[str, List[int]], outputBaseOffsets: Dict[str, List[int]],
                  inputLoadSchedule: List[Dict[str, HyperRectangle]], outputLoadSchedule: List[Dict[str,
                                                                                                    HyperRectangle]]):
+        """
+        Initialize a TilingSchedule with specified offsets and load schedules.
+
+        Parameters
+        ----------
+        inputBaseOffsets : Dict[str, List[int]]
+            Input tensor base offsets for each tile.
+        outputBaseOffsets : Dict[str, List[int]]
+            Output tensor base offsets for each tile.
+        inputLoadSchedule : List[Dict[str, HyperRectangle]]
+            Input loading schedule for each tile.
+        outputLoadSchedule : List[Dict[str, HyperRectangle]]
+            Output storing schedule for each tile.
+
+        Raises
+        ------
+        AssertionError
+            If any key from inputBaseOffsets is missing from a schedule step
+            in inputLoadSchedule, or if any key from outputBaseOffsets is
+            missing from a schedule step in outputLoadSchedule.
+        """
 
         # assert len(inputLoadSchedule) == len(outputLoadSchedule), "Didn't get equal amount of input and output tiles!"
 
@@ -100,6 +285,30 @@ def __repr__(self) -> str:
         return outStr
 
     def __add__(self, other: TilingSchedule) -> TilingSchedule:
+        """
+        Concatenate two TilingSchedule objects.
+
+        Combines this tiling schedule with another by concatenating their
+        load schedules while maintaining the same base offsets. This is
+        useful for creating composite tiling schedules from multiple stages.
+
+        Parameters
+        ----------
+        other : TilingSchedule
+            The other TilingSchedule to concatenate with this one.
+
+        Returns
+        -------
+        TilingSchedule
+            A new TilingSchedule containing the concatenated load schedules
+            from both input schedules.
+
+        Raises
+        ------
+        AssertionError
+            If the other object is not a TilingSchedule, or if the tensor
+            keys don't match between the two schedules.
+        """
 
         assert isinstance(other, TilingSchedule), f"Other {other} is not a TilingSchedule"
 
@@ -124,10 +333,60 @@ def __add__(self, other: TilingSchedule) -> TilingSchedule:
 
 @dataclass
 class VariableReplacementScheme():
+    """
+    Defines how variables should be replaced with tile-specific values.
+
+    This class manages the replacement of scalar variables with arrays of
+    tile-specific values during tiled execution. It tracks both the per-tile
+    replacement values and the corresponding data types for each variable.
+
+    Attributes
+    ----------
+    perTileReplacements : Dict[str, List]
+        Dictionary mapping variable names to lists of replacement values,
+        one value per tile. Each list should have length equal to the
+        number of tiles.
+    replacementTypes : Dict[str, Type[Pointer]]
+        Dictionary mapping variable names to their corresponding pointer
+        types for the replacement arrays.
+
+    Parameters
+    ----------
+    perTileReplacements : Dict[str, List]
+        Per-tile replacement values for each variable.
+    replacementTypes : Dict[str, Type[Pointer]]
+        Type information for each replacement variable.
+
+    Raises
+    ------
+    AssertionError
+        If the keys in perTileReplacements and replacementTypes don't match
+        exactly, or if they have different numbers of entries.
+
+    Notes
+    -----
+    This scheme is used to replace compile-time constants with runtime
+    arrays during tiled execution, enabling different values for each tile.
+    """
     perTileReplacements: Dict[str, List]
     replacementTypes: Dict[str, Type[Pointer]]
 
     def __init__(self, perTileReplacements: Dict[str, List], replacementTypes: Dict[str, Type[Pointer]]):
+        """
+        Initialize a VariableReplacementScheme with replacements and types.
+
+        Parameters
+        ----------
+        perTileReplacements : Dict[str, List]
+            Per-tile replacement values for each variable.
+        replacementTypes : Dict[str, Type[Pointer]]
+            Type information for each replacement variable.
+
+        Raises
+        ------
+        AssertionError
+            If the keys don't match exactly or have different counts.
+        """
         assert len(perTileReplacements.keys()) == len(
             replacementTypes.keys()), "Exactly all replacements must have one type"
 
@@ -138,6 +397,29 @@ def __init__(self, perTileReplacements: Dict[str, List], replacementTypes: Dict[
         self.replacementTypes = replacementTypes
 
     def __add__(self, other: VariableReplacementScheme) -> VariableReplacementScheme:
+        """
+        Concatenate two VariableReplacementScheme objects.
+
+        Combines this replacement scheme with another by concatenating their
+        per-tile replacement lists. This is useful for merging replacement
+        schemes from multiple tiling stages.
+
+        Parameters
+        ----------
+        other : VariableReplacementScheme
+            The other VariableReplacementScheme to concatenate with this one.
+
+        Returns
+        -------
+        VariableReplacementScheme
+            A new VariableReplacementScheme with concatenated replacement lists.
+
+        Raises
+        ------
+        AssertionError
+            If the other object is not a VariableReplacementScheme, or if
+            the variable keys don't match between the two schemes.
+        """
 
         assert isinstance(other, VariableReplacementScheme), f"Other {other} is not a VariableReplacementScheme"
 
@@ -161,6 +443,33 @@ def __add__(self, other: VariableReplacementScheme) -> VariableReplacementScheme
 def minimizeVariableReplacement(
         scheme: VariableReplacementScheme,
         operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, Dict]:
+    """
+    Optimize a variable replacement scheme by eliminating constant replacements.
+
+    Analyzes the replacement scheme and removes variables that have the same
+    value across all tiles, directly setting them in the operator representation
+    instead. This optimization reduces memory usage and improves performance.
+
+    Parameters
+    ----------
+    scheme : VariableReplacementScheme
+        The original variable replacement scheme to optimize.
+    operatorRepresentation : OperatorRepresentation
+        The operator representation that will be updated with constant values.
+
+    Returns
+    -------
+    Tuple[VariableReplacementScheme, Dict]
+        A tuple containing:
+        - The minimized VariableReplacementScheme with only non-constant variables
+        - A dictionary of updates to apply to the operator representation
+
+    Notes
+    -----
+    Variables with identical values across all tiles are considered constants
+    and are removed from the replacement scheme. Their single value is set
+    directly in the operator representation.
+    """
     newPerTileRep = {}
     newRepTypes = {}
 
@@ -175,6 +484,48 @@ def minimizeVariableReplacement(
 
 
 def minimizeRectangle(rect: HyperRectangle, referenceShape: Sequence[int]) -> Tuple[HyperRectangle, Tuple[int, ...]]:
+    """
+    Minimize a hyperrectangle by collapsing dimensions where possible.
+
+    Reduces the dimensionality of a hyperrectangle by merging consecutive
+    dimensions where the rectangle spans the entire reference shape. This
+    optimization is useful for memory transfers and reduces complexity.
+
+    Parameters
+    ----------
+    rect : HyperRectangle
+        The hyperrectangle to minimize.
+    referenceShape : Sequence[int]
+        The shape of the reference tensor that the rectangle is within.
+
+    Returns
+    -------
+    Tuple[HyperRectangle, Tuple[int, ...]]
+        A tuple containing:
+        - The minimized HyperRectangle with collapsed dimensions
+        - The corresponding minimized reference shape
+
+    Raises
+    ------
+    AssertionError
+        If the rectangle offset is non-zero when dimensions match the
+        reference shape (indicating the rectangle spans the full dimension).
+
+    Notes
+    -----
+    Dimensions are collapsed from right to left. When a rectangle dimension
+    equals the reference dimension and has zero offset, it can be merged
+    with adjacent dimensions to reduce the overall rank.
+
+    Example
+    -------
+    >>> rect = HyperRectangle((0, 0), (2, 2))
+    >>> minimizeRectangle(rect, (4, 4))
+        (HyperRectangle(offset=(0, 0), dims=(2, 2)), (2, 4))
+    >>> rect = HyperRectangle((0, 0), (2, 2))
+    >>> minimizeRectangle(rect, (4, 2))
+        (HyperRectangle(offset=(0,), dims=(4,)), (8,))
+    """
     minRectShape: List[int] = []
     minRectOffset: List[int] = []
     minReferenceShape: List[int] = []
@@ -200,6 +551,37 @@ def minimizeRectangle(rect: HyperRectangle, referenceShape: Sequence[int]) -> Tu
 
 
 def padShape(shape: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
+    """
+    Pad a shape tuple to a target rank by prepending ones.
+
+    Extends a shape tuple to a higher dimensionality by adding leading
+    dimensions of size 1. This is useful for broadcasting operations
+    and ensuring consistent tensor ranks.
+
+    Parameters
+    ----------
+    shape : Tuple[int, ...]
+        The original shape tuple to pad.
+    rank : int
+        The target rank (number of dimensions) for the padded shape.
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The padded shape tuple with leading dimensions of size 1.
+
+    Raises
+    ------
+    AssertionError
+        If the target rank is smaller than the current shape's rank.
+
+    Examples
+    --------
+    >>> padShape((3, 4), 4)
+    (1, 1, 3, 4)
+    >>> padShape((5,), 3)
+    (1, 1, 5)
+    """
     assert rank >= len(
         shape), f"Cannot pad to rank smaller then shape's. Received rank: {rank}, shape rank: {len(shape)}"
     ret = tuple([1] * (rank - len(shape))) + shape
@@ -208,6 +590,37 @@ def padShape(shape: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
 
 
 def padOffset(offset: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
+    """
+    Pad an offset tuple to a target rank by prepending zeros.
+
+    Extends an offset tuple to a higher dimensionality by adding leading
+    offset values of 0. This ensures offset tuples match the rank of
+    their corresponding shapes.
+
+    Parameters
+    ----------
+    offset : Tuple[int, ...]
+        The original offset tuple to pad.
+    rank : int
+        The target rank (number of dimensions) for the padded offset.
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The padded offset tuple with leading zeros.
+
+    Raises
+    ------
+    AssertionError
+        If the target rank is smaller than the current offset's rank.
+
+    Examples
+    --------
+    >>> padOffset((2, 3), 4)
+    (0, 0, 2, 3)
+    >>> padOffset((5,), 3)
+    (0, 0, 5)
+    """
     assert rank >= len(
         offset), f"Cannot pad to rank smaller then offset's. Received rank: {rank}, offset rank: {len(offset)}"
     ret = tuple([0] * (rank - len(offset))) + offset
@@ -216,6 +629,39 @@ def padOffset(offset: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
 
 
 def padStride(stride: Tuple[int, ...], rank: int, paddingStride: int) -> Tuple[int, ...]:
+    """
+    Pad a stride tuple to a target rank by prepending a specified stride value.
+
+    Extends a stride tuple to a higher dimensionality by adding leading
+    stride values. This is useful for maintaining consistent stride
+    calculations across different tensor ranks.
+
+    Parameters
+    ----------
+    stride : Tuple[int, ...]
+        The original stride tuple to pad.
+    rank : int
+        The target rank (number of dimensions) for the padded stride.
+    paddingStride : int
+        The stride value to use for padding (prepended dimensions).
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The padded stride tuple with leading padding stride values.
+
+    Raises
+    ------
+    AssertionError
+        If the target rank is smaller than the current stride's rank.
+
+    Examples
+    --------
+    >>> padStride((4, 1), 4, 16)
+    (16, 16, 4, 1)
+    >>> padStride((1,), 3, 8)
+    (8, 8, 1)
+    """
     assert rank >= len(
         stride), f"Cannot pad to rank smaller then stride's. Received rank: {rank}, stride rank: {len(stride)}"
     ret = tuple([paddingStride] * (rank - len(stride))) + stride
@@ -224,6 +670,36 @@ def padStride(stride: Tuple[int, ...], rank: int, paddingStride: int) -> Tuple[i
 
 
 def stridesFromShape(shape: Sequence[int]) -> Tuple[int, ...]:
+    """
+    Calculate memory strides from a tensor shape.
+
+    Computes the stride values for each dimension of a tensor based on its
+    shape. Strides represent the number of elements to skip in memory when
+    moving one position along each dimension.
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        The shape of the tensor as a sequence of dimension sizes.
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The stride values for each dimension, where the last dimension
+        has stride 1 and earlier dimensions have progressively larger strides.
+
+    Notes
+    -----
+    Strides are computed assuming row-major (C-style) memory layout.
+    The stride for dimension i is the product of all dimensions after i.
+
+    Examples
+    --------
+    >>> stridesFromShape([2, 3, 4])
+    (12, 4, 1)
+    >>> stridesFromShape([5, 6])
+    (6, 1)
+    """
     strides = [1] * len(shape)
     for idx, dim in enumerate(reversed(shape[1:])):
         strides[idx + 1] = strides[idx] * dim
@@ -231,18 +707,114 @@ def stridesFromShape(shape: Sequence[int]) -> Tuple[int, ...]:
 
 
 def calculateFlatOffset(offsets: Sequence[int], strides: Sequence[int]) -> int:
+    """
+    Calculate the flat memory offset from multi-dimensional coordinates.
+
+    Converts multi-dimensional tensor coordinates (offsets) to a single
+    flat memory offset using the provided stride information. This is
+    essential for translating tensor indices to memory addresses.
+
+    Parameters
+    ----------
+    offsets : Sequence[int]
+        The multi-dimensional coordinates/offsets in each dimension.
+    strides : Sequence[int]
+        The stride values for each dimension.
+
+    Returns
+    -------
+    int
+        The flat memory offset corresponding to the multi-dimensional position.
+
+    Raises
+    ------
+    AssertionError
+        If offsets and strides have different numbers of dimensions.
+
+    Notes
+    -----
+    The flat offset is computed as the sum of (offset[i] * stride[i])
+    for all dimensions i.
+
+    Examples
+    --------
+    >>> calculateFlatOffset([1, 2, 3], [12, 4, 1])
+    23
+    >>> calculateFlatOffset([0, 1], [6, 1])
+    1
+    """
     assert len(offsets) == len(strides), \
         f"Offsets and strides have to have the same number of dimensions. Length offsets: {len(offsets)}, strides: {len(strides)}"
     return sum(offset * stride for offset, stride in zip(offsets, strides))
 
 
 def calculateFlatOffsetInBytes(tile: HyperRectangle, referenceBuffer: VariableBuffer) -> int:
+    """
+    Calculate the flat memory offset in bytes for a hyperrectangle tile.
+
+    Computes the byte offset in memory for the starting position of a
+    hyperrectangle tile within a reference buffer. This accounts for
+    both the multi-dimensional positioning and the data type size.
+
+    Parameters
+    ----------
+    tile : HyperRectangle
+        The hyperrectangle tile whose offset should be calculated.
+    referenceBuffer : VariableBuffer
+        The reference buffer containing the tile, used for shape and type info.
+
+    Returns
+    -------
+    int
+        The flat memory offset in bytes from the buffer start to the tile start.
+
+    Notes
+    -----
+    The calculation combines multi-dimensional offset computation with
+    data type width to produce a byte-level memory offset.
+    """
     return int(
         calculateFlatOffset(tile.offset, stridesFromShape(referenceBuffer.shape)) *
         (referenceBuffer._type.referencedType.typeWidth // 8))
 
 
 def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRectangle]:
+    """
+    Compute hyperrectangle tiles for a memory transfer operation.
+
+    Generates a list of hyperrectangle tiles that partition the source tensor
+    into smaller chunks that fit within the destination memory constraints.
+    This is fundamental for tiled execution where large tensors are processed
+    in smaller, memory-efficient pieces.
+
+    Parameters
+    ----------
+    memoryTransfer : MemoryTransfer
+        The memory transfer operation defining source and destination constraints.
+
+    Returns
+    -------
+    List[HyperRectangle]
+        A list of hyperrectangle tiles that cover the entire source tensor,
+        each fitting within the destination memory constraints.
+
+    Raises
+    ------
+    AssertionError
+        If source or destination shapes are undefined, if they have different
+        numbers of dimensions, or if any destination dimension is larger than
+        the corresponding source dimension.
+
+    Notes
+    -----
+    The tiling algorithm generates non-overlapping tiles that completely
+    cover the source tensor. Each tile is sized to fit within the destination
+    memory constraints, with edge tiles potentially being smaller to fit
+    exactly within the source tensor boundaries.
+
+    The tiles are generated in row-major order, iterating through dimensions
+    from outermost to innermost.
+    """
     assert memoryTransfer.source.shape is not None, "Source transfer shape cannot be undefined!"
     assert memoryTransfer.destination.shape is not None, "Destination transfer shape cannot be undefined!"
 
@@ -256,6 +828,19 @@ def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRect
         assert dimSizeSmall <= dimSizeLarge, f"smallShape[{dimIdx}] should not be bigger then largeShape[{dimIdx}]. ({dimSizeSmall} > {dimSizeLarge})"
 
     def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]:
+        """
+        Generate tile indices in row-major order.
+
+        Parameters
+        ----------
+        tileIndexEnd : List[int]
+            The end index for each dimension (exclusive).
+
+        Yields
+        ------
+        List[int]
+            Successive tile indices covering the entire index space.
+        """
         tileCount = np.prod(tileIndexEnd)
         tileIndex = [0] * len(tileIndexEnd)
         for _ in range(tileCount):
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
index b7f3535790..71f632cbd2 100644
--- a/DeeployTest/CMakeLists.txt
+++ b/DeeployTest/CMakeLists.txt
@@ -50,6 +50,8 @@ elseif(DEEPLOY_ARCH STREQUAL SNITCH)
   add_subdirectory(Platforms/Snitch)
 elseif(DEEPLOY_ARCH STREQUAL CHIMERA)
   add_subdirectory(Platforms/Chimera)
+elseif(DEEPLOY_ARCH STREQUAL SPATZ)
+  add_subdirectory(Platforms/Spatz)
 elseif(platform STREQUAL GAP9)
 
   # Search for hex files generated by Python code generator
diff --git a/DeeployTest/Platforms/Spatz/CMakeLists.txt b/DeeployTest/Platforms/Spatz/CMakeLists.txt
new file mode 100644
index 0000000000..6af333af2b
--- /dev/null
+++ b/DeeployTest/Platforms/Spatz/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(ProjectId ${TESTNAME})
+
+file(GLOB_RECURSE SOURCES
+    main.c
+)
+
+list(APPEND SOURCES
+    ${SPATZ_HOME}/sw/spatzBenchmarks/benchmark/benchmark.c
+)
+
+add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES})
+
+set(SPATZ_BENCHMARK_INCLUDE_DIR
+    ${SPATZ_HOME}/sw/spatzBenchmarks/include
+)
+target_include_directories(${ProjectId} PRIVATE ${SPATZ_BENCHMARK_INCLUDE_DIR})
+target_include_directories(network PUBLIC ${SPATZ_BENCHMARK_INCLUDE_DIR})
+
+target_link_libraries(${ProjectId} PRIVATE network deeploylib)
+target_compile_options(${ProjectId} INTERFACE network)
+
+add_spatz_gvsoc_emulation(${ProjectId} "spatz_v2")
+add_spatz_vsim_simulation(${ProjectId})
\ No newline at end of file
diff --git a/DeeployTest/Platforms/Spatz/main.c b/DeeployTest/Platforms/Spatz/main.c
new file mode 100644
index 0000000000..4a413b48ed
--- /dev/null
+++ b/DeeployTest/Platforms/Spatz/main.c
@@ -0,0 +1,105 @@
+
+#include <stdint.h>
+#include <stddef.h>
+#include <benchmark.h>
+#include "printf.h"
+
+#include "Network.h"
+#include "testinputs.h"
+#include "testoutputs.h"
+
+#ifndef DEEPLOY_ZERO_COPY_TEST_INPUTS
+#define DEEPLOY_ZERO_COPY_TEST_INPUTS 1
+#endif
+
+// Optional: some generated networks provide this helper to avoid copying
+// test inputs into Deeploy-owned buffers.
+#ifndef DEEPLOYNETWORK_HAS_BIND_EXTERNAL_INPUTS
+void DeeployNetwork_BindExternalInputs(void **external_inputs) __attribute__((weak));
+#endif
+
+
+int main() {
+  const unsigned int core_id = snrt_cluster_core_idx();
+  unsigned int timer_start, timer_end, timer;
+
+  if (core_id == 0) printf("[INFO] Running on %d cores\n", snrt_cluster_core_num());
+  if (snrt_is_dm_core()){printf("[INFO] DM core is core number %d\n", core_id);}
+  snrt_cluster_hw_barrier();
+
+  // do it only with one of the two spatz cores
+  if (snrt_is_dm_core()){
+    printf("Initializing network...\r\n");
+    InitNetwork(0, 1);
+
+    // printf("Copying inputs to l3 buffer...\r\n");
+#if DEEPLOY_ZERO_COPY_TEST_INPUTS
+    if (DeeployNetwork_BindExternalInputs) {
+      DeeployNetwork_BindExternalInputs(testInputVector);
+    } else {
+      for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+        memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]);
+      }
+    }
+#else
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+      memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]);
+    }
+#endif
+
+    printf("Running network...\r\n");
+  }
+  snrt_cluster_hw_barrier();
+
+  if (snrt_is_dm_core()){ timer_start = benchmark_get_cycle(); }
+  RunNetwork(core_id, 2);
+
+  snrt_cluster_hw_barrier();
+  
+  if (snrt_is_dm_core()){
+    timer_end = benchmark_get_cycle();
+    timer = timer_end - timer_start;
+
+    printf("Network ran in %d cycles.\r\nChecking Outputs...\r\n", timer);
+    int32_t tot_err = 0;
+    uint32_t tot = 0;
+    OUTPUTTYPE diff;
+    OUTPUTTYPE expected, actual;
+
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+      tot += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);
+      for (uint32_t i = 0;
+          i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
+        expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
+        actual = ((OUTPUTTYPE *)DeeployNetwork_outputs[buf])[i];
+        diff = expected - actual;
+
+#if ISOUTPUTFLOAT == 1
+        // RUNWANG: Allow margin of error for float32_t
+        // MATTIA: if diff is a quiet nan 0x7FC00000 we want to error
+        if ((diff < -1e-4f) || (diff > 1e-4f) || *(uint32_t*)&diff == 0x7FC00000) {
+          tot_err += 1;
+          // printf("Expected: %f  Actual: %f  Diff: %f at Index %12u in Output %u\r\n", expected, actual, diff, i, buf);  
+          printf("Expected: 0x%08x  Actual: 0x%08x  Diff: 0x%08x at Index %4u in Output %u\r\n", *(uint32_t*)&expected, *(uint32_t*)&actual, *(uint32_t*)&diff, i, buf);  
+        }
+#else
+        // RUNWANG: No margin for integer comparison
+        if (diff != 0) {
+          tot_err += 1;
+          printf("Expected: %4d  ", expected);
+          printf("Actual: %4d  ", actual);
+          printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
+        }
+#endif
+      }
+    }
+
+    printf("Errors: %d out of %d \r\n", tot_err, tot);
+  }
+
+  printf("core %d arrived at the end\r\n", core_id);
+  snrt_cluster_hw_barrier();
+  printf("We are after hw barrier\r\n");
+
+  return 0;
+}
diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz
new file mode 100644
index 0000000000..eb073685c7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx
new file mode 100644
index 0000000000..c20c89bd05
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz
new file mode 100644
index 0000000000..ed786d2e1d
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz
new file mode 100644
index 0000000000..afc11e34d7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx
new file mode 100644
index 0000000000..94e265be97
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx
@@ -0,0 +1,13 @@
+pytorch2.7.0:^
+&
+VA/Softmax"Softmax*
+axis�
+main_graphZ
+V
+
+
+b
+A
+
+
+B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz
new file mode 100644
index 0000000000..f5f6daea15
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/inputs.npz
new file mode 100644
index 0000000000..cf71086f0a
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/network.onnx
new file mode 100644
index 0000000000..25a9df5ce5
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/outputs.npz
new file mode 100644
index 0000000000..6fb79e45c5
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/inputs.npz
new file mode 100644
index 0000000000..9c1eb8c9be
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/network.onnx
new file mode 100644
index 0000000000..d50f25b9a9
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/network.onnx
@@ -0,0 +1,35 @@
+pytorch2.7.0:�
+)
+Q
+K/MatMul_output_0/MatMul"MatMul
+E
+/MatMul_output_0/Softmax_output_0/Softmax"Softmax*
+axis�
+,
+/Softmax_output_0
+VA	/MatMul_1"MatMul
+main_graphZ
+Q
+
+
+@Z
+K
+	
+@
+�Z
+V
+	
+�
+@b
+A
+
+
+@j#
+/MatMul_output_0
+	
+
+�j$
+/Softmax_output_0
+	
+
+�B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/outputs.npz
new file mode 100644
index 0000000000..15750ef660
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/inputs.npz
new file mode 100644
index 0000000000..9c1eb8c9be
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/network.onnx
new file mode 100644
index 0000000000..3d8856b104
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/outputs.npz
new file mode 100644
index 0000000000..f2d40944bd
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/inputs.npz
new file mode 100644
index 0000000000..9c1eb8c9be
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/network.onnx
new file mode 100644
index 0000000000..1cfad6347b
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/outputs.npz
new file mode 100644
index 0000000000..7ff1584247
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/inputs.npz
new file mode 100644
index 0000000000..9c1eb8c9be
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/network.onnx
new file mode 100644
index 0000000000..10882f4c35
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/outputs.npz
new file mode 100644
index 0000000000..7ff1584247
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/inputs.npz
new file mode 100644
index 0000000000..9c1eb8c9be
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/network.onnx
new file mode 100644
index 0000000000..e0a9db3b16
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/outputs.npz
new file mode 100644
index 0000000000..2a25c8d109
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/inputs.npz
new file mode 100644
index 0000000000..9c1eb8c9be
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/network.onnx
new file mode 100644
index 0000000000..5ecf54f7e7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/outputs.npz
new file mode 100644
index 0000000000..7ff1584247
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/inputs.npz
new file mode 100644
index 0000000000..9c1eb8c9be
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/network.onnx
new file mode 100644
index 0000000000..ce16e85054
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/outputs.npz
new file mode 100644
index 0000000000..76aacbb6e0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/outputs.npz differ
diff --git a/DeeployTest/deeployRunner_spatz.py b/DeeployTest/deeployRunner_spatz.py
new file mode 100644
index 0000000000..5404defc13
--- /dev/null
+++ b/DeeployTest/deeployRunner_spatz.py
@@ -0,0 +1,12 @@
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+    sys.exit(
+        main(
+            default_platform = "Spatz",
+            default_simulator = "gvsoc",
+            tiling_enabled = False,
+        )
+    )
diff --git a/DeeployTest/deeployRunner_tiled_spatz.py b/DeeployTest/deeployRunner_tiled_spatz.py
new file mode 100644
index 0000000000..6900d7010e
--- /dev/null
+++ b/DeeployTest/deeployRunner_tiled_spatz.py
@@ -0,0 +1,12 @@
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+    sys.exit(
+        main(
+            default_platform = "Spatz",
+            default_simulator = "gvsoc",
+            tiling_enabled = True,
+        )
+    )
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 01216984af..ec9fb3a7ce 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -123,6 +123,7 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     if args.doublebuffer:
         assert args.defaultMemLevel in ["L3", "L2"]
         if args.defaultMemLevel == "L3":
+            # for double buffering on spatz set this to DBTiler and pass --doublebuffer to deeployRunner_spatz
             deployer = TilerDeployerWrapper(deployer, DBOnlyL3Tiler, testName = testIdentifier, workDir = args.dumpdir)
         else:
             deployer = TilerDeployerWrapper(deployer, DBTiler, testName = testIdentifier, workDir = args.dumpdir)
@@ -250,11 +251,20 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
             test_inputs = [test_inputs[0]]
             test_outputs = [test_outputs[-2]]
 
-    # Instantiate Classes Requried for Memory Level Annotation Extension
-    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000)
-    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2)
-    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1)
-    memoryLevels = [L3, L2, L1]
+    # Instantiate Classes Required for Memory Level Annotation Extension
+    if args.platform == "Spatz":
+        # Spatz cluster has only TCDM (L1) + external DRAM (L3). No on-chip L2.
+        # Declare L1 and L3 as direct neighbours so BFS-based tile-path
+        # generation does not insert a phantom L2 staging buffer.
+        L3 = MemoryLevel(name = "L3", neighbourNames = ["L1"], size = 64000000)
+        L1 = MemoryLevel(name = "L1", neighbourNames = ["L3"], size = args.l1)
+        memoryLevels = [L3, L1]
+    else:
+        L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"],        size = 64000000)
+        L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"],  size = args.l2)
+        L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"],        size = args.l1)
+        memoryLevels = [L3, L2, L1]
+
 
     if args.neureka_wmem:
         memoryLevels.append(MemoryLevel(name = "WeightMemory_SRAM", neighbourNames = [], size = 4 * 1024 * 1024))
diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py
index 39a44d9442..d34ee25e74 100644
--- a/DeeployTest/testUtils/codeGenerate.py
+++ b/DeeployTest/testUtils/codeGenerate.py
@@ -145,6 +145,25 @@ def generateTestNetworkHeader(deployer: NetworkDeployer) -> str:
     return retStr
 
 
+def _generateBindExternalInputs(deployer: NetworkDeployer) -> str:
+    """Generate a bind function for all global network input buffers."""
+    inputs = deployer.inputs()
+
+    retStr = "void DeeployNetwork_BindExternalInputs(void **external_inputs) {\n"
+    retStr += "  // NOTE: This is a hack to avoid the memcpy in main.c from \n"
+    retStr += "  // testInputVector to DeeployNetwork_inputs, since they are both in L3\n"
+
+    for index, node in enumerate(inputs):
+        typeName = node._type.referencedType.typeName
+        retStr += f"  DeeployNetwork_input_{index} = ({typeName} *)external_inputs[{index}];\n"
+
+    for index in range(len(inputs)):
+        retStr += f"  DeeployNetwork_inputs[{index}] = (void *)DeeployNetwork_input_{index};\n"
+
+    retStr += "}\n"
+    return retStr
+
+
 def generateTestNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: CodeGenVerbosity) -> str:
     retStr = ""
 
@@ -198,6 +217,9 @@ def generateTestNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: C
     }
     """
 
+    # TODO: make this work only for spatz and with the correct number of unputs every time
+    retStr += _generateBindExternalInputs(deployer)
+
     return retStr
 
 
diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
index 1dcddeea62..cdbd0af3db 100644
--- a/DeeployTest/testUtils/core/execution.py
+++ b/DeeployTest/testUtils/core/execution.py
@@ -6,6 +6,7 @@
 import shutil
 import subprocess
 import sys
+import threading
 from pathlib import Path
 
 from Deeploy.Logging import DEFAULT_LOGGER as log
@@ -191,15 +192,43 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
 
     log.debug(f"[Execution] Simulation command: {' '.join(cmd)}")
 
-    result = subprocess.run(cmd, capture_output = True, text = True, env = env)
-
-    if result.stdout:
-        print(result.stdout, end = '')
-    if result.stderr:
-        print(result.stderr, end = '', file = sys.stderr)
+    process = subprocess.Popen(
+        cmd,
+        stdout = subprocess.PIPE,
+        stderr = subprocess.PIPE,
+        text = True,
+        env = env,
+        bufsize = 1,
+    )
+
+    stdout_chunks = []
+    stderr_chunks = []
+
+    def _stream_reader(pipe, chunks, is_stderr: bool = False) -> None:
+        assert pipe is not None
+        for line in iter(pipe.readline, ''):
+            chunks.append(line)
+            if is_stderr:
+                print(line, end = '', file = sys.stderr, flush = True)
+            else:
+                print(line, end = '', flush = True)
+        pipe.close()
+
+    stdout_thread = threading.Thread(target = _stream_reader, args = (process.stdout, stdout_chunks), daemon = True)
+    stderr_thread = threading.Thread(target = _stream_reader, args = (process.stderr, stderr_chunks, True), daemon = True)
+
+    stdout_thread.start()
+    stderr_thread.start()
+
+    returncode = process.wait()
+    stdout_thread.join()
+    stderr_thread.join()
+
+    stdout = ''.join(stdout_chunks)
+    stderr = ''.join(stderr_chunks)
 
     # Parse output for error count and cycles
-    test_result = parse_test_output(result.stdout, result.stderr)
+    test_result = parse_test_output(stdout, stderr)
 
     if not test_result.success and test_result.error_count == -1:
         log.warning(f"Could not parse error count from output")
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index a5a8d70ef3..0c98e254aa 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -348,6 +348,7 @@ def main(default_platform: Optional[str] = None,
         "snitch": "Snitch",
         "chimera": "Chimera",
         "softhier": "SoftHier",
+        "spatz": "Spatz",
     }
 
     if args.platform:
@@ -388,6 +389,7 @@ def main(default_platform: Optional[str] = None,
             "Snitch": "gvsoc",
             "Chimera": "gvsoc",
             "SoftHier": "gvsoc",
+            "Spatz": "vsim",
         }
         simulator = simulator_map.get(platform, "host")
         log.info(f"No simulator specified, using default for {platform}: {simulator}")
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 9d526906f9..69a83f1e8d 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -10,6 +10,8 @@
 from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
+from Deeploy.Targets.Spatz.Deployer import SpatzDeployer
+from Deeploy.Targets.Spatz.Platform import SpatzOptimizer, SpatzPlatform
 from Deeploy.Targets.Chimera.Deployer import ChimeraDeployer
 from Deeploy.Targets.Chimera.Platform import ChimeraOptimizer, ChimeraPlatform
 from Deeploy.Targets.CortexM.Deployer import CMSISDeployer
@@ -31,7 +33,7 @@
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"]
+_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "Spatz"]
 _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
 
 
@@ -76,6 +78,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
     elif platformName == "Chimera":
         Platform = ChimeraPlatform()
 
+    elif platformName == "Spatz":
+        Platform = SpatzPlatform()
+
     else:
         raise RuntimeError(f"Deployment platform {platformName} is not implemented")
 
@@ -272,6 +277,18 @@ def mapDeployer(platform: DeploymentPlatform,
                                    name = name,
                                    default_channels_first = default_channels_first,
                                    deeployStateDir = deeployStateDir)
+    
+    elif isinstance(platform, (SpatzPlatform)):
+        deployer = SpatzDeployer(
+            graph,
+            platform,
+            inputTypes,
+            SpatzOptimizer,
+            scheduler,
+            name = name,
+            default_channels_first = default_channels_first,
+            deeployStateDir = deeployStateDir
+        )
 
     else:
         raise RuntimeError(f"Deployer for platform {platform} is not implemented")
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 6d9f3cfcd7..6be4bef197 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -110,6 +110,14 @@ def param_id(param):
         "model_tests": SNITCH_MODEL_TESTS,
         "default_num_cores": SNITCH_DEFAULT_NUM_CORES,
     },
+    "spatz": {
+        "platform": "Spatz",
+        "simulator": "vsim",
+        # TODO: Define KERNEL_TESTS and MODEL_TESTS for Spatz
+        "kernel_tests": [],
+        "model_tests": [],
+        # "default_num_cores": <set if known>,
+    },
     "gap9": {
         "platform": "GAP9",
         "simulator": "gvsoc",
diff --git a/Makefile b/Makefile
index d40a49da11..49b04baeb2 100644
--- a/Makefile
+++ b/Makefile
@@ -27,10 +27,12 @@ PICOLIBC_RV32IMF_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imf
 CHIMERA_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/chimera-sdk
 PULP_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/pulp-sdk
 SNITCH_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/snitch_cluster
+SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/spatz
 QEMU_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/qemu
 BANSHEE_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/banshee
 MEMPOOL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/mempool
 GVSOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc
+GVSOC_SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc_spatz
 SOFTHIER_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/softhier
 MINIMALLOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/minimalloc
 XTL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/xtl
@@ -44,8 +46,10 @@ PICOLIBC_COMMIT_HASH ?= 31ff1b3601b379e4cab63837f253f59729ce1fef
 PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4
 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6
 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2
+SPATZ_COMMIT_HASH ?= 6bd9f3094e237dab392983edb827105bce8e3e86
 SOFTHIER_COMMIT_HASH ?= 0       # bowwang: to be updated
-GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0
+# GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 # old
+GVSOC_COMMIT_HASH ?= 209c147cbd293d5c1590694e68c489122c777acc # new
 MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d
 CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea
 XTL_VERSION ?= 0.7.5
@@ -69,7 +73,7 @@ else
 	$(error unsupported platform $(OS))
 endif
 
-all: toolchain emulators docs echo-bash
+all: toolchain emulators # docs echo-bash
 
 echo-bash:
 
@@ -79,8 +83,10 @@ echo-bash:
 	@echo "export PULP_SDK_HOME=${PULP_SDK_INSTALL_DIR}"
 	@echo "export CHIMERA_SDK_HOME=${CHIMERA_SDK_INSTALL_DIR}"
 	@echo "export SNITCH_HOME=${SNITCH_INSTALL_DIR}"
+	@echo "export SPATZ_HOME=${SPATZ_INSTALL_DIR}"
 	@echo "export GVSOC_INSTALL_DIR=${GVSOC_INSTALL_DIR}"
 	@echo "export SOFTHIER_INSTALL_DIR=${SOFTHIER_INSTALL_DIR}"
+	@echo "export BANSHEE_INSTALL_DIR=${BANSHEE_INSTALL_DIR}"
 	@echo "export LLVM_INSTALL_DIR=${LLVM_INSTALL_DIR}"
 	@echo "export MEMPOOL_HOME=${MEMPOOL_INSTALL_DIR}"
 	@echo "export CMAKE=$$(which cmake)"
@@ -91,9 +97,9 @@ echo-bash:
 	@echo "source ${PULP_SDK_INSTALL_DIR}/configs/siracusa.sh"
 
 
-toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv
+toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv xtensor minimalloc # xtensor needed for gvsoc, minimalloc for tiling
 
-emulators: snitch_runtime pulp-sdk qemu banshee mempool
+emulators: snitch_runtime spatz_runtime pulp-sdk qemu banshee mempool gvsoc
 
 ${TOOLCHAIN_DIR}/llvm-project:
 	cd ${TOOLCHAIN_DIR} && \
@@ -124,6 +130,7 @@ ${LLVM_INSTALL_DIR}: ${TOOLCHAIN_DIR}/llvm-project
 llvm: ${LLVM_INSTALL_DIR}
 
 
+# runtimes for different architectures
 ${LLVM_CLANG_RT_RISCV_RV32IM}: ${TOOLCHAIN_DIR}/llvm-project
 	cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build-compiler-rt-riscv-rv32im \
 	&& cd build-compiler-rt-riscv-rv32im; \
@@ -429,16 +436,55 @@ ${SNITCH_INSTALL_DIR}: ${TOOLCHAIN_DIR}/snitch_cluster
 
 snitch_runtime: ${SNITCH_INSTALL_DIR}
 
+${TOOLCHAIN_DIR}/spatz:
+	cd ${TOOLCHAIN_DIR} && \
+	git clone https://github.com/pulp-platform/spatz.git && \
+	cd ${TOOLCHAIN_DIR}/spatz && git checkout ${SPATZ_COMMIT_HASH} && \
+ 	git submodule update --init --recursive
+
+${SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/spatz
+	mkdir -p ${SPATZ_INSTALL_DIR}
+	cp -r ${TOOLCHAIN_DIR}/spatz/ ${SPATZ_INSTALL_DIR}/../
+	cd ${SPATZ_INSTALL_DIR} && \
+	make all -j8 && \
+	python3.6 -m venv .venv && \
+	.venv/bin/pip install jsonref jsonschema jstyleson dataclasses hjson mako && \
+	source .venv/bin/activate && \
+	source util/iis-env.sh && \
+	make init && \
+	cd hw/system/spatz_cluster/ && \
+	make sw
+
+spatz_runtime: ${SPATZ_INSTALL_DIR}
+
+# ${TOOLCHAIN_DIR}/gvsoc_spatz:
+# 	cd ${TOOLCHAIN_DIR} && \
+# 	git clone https://github.com/gvsoc/gvsoc.git gvsoc_spatz && \
+# 	cd ${TOOLCHAIN_DIR}/gvsoc_spatz && git checkout ${GVSOC_SPATZ_COMMIT_HASH} && \
+# 	git submodule update --init --recursive && \
+# 	python3 -m venv venv && source venv/bin/activate &&\
+# 	pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil && \
+# 	cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch
+# 
+# 
+# ${GVSOC_SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc_spatz
+# 	cd ${TOOLCHAIN_DIR}/gvsoc_spatz && \
+# 	source venv/bin/activate &&\
+# 	CXX=g++-11.2.0 CC=gcc-11.2.0 CMAKE=cmake-3.18.1 make all TARGETS=spatz_v2 INSTALLDIR=${GVSOC_SPATZ_INSTALL_DIR}
+# 
+# gvsoc_spatz: ${GVSOC_SPATZ_INSTALL_DIR}
+
 ${TOOLCHAIN_DIR}/gvsoc:
 	cd ${TOOLCHAIN_DIR} && \
 	git clone https://github.com/gvsoc/gvsoc.git && \
 	cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \
 	git submodule update --init --recursive && \
-	pip install -r core/requirements.txt && pip install -r gapy/requirements.txt
+	pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil &&\
+	cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch
 
 ${GVSOC_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc
 	cd ${TOOLCHAIN_DIR}/gvsoc && \
-	 XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera" build INSTALLDIR=${GVSOC_INSTALL_DIR}
+	XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera spatz_v2" build INSTALLDIR=${GVSOC_INSTALL_DIR}
 
 gvsoc: ${GVSOC_INSTALL_DIR}
 
@@ -504,7 +550,7 @@ ${QEMU_INSTALL_DIR}: ${TOOLCHAIN_DIR}/qemu
 	cd ${TOOLCHAIN_DIR}/qemu/ && \
 	mkdir -p build && cd build && \
 	../configure --target-list=arm-softmmu,arm-linux-user,riscv32-softmmu,riscv32-linux-user \
-	--prefix=${QEMU_INSTALL_DIR} && \
+	--prefix=${QEMU_INSTALL_DIR} --disable-werror && \
 	make -j && \
 	make install
 
@@ -543,7 +589,7 @@ ${TOOLCHAIN_DIR}/minimalloc:
 	cd ${TOOLCHAIN_DIR} && \
 	git clone --recursive https://github.com/google/minimalloc.git && \
 	cd ${TOOLCHAIN_DIR}/minimalloc && git checkout ${MINIMALLOC_COMMMIT_HASH} && \
-	cmake -DCMAKE_BUILD_TYPE=Release && make -j && \
+	cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=17 && make -j && \
 	mkdir -p ${MINIMALLOC_INSTALL_DIR} && cp minimalloc ${MINIMALLOC_INSTALL_DIR}
 
 ${CHIMERA_SDK_INSTALL_DIR}:
diff --git a/TargetLibraries/Spatz/CMakeLists.txt b/TargetLibraries/Spatz/CMakeLists.txt
new file mode 100644
index 0000000000..ef0fd63ab8
--- /dev/null
+++ b/TargetLibraries/Spatz/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB_RECURSE SOURCES
+	"src/**"
+)
+
+list(APPEND SOURCES
+	${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c
+)
+
+include(cmake/spatz-runtime-precompiled.cmake)
+
+add_deeploy_library(deeployspatz STATIC ${SOURCES})
+target_include_directories(deeployspatz
+	PUBLIC
+	${CMAKE_CURRENT_LIST_DIR}/inc
+)
+target_include_directories(deeployspatz PRIVATE ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel)
+target_include_directories(deeployspatz SYSTEM PUBLIC ${SPATZ_RUNTIME_INCLUDE})
+target_link_libraries(deeployspatz INTERFACE spatz-runtime)
diff --git a/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake
new file mode 100644
index 0000000000..42e15e1b31
--- /dev/null
+++ b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake
@@ -0,0 +1,27 @@
+
+set(SPATZ_RUNTIME_BASE_INCLUDE
+	${SPATZ_HOME}/sw/snRuntime/include
+	${SPATZ_HOME}/sw/snRuntime/vendor
+	${SPATZ_HOME}/sw/toolchain/riscv-opcodes
+)
+
+set(SPATZ_CLUSTER_LINK_INCLUDE
+	${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime
+)
+
+set(SPATZ_LINKER_SCRIPT ${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime/common.ld)
+# set(SPATZ_LINKER_SCRIPT ${SNITCH_RUNTIME_HOME}/base.ld)
+if(NOT EXISTS ${SPATZ_LINKER_SCRIPT})
+	message(FATAL_ERROR "Spatz linker script not found: ${SPATZ_LINKER_SCRIPT}")
+endif()
+
+set(SPATZ_CLUSTER_LINK_OPTIONS
+	-Wl,--gc-sections
+	-T ${SPATZ_LINKER_SCRIPT}
+)
+
+set(SPATZ_RUNTIME_INCLUDE ${SPATZ_RUNTIME_BASE_INCLUDE})
+
+add_library(spatz-runtime INTERFACE)
+target_link_directories(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_INCLUDE})
+target_link_libraries(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_OPTIONS} libsnRuntime-cluster.a)
diff --git a/TargetLibraries/Spatz/inc/DeeploySpatzMath.h b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h
new file mode 100644
index 0000000000..0157d8d966
--- /dev/null
+++ b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_SPATZ_MATH_HEADER_
+#define __DEEPLOY_SPATZ_MATH_HEADER_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "DeeployBasicMath.h"
+
+void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
+								 const float32_t *__restrict__ pSrcB,
+								 float32_t *__restrict__ pDstY, uint32_t M,
+								 uint32_t N, uint32_t O);
+
+void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size,
+                       int32_t last_dim_length);
+
+
+void compute_topk_min_heap( uint32_t k, uint32_t n, float32_t *data_in, float32_t *heap_values, int32_t *heap_indices);
+
+
+#define BEGIN_SINGLE_CORE if (core_id == 0) {
+#define END_SINGLE_CORE }
+#define SINGLE_CORE if (core_id == 0)
+
+#endif // __DEEPLOY_SPATZ_MATH_HEADER_
diff --git a/TargetLibraries/Spatz/inc/Util.h b/TargetLibraries/Spatz/inc/Util.h
new file mode 100644
index 0000000000..893d687fa1
--- /dev/null
+++ b/TargetLibraries/Spatz/inc/Util.h
@@ -0,0 +1,9 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SPATZ_UTIL_H
+#define SPATZ_UTIL_H
+
+void spatz_util_dummy(void);
+
+#endif // SPATZ_UTIL_H
diff --git a/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c
new file mode 100644
index 0000000000..1caf2c3004
--- /dev/null
+++ b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c
@@ -0,0 +1,154 @@
+#include "DeeploySpatzMath.h"
+#include <snrt.h>
+#include <stdlib.h>
+
+// functions defined in ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c
+// they calculate matrix matrix multiplication
+void matmul_2xVL(float *c, const float *a, const float *b,
+                 const unsigned int m_start, const unsigned int m_end,
+                 const unsigned int N, const unsigned int P,
+                 const unsigned int p_start, const unsigned int p_end);
+
+
+void matmul_4xVL(float *c, const float *a, const float *b,
+                 const unsigned int m_start, const unsigned int m_end,
+                 const unsigned int N, const unsigned int P,
+                 const unsigned int p_start, const unsigned int p_end);
+
+
+void matmul_8xVL(float *c, const float *a, const float *b,
+                 const unsigned int m_start, const unsigned int m_end,
+                 const unsigned int N, const unsigned int P,
+                 const unsigned int p_start, const unsigned int p_end);
+
+
+// calculates vector matrix multiplication with Gustavson algorithm
+void gemv_v32b_m4(float *a, float *b, float *c, int N, int local_P, int total_P) {
+  unsigned int p = 0;
+  // Loop only up to the local number of columns assigned to this core
+  while (p < (unsigned int)local_P) {
+    size_t gvl;
+    asm volatile("vsetvli %0, %1, e32, m8, ta, ma"
+                 : "=r"(gvl) : "r"((unsigned int)local_P - p));
+
+    const float *b_ = b + p;                        
+    asm volatile("vle32.v v8, (%0)" ::"r"(b_));
+    asm volatile("vfmul.vf v16, v8, %0" ::"f"(a[0]));   
+
+    for (int row = 1; row < N; row++) {
+      // CRITICAL: Must skip the TOTAL width of the matrix to reach the next row
+      b_ += total_P;
+      asm volatile("vle32.v v8, (%0)" ::"r"(b_));
+      asm volatile("vfmacc.vf v16, %0, v8" ::"f"(a[row]));
+    }
+
+    asm volatile("vse32.v v16, (%0)" ::"r"(c + p));
+    p += gvl;
+  }
+}
+
+// calculates vector matrix multiplication one lement of the result at a time (inner product)
+void gemv_col_reduction(float *a, float *b, float *c, int N, int local_P, int total_P) {
+  // CRITICAL: Stride must use the original TOTAL width of matrix B
+  ptrdiff_t b_stride = total_P * sizeof(float);
+
+  // Loop only through the columns assigned to this core
+  for (int col = 0; col < local_P; col++) {
+    unsigned int row = 0;
+    
+    // Clear vector register v0 (takes v0-v7) to accumulate partial products
+    size_t init_gvl;
+    asm volatile("vsetvli %0, zero, e32, m8, ta, ma" : "=r"(init_gvl));
+    asm volatile("vmv.v.i v0, 0"); 
+
+    // Loop through the N elements of the current column
+    while (row < (unsigned int)N) {
+      size_t gvl;
+      asm volatile("vsetvli %0, %1, e32, m8, ta, ma"
+                   : "=r"(gvl) : "r"((unsigned int)N - row));
+
+      // Pointer uses total_P to correctly jump down the rows
+      const float *b_ptr = b + (row * total_P) + col;
+      const float *a_ptr = a + row;
+
+      // Load strided elements from B into v8 (takes v8-v15)
+      asm volatile("vlse32.v v8, (%0), %1" ::"r"(b_ptr), "r"(b_stride));
+      
+      // Load contiguous elements from A into v16 (takes v16-v23)
+      asm volatile("vle32.v v16, (%0)" ::"r"(a_ptr));
+
+      // Multiply and accumulate: v0 = v0 + (v8 * v16)
+      asm volatile("vfmacc.vv v0, v8, v16");
+
+      row += gvl;
+    }
+
+    // --- Reduction Phase ---
+    asm volatile("vsetvli zero, zero, e32, m8, ta, ma");
+    asm volatile("vmv.v.i v24, 0");
+
+    // Reduce the accumulated vector v0 into the first element of v24
+    asm volatile("vfredosum.vs v24, v0, v24");
+
+    // Store only the single scalar result (1 element) into c[col]
+    size_t one = 1;
+    asm volatile("vsetvli zero, %0, e32, m1, ta, ma" :: "r"(one));
+    asm volatile("vse32.v v24, (%0)" ::"r"(c + col));
+  }
+}
+
+void matmul_vanila(float *a, float *b, float *c, int M, int N, int P) {
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < P; j++) {
+      float sum = 0.0f;
+      for (int k = 0; k < N; k++) {
+        sum += a[i * N + k] * b[k * P + j];
+      }
+      
+      c[i * P + j] = sum;
+    }
+  }
+}
+
+
+
+void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ a,
+                                 const float32_t *__restrict__ b,
+                                 float32_t *__restrict__ c, uint32_t M,
+                                 uint32_t N, uint32_t P) {
+  // const unsigned int num_cores = snrt_cluster_core_num(); = 2 for spatz
+  const unsigned int cid = snrt_cluster_core_idx();
+
+  if (M == 1) {
+    // TODO make this be more specific, probably needs to me N>5*P or some other constant
+    int cols_core0 = P / 2;
+    int cols_core1 = P - cols_core0; // Safely gets the remainder if P is odd
+    if (N>4*P){
+      if (cid == 0) {
+          gemv_col_reduction(a, b, c, N, cols_core0, P);
+      } else {
+          float *b_offset = b + cols_core0; float *c_offset = c + cols_core0;
+          gemv_col_reduction(a, b_offset, c_offset, N, cols_core1, P);
+      }
+    } else {
+      if (cid == 0) {
+        gemv_v32b_m4(a, b, c, N, cols_core0, P);
+      } else {
+        float *b_offset = b + cols_core0; float *c_offset = c + cols_core0;
+        gemv_v32b_m4(a, b_offset, c_offset, N, cols_core1, P);
+      }
+    }
+  } else {
+    unsigned int p_start, p_end;
+    if (cid == 0){ p_start = 0; p_end = (P/2);
+    } else { p_start = (P/2); p_end = P; }
+
+    if (M <= 4) {
+      matmul_2xVL(c, a, b, 0, M, N, P, p_start, p_end);
+    } else if (M <= 8) {
+      matmul_4xVL(c, a, b, 0, M, N, P, p_start, p_end);
+    } else {
+      matmul_8xVL(c, a, b, 0, M, N, P, p_start, p_end);
+    }
+  }
+}
diff --git a/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c
new file mode 100644
index 0000000000..bf4d24b221
--- /dev/null
+++ b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c
@@ -0,0 +1,163 @@
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+float32_t myexpf(float32_t x){
+  const float32_t inv_ln2 = 1.4426950409f;
+  const float32_t ln2 = 0.6931471806f;
+
+  // Range reduction: x = k * ln(2) + r, with r kept small so the polynomial is accurate.
+  float32_t scaled = x * inv_ln2;
+  int32_t k = (int32_t)(scaled + (scaled >= 0.0f ? 0.5f : -0.5f));
+  float32_t r = x - ((float32_t)k * ln2);
+
+  float32_t r2 = r * r;
+  float32_t r3 = r2 * r;
+  float32_t r4 = r3 * r;
+  float32_t r5 = r4 * r;
+  float32_t r6 = r5 * r;
+  float32_t r7 = r6 * r;
+
+  float32_t poly = 1.0f + r + (r2 * 0.5f) + (r3 * 0.1666666667f) + (r4 * 0.0416666667f) + (r5 * 0.0083333333f) + (r6 * 0.0013888889f) + (r7 * 0.0001984127f);
+
+  return ldexpf(poly, k);
+}
+
+// Type-punning union to safely manipulate IEEE 754 float bits without breaking strict aliasing rules
+union float_bits {
+  float f;
+  uint32_t i;
+};
+
+float expf_nodiv_reduced(float x) {
+  // Mathematical constants
+  const float LN2_HI = 0.69314575195f;     // High bits of ln(2)
+  const float LN2_LO = 1.4286068203e-6f;   // Low bits of ln(2) for quasi-double precision reduction
+  const float INV_LN2 = 1.4426950408f;     // log2(e) = 1/ln(2)
+
+  // Bound limits to prevent float overflow/underflow
+  if (x > 88.722839f)  x = 88.722839f;
+  if (x < -87.336544f) return 0.0f;
+
+  // 1. Argument Reduction: Find integer k closest to x / ln(2)
+  // We cast to integer to perform a fast round-to-nearest operation
+  // int32_t k = (int32_t)(x * INV_LN2 + (x >= 0.0f ? 0.5f : -0.5f));
+  float sign_offset = __builtin_copysignf(0.5f, x);
+  int32_t k = (int32_t)(x * INV_LN2 + sign_offset);
+
+  // Compute residual r = x - k * ln(2) using Cody-Waite reduction to minimize loss of significance
+  float r = x - ((float)k * LN2_HI) - ((float)k * LN2_LO);
+
+  // 2. Taylor Polynomial Approximation of e^r (Horner's Method)
+  // Range of r is strictly bounded within [-0.34657, 0.34657]
+  // Coefficients are 1, 1, 1/2, 1/6, 1/24, 1/120
+  float poly = 1.0f + r * (1.0f + r * (0.5f + r * (0.166666671f + r * (0.041666664f + r * 0.008333333f))));
+
+  // 3. Reconstruction: Generate 2^k via IEEE 754 bit-manipulation
+  // The exponent field is bits [30:23] with a bias of 127
+  int32_t biased_exp = k + 127;
+  
+  union float_bits two_to_k;
+  two_to_k.i = ((uint32_t)biased_exp << 23); // Shift biased integer into the float exponent slot
+
+  // e^x = e^r * 2^k
+  return poly * two_to_k.f;
+}
+
+// inverse funciton that doesnt use fdiv.s
+float32_t myinv(float32_t x){
+    uint32_t i = *(uint32_t*)&x;
+    i = 0x7EEEEEEE - i; 
+    float y = *(float*)&i;
+
+    // Newton-Raphson steps (Multiplication only!)
+    y = y * (2.0f - x * y);
+    y = y * (2.0f - x * y);
+    y = y * (2.0f - x * y); 
+    
+    return y;
+}
+
+void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size, int32_t last_dim_length) {
+  const unsigned int cid = snrt_cluster_core_idx();
+  // two cores divided on the vector lenght
+  if (size == last_dim_length){
+    static float32_t maxval[1];
+    if (cid==0){
+      float32_t max_val = -inf;
+
+      for (int i = 0; i < last_dim_length; i++) {
+        if (input[i] > max_val) { max_val = input[i]; }
+      }
+      maxval[0] = max_val;
+    }
+
+    snrt_cluster_hw_barrier();
+
+    static float32_t partial_sum[2];
+    float32_t exp_val = 0.0f;
+
+    if (cid==0){
+      float32_t sum_core0 = 0.0f;
+      for (int i = 0; i < last_dim_length/2; i++) {
+        exp_val = expf_nodiv_reduced(input[i] - maxval[0]);
+        output[i] = exp_val;
+        sum_core0 += exp_val;
+      }
+      partial_sum[0] = sum_core0;
+    } else {
+      float32_t sum_core1 = 0.0f;
+      for (int i = last_dim_length/2; i < last_dim_length; i++) {
+        exp_val = expf_nodiv_reduced(input[i] - maxval[0]);
+        output[i] = exp_val;
+        sum_core1 += exp_val;
+      }
+      partial_sum[1] = sum_core1;
+    }
+
+    snrt_cluster_hw_barrier();
+    float32_t one_over_sum= 0.0f;
+
+    if (cid == 0){ one_over_sum = myinv(partial_sum[0] + partial_sum[1]); }
+    snrt_cluster_hw_barrier();
+    if (cid == 0){ for (int i = 0; i < last_dim_length; i++) { output[i] *= one_over_sum; } }
+    snrt_cluster_hw_barrier();
+    return;
+
+  } else {
+    // divide worload betw cores in batches
+    int32_t batch_size = size / last_dim_length;
+    unsigned int items_per_core = (batch_size + 1) / 2;
+
+    unsigned int b_start, b_end;
+
+    if (cid == 0) {
+        b_start = 0;
+        b_end   = items_per_core;
+    } else {
+        b_start = items_per_core;
+        // Core 1 always ends at the total batch size
+        b_end   = batch_size;
+    }
+    for (int b = b_start; b < b_end; b++) {
+      float32_t max_val = -inf;
+      float sum = 0.0f;
+
+      for (int i = 0; i < last_dim_length; i++) {
+        if (input[b * last_dim_length + i] > max_val) {
+          max_val = input[b * last_dim_length + i];
+        }
+      }
+
+      for (int i = 0; i < last_dim_length; i++) {
+        float32_t exp_val = input[b * last_dim_length + i] - max_val;
+        output[b * last_dim_length + i] = expf_nodiv_reduced(exp_val);
+        sum += output[b * last_dim_length + i];
+      }
+
+      float32_t sum_1 = myinv(sum);
+      for (int i = 0; i < last_dim_length; i++) {
+        output[b * last_dim_length + i] = output[b * last_dim_length + i] * sum_1;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Spatz/src/TopK_fp32_int32_spatz.c b/TargetLibraries/Spatz/src/TopK_fp32_int32_spatz.c
new file mode 100644
index 0000000000..3802f330b3
--- /dev/null
+++ b/TargetLibraries/Spatz/src/TopK_fp32_int32_spatz.c
@@ -0,0 +1,126 @@
+#include "DeeployBasicMath.h"
+#include <math.h>
+#include <float.h>
+
+/* note:
+ * heap is stored in a vector
+ * minimum element is in root of heap (index 0 in the vector)
+ * left and right of a index are always > than root
+ */
+static inline __attribute__((always_inline)) void reorder_heap(uint32_t idx, uint32_t size, float32_t *heap_values, int32_t *heap_indices){
+  for (;;) {
+    uint32_t left = 2 * idx + 1;
+    if (left >= size) {
+      break;
+    }
+    uint32_t smallest = left;
+    uint32_t right = left + 1;
+    if (right < size && heap_values[right] < heap_values[left]) {
+      smallest = right;
+    }
+    if (heap_values[smallest] < heap_values[idx]) {
+      float32_t tmp_val = heap_values[idx];
+      int32_t tmp_idx = heap_indices[idx];
+      heap_values[idx] = heap_values[smallest];
+      heap_indices[idx] = heap_indices[smallest];
+      heap_values[smallest] = tmp_val;
+      heap_indices[smallest] = tmp_idx;
+      idx = smallest;
+    } else {
+      break;
+    }
+  }
+}
+
+// heap_value and _indices are arrays i can modify and work with, used as scratchpad, but also as output
+void compute_topk_min_heap( uint32_t k, uint32_t n, float32_t *data_in, float32_t *heap_values, int32_t *heap_indices) {
+  // Initialize heap with first k elements
+  for (uint32_t i = 0; i < k; ++i) { heap_values[i] = data_in[i]; heap_indices[i] = (int32_t)i; }
+
+  // Build min-heap by reordeing each sub heap starting fomr the smallest ones (k/2-1) to the biggest ones (0)
+  for (int32_t root = (int32_t)k / 2 - 1; root >= 0; --root) {
+    reorder_heap(root, k, heap_values, heap_indices);
+  }
+
+  // Process remaining elements, keeping top k values in the min-heap
+  for (uint32_t i = k; i < n; ++i) {
+    float32_t value = data_in[i];
+    if (value > heap_values[0]) {
+      heap_values[0] = value;
+      heap_indices[0] = (int32_t)i;
+
+      reorder_heap(0, k, heap_values, heap_indices);
+    }
+  }
+
+  /* heap sort */
+  for (uint32_t i = k-1; i > 0; i--) {
+    // swap min and max, root and most bottom (biggest) leaf
+    float32_t root_val = heap_values[0]; float32_t root_idx = heap_indices[0];
+
+    heap_values[0] = heap_values[i]; heap_indices[0] = heap_indices[i];
+
+    heap_values[i] = root_val; heap_indices[i] = root_idx;
+    // reduce size and heapify
+    reorder_heap(0, i, heap_values, heap_indices);
+  }
+  
+}
+
+// finds the k biggest elements from a vector of n elements, and returns them in data_out
+void compute_topk_vector_instructions(uint32_t k, uint32_t n, float32_t *data_in, float32_t *data_out, int32_t *indices_out) {
+    
+    for (uint32_t i = 0; i < k; i++) {
+        float32_t global_max = -FLT_MAX;
+        int32_t global_max_idx = -1;
+        
+        uint32_t avl = n;
+        uint32_t vl;
+        float32_t *ptr = data_in;
+        uint32_t current_idx_offset = 0;
+
+        // --- Pass 1: Find the maximum value and its index in the current array ---
+        while (avl > 0) {
+            asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+            // Setup scalar helper registers for reduction initialization
+            float32_t block_max_scalar = -FLT_MAX;
+            
+            // Inline assembly to load, reduce, and find the index manually or via step tracking
+            // v24 will hold the loaded data chunks
+            asm volatile (
+                "vle32.v v24, (%1)\n\t"
+                "vfmv.s.f v0, %2\n\t"              // Init scalar reduction register with -FLT_MAX
+                "vfredmax.vs v0, v24, v0\n\t"       // Find max in this vector block
+                "vfmv.f.s %0, v0\n\t"               // Move block max back to C variable
+                : "=f"(block_max_scalar)
+                : "r"(ptr), "f"(-FLT_MAX)
+                : "v0", "v24"
+            );
+
+            // Check if the maximum found in this block beats our global tracker
+            if (block_max_scalar > global_max) {
+                // If it does, we sweep the block to catch the exact scalar index position
+                for (uint32_t j = 0; j < vl; j++) {
+                    if (ptr[j] > global_max) {
+                        global_max = ptr[j];
+                        global_max_idx = current_idx_offset + j;
+                    }
+                }
+            }
+
+            ptr += vl;
+            current_idx_offset += vl;
+            avl -= vl;
+        }
+
+        // Save the found top element metadata to output arrays
+        data_out[i] = global_max;
+        indices_out[i] = global_max_idx;
+
+        // --- Pass 2: Mask out the found maximum to prevent re-discovery ---
+        if (global_max_idx != -1) {
+            data_in[global_max_idx] = -FLT_MAX;
+        }
+    }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Spatz/src/Util.c b/TargetLibraries/Spatz/src/Util.c
new file mode 100644
index 0000000000..9c30c11f49
--- /dev/null
+++ b/TargetLibraries/Spatz/src/Util.c
@@ -0,0 +1,5 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+// SPDX-License-Identifier: Apache-2.0
+
+// Minimal stub for Spatz runtime linkage
+void spatz_util_dummy(void) {}
diff --git a/cmake/simulation.cmake b/cmake/simulation.cmake
index 55525feedd..983dc0e4ee 100644
--- a/cmake/simulation.cmake
+++ b/cmake/simulation.cmake
@@ -102,3 +102,19 @@ macro(add_gvsoc_emulation name target)
 		USES_TERMINAL
 	)
 endmacro()
+
+macro(add_spatz_gvsoc_emulation name target)
+	set(GVSOC_WORKDIR ${CMAKE_BINARY_DIR}/gvsoc_workdir)
+	make_directory(${GVSOC_WORKDIR})
+	set(GVSOC_BINARY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}")
+	add_custom_target(gvsoc_${name}
+		DEPENDS ${name}
+		WORKING_DIRECTORY ${GVSOC_WORKDIR}
+		COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/*.bin ${GVSOC_WORKDIR}/ || true
+		COMMAND bash -c "${GVSOC_INSTALL_DIR}/bin/gvrun --target ${target} --param chip/soc/binary=${GVSOC_BINARY} run"
+		COMMENT "Simulating deeploytest ${name} with gvsoc for the target ${target}"
+		POST_BUILD
+		USES_TERMINAL
+		VERBATIM
+	)
+endmacro()
diff --git a/cmake/spatz/spatz.cmake b/cmake/spatz/spatz.cmake
new file mode 100644
index 0000000000..b715f625c9
--- /dev/null
+++ b/cmake/spatz/spatz.cmake
@@ -0,0 +1,30 @@
+add_compile_definitions(
+	DEEPLOY_SPATZ_PLATFORM
+)
+
+set(DEEPLOY_ARCH SPATZ)
+
+set(num_threads ${NUM_CORES})
+
+macro(add_spatz_vsim_simulation name)
+	add_custom_target(vsim_${name}
+	WORKING_DIRECTORY ${SPATZ_HOME}/hw/system/spatz_cluster
+	DEPENDS ${name}
+	COMMAND ${QUESTA} bin/spatz_cluster.vsim
+	${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name} || true
+	COMMENT "Simulating deeploytest with vsim (Spatz cluster)"
+	POST_BUILD
+	USES_TERMINAL
+	VERBATIM
+	)
+endmacro()
+
+add_compile_options(
+		-ffast-math
+)
+
+add_link_options(
+		-ffast-math
+		-Wl,--gc-sections
+)
+
diff --git a/cmake/spatz/toolchain_llvm.cmake b/cmake/spatz/toolchain_llvm.cmake
new file mode 100644
index 0000000000..89f10f1954
--- /dev/null
+++ b/cmake/spatz/toolchain_llvm.cmake
@@ -0,0 +1,74 @@
+
+set(CMAKE_SYSTEM_NAME Generic)
+
+# Crucial: Point CMake to the specialized Clang toolchain instead of system cc
+set(SPATZ_TOOLCHAIN_DIR ${SPATZ_HOME}/sw/toolchain/llvm-project/build/bin)
+
+set(CMAKE_C_COMPILER   ${SPATZ_TOOLCHAIN_DIR}/clang)
+set(CMAKE_CXX_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang++)
+set(CMAKE_ASM_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang)
+set(CMAKE_OBJCOPY ${SPATZ_TOOLCHAIN_DIR}/llvm-objcopy)
+set(CMAKE_OBJDUMP ${SPATZ_TOOLCHAIN_DIR}/llvm-objdump)
+set(CMAKE_LINKER ${SPATZ_TOOLCHAIN_DIR}/ld.lld)
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+
+set(ISA rv32imafdvzfh_xdma)
+
+# Compile options based on user's manual compilation commands
+add_compile_options(
+    -target riscv32-unknown-elf
+    # -MP
+    -mcpu=snitch
+    -mcmodel=small
+    
+    -ffast-math
+    -fno-builtin-printf
+    -fno-common
+    -falign-loops=16 
+    -ffunction-sections
+    -Wextra
+    
+    # LLVM specific flags from user command
+    -mllvm -misched-topdown 
+    -menable-experimental-extensions
+    -mno-relax
+    
+    -march=${ISA}
+    -mabi=ilp32d
+    # Newlib headers: prefer $GCC_INSTALL_DIR (set by util/iis-env.sh to the
+    # cluster's spatz-gcc) over a source-built GNU toolchain inside spatz.
+    # -isystem $ENV{GCC_INSTALL_DIR}/riscv32-unknown-elf/include
+    -isystem ${SPATZ_HOME}/sw/toolchain/riscv-gnu-toolchain/riscv-newlib/newlib/libc/include
+    
+    # Optimization and debug
+    -O3 
+    -g
+)
+
+# Link options matching user command
+add_link_options(
+    -target riscv32-unknown-elf
+    -mcpu=snitch
+    -march=${ISA}
+    -mabi=ilp32d
+    -mcmodel=small
+    
+    -fuse-ld=lld
+    -nostartfiles
+
+    -ffast-math
+    -fno-common
+    -fno-builtin-printf
+     
+    -static 
+    -Wl,-z,norelro 
+    -Wl,--gc-sections 
+    -Wl,--no-relax
+
+    --gcc-toolchain=/usr/pack/riscv-1.0-kgf/spatz-gcc-7.1.1
+)
+
+# libsnRuntime-cluster.a is handled by our target_link_libraries(deeployspatz INTERFACE spatz-runtime)
+link_libraries(
+    -lm -lgcc -lm -lgcc
+)
diff --git a/conda_enviroment_topk_attention.yml b/conda_enviroment_topk_attention.yml
new file mode 100644
index 0000000000..8121982e85
--- /dev/null
+++ b/conda_enviroment_topk_attention.yml
@@ -0,0 +1,81 @@
+name: ~/.conda/envs/deeploy_conda_venv
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.12.2=h06a4308_0
+  - ld_impl_linux-64=2.44=h9e0c5a2_3
+  - libexpat=2.7.5=h7354ed3_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=15.2.0=h69a1729_7
+  - libgcc-ng=15.2.0=h166f726_7
+  - libgomp=15.2.0=h4751f2c_7
+  - libnsl=2.0.0=h5eee18b_0
+  - libstdcxx=15.2.0=h39759b7_7
+  - libstdcxx-ng=15.2.0=hc03a8fd_7
+  - libuuid=1.41.5=h5eee18b_0
+  - libxcb=1.17.0=h9b100fa_0
+  - libzlib=1.3.1=hb25bd0a_0
+  - ncurses=6.5=h7934f7d_0
+  - openssl=3.5.5=h1b28b03_0
+  - packaging=25.0=py311h06a4308_1
+  - pip=26.0.1=pyhc872135_0
+  - pthread-stubs=0.3=h0ce48e5_1
+  - python=3.11.15=h741d88c_0
+  - readline=8.3=hc2a1206_0
+  - setuptools=80.10.2=py311h06a4308_0
+  - sqlite=3.51.2=h3e8d24a_0
+  - tk=8.6.15=h54e0aa7_0
+  - tzdata=2026a=he532380_0
+  - wheel=0.46.3=py311h06a4308_0
+  - xorg-libx11=1.8.12=h9b100fa_1
+  - xorg-libxau=1.0.12=h9b100fa_0
+  - xorg-libxdmcp=1.1.5=h9b100fa_0
+  - xorg-xorgproto=2024.1=h5eee18b_1
+  - xz=5.8.2=h448239c_0
+  - zlib=1.3.1=hb25bd0a_0
+  - pip:
+      - absl-py==2.4.0
+      - argparse==1.4.0
+      - beautifulsoup4==4.14.3
+      - certifi==2026.2.25
+      - chardet==5.2.0
+      - charset-normalizer==3.4.6
+      - contourpy==1.3.3
+      - cycler==0.12.1
+      - deeploy-pulp==0.2.1
+      - flatbuffers==25.12.19
+      - fonttools==4.62.1
+      - idna==3.11
+      - imagesize==2.0.0
+      - iniconfig==2.3.0
+      - jinja2==3.1.6
+      - kiwisolver==1.5.0
+      - lz4==4.4.5
+      - markdown-it-py==4.0.0
+      - markupsafe==3.0.3
+      - mdurl==0.1.2
+      - mpmath==1.3.0
+      - narwhals==2.18.1
+      - pillow==12.1.1
+      - plotly==6.6.0
+      - pluggy==1.6.0
+      - psutil==7.2.2
+      - ptyprocess==0.7.0
+      - pyparsing==3.3.2
+      - pytest==9.0.2
+      - python-dateutil==2.9.0.post0
+      - pytz==2026.1.post1
+      - six==1.17.0
+      - snowballstemmer==3.0.1
+      - soupsieve==2.8.3
+      - sphinxcontrib-jsmath==1.0.1
+      - sympy==1.14.0
+      - tabulate==0.10.0
+      - toml==0.10.2
+      - typing-extensions==4.15.0
+      - urllib3==2.6.3
+      - wcwidth==0.6.0
+prefix: ~/.conda/envs/deeploy_conda_venv
diff --git a/toolchain/gvsoc.patch b/toolchain/gvsoc.patch
new file mode 100644
index 0000000000..22e65922a9
--- /dev/null
+++ b/toolchain/gvsoc.patch
@@ -0,0 +1,12 @@
+diff --git a/engine/src/launcher.cpp b/engine/src/launcher.cpp
+index f0b1b654..48c83592 100644
+--- a/engine/src/launcher.cpp
++++ b/engine/src/launcher.cpp
+@@ -21,6 +21,7 @@
+ 
+ #include <pthread.h>
+ #include <signal.h>
++#include <unistd.h>
+ #include <algorithm>
+ 
+ #include <stdexcept>