pulp-platform · lee2716 · Jan 31, 2026 · Jan 31, 2026 · Feb 3, 2026 · Feb 3, 2026
@@ -36,3 +36,11 @@ jobs:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
       pytest-marker: "kernels and singlebuffer and l2"
+
+  snitch-models-tiled-singlebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-snitch-tiled-sequential.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and singlebuffer and l2"
@@ -36,3 +36,11 @@ jobs:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
       pytest-marker: "kernels"
+
+  snitch-models:
+    needs: select-env
+    uses: ./.github/workflows/_runner-snitch.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models"
@@ -31,3 +31,5 @@ ignore:
   - "**/toolchain/"
   # Ignore all files in .git
   - "**/.git/**"
+  # Ignore all files in .venv
+  - "**/.venv/"
@@ -3121,7 +3121,7 @@ def _exportGraph(self, folderPath, fileName):
         # VJUNG: ONNX-Graphsurgeon needs tensors to be in their export types
         constTensors = [tensor for tensor in self.graph.tensors().values() if isinstance(tensor, gs.Constant)]
         for tensor in constTensors:
-            if tensor.dtype != tensor.export_dtype:
+            if hasattr(tensor, 'export_dtype') and tensor.dtype != tensor.export_dtype:
                 tensor.values = tensor.values.astype(tensor.export_dtype)
 
         model = gs.export_onnx(self.graph)

@@ -300,6 +300,9 @@
 BasicConcatBindings = [
     NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
                 ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                ConcatTemplate.referenceTemplate, BasicTransformer)
 ]
 
 BasicQuantBindings = [

@@ -703,6 +703,43 @@ def computeOps(self):
         return numPx * opsPerPx
 
 
+class RMSNormLayer(ONNXLayer):
+    """Layer support for the ONNX RMSNormalization operator.
+
+    Supported opset: 23
+
+    It is computed as follows:
+        - XSquared = Mul(X, X)
+        - XSquaredMean = ReduceMean<axes=normalized_axes>(XSquared)
+        - MeanSquareEpsilon = Add(XSquaredMean, epsilon)
+        - RMS = Sqrt(MeanSquareEpsilon)
+        - Normalized = Div(X, RMS)
+        - Y = Mul(Normalized, Scale)
+
+    For more details, this is the official ONNX documentation:
+    https://onnx.ai/onnx/operators/onnx__RMSNormalization.html#rmsnormalization-23
+    """
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        inputSize = self.mapper.parser.operatorRepresentation['inputSize']
+        NormalizedAxesSize = self.mapper.parser.operatorRepresentation['NormalizedAxesSize']
+        scale = self.mapper.parser.operatorRepresentation['scale']
+
+        # a. XSquared = Mul(X, X) => inputSize ops
+        # b. XSquaredMean = ReduceMean<axes=normalized_axes>(XSquared)
+        #    => inputSize ops (additions) + (inputSize - NormalizedAxesSize) ops (divisions)
+        # c. MeanSquareEpsilon = Add(XSquaredMean, epsilon) => (inputSize - NormalizedAxesSize) ops
+        # d. RMS = Sqrt(MeanSquareEpsilon) => (inputSize - NormalizedAxesSize) ops
+        # e. Normalized = Div(X, RMS) => inputSize ops
+        # f. Y = Mul(Normalized, Scale) => 0 if all(Scale == 1.0), else inputSize ops
+        scale_ops = 0 if (scale == 1.0).all() else inputSize
+        ops = 6 * inputSize - 3 * NormalizedAxesSize + scale_ops
+        return ops
+
+
 class CeilLayer(SingleOperationPerElementLayer):
     pass
 

@@ -11,6 +11,37 @@
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer
 
 
+def compute_broadcast_strides(shape1, shape2, out_shape):
+    """Compute strides for ONNX/NumPy-style broadcasting.
+
+    Pads both input shapes from the left to match the output ndim,
+    then computes strides where broadcast dimensions (size 1) get stride 0.
+
+    Example:
+        shape1=[8,8,8], shape2=[8]
+        -> strides1=[64,8,1], strides2=[0,0,1]
+    """
+    ndim = len(out_shape)
+
+    pad1 = [1] * (ndim - len(shape1)) + shape1
+    pad2 = [1] * (ndim - len(shape2)) + shape2
+
+    def _calc_strides(padded_shape, out_shape):
+        strides = []
+        stride = 1
+        for i in range(ndim - 1, -1, -1):
+            if padded_shape[i] == 1 and out_shape[i] > 1:
+                strides.insert(0, 0)
+            else:
+                strides.insert(0, stride)
+            stride *= padded_shape[i] if padded_shape[i] > 1 else 1
+        return strides
+
+    strides1 = _calc_strides(pad1, out_shape)
+    strides2 = _calc_strides(pad2, out_shape)
+    return strides1, strides2
+
+
 class UnaryElementWiseParser(NodeParser):
 
     def parseNode(self, node: gs.Node) -> bool:
@@ -72,6 +103,10 @@ def parseNode(self, node: gs.Node) -> (bool):
             self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'])
             self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D']))
 
+            stash_type = node.attrs.get('stash_type', 1)
+            if stash_type != 1:
+                raise ValueError(f"iRMSNorm: only stash_type=1 (FP32) is supported, got {stash_type}")
+
         return ret
 
     def parseNodeCtxt(self,
@@ -87,8 +122,19 @@ def parseNodeCtxt(self,
         for idx, outputNode in enumerate(node.outputs):
             self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
 
-        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
-        self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1]
+        input_shape = list(ctxt.lookup(node.inputs[0].name).shape)
+
+        axis = node.attrs.get('axis', -1)
+        if axis < 0:
+            axis = len(input_shape) + axis
+
+        self.operatorRepresentation['inputSize'] = int(np.prod(input_shape))
+        self.operatorRepresentation['NormalizedAxesSize'] = int(np.prod(input_shape[axis:]))
+        self.operatorRepresentation['scale'] = node.inputs[1].values
+
+        # Keep old keys for C template compatibility
+        self.operatorRepresentation['size'] = int(np.prod(input_shape))
+        self.operatorRepresentation['lastDimLength'] = int(input_shape[-1])
 
         return ctxt, True
 
@@ -488,23 +534,37 @@ def __init__(self):
         super().__init__()
 
     def parseNode(self, node: gs.Node) -> bool:
-
         ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
-
         return ret
 
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
-
         data_in_1 = ctxt.lookup(node.inputs[0].name)
         data_in_2 = ctxt.lookup(node.inputs[1].name)
         data_out = ctxt.lookup(node.outputs[0].name)
+
         self.operatorRepresentation['data_in_1'] = data_in_1.name
         self.operatorRepresentation['data_in_2'] = data_in_2.name
         self.operatorRepresentation['data_out'] = data_out.name
-        self.operatorRepresentation['size'] = np.prod(data_in_1.shape)
+        self.operatorRepresentation['size'] = np.prod(data_out.shape)
+
+        # Check if broadcasting is needed
+        shape1 = list(data_in_1.shape)
+        shape2 = list(data_in_2.shape)
+        out_shape = list(data_out.shape)
+
+        need_broadcast = (shape1 != out_shape) or (shape2 != out_shape)
+        self.operatorRepresentation['need_broadcast'] = need_broadcast
+
+        if need_broadcast:
+            strides1, strides2 = compute_broadcast_strides(shape1, shape2, out_shape)
+
+            self.operatorRepresentation['ndim'] = len(out_shape)
+            self.operatorRepresentation['strides1'] = strides1
+            self.operatorRepresentation['strides2'] = strides2
+            self.operatorRepresentation['out_shape'] = out_shape
 
         return ctxt, True
 
@@ -2097,15 +2157,15 @@ def parseNodeCtxt(self,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
 
-        inputs = ["input1", "input2"]
-        outputs = ["output"]
+        inputs = ["A", "B"]
+        outputs = ["C"]
         for idx, inputNode in enumerate(node.inputs):
             if idx < len(inputs):
                 self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
         for idx, outputNode in enumerate(node.outputs):
             self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
 
-        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['input1']).shape)
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['A']).shape)
 
         return ctxt, True
 

@@ -6,5 +6,5 @@
 
 referenceTemplate = NodeTemplate("""
 // Division (Name: ${nodeName}, Op: ${nodeOp})
-SINGLE_CORE Div_fp${input1_type.referencedType.typeWidth}_fp${input2_type.referencedType.typeWidth}_fp${output_type.referencedType.typeWidth}(${input1}, ${input2}, ${output}, ${size});
+SINGLE_CORE Div_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}(${A}, ${B}, ${C}, ${size});
 """)
@@ -34,6 +34,12 @@ def alignToContext(self, ctxt: NetworkContext,
         bufferIn.aliases.add(bufferOut.name)
         bufferOut.aliases.add(bufferIn.name)
 
+        # Tiling still reads the legacy single-valued `_alias` attribute
+        # (TilerExtension / MemoryScheduler). Set it here so platforms that
+        # rely on Reshape pointer-passthrough during tiling don't each need
+        # to carry the same workaround in a subclass.
+        bufferOut._alias = bufferIn.name
+
         return ctxt, operatorRepresentation, []
 
 

@@ -12,11 +12,26 @@
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
-from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
 
 
 class BOPTileConstraint(TileConstraint):
-    """Tile constraint class for binary operators, i.e. operators that use two input tensors of equal dimensions
+    """Tile constraint class for binary operators, i.e. operators that have exactly 2 inputs and 1 output.
+
+    When the second input is a scalar (total size 1), it is kept full-size and only
+    the first input and the output are tiled together. This supports ONNX
+    broadcasting in operators that have a corresponding scalar kernel.
+
+    Warning:
+        Broadcasting support is partial -- only the case of a fully-scalar
+        second input (np.prod(input2.shape) == 1) is handled. Other ONNX
+        broadcasting patterns -- input1 scalar, partial broadcasting such
+        as (N, 1) + (1, M), single-dim broadcasting such as (N, M, K) +
+        (N, 1, K), or rank-mismatched shapes such as (N, M) + (M,) --
+        fall through to the non-scalar branch, where the dim-equality
+        constraints will fail to satisfy. Operators that need full ONNX
+        broadcasting must use a different tile constraint.
     """
 
     dataIn1Name = 'data_in_1'  #: str: Name of the first input tensor as defined by the operator's parser
@@ -34,14 +49,27 @@ def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
         input1Shape = ctxt.lookup(inputBuffer1Name).shape
-
-        for dim in range(len(input1Shape)):
-            inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
-            inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
-            outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
-
-            tilerModel.addConstraint(inputDim1Var == inputDim2Var)
-            tilerModel.addConstraint(inputDim1Var == outputDimVar)
+        input2Shape = list(ctxt.lookup(inputBuffer2Name).shape)
+        is_scalar = (np.prod(input2Shape) == 1)
+
+        if is_scalar:
+            # Scalar broadcasting: tile input1 and output together; input2 stays full-size.
+            for dim in range(len(input1Shape)):
+                inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+                outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+                tilerModel.addConstraint(inputDim1Var == outputDimVar)
+            for dim in range(len(input2Shape)):
+                inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+                tilerModel.addConstraint(inputDim2Var == input2Shape[dim])
+        else:
+            # Element-wise: all three tensors tiled identically.
+            for dim in range(len(input1Shape)):
+                inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+                inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+                outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+                tilerModel.addConstraint(inputDim1Var == inputDim2Var)
+                tilerModel.addConstraint(inputDim1Var == outputDimVar)
 
         return tilerModel
 
@@ -64,11 +92,18 @@ def serializeTilingSolution(
             newSize = np.prod(cube.dims)
             replacements["size"].append(newSize)
 
+        input2Shape = list(ctxt.lookup(operatorRepresentation[cls.dataIn2Name]).shape)
+        is_scalar = (np.prod(input2Shape) == 1)
+
         inputLoadSchedule = []
         outputLoadSchedule = []
 
         for cube in outputCubes:
-            inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube})
+            if is_scalar:
+                in2Cube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape))
+                inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: in2Cube})
+            else:
+                inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube})
 
         for out in outputCubes:
             outputLoadSchedule.append({cls.dataOutName: out})

@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.AbstractDataTypes import FloatImmediate, Pointer
 from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker
 from Deeploy.DeeployTypes import ConstantBuffer, OperatorRepresentation, VariableBuffer
 
@@ -409,7 +409,10 @@ def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[
 
     def _inferNumLevels(self, inputs: List[VariableBuffer],
                         operatorRepresentation: OperatorRepresentation) -> List[int]:
-        return [2**(4 * self.input_types[0].referencedType.typeWidth)]
+        input_type = self.input_types[0].referencedType
+        if issubclass(input_type, FloatImmediate):
+            return [2**(input_type.typeWidth)]
+        return [2**(4 * input_type.typeWidth)]
 
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
@@ -610,3 +613,25 @@ def _inferNumLevels(self, inputs: List[VariableBuffer],
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
         return [True]
+
+
+class RMSNormChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        # RMSNorm: square, mean, sqrt, reciprocal, multiply
+        # Output precision similar to input
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        # RMSNorm output can be signed (depending on input signedness)
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+