diff --git a/CHANGELOG.md b/CHANGELOG.md index b79ce714b9..67fdce7e29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Fix for python error when using python 3.12.11 [#189]( https://github.com/pulp-platform/Deeploy/pull/189) - Add support for Operators for Generic target needed in MAGIA [#193]( https://github.com/pulp-platform/Deeploy/pull/193) - Fix GAP9 L3 Board Tests: readfs Flash Ordering and Duplicate Input Data [#196](https://github.com/pulp-platform/Deeploy/pull/196) +- Add support for Operators for Generic target needed in MAGIA (again) [#195]( https://github.com/pulp-platform/Deeploy/pull/195) ### Added - Add many missing docstrings @@ -29,6 +30,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows - Per-layer microbenchmarking on PULPOpen via `--profileMicrobenchmark`: new `PULPMicrobenchmark` code-transformation pass + `perf_utils.h` helpers report cycles, instructions, stalls and cache misses per layer in `RunNetwork` - Add support for the Generic target for the following operators [Ceil](https://onnx.ai/onnx/operators/onnx__Ceil.html), [Floor](https://onnx.ai/onnx/operators/onnx__Floor.html), [Clip](https://onnx.ai/onnx/operators/onnx__Clip.html), [Sub](https://onnx.ai/onnx/operators/onnx__Sub.html), [Exp](https://onnx.ai/onnx/operators/onnx__Exp.html), [Sigmoid](https://onnx.ai/onnx/operators/onnx__Sigmoid.html), [Swish](https://onnx.ai/onnx/operators/onnx__Swish.html), [HardSigmoid](https://onnx.ai/onnx/operators/onnx__HardSigmoid.html), [HardSwish](https://onnx.ai/onnx/operators/onnx__HardSwish.html), [InstanceNormalization](https://onnx.ai/onnx/operators/onnx__InstanceNormalization.html), [GroupNormalization](https://onnx.ai/onnx/operators/onnx__GroupNormalization.html), [AveragePool](https://onnx.ai/onnx/operators/onnx__AveragePool.html), [GlobalAveragePool](https://onnx.ai/onnx/operators/onnx__GlobalAveragePool.html), [GlobalMaxPool](https://onnx.ai/onnx/operators/onnx__GlobalMaxPool.html). +- Add support for the Generic target for the following operators: [Elu](https://onnx.ai/onnx/operators/onnx__Elu.html), [LeakyRelu](https://onnx.ai/onnx/operators/onnx__LeakyRelu.html), [Selu](https://onnx.ai/onnx/operators/onnx__Selu.html), [Scatter](https://onnx.ai/onnx/operators/onnx__Scatter.html), [ScatterElements](https://onnx.ai/onnx/operators/onnx__ScatterElements.html), [Col2Im](https://onnx.ai/onnx/operators/onnx__Col2Im.html), [Resize](https://onnx.ai/onnx/operators/onnx__Resize.html) ### Changed - Use by default `devel` container for GAP9 CI @@ -44,6 +46,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Aligned CLI commands across the project - Added @runwangdl as a code owner - Skip emitting duplicate `testInputVector` data for inputs placed in L3 (loaded at runtime from the readfs hex instead), reducing test binary size +- Allowing ONNX Operators with empty inputs. ### Fixed - Add missing `shell: bash` directive to CI cache generation steps to ensure correct shell execution @@ -57,6 +60,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Reduce RunNetwork stack usage by scoping per-layer variables with braces and moving tileIdxPtr allocation into per-layer execution blocks - Fix invalid escape sequence python error in DeeployTypes.py: appearing when using pytest to launch regressions - Fix GAP9 board tests with `--defaultMemLevel L3` reading garbage inputs: place all gapy `--flash-property` options before the positional subcommand and use `image flash run` so the readfs partition (input hex files) is flashed to the device +- Fix `ConvTranspose` layer: output buffer shape computation. ### Removed - `testDMA.py` was an old test; we now have `test_dmas.py` instead. diff --git a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py index c70628729b..46a4896c12 100644 --- a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py +++ b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py @@ -39,8 +39,8 @@ def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node, operatorRepresentation: OperatorRepresentation) -> NetworkContext: ctxt = super().typeInferOutput(ctxt, node, operatorRepresentation) - inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs] - outputs = [ctxt.lookup(outputNode.name) for outputNode in node.outputs] + inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs if inputNode.name] + outputs = [ctxt.lookup(outputNode.name) for outputNode in node.outputs if outputNode.name] signProp = all([hasattr(_input, "_signed") and hasattr(_input, "nLevels") for _input in inputs]) diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 44abe85112..ea9aaff67f 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -1110,6 +1110,10 @@ def parseInputs(cls, ctxt: NetworkContext, node: gs.Node) -> NetworkContext: for inputNode in node.inputs: data_in = inputNode.name + # Skip absent optional inputs (ONNX represents them as empty-name Variables) + if not data_in: + continue + # Hoist constant inputs if type(inputNode) == gs.ir.tensor.Constant and not ctxt.is_global(data_in): ctxt.hoistConstant(inputNode) @@ -1277,7 +1281,7 @@ def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node, """ newCtxt = ctxt.copy() - inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs] + inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs if inputNode.name] outputNames = [node.name for node in node.outputs] outputTypes = self.output_types @@ -1348,7 +1352,7 @@ def annotateDict(self, ctxt: NetworkContext, node: gs.Node, operatorRepresentati The NodeParser's operatorRepresentation """ - env = [node.name for node in node.inputs + node.outputs] + env = [node.name for node in node.inputs + node.outputs if node.name] for key, value in operatorRepresentation.items(): # check if the referenced buffer is in the environment if isinstance(value, str) and value in env: @@ -1903,7 +1907,9 @@ def broadcast(self, ctxt: NetworkContext, default_channels_first: bool = True) - broadcast to the target shape """ - inputShapes = [ctxt.lookup(node.name).shape for node in self.node.inputs] + # Absent optional inputs are represented in ONNX as empty-name Variables; skip them. + validInputNodes = [node for node in self.node.inputs if node.name] + inputShapes = [ctxt.lookup(node.name).shape for node in validInputNodes] outputShapes = [ctxt.lookup(node.name).shape for node in self.node.outputs] if not "channels_first" in self.mapper.parser.operatorRepresentation: @@ -1914,7 +1920,7 @@ def broadcast(self, ctxt: NetworkContext, default_channels_first: bool = True) - newInputShapes, newOutputShapes = self.computeShapes(inputShapes, outputShapes, self.mapper.parser.operatorRepresentation, channels_first) - for node, newShape in zip(self.node.inputs + self.node.outputs, newInputShapes + newOutputShapes): + for node, newShape in zip(validInputNodes + self.node.outputs, newInputShapes + newOutputShapes): if ctxt.is_local(node.name): ctxt.localObjects[node.name].shape = newShape # Update shape of tensors in onnx graph @@ -2103,7 +2109,7 @@ def bind(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]: npType = self._broadcastToNpType(ctxt.localObjects[node.name]._type) if npType is not None: node.dtype = npType - elif ctxt.is_global(node.name): + elif ctxt.is_global(node.name) and hasattr(ctxt.globalObjects[node.name], '_type'): npType = self._broadcastToNpType(ctxt.globalObjects[node.name]._type) if isinstance(ctxt.globalObjects[node.name], ConstantBuffer): if isinstance(node, gs.Constant): @@ -2954,6 +2960,8 @@ def generateBufferInitializationCode(self) -> str: callStack = '' for node in ctxt.globalObjects.values(): if isinstance(node, VariableBuffer) and not isinstance(node, StructBuffer): + if not hasattr(node, '_type'): + continue assert issubclass(node._type, Pointer), f"Global VariableBuffer {node.name} is not a Pointer!" if node._deploy: name = node.name @@ -2999,6 +3007,8 @@ def generateBufferAllocationCode(self) -> str: for node in ctxt.globalObjects.values(): if isinstance(node, VariableBuffer) and not isinstance(node, StructBuffer): + if not hasattr(node, '_type'): + continue assert issubclass(node._type, Pointer), f"Global VariableBuffer {node.name} is not a Pointer!" if node._deploy: name = node.name @@ -3535,6 +3545,8 @@ def _printMemorySummary(self): # We do not count structs for now, since they are not properly modeled if isinstance(_buffer, ConstantBuffer) or (isinstance(_buffer, VariableBuffer) and _buffer._deploy): # SCHEREMO: We only + if not hasattr(_buffer, '_type'): + continue if (hasattr(_buffer, "_memoryLevel") and _buffer._memoryLevel == level) or level == "None": staticSize += int((np.prod(_buffer.shape) * _buffer._type.referencedType.typeWidth // 8)) else: diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 21cf01e52a..f5483bf669 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -11,23 +11,24 @@ int8_t, int32_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, ConvTemplate, \ - ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \ - FloatAveragePoolTemplate, FloatCeilTemplate, FloatClipTemplate, FloatConvTemplate, FloatDivTemplate, \ - FloatDWConvTemplate, FloatExpTemplate, FloatFloorTemplate, FloatGELUTemplate, FloatGemmTemplate, \ - FloatGlobalAveragePoolTemplate, FloatGlobalMaxPoolTemplate, FloatGroupNormTemplate, FloatHardSigmoidTemplate, \ - FloatHardSwishTemplate, FloatInstanceNormTemplate, FloatLayernormTemplate, FloatMatMulTemplate, \ - FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, FloatPowTemplate, FloatReduceMeanTemplate, \ - FloatReluTemplate, FloatSigmoidTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, FloatSubTemplate, \ - FloatSwishTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, \ - MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ - RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, SubTemplate, \ +from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, Col2ImTemplate, ConcatTemplate, \ + ConvTemplate, ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, \ + FloatAddTemplate, FloatAveragePoolTemplate, FloatCeilTemplate, FloatClipTemplate, FloatConvTemplate, \ + FloatDivTemplate, FloatDWConvTemplate, FloatEluTemplate, FloatExpTemplate, FloatFloorTemplate, FloatGELUTemplate, \ + FloatGemmTemplate, FloatGlobalAveragePoolTemplate, FloatGlobalMaxPoolTemplate, FloatGroupNormTemplate, \ + FloatHardSigmoidTemplate, FloatHardSwishTemplate, FloatInstanceNormTemplate, FloatLayernormTemplate, \ + FloatLeakyReluTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \ + FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSeluTemplate, FloatSigmoidTemplate, \ + FloatSoftmaxTemplate, FloatSqrtTemplate, FloatSubTemplate, FloatSwishTemplate, GatherTemplate, GemmTemplate, \ + IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, MaxPoolTemplate, MulTemplate, \ + PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, RequantShiftTemplate, ReshapeTemplate, \ + ResizeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, ScatterTemplate, SliceTemplate, SubTemplate, \ TransposeTemplate, iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \ - LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \ - ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \ - SoftmaxChecker, TransposeChecker + LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, PassThroughTypeChecker, QuantChecker, \ + ReduceMeanChecker, ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, \ + SliceChecker, SoftmaxChecker, TransposeChecker BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()]) @@ -326,19 +327,35 @@ for type in FloatDataTypes ] -BasicConvTransposeBindings = [ +BasicConvTranspose1DBindings = [ NodeBinding( ConvChecker( [PointerClass(type), PointerClass(type), PointerClass(type)], # input, weight, bias [PointerClass(type)]), - ConvTransposeTemplate.referenceTemplate, + ConvTransposeTemplate.referenceTemplate1D, BasicTransformer) for type in FloatDataTypes ] + [ NodeBinding( ConvChecker( [PointerClass(type), PointerClass(type)], # input, weight [PointerClass(type)]), - ConvTransposeTemplate.referenceTemplate, + ConvTransposeTemplate.referenceTemplate1D, + BasicTransformer) for type in FloatDataTypes +] + +BasicConvTranspose2DBindings = [ + NodeBinding( + ConvChecker( + [PointerClass(type), PointerClass(type), PointerClass(type)], # input, weight, bias + [PointerClass(type)]), + ConvTransposeTemplate.referenceTemplate2D, + BasicTransformer) for type in FloatDataTypes +] + [ + NodeBinding( + ConvChecker( + [PointerClass(type), PointerClass(type)], # input, weight + [PointerClass(type)]), + ConvTransposeTemplate.referenceTemplate2D, BasicTransformer) for type in FloatDataTypes ] @@ -385,6 +402,21 @@ FloatHardSwishTemplate.referenceTemplate, BasicTransformer), ] +BasicEluBindings = [ + NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatEluTemplate.referenceTemplate, + BasicTransformer), +] + +BasicSeluBindings = [ + NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatSeluTemplate.referenceTemplate, + BasicTransformer), +] + +BasicLeakyReluBindings = [ + NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatLeakyReluTemplate.referenceTemplate, BasicTransformer), +] + BasicInstanceNormBindings = [ NodeBinding( DummyChecker( @@ -420,3 +452,22 @@ NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatGlobalMaxPoolTemplate.referenceTemplate, BasicTransformer) ] + +BasicCol2ImBindings = [ + NodeBinding( + PassThroughTypeChecker([PointerClass(type), PointerClass(int32_t), + PointerClass(int32_t)], [PointerClass(type)]), Col2ImTemplate.referenceTemplate, + BasicTransformer) for type in (int8_t, uint8_t, float32_t) +] + +BasicScatterBindings = [ + NodeBinding( + PassThroughTypeChecker( + [PointerClass(type), PointerClass(int32_t), PointerClass(type)], [PointerClass(type)]), + ScatterTemplate.referenceTemplate, BasicTransformer) for type in (int8_t, uint8_t, float32_t) +] + +BasicResizeBindings = [ + NodeBinding(PassThroughTypeChecker([PointerClass(type)], [PointerClass(type)]), ResizeTemplate.referenceTemplate, + BasicTransformer) for type in (int8_t, uint8_t, float32_t) +] diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index d0a1e1db3c..bb91244447 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -662,45 +662,46 @@ def __init__(self, maps: List[NodeMapper]): def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation, channels_first) -> Tuple[Shape, Shape]: - """ - Infers output shapes for ConvTranspose using only static info. - - inputShapes[0]: input tensor shape (e.g., [N, C_in, W] for 1D, [N, C_in, H, W] for 2D) - - inputShapes[1]: weight tensor shape (e.g., [C_in, C_out // group, kW] for 1D) - - outputShapes[0]: output tensor shape (to be updated) - """ newInputShapes = list(inputShapes) - newOutputShapes = list(outputShapes) + + input_shape = inputShapes[0] # [N, C_in, d0, ...] + weight_shape = inputShapes[1] # [C_in, C_out//group, k0, ...] group = operatorRepresentation.get('group', 1) - weight_shape = inputShapes[1] - if newOutputShapes and len(newOutputShapes[0]) >= 2: - # For 1D: weight_shape = [C_in, C_out // group, kW] - # For 2D: weight_shape = [C_in, C_out // group, kH, kW] - ch_out = weight_shape[1] * group - if channels_first: - newOutputShapes[0][1] = ch_out - else: - newOutputShapes[0][-1] = ch_out + batch = input_shape[0] + spatial_in = list(input_shape[2:]) if channels_first else list(input_shape[1:-1]) + ndim = len(spatial_in) - return newInputShapes, newOutputShapes + kernel_shape = list(weight_shape[2:]) + C_out = weight_shape[1] * group - def computeOps(self): - opRep = self.mapper.parser.operatorRepresentation + strides = operatorRepresentation.get('strides') or [1] * ndim + dilations = operatorRepresentation.get('dilations') or [1] * ndim + output_padding = operatorRepresentation.get('output_padding') or [0] * ndim + pads = operatorRepresentation.get('pads') or [0] * (2 * ndim) - groups = opRep.get('group', 1) - kernel_shape = np.prod(opRep['kernel_shape']) # es. [3, 3] -> 9 - ch_in = opRep['ch_im_in'] - ch_out = opRep['ch_im_out'] + spatial_out = [(spatial_in[d] - 1) * strides[d] - pads[d] - pads[d + ndim] + dilations[d] * + (kernel_shape[d] - 1) + output_padding[d] + 1 for d in range(ndim)] - opsPerPx = int(kernel_shape * ch_in * ch_out / groups) * 2 - - # ConvTranspose upscales spatial dims, quindi num pixel viene da output - if 'dim_im_out_y' in opRep: - numPx = opRep['dim_im_out_x'] * opRep['dim_im_out_y'] + if channels_first: + output_shape = [batch, C_out] + spatial_out else: - numPx = opRep['dim_im_out_x'] + output_shape = [batch] + spatial_out + [C_out] - return numPx * opsPerPx + return newInputShapes, [output_shape] + + def computeOps(self): + rep = self.mapper.parser.operatorRepresentation + + group = rep.get('group', 1) + kernel_shape = np.prod(rep['kernel_shape']) # es. [3, 3] -> 9 + channels = rep['channels'] + feature_maps = rep['feature_maps'] + + ops_per_px = int(kernel_shape * feature_maps * channels // group) * 2 + num_px = np.prod(rep['output_shape']) + + return num_px * ops_per_px class CeilLayer(SingleOperationPerElementLayer): @@ -750,6 +751,28 @@ def computeOps(self): return self.mapper.parser.operatorRepresentation['size'] * 5 +class EluLayer(ONNXLayer): + + def computeOps(self): + # input > 0 -> y = x (just an assignment) + # input <=0 -> y = alpha * (expf(x) - 1): exp, add, mul + # consider the worst case, which is 3 ops + return self.mapper.parser.operatorRepresentation['size'] * 3 + + +class SeluLayer(ONNXLayer): + + def computeOps(self): + # input > 0 -> y = gamma * x: mul + # input <=0 -> y = gamma * alpha * (expf(x) - 1): exp, add, 2 mul + # consider the worst case, which is 4 ops + return self.mapper.parser.operatorRepresentation['size'] * 4 + + +class LeakyReluLayer(SingleOperationPerElementLayer): + pass + + class InstanceNormLayer(ONNXLayer): def computeOps(self): @@ -792,3 +815,42 @@ def computeOps(self): opRep = self.mapper.parser.operatorRepresentation # (spatial_size - 1) comparisons per output channel return int(opRep['batch_size'] * opRep['num_channels'] * (opRep['spatial_size'] - 1)) + + +class Col2ImLayer(ONNXLayer): + + def computeOps(self): + # Col2Im iterates over every element of the input tensor and adds it + # into the corresponding output position. The total number of + # accumulations is exactly the number of input elements which is + # N × C × block_volume × L + rep = self.mapper.parser.operatorRepresentation + block_volume = int(np.prod(rep['block_shape'])) + L = int(np.prod(rep['col_dims'])) + return rep['batch_size'] * rep['channels'] * block_volume * L + + +class ScatterLayer(ONNXLayer): + + def computeOps(self): + opRep = self.mapper.parser.operatorRepresentation + if opRep.get('reduction', 'none') == 'none': + # no arithmetic operations + return 0 + else: + # 1 op per index element + return int(np.prod(opRep['indices_shape'])) + + +class ResizeLayer(ONNXLayer): + + def computeOps(self): + rep = self.mapper.parser.operatorRepresentation + size = rep['batch_size'] * rep['channels'] * int(np.prod(rep['output_shape'])) + spatial_dims: int = rep['spatial_dims'] + ops = 0 # default: Nearest-neighbour is a pure copy — no arithmetic operations. + if rep['mode'] == 'linear': # 2^spatial_dims multiply-accumulates per output element. + ops = size * (1 << spatial_dims) + elif rep['mode'] == 'cubic': # 4^spatial_dims multiply-accumulates per output element. + ops = size * (4**spatial_dims) + return ops diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index aa8bd8724a..f4b8d5bae4 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -2743,127 +2743,190 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: + + if not all( + [node.op == 'ConvTranspose', + len(node.inputs) >= 2 and len(node.inputs) <= 3, + len(node.outputs) == 1]): + return False + # Extract ONNX attributes with defaults - strides = node.attrs.get('strides', [1]) + auto_pad: str = node.attrs.get('auto_pad', 'NOTSET') + group: int = node.attrs.get('group', 1) - pads = node.attrs.get('pads', [0, 0]) - kernel_shape = node.attrs.get('kernel_shape', None) - dilations = node.attrs.get('dilations', [1]) - group = node.attrs.get('group', 1) + if not all([ + auto_pad in ('NOTSET', 'SAME_UPPER', 'SAME_LOWER', 'VALID'), + group >= 1, + ]): + return False - # Check for required attributes - wellFormed = (kernel_shape is not None and len(node.outputs) == 1) - if wellFormed: - self.operatorRepresentation['strides'] = strides - self.operatorRepresentation['pads'] = pads - self.operatorRepresentation['kernel_shape'] = kernel_shape - self.operatorRepresentation['dilations'] = dilations - self.operatorRepresentation['group'] = group - self.operatorRepresentation['nodeName'] = node.name - self.operatorRepresentation['nodeOp'] = node.op - return wellFormed + self.operatorRepresentation['auto_pad'] = auto_pad + self.operatorRepresentation['group'] = group + + self.operatorRepresentation['dilations'] = node.attrs.get('dilations', None) # default: ones + self.operatorRepresentation['kernel_shape'] = node.attrs.get('kernel_shape', None) # default from weights + self.operatorRepresentation['output_padding'] = node.attrs.get('output_padding', None) # default: zeros + self.operatorRepresentation['output_shape'] = node.attrs.get('output_shape', None) # overwrite pads + self.operatorRepresentation['pads'] = node.attrs.get('pads', None) # default: zeros + self.operatorRepresentation['strides'] = node.attrs.get('strides', None) # default: ones + + self.operatorRepresentation['nodeOp'] = node.op + self.operatorRepresentation['nodeName'] = node.name + + return True def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True): - # Register buffer names for codegen - self.operatorRepresentation['data_in'] = node.inputs[0].name - self.operatorRepresentation['weight'] = node.inputs[1].name - self.operatorRepresentation['data_out'] = node.outputs[0].name + + rep = self.operatorRepresentation + + # inputs/outputs + data_in: VariableBuffer = ctxt.lookup(node.inputs[0].name) + weight: ConstantBuffer = ctxt.lookup(node.inputs[1].name) + data_out: VariableBuffer = ctxt.lookup(node.outputs[0].name) + + # check inputs/outputs + if not all([ + len(data_in.shape) == len(data_out.shape) == len(weight.shape), # same ndims + all(s > 0 for s in data_in.shape), # no empty dim + all(s > 0 for s in weight.shape), # no empty dim + all(s > 0 for s in data_out.shape), # no empty dim + data_in.shape[0] == data_out.shape[0], # same batch size + len(data_in.shape) > 2, # at least batch, channels/feature_maps, one spatial dim + ]): + return ctxt, False + + # retrieve info from inputs/outputs + batch_size, channels = data_in.shape[:2] + input_shape = data_in.shape[2:] # spatial input shape + output_shape = data_out.shape[2:] # spatial output shape + + spatial_dims = len(input_shape) + + kernel_shape = list(weight.shape[2:]) + feature_maps = weight.shape[1] * rep['group'] # input channels + + # optional inputs if len(node.inputs) == 3: - self.operatorRepresentation['bias'] = node.inputs[2].name - self.operatorRepresentation['has_bias'] = "true" - else: - self.operatorRepresentation['has_bias'] = "false" - # Get output shape from context - data_out = ctxt.lookup(node.outputs[0].name) - out_shape = data_out.shape - if len(out_shape) == 3: - self.operatorRepresentation['dim_im_out_x'] = out_shape[2] - elif len(out_shape) == 4: - self.operatorRepresentation['dim_im_out_x'] = out_shape[2] - self.operatorRepresentation['dim_im_out_y'] = out_shape[3] - - stride_x, stride_y = 1, 1 - if "strides" in node.attrs: - stride_y = node.attrs["strides"][0] - stride_x = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_y - self.operatorRepresentation["stride_y"] = stride_y - self.operatorRepresentation["stride_x"] = stride_x - - if "kernel_shape" in node.attrs: - kernel_shape = node.attrs["kernel_shape"] - kernel_shape_x = kernel_shape[0] - # For 2D, kernel_shape may have two elements - kernel_shape_y = kernel_shape[1] if len(kernel_shape) > 1 else kernel_shape_x - else: - kernel_shape_x = 1 - kernel_shape_y = 1 + bias: ConstantBuffer = ctxt.lookup(node.inputs[2].name) + if not (len(bias.shape) == 1 and bias.shape[0] == feature_maps): + return ctxt, False + rep['bias'] = bias.name + + # attributes with possible inconsistences + kernel_shape_attr: list[int] = rep['kernel_shape'] or kernel_shape + output_shape_attr: list[int] = rep['output_shape'] or output_shape + # check possible inconsistences + if not all([ + kernel_shape_attr == kernel_shape, + output_shape_attr == output_shape, + ]): + return ctxt, False + + # other attributes + dilations: list[int] = rep['dilations'] or [1] * spatial_dims + output_padding: list[int] = rep['output_padding'] or [0] * spatial_dims + strides: list[int] = rep['strides'] or [1] * spatial_dims + + # auto_pad may lead to overwrite pads + if rep['auto_pad'] == 'NOTSET': + pads: list[int] = rep['pads'] or [0] * (2 * spatial_dims) + elif rep['auto_pad'] == 'VALID': + pads = [0] * (2 * spatial_dims) + else: # SAME_UPPER, SAME_LOWER + starts = [0] * spatial_dims + ends = [0] * spatial_dims + for i in range(spatial_dims): + total_padding = (strides[i] * (input_shape[i] - 1) + output_padding[i] + + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]) + half = total_padding // 2 + if rep['auto_pad'] == 'SAME_UPPER': + starts[i] = half + ends[i] = total_padding - half + else: # SAME_LOWER + ends[i] = half + starts[i] = total_padding - half + pads = starts + ends + + # check other attributes + if not all([ + len(dilations) == spatial_dims, + len(output_padding) == spatial_dims, + len(pads) == 2 * spatial_dims, + len(strides) == spatial_dims, + all(d > 0 for d in dilations), + all(p >= 0 for p in output_padding), + all(p >= 0 for p in pads), + all(s > 0 for s in strides), + ]): + return ctxt, False + + # fill operatorRepresentation + rep['data_in'] = data_in.name + rep['weight'] = weight.name + rep['data_out'] = data_out.name + rep['has_bias'] = int('bias' in rep) + + rep['kernel_shape'] = kernel_shape + rep['output_shape'] = output_shape + rep['pads'] = pads + rep['strides'] = strides + rep['dilations'] = dilations + rep['output_padding'] = output_padding + + rep['batch_size'] = batch_size + rep['channels'] = channels + rep['feature_maps'] = feature_maps + rep['input_shape'] = input_shape - data_in = ctxt.lookup(node.inputs[0].name) - data_out = ctxt.lookup(node.outputs[0].name) - in_shape = data_in.shape - out_shape = data_out.shape - - self.operatorRepresentation['ch_im_in'] = in_shape[1] - self.operatorRepresentation['dim_im_in_y'] = in_shape[2] - self.operatorRepresentation['ch_im_out'] = out_shape[1] - self.operatorRepresentation['dim_im_out_y'] = out_shape[2] - - self.operatorRepresentation[ - 'batchOffsetIn'] = self.operatorRepresentation['ch_im_in'] * self.operatorRepresentation['dim_im_in_y'] - self.operatorRepresentation[ - 'batchOffsetOut'] = self.operatorRepresentation['ch_im_out'] * self.operatorRepresentation['dim_im_out_y'] return ctxt, True class ConvTranspose1DParser(ConvTransposeParser): - def __init__(self): - super().__init__() + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: - def parseNode(self, node: gs.Node) -> bool: - # 1D ConvTranspose expects 3D input/output and 3D weight - wellFormed = super().parseNode(node) - ret = False - if wellFormed: - ret = all([ - # Make sure strides are 2D - len(node.attrs['strides']) == 1, - len(node.attrs['pads']) == 2, - len(node.attrs['dilations']) == 1, - ]) - if ret: + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return ctxt, False - self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape'] - self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][0]) - self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][0]) - self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][0]) - self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][0]) + rep = self.operatorRepresentation + spatial_dims = len(rep['kernel_shape']) + if spatial_dims != 1: + return ctxt, False - return ret + rep['input_length'], = rep['input_shape'] + rep['output_length'], = rep['output_shape'] + rep['kernel_length'], = rep['kernel_shape'] + rep['stride'], = rep['strides'] + + return ctxt, True + + +class ConvTranspose2DParser(ConvTransposeParser): def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: - newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return ctxt, False - if ret: - data_in = newCtxt.lookup(node.inputs[0].name) - data_out = newCtxt.lookup(node.outputs[0].name) - in_shape = data_in.shape - out_shape = data_out.shape - self.operatorRepresentation['batch'] = in_shape[0] - self.operatorRepresentation['ch_im_in'] = in_shape[1] - self.operatorRepresentation['dim_im_in_y'] = in_shape[2] - self.operatorRepresentation['ch_im_out'] = out_shape[1] - self.operatorRepresentation['dim_im_out_y'] = out_shape[2] - self.operatorRepresentation[ - "batchOffsetIn"] = self.operatorRepresentation["ch_im_in"] * self.operatorRepresentation["dim_im_in_y"] - self.operatorRepresentation["batchOffsetOut"] = self.operatorRepresentation[ - "ch_im_out"] * self.operatorRepresentation["dim_im_out_y"] - return newCtxt, True - return ctxt, False + rep = self.operatorRepresentation + spatial_dims = len(rep['kernel_shape']) + if spatial_dims != 2: + return ctxt, False + + rep['input_height'], rep['input_width'] = rep['input_shape'] + rep['output_height'], rep['output_width'] = rep['output_shape'] + rep['kernel_height'], rep['kernel_width'] = rep['kernel_shape'] + rep['stride_h'], rep['stride_w'] = rep['strides'] + + return ctxt, True class SqrtParser(UnaryElementWiseParser): @@ -2959,6 +3022,34 @@ def parseNode(self, node: gs.Node) -> bool: return super().parseNode(node) and node.op == 'HardSwish' +class EluParser(UnaryElementWiseParser): + + def parseNode(self, node: gs.Node) -> bool: + if not (super().parseNode(node) and node.op == 'Elu'): + return False + self.operatorRepresentation['alpha'] = node.attrs.get('alpha', 1.0) + return True + + +class SeluParser(UnaryElementWiseParser): + + def parseNode(self, node: gs.Node) -> bool: + if not (super().parseNode(node) and node.op == 'Selu'): + return False + self.operatorRepresentation['alpha'] = node.attrs.get('alpha', 1.67326319217681884765625) + self.operatorRepresentation['gamma'] = node.attrs.get('gamma', 1.05070102214813232421875) + return True + + +class LeakyReluParser(UnaryElementWiseParser): + + def parseNode(self, node: gs.Node) -> bool: + if not (super().parseNode(node) and node.op == 'LeakyRelu'): + return False + self.operatorRepresentation['alpha'] = node.attrs.get('alpha', 0.01) + return True + + class NormalizationParser(NodeParser): def parseNode(self, node: gs.Node) -> bool: @@ -3124,3 +3215,236 @@ class GlobalMaxPoolParser(GlobalPoolParser): def parseNode(self, node: gs.Node) -> bool: return super().parseNode(node) and node.op == 'GlobalMaxPool' + + +class ScatterParser(NodeParser): + + def parseNode(self, node: gs.Node) -> bool: + + if not all([ + node.op == 'Scatter' or node.op == 'ScatterElements', + len(node.inputs) == 3, + len(node.outputs) == 1, + ]): + return False + + axis = node.attrs.get('axis', 0) + reduction = node.attrs.get('reduction', 'none') + + if reduction not in ('none', 'add', 'mul', 'max', 'min'): + return False + + self.operatorRepresentation['axis'] = axis + self.operatorRepresentation['reduction'] = reduction + + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + indices = ctxt.lookup(node.inputs[1].name) + updates = ctxt.lookup(node.inputs[2].name) + data_out = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['indices'] = indices.name + self.operatorRepresentation['updates'] = updates.name + self.operatorRepresentation['data_out'] = data_out.name + + self.operatorRepresentation['ndim'] = len(data_in.shape) + self.operatorRepresentation['data_shape'] = list(data_in.shape) + self.operatorRepresentation['indices_shape'] = list(indices.shape) + + return ctxt, True + + +class Col2ImParser(NodeParser): + + def parseNode(self, node: gs.Node) -> bool: + + if not all([node.op == 'Col2Im', len(node.inputs) == 3, len(node.outputs) == 1]): + return False + + # Deeploy is a static ahead-of-time code generator: shape tensors that + # appear as C compound literals in the emitted code must be known at + # parse time. + # image_shape / block_shape are therefore assumed to be constant and + # are not supported as variables + if not isinstance(node.inputs[1], gs.Constant) or not isinstance(node.inputs[2], gs.Constant): + return False + + image_shape = node.inputs[1].values.astype(int).tolist() + block_shape = node.inputs[2].values.astype(int).tolist() + spatial_dims = len(image_shape) + + if spatial_dims <= 0: + return False + + dilations = list(node.attrs.get('dilations', [1] * spatial_dims)) + pads = list(node.attrs.get('pads', [0] * (2 * spatial_dims))) + strides = list(node.attrs.get('strides', [1] * spatial_dims)) + + if not all([ + len(dilations) == spatial_dims, + len(pads) == 2 * spatial_dims, + len(strides) == spatial_dims, + all(s > 0 for s in image_shape), + all(s > 0 for s in block_shape), + all(d > 0 for d in dilations), + all(p >= 0 for p in pads), + all(s > 0 for s in strides), + ]): + return False + + col_dims = [(image_shape[p] + pads[p] + pads[p + spatial_dims] - dilations[p] * + (block_shape[p] - 1) - 1) // strides[p] + 1 for p in range(spatial_dims)] + if any(d <= 0 for d in col_dims): + return False + + self.operatorRepresentation['col_dims'] = col_dims + self.operatorRepresentation['image_shape'] = image_shape + self.operatorRepresentation['block_shape'] = block_shape + self.operatorRepresentation['spatial_dims'] = spatial_dims + self.operatorRepresentation['dilations'] = dilations + self.operatorRepresentation['pads'] = pads + self.operatorRepresentation['strides'] = strides + + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in: VariableBuffer = ctxt.lookup(node.inputs[0].name) + data_out: VariableBuffer = ctxt.lookup(node.outputs[0].name) + + image_shape = self.operatorRepresentation['image_shape'] + block_shape = self.operatorRepresentation['block_shape'] + col_dims = self.operatorRepresentation['col_dims'] + + N, C = data_out.shape[0], data_out.shape[1] + block_volume = int(np.prod(block_shape)) + L = int(np.prod(col_dims)) + + if list(data_in.shape) != [N, C * block_volume, L]: + return ctxt, False + + if list(data_out.shape) != [N, C] + image_shape: + return ctxt, False + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['batch_size'] = N + self.operatorRepresentation['channels'] = C + + return ctxt, True + + +class ResizeParser(NodeParser): + + @staticmethod + def _is_empty(input: gs.Variable | gs.Constant | None) -> bool: + if input is None: + return True + if isinstance(input, gs.Constant): + return input.values.size <= 0 + if isinstance(input, gs.Variable): + return input.shape is None + return True + + def parseNode(self, node: gs.Node) -> bool: + + if not all([node.op == 'Resize', len(node.inputs) >= 1, len(node.outputs) == 1]): + return False + + antialias = node.attrs.get('antialias', 0) + axes = node.attrs.get('axes', None) # None -> all axes + coord_mode = node.attrs.get('coordinate_transformation_mode', 'half_pixel') + cubic_coeff_a = node.attrs.get('cubic_coeff_a', -0.75) + exclude_outside = node.attrs.get('exclude_outside', 0) + extrapolation_value = node.attrs.get('extrapolation_value', 0.0) + keep_aspect_ratio_policy = node.attrs.get('keep_aspect_ratio_policy', 'stretch') + mode = node.attrs.get('mode', 'nearest') + nearest_mode = node.attrs.get('nearest_mode', 'round_prefer_floor') + + if not all([ + coord_mode in ('half_pixel', 'half_pixel_symmetric', 'pytorch_half_pixel', 'align_corners', + 'asymmetric', 'tf_crop_and_resize'), + keep_aspect_ratio_policy in ('stretch', 'not_larger', 'not_smaller'), + mode in ('nearest', 'linear', 'cubic'), + nearest_mode in ('floor', 'ceil', 'round_prefer_floor', 'round_prefer_ceil'), + ]): + return False + + self.operatorRepresentation['antialias'] = antialias + self.operatorRepresentation['axes'] = axes + self.operatorRepresentation['coord_mode'] = coord_mode + self.operatorRepresentation['cubic_coeff_a'] = cubic_coeff_a + self.operatorRepresentation['exclude_outside'] = exclude_outside + self.operatorRepresentation['extrapolation_value'] = extrapolation_value + self.operatorRepresentation['keep_aspect_ratio_policy'] = keep_aspect_ratio_policy + self.operatorRepresentation['mode'] = mode + self.operatorRepresentation['nearest_mode'] = nearest_mode + + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in: VariableBuffer = ctxt.lookup(node.inputs[0].name) + data_out: VariableBuffer = ctxt.lookup(node.outputs[0].name) + + if not all([ + len(data_in.shape) == len(data_out.shape), # same ndims + all(s > 0 for s in data_in.shape), # no empty dim + all(s > 0 for s in data_out.shape), # no empty dim + len(data_in.shape) > 2, # at least batch, channels, one spatial dim + data_in.shape[:2] == data_out.shape[:2], # batch_size and channels are unchanged + ]): + return ctxt, False + + roi: gs.Variable | gs.Constant | None = node.inputs[1] if len(node.inputs) > 1 else None + scales: gs.Constant | None = node.inputs[2] if len(node.inputs) > 2 else None + sizes: gs.Constant | None = node.inputs[3] if len(node.inputs) > 3 else None + + has_scales = not self._is_empty(scales) + has_sizes = not self._is_empty(sizes) + + if any([ + # ONNX requires exactly one of scales / sizes to be non-empty + has_scales and has_sizes, + (not has_scales) and (not has_sizes), + # scales and sizes assumed constants otherwise output shape + # cannot be inferred at parsing time + has_scales and not isinstance(scales, gs.Constant), + has_sizes and not isinstance(sizes, gs.Constant), + ]): + return ctxt, False + + if not self._is_empty(roi): + if isinstance(roi, gs.Constant): + _roi = roi.values.tolist() + elif isinstance(roi, gs.Variable): + _roi = ctxt.lookup(roi.name).name + else: + return ctxt, False + else: + _roi = None + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['batch_size'] = data_in.shape[0] + self.operatorRepresentation['channels'] = data_in.shape[1] + self.operatorRepresentation['spatial_dims'] = len(data_in.shape[2:]) + self.operatorRepresentation['input_shape'] = list(data_in.shape[2:]) + self.operatorRepresentation['output_shape'] = list(data_out.shape[2:]) + self.operatorRepresentation['roi'] = _roi + self.operatorRepresentation['scales'] = scales.values.tolist() if has_scales else None + self.operatorRepresentation['sizes'] = sizes.values.tolist() if has_sizes else None + + return ctxt, True diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py index 2aa1ef1c38..413dd6f878 100644 --- a/Deeploy/Targets/Generic/Platform.py +++ b/Deeploy/Targets/Generic/Platform.py @@ -7,32 +7,35 @@ from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicAveragePool1DBindings, BasicAveragePool2DBindings, \ - BasicBatchNormBindings, BasicCeilBindings, BasicClipBindings, BasicConcatBindings, BasicConv1DBindings, \ - BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, \ - BasicDWConv1DBinding, BasicDWConv2DBindings, BasicExpBindings, BasicFloorBindings, BasicGatherBindings, \ - BasicGELUBindings, BasicGEMMBindings, BasicGlobalAveragePoolBindings, BasicGlobalMaxPoolBindings, \ - BasicGroupNormBindings, BasicHardSigmoidBindings, BasicHardSwishBindings, BasicInstanceNormBindings, \ - BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, BasicLayerNormBindings, BasicMatMulBindings, \ - BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, BasicPad1DBindings, BasicPad2DBindings, \ - BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReluBinding, \ - BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSigmoidBindings, \ - BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicSubBindings, BasicSwishBindings, \ - BasicTransposeBindings, DummyBinding + BasicBatchNormBindings, BasicCeilBindings, BasicClipBindings, BasicCol2ImBindings, BasicConcatBindings, \ + BasicConv1DBindings, BasicConv2DBindings, BasicConvTranspose1DBindings, BasicConvTranspose2DBindings, \ + BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, \ + BasicEluBindings, BasicExpBindings, BasicFloorBindings, BasicGatherBindings, BasicGELUBindings, BasicGEMMBindings, \ + BasicGlobalAveragePoolBindings, BasicGlobalMaxPoolBindings, BasicGroupNormBindings, BasicHardSigmoidBindings, \ + BasicHardSwishBindings, BasicInstanceNormBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \ + BasicLayerNormBindings, BasicLeakyReluBindings, BasicMatMulBindings, BasicMaxPool1DBindings, \ + BasicMaxPool2DBindings, BasicMulBindings, BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, \ + BasicQuantBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, \ + BasicResizeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicScatterBindings, \ + BasicSeluBindings, BasicSigmoidBindings, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, \ + BasicSubBindings, BasicSwishBindings, BasicTransposeBindings, DummyBinding from Deeploy.Targets.Generic.Layers import AddLayer, AveragePoolLayer, BatchNormalizationLayer, CeilLayer, ClipLayer, \ - ConcatLayer, ConvLayer, ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, ExpLayer, FloorLayer, \ - GatherLayer, GELULayer, GEMMLayer, GlobalAveragePoolLayer, GlobalMaxPoolLayer, GroupNormLayer, InstanceNormLayer, \ - ITAMaxLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \ - ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SigmoidLayer, \ - SliceLayer, SoftmaxLayer, SqrtLayer, SubLayer, SwishLayer, TransposeLayer + Col2ImLayer, ConcatLayer, ConvLayer, ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, EluLayer, \ + ExpLayer, FloorLayer, GatherLayer, GELULayer, GEMMLayer, GlobalAveragePoolLayer, GlobalMaxPoolLayer, \ + GroupNormLayer, InstanceNormLayer, ITAMaxLayer, LayerNormLayer, LeakyReluLayer, MatMulLayer, MaxPoolLayer, \ + MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, \ + ReshapeLayer, ResizeLayer, RQIntegerDivLayer, RQSiGELULayer, ScatterLayer, SeluLayer, SigmoidLayer, SliceLayer, \ + SoftmaxLayer, SqrtLayer, SubLayer, SwishLayer, TransposeLayer from Deeploy.Targets.Generic.Parsers import AddParser, AveragePool1DParser, AveragePool2DParser, BatchNormParser, \ - CeilParser, ClipParser, ConcatParser, ConvTranspose1DParser, DebugParser, DequantParser, DivParser, DummyParser, \ - ExpParser, FlattenParser, FloorParser, GatherParser, GELUParser, GenericConv1DParser, GenericConv2DParser, \ - GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, GlobalAveragePoolParser, \ - GlobalMaxPoolParser, GroupNormParser, HardSigmoidParser, HardSwishParser, InstanceNormParser, IntegerDivParser, \ - ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, Pad1DParser, \ - Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, \ - ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SigmoidParser, SliceParser, SoftmaxParser, SqrtParser, \ - SubParser, SwishParser, TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser + CeilParser, ClipParser, Col2ImParser, ConcatParser, ConvTranspose1DParser, ConvTranspose2DParser, DebugParser, \ + DequantParser, DivParser, DummyParser, EluParser, ExpParser, FlattenParser, FloorParser, GatherParser, GELUParser, \ + GenericConv1DParser, GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, \ + GenericMaxPool2DParser, GlobalAveragePoolParser, GlobalMaxPoolParser, GroupNormParser, HardSigmoidParser, \ + HardSwishParser, InstanceNormParser, IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, \ + LeakyReluParser, MatMulParser, MaxPool1DParser, MulParser, Pad1DParser, Pad2DParser, PowParser, QuantParser, \ + ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, ResizeParser, \ + RQIntegerDivParser, RQSiGELUParser, ScatterParser, SeluParser, SigmoidParser, SliceParser, SoftmaxParser, \ + SqrtParser, SubParser, SwishParser, TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \ @@ -78,7 +81,8 @@ QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) BatchNormalizationMapper = NodeMapper(BatchNormParser(), BasicBatchNormBindings) -ConvTransposeMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTransposeBindings) +ConvTranspose1DMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTranspose1DBindings) +ConvTranspose2DMapper = NodeMapper(ConvTranspose2DParser(), BasicConvTranspose2DBindings) SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings) CeilMapper = NodeMapper(CeilParser(), BasicCeilBindings) FloorMapper = NodeMapper(FloorParser(), BasicFloorBindings) @@ -88,12 +92,18 @@ SwishMapper = NodeMapper(SwishParser(), BasicSwishBindings) HardSigmoidMapper = NodeMapper(HardSigmoidParser(), BasicHardSigmoidBindings) HardSwishMapper = NodeMapper(HardSwishParser(), BasicHardSwishBindings) +EluMapper = NodeMapper(EluParser(), BasicEluBindings) +SeluMapper = NodeMapper(SeluParser(), BasicSeluBindings) +LeakyReluMapper = NodeMapper(LeakyReluParser(), BasicLeakyReluBindings) InstanceNormMapper = NodeMapper(InstanceNormParser(), BasicInstanceNormBindings) GroupNormMapper = NodeMapper(GroupNormParser(), BasicGroupNormBindings) AveragePool1DMapper = NodeMapper(AveragePool1DParser(), BasicAveragePool1DBindings) AveragePool2DMapper = NodeMapper(AveragePool2DParser(), BasicAveragePool2DBindings) GlobalAveragePoolMapper = NodeMapper(GlobalAveragePoolParser(), BasicGlobalAveragePoolBindings) GlobalMaxPoolMapper = NodeMapper(GlobalMaxPoolParser(), BasicGlobalMaxPoolBindings) +ScatterMapper = NodeMapper(ScatterParser(), BasicScatterBindings) +Col2ImMapper = NodeMapper(Col2ImParser(), BasicCol2ImBindings) +ResizeMapper = NodeMapper(ResizeParser(), BasicResizeBindings) # Dummy nodes are intended for development purposes only! # They should always generate compiler errors to not accidentally end up in production code @@ -106,12 +116,14 @@ 'Concat': ConcatLayer([ConcatMapper]), 'DebugPrint': DebugPrintLayer([DebugMapper]), 'Div': DivLayer([DivMapper]), + 'Elu': EluLayer([EluMapper]), 'Flatten': ReshapeLayer([FlattenMapper]), 'Gather': GatherLayer([GatherMapper]), 'Gemm': GEMMLayer([GEMMMapper]), 'iGELU': GELULayer([GELUMapper]), 'Gelu': GELULayer([GELUMapper]), 'LayerNormalization': LayerNormLayer([LayerNormMapper]), + 'LeakyRelu': LeakyReluLayer([LeakyReluMapper]), 'iLayerNorm': LayerNormLayer([iLayerNormMapper]), 'IntegerDiv': DivLayer([IntegerDivMapper]), 'IntegerMean': ReduceMeanLayer([ReduceMeanMapper]), @@ -140,11 +152,12 @@ 'Quant': QuantLayer([QuantMapper]), 'Dequant': DequantLayer([DequantMapper]), 'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]), - 'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper]), + 'ConvTranspose': ConvTransposeLayer([ConvTranspose1DMapper, ConvTranspose2DMapper]), 'Ceil': CeilLayer([CeilMapper]), 'Floor': FloorLayer([FloorMapper]), 'Clip': ClipLayer([ClipMapper]), 'Exp': ExpLayer([ExpMapper]), + 'Selu': SeluLayer([SeluMapper]), 'Sigmoid': SigmoidLayer([SigmoidMapper]), 'Swish': SwishLayer([SwishMapper]), 'HardSigmoid': SigmoidLayer([HardSigmoidMapper]), @@ -154,6 +167,10 @@ 'AveragePool': AveragePoolLayer([AveragePool1DMapper, AveragePool2DMapper]), 'GlobalAveragePool': GlobalAveragePoolLayer([GlobalAveragePoolMapper]), 'GlobalMaxPool': GlobalMaxPoolLayer([GlobalMaxPoolMapper]), + 'Resize': ResizeLayer([ResizeMapper]), + 'Scatter': ScatterLayer([ScatterMapper]), + 'ScatterElements': ScatterLayer([ScatterMapper]), + 'Col2Im': Col2ImLayer([Col2ImMapper]), # # For example, you can use the DummpyMapper, in case you want to test # # deployment or optimizations with GlobalAveragePool nodes but did not yet # # implement the corresponding kernel diff --git a/Deeploy/Targets/Generic/Templates/Col2ImTemplate.py b/Deeploy/Targets/Generic/Templates/Col2ImTemplate.py new file mode 100644 index 0000000000..11d10988fe --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/Col2ImTemplate.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import FloatImmediate +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +def _typeSuffix(ref_type) -> str: + if issubclass(ref_type, FloatImmediate): + return f'fp{ref_type.typeWidth}' + elif ref_type.signed: + return f's{ref_type.typeWidth}' + else: + return f'u{ref_type.typeWidth}' + + +class _Col2ImTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation['data_in']) + operatorRepresentation['type_suffix'] = _typeSuffix(data_in._type.referencedType) + + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _Col2ImTemplate(""" +// Col2Im (Name: ${nodeName}, Op: ${nodeOp}) +Col2Im_${type_suffix}( + ${data_in}, ${data_out}, + ${batch_size}, ${channels}, ${spatial_dims}, + (int32_t[]){${', '.join(str(s) for s in image_shape)}}, + (int32_t[]){${', '.join(str(s) for s in block_shape)}}, + (int32_t[]){${', '.join(str(s) for s in dilations)}}, + (int32_t[]){${', '.join(str(s) for s in pads)}}, + (int32_t[]){${', '.join(str(s) for s in strides)}} +); +""") diff --git a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py index 9bf864c91f..0461283dec 100644 --- a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py +++ b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py @@ -4,10 +4,35 @@ from Deeploy.DeeployTypes import NodeTemplate -referenceTemplate = NodeTemplate(""" +referenceTemplate2D = NodeTemplate(""" <% -batchOffsetIn = ch_im_in * dim_im_in_y -batchOffsetOut = ch_im_out * dim_im_out_y +batch_stride_input = channels * input_height * input_width +batch_stride_output = feature_maps * output_height * output_width +%> + +// 2D Transposed Conv (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE + ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for (uint32_t n=0; n<${batch_size}; ++n) { + ConvTranspose2d_fp32( + ref_${data_out}_${data_in}, ${channels}, ${input_height}, + ${input_width}, ${weight}, ${feature_maps}, ${kernel_height}, + ${kernel_width}, ${stride_h}, ${stride_w}, ${bias}, ${has_bias}, + ref_${data_out}_${data_out}, ${output_height}, ${output_width} + ); + + ref_${data_out}_${data_in} += ${batch_stride_input}; + ref_${data_out}_${data_out} += ${batch_stride_output}; + } +END_SINGLE_CORE +""") + +referenceTemplate1D = NodeTemplate(""" +<% +batch_stride_input = channels * input_length +batch_stride_output = feature_maps * output_length %> // 1D Transposed Conv (Name: ${nodeName}, Op: ${nodeOp}) @@ -15,17 +40,15 @@ ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; - for (uint32_t n=0; n<${batch}; ++n) { + for (uint32_t n=0; n<${batch_size}; ++n) { ConvTranspose1d_fp32( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, - ${weight}, ${ch_im_out}, ${dim_kernel_y}, - ${stride_y}, - ${bias}, ${has_bias}, - ref_${data_out}_${data_out}, ${dim_im_out_y} + ref_${data_out}_${data_in}, ${channels}, ${input_length}, ${weight}, + ${feature_maps}, ${kernel_length}, ${stride}, ${bias}, ${has_bias}, + ref_${data_out}_${data_out}, ${output_length} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${data_out}_${data_in} += ${batch_stride_input}; + ref_${data_out}_${data_out} += ${batch_stride_output}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/FloatEluTemplate.py b/Deeploy/Targets/Generic/Templates/FloatEluTemplate.py new file mode 100644 index 0000000000..fc7a1886ae --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/FloatEluTemplate.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _EluTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]: + + data_in = ctxt.lookup(operatorRepresentation['data_in']) + operatorRepresentation['size'] = int(np.prod(data_in.shape)) + operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _EluTemplate(""" +// ELU (Name: ${nodeName}, Op: ${nodeOp}) +Elu_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size}, ${alpha}); +""") diff --git a/Deeploy/Targets/Generic/Templates/FloatLeakyReluTemplate.py b/Deeploy/Targets/Generic/Templates/FloatLeakyReluTemplate.py new file mode 100644 index 0000000000..35804bd3d7 --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/FloatLeakyReluTemplate.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _LeakyReluTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]: + + data_in = ctxt.lookup(operatorRepresentation['data_in']) + operatorRepresentation['size'] = int(np.prod(data_in.shape)) + operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _LeakyReluTemplate(""" +// LeakyRelu (Name: ${nodeName}, Op: ${nodeOp}) +LeakyRelu_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size}, ${alpha}); +""") diff --git a/Deeploy/Targets/Generic/Templates/FloatSeluTemplate.py b/Deeploy/Targets/Generic/Templates/FloatSeluTemplate.py new file mode 100644 index 0000000000..2585a1966d --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/FloatSeluTemplate.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _SeluTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]: + + data_in = ctxt.lookup(operatorRepresentation['data_in']) + operatorRepresentation['size'] = int(np.prod(data_in.shape)) + operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _SeluTemplate(""" +// SELU (Name: ${nodeName}, Op: ${nodeOp}) +Selu_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size}, ${alpha}, ${gamma}); +""") diff --git a/Deeploy/Targets/Generic/Templates/ResizeTemplate.py b/Deeploy/Targets/Generic/Templates/ResizeTemplate.py new file mode 100644 index 0000000000..3a1ce42acd --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/ResizeTemplate.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import FloatImmediate +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + +_TYPE_TAG_MAP = { + 'fp32': 'RESIZE_TYPE_FLOAT32', + 's8': 'RESIZE_TYPE_INT8', + 'u8': 'RESIZE_TYPE_UINT8', + 's16': 'RESIZE_TYPE_INT16', + 'u16': 'RESIZE_TYPE_UINT16', + 's32': 'RESIZE_TYPE_INT32', + 'u32': 'RESIZE_TYPE_UINT32', +} + +_MODE_MAP = { + "nearest": "RESIZE_MODE_NEAREST", + "linear": "RESIZE_MODE_LINEAR", + "cubic": "RESIZE_MODE_CUBIC", +} + +_COORD_MAP = { + "asymmetric": "RESIZE_COORD_ASYMMETRIC", + "half_pixel": "RESIZE_COORD_HALF_PIXEL", + "half_pixel_symmetric": "RESIZE_COORD_HALF_PIXEL_SYMMETRIC", + "align_corners": "RESIZE_COORD_ALIGN_CORNERS", + "pytorch_half_pixel": "RESIZE_COORD_PYTORCH_HALF_PIXEL", + "tf_crop_and_resize": "RESIZE_COORD_TF_CROP_AND_RESIZE", +} + +_NEAREST_MAP = { + "floor": "RESIZE_NEAREST_FLOOR", + "ceil": "RESIZE_NEAREST_CEIL", + "round_prefer_floor": "RESIZE_NEAREST_ROUND_PREFER_FLOOR", + "round_prefer_ceil": "RESIZE_NEAREST_ROUND_PREFER_CEIL", +} + + +def _typeSuffix(ref_type) -> str: + if issubclass(ref_type, FloatImmediate): + return f'fp{ref_type.typeWidth}' + elif ref_type.signed: + return f's{ref_type.typeWidth}' + else: + return f'u{ref_type.typeWidth}' + + +class _ResizeTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + rep = operatorRepresentation + + if rep.get('roi', None) is not None: + raise ValueError("Resize: 'roi' input is not supported.") + if rep.get('scales', None) is not None: + raise ValueError("Resize: 'scales' input is not supported; use 'sizes' instead. ") + if rep.get('antialias', 0) != 0: + raise ValueError(f"Resize: antialias={rep['antialias']} is not supported by this kernel.") + if rep.get('exclude_outside', 0) != 0: + raise ValueError(f"Resize: exclude_outside={rep['exclude_outside']} is not supported by this kernel.") + if rep.get('axes', None) is not None: + raise ValueError(f"Resize: axes={rep['axes']} is not supported; all axes must be resized.") + if rep.get('keep_aspect_ratio_policy', 'stretch') != 'stretch': + raise ValueError( + f"Resize: keep_aspect_ratio_policy='{rep['keep_aspect_ratio_policy']}' is not supported by this kernel." + ) + if rep.get('coord_mode', 'half_pixel') == 'tf_crop_and_resize': + raise ValueError( + "Resize: coordinate_transformation_mode='tf_crop_and_resize' is not supported by this kernel.") + if rep.get('mode', 'nearest') == 'cubic': + raise ValueError("Resize: mode='cubic' is not supported by this kernel.") + + data_in = ctxt.lookup(rep['data_in']) + type_suffix = _typeSuffix(data_in._type.referencedType) + rep['type_tag'] = _TYPE_TAG_MAP[type_suffix] + rep['mode'] = _MODE_MAP[rep['mode']] + rep['coord_mode'] = _COORD_MAP[rep['coord_mode']] + rep['nearest_mode'] = _NEAREST_MAP[rep['nearest_mode']] + + return ctxt, rep, [] + + +referenceTemplate = _ResizeTemplate(""" +// Resize (Name: ${nodeName}, Op: ${nodeOp}) +Resize( + ${data_in}, ${data_out}, ${type_tag}, + ${batch_size}, ${channels}, ${spatial_dims}, + (int32_t[]){${', '.join(str(s) for s in input_shape)}}, + (int32_t[]){${', '.join(str(s) for s in output_shape)}}, + ${mode}, ${coord_mode}, ${nearest_mode} +); +""") diff --git a/Deeploy/Targets/Generic/Templates/ScatterTemplate.py b/Deeploy/Targets/Generic/Templates/ScatterTemplate.py new file mode 100644 index 0000000000..71172ab3d6 --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/ScatterTemplate.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import FloatImmediate +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + +_REDUCTION_MAP = { + "none": "SCATTER_REDUCTION_NONE", + "add": "SCATTER_REDUCTION_ADD", + "mul": "SCATTER_REDUCTION_MUL", + "min": "SCATTER_REDUCTION_MIN", + "max": "SCATTER_REDUCTION_MAX", +} + + +def _typeSuffix(ref_type) -> str: + if issubclass(ref_type, FloatImmediate): + return f'fp{ref_type.typeWidth}' + elif ref_type.signed: + return f's{ref_type.typeWidth}' + else: + return f'u{ref_type.typeWidth}' + + +class _ScatterTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation['data_in']) + operatorRepresentation['type_suffix'] = _typeSuffix(data_in._type.referencedType) + operatorRepresentation['reduction_c'] = _REDUCTION_MAP[operatorRepresentation['reduction']] + + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _ScatterTemplate(""" +// Scatter (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE +Scatter_${type_suffix}(${data_in}, ${indices}, ${updates}, ${data_out}, ${ndim}, + (int32_t[]){${', '.join(str(s) for s in data_shape)}}, + (int32_t[]){${', '.join(str(s) for s in indices_shape)}}, + ${axis}, ${reduction_c} +); +END_SINGLE_CORE +""") diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..4224cba538 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -2,59 +2,22 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Sequence, Type +from typing import List, Optional import numpy as np -from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker from Deeploy.DeeployTypes import ConstantBuffer, OperatorRepresentation, VariableBuffer -class ConcatChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - - maxNLevel = max(i.nLevels for i in inputs) - - return [maxNLevel] - - def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: - assert (all([_inp._signed == True for _inp in inputs]) or all( - [[_inp._signed == False for _inp in inputs]])), "Some inputs in concat operation have different signs!" - - if inputs[0]._signed: - return [True] - else: - return [False] - - -class SliceChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) +class DummyChecker(SignPropTypeChecker): def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - return [inputs[0].nLevels] - - def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: - if inputs[0]._signed: - return [True] - else: - return [False] - + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] -class TransposeChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) +class PassThroughTypeChecker(SignPropTypeChecker): def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: @@ -62,75 +25,54 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: - if inputs[0]._signed: - return [True] - else: - return [False] + return [bool(inputs[0]._signed)] -class PadChecker(SignPropTypeChecker): +SliceChecker = PassThroughTypeChecker +TransposeChecker = PassThroughTypeChecker +PadChecker = PassThroughTypeChecker +GatherChecker = PassThroughTypeChecker +ReshapeChecker = PassThroughTypeChecker +MaxPoolChecker = PassThroughTypeChecker +DebugPrintChecker = PassThroughTypeChecker - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [inputs[0].nLevels] +class SignedOutputTypeChecker(SignPropTypeChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: - if inputs[0]._signed: - return [True] - else: - return [False] - + return [True] -class AddChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) +class ConcatChecker(SignPropTypeChecker): def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [inputs[0].nLevels + inputs[1].nLevels] - - def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[bool]: - if inputs[0]._signed or isinstance(inputs[1], ConstantBuffer): - return [True] - else: - return [False] - - -class GatherChecker(SignPropTypeChecker): + operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) + maxNLevel = max(i.nLevels for i in inputs) - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [inputs[0].nLevels] + return [maxNLevel] def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[bool]: + operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]: + assert (all([_inp._signed == True for _inp in inputs]) or all( + [[_inp._signed == False for _inp in inputs]])), "Some inputs in concat operation have different signs!" + if inputs[0]._signed: return [True] else: return [False] -class ReshapeChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) +class AddChecker(SignPropTypeChecker): def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: - return [inputs[0].nLevels] + return [inputs[0].nLevels + inputs[1].nLevels] def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: - if inputs[0]._signed: + if inputs[0]._signed or isinstance(inputs[1], ConstantBuffer): return [True] else: return [False] @@ -138,9 +80,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class MHSAChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [operatorRepresentation['n_levels']] @@ -150,28 +89,14 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [True] -class CLCAChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] +class CLCAChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] -class LinearAttentionChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] +class LinearAttentionChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -180,9 +105,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class GEMMChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [ @@ -195,14 +117,7 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [True] -class LayerNormChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] +class LayerNormChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -211,9 +126,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class MulChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(self.input_types[1].typeWidth)] @@ -228,9 +140,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class DivChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(self.output_types[0].referencedType.typeWidth)] @@ -245,9 +154,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class RQIntegerDivChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(self.output_types[0].referencedType.typeWidth)] @@ -262,9 +168,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class MatMulChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [np.max(inputs[0].shape) * np.max(inputs[1].shape) * 2**(self.input_types[0].referencedType.typeWidth)] @@ -281,9 +184,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class RQMatMulChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [operatorRepresentation['n_levels']] @@ -295,9 +195,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class RQGEMMChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [operatorRepresentation['n_levels']] @@ -307,14 +204,7 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [bool(operatorRepresentation["signed"])] -class ReduceMeanChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] +class ReduceMeanChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -326,9 +216,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class ReduceSumChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [operatorRepresentation['axisLength'] * 2**(self.input_types[0].referencedType.typeWidth)] @@ -343,9 +230,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class ReluChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs, operatorRepresentation): return [2**(self.input_types[0].referencedType.typeWidth)] @@ -354,14 +238,7 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] -class SoftmaxChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] +class SoftmaxChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -370,9 +247,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class iNoNormChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(4 * self.input_types[0].referencedType.typeWidth)] @@ -385,14 +259,7 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] -class GELUChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] +class GELUChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -404,9 +271,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class HardswishChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(4 * self.input_types[0].referencedType.typeWidth)] @@ -419,31 +283,7 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] -class RQHardswishChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] - - def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[bool]: - if inputs[0]._signed: - return [True] - else: - return [False] - - -class MaxPoolChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [inputs[0].nLevels] +class RQHardswishChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -455,9 +295,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class ConvChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: weight = inputs[1] @@ -476,9 +313,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class RequantShiftChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [operatorRepresentation['n_levels']] @@ -488,38 +322,8 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [operatorRepresentation["signed"]] -class DummyChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] - - -class DebugPrintChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [inputs[0].nLevels] - - def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[bool]: - if inputs[0]._signed: - return [True] - else: - return [False] - - class RQAddChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [operatorRepresentation['rqsOut_n_levels']] @@ -540,9 +344,6 @@ def checkOutputType(self, inputs: List[VariableBuffer], operatorRepresentation: class QuantChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: # Calculate number of levels based on bit_width @@ -557,9 +358,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class DequantChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(self.output_types[0].referencedType.typeWidth)] @@ -571,9 +369,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class SoftmaxCrossEntropyLossChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: @@ -586,9 +381,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], class SGDChecker(SignPropTypeChecker): - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: return [2**(self.input_types[0].referencedType.typeWidth)] @@ -598,14 +390,7 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [True] -class BatchNormChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(self.input_types[0].referencedType.typeWidth)] +class BatchNormChecker(DummyChecker): def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: diff --git a/DeeployTest/Tests/Kernels/FP32/Col2Im/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Col2Im/inputs.npz new file mode 100644 index 0000000000..9c4058dbcb Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Col2Im/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Col2Im/network.onnx b/DeeployTest/Tests/Kernels/FP32/Col2Im/network.onnx new file mode 100644 index 0000000000..fc03a38ccf Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Col2Im/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/Col2Im/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Col2Im/outputs.npz new file mode 100644 index 0000000000..e81b80e67f Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Col2Im/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/inputs.npz new file mode 100644 index 0000000000..f55ba85683 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/network.onnx b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/network.onnx new file mode 100644 index 0000000000..c4158441d2 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/outputs.npz new file mode 100644 index 0000000000..532fbdaa5a Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_1D/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/inputs.npz new file mode 100644 index 0000000000..c46f130fc3 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/network.onnx new file mode 100644 index 0000000000..d9bc804665 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/outputs.npz new file mode 100644 index 0000000000..fa872f8286 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose/Regular_2D/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Elu/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Elu/inputs.npz new file mode 100644 index 0000000000..070beaf015 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Elu/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Elu/network.onnx b/DeeployTest/Tests/Kernels/FP32/Elu/network.onnx new file mode 100644 index 0000000000..123fab1153 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Elu/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/Elu/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Elu/outputs.npz new file mode 100644 index 0000000000..223e5bfc85 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Elu/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/LeakyRelu/inputs.npz b/DeeployTest/Tests/Kernels/FP32/LeakyRelu/inputs.npz new file mode 100644 index 0000000000..1ae95b34f8 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/LeakyRelu/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/LeakyRelu/network.onnx b/DeeployTest/Tests/Kernels/FP32/LeakyRelu/network.onnx new file mode 100644 index 0000000000..038a4c7562 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/LeakyRelu/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/LeakyRelu/outputs.npz b/DeeployTest/Tests/Kernels/FP32/LeakyRelu/outputs.npz new file mode 100644 index 0000000000..3cf5e869a7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/LeakyRelu/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Resize/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Resize/inputs.npz new file mode 100644 index 0000000000..12d2c04d5d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Resize/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Resize/network.onnx b/DeeployTest/Tests/Kernels/FP32/Resize/network.onnx new file mode 100644 index 0000000000..569b3cf20a Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Resize/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/Resize/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Resize/outputs.npz new file mode 100644 index 0000000000..97ca7332b7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Resize/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ScatterElements/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ScatterElements/inputs.npz new file mode 100644 index 0000000000..2efbcd30c9 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ScatterElements/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ScatterElements/network.onnx b/DeeployTest/Tests/Kernels/FP32/ScatterElements/network.onnx new file mode 100644 index 0000000000..17031f8c19 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ScatterElements/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ScatterElements/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ScatterElements/outputs.npz new file mode 100644 index 0000000000..c7dae2412b Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ScatterElements/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Selu/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Selu/inputs.npz new file mode 100644 index 0000000000..4565d80b6c Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Selu/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Selu/network.onnx b/DeeployTest/Tests/Kernels/FP32/Selu/network.onnx new file mode 100644 index 0000000000..55a951a513 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Selu/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/Selu/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Selu/outputs.npz new file mode 100644 index 0000000000..0811fb34d1 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Selu/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/Col2Im/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Col2Im/inputs.npz new file mode 100644 index 0000000000..5cccf345b2 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Col2Im/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/Col2Im/network.onnx b/DeeployTest/Tests/Kernels/Integer/Col2Im/network.onnx new file mode 100644 index 0000000000..9e31c06d27 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Col2Im/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/Integer/Col2Im/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Col2Im/outputs.npz new file mode 100644 index 0000000000..a36d20ef99 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Col2Im/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/Resize/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Resize/inputs.npz new file mode 100644 index 0000000000..9076dba614 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Resize/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/Resize/network.onnx b/DeeployTest/Tests/Kernels/Integer/Resize/network.onnx new file mode 100644 index 0000000000..fab06b2c64 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Resize/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/Integer/Resize/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Resize/outputs.npz new file mode 100644 index 0000000000..9ee4676afc Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Resize/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/ScatterElements/inputs.npz b/DeeployTest/Tests/Kernels/Integer/ScatterElements/inputs.npz new file mode 100644 index 0000000000..c034759e7c Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/ScatterElements/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/ScatterElements/network.onnx b/DeeployTest/Tests/Kernels/Integer/ScatterElements/network.onnx new file mode 100644 index 0000000000..77eca65237 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/ScatterElements/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/Integer/ScatterElements/outputs.npz b/DeeployTest/Tests/Kernels/Integer/ScatterElements/outputs.npz new file mode 100644 index 0000000000..41dbf1edde Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/ScatterElements/outputs.npz differ diff --git a/DeeployTest/test_generic_config.py b/DeeployTest/test_generic_config.py index eaea3d6400..abe14bfff0 100644 --- a/DeeployTest/test_generic_config.py +++ b/DeeployTest/test_generic_config.py @@ -12,6 +12,9 @@ "Kernels/FP32/AveragePool/Regular_2D", "Kernels/FP32/Ceil", "Kernels/FP32/Clip", + "Kernels/FP32/Col2Im", + "Kernels/FP32/ConvTranspose/Regular_1D", + "Kernels/FP32/ConvTranspose/Regular_2D", "Kernels/FP32/Conv/DW_2D_Bias", "Kernels/FP32/Conv/DW_2D_NoBias", "Kernels/FP32/Conv/DW_2D_ZeroValuedBias", @@ -19,6 +22,7 @@ "Kernels/FP32/Conv/Regular_2D_NoBias", "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias", "Kernels/FP32/Div", + "Kernels/FP32/Elu", "Kernels/FP32/Exp", "Kernels/FP32/Floor", "Kernels/FP32/GEMM/Regular", @@ -33,6 +37,7 @@ "Kernels/FP32/MaxPool/Regular_2D", "Kernels/FP32/Mul", "Kernels/FP32/LayerNorm", + "Kernels/FP32/LeakyRelu", "Kernels/FP32/RMSNorm", "Kernels/FP32/Pow/Scalar", "Kernels/FP32/Pow/Vector", @@ -55,6 +60,9 @@ "Kernels/FP32/ReduceMean/NoKeepDims/Axis2", "Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add", "Kernels/FP32/Reshape/SkipConnection", + "Kernels/FP32/Resize", + "Kernels/FP32/ScatterElements", + "Kernels/FP32/Selu", "Kernels/FP32/Sigmoid", "Kernels/FP32/Sqrt", "Kernels/FP32/Sub", @@ -64,6 +72,7 @@ "Kernels/Integer/Softmax/Regular", "Kernels/Integer/Add/MultIO", "Kernels/Integer/Add/Regular", + "Kernels/Integer/Col2Im", "Kernels/Integer/Conv/DW_1D", "Kernels/Integer/Conv/Regular_1D", "Kernels/Integer/Conv/DW_2D", @@ -77,6 +86,8 @@ "Kernels/Integer/Pad/Regular_2D", "Kernels/Integer/ReduceMean", "Kernels/Integer/ReduceSum", + "Kernels/Integer/Resize", + "Kernels/Integer/ScatterElements", "Kernels/Integer/Slice", "Kernels/Integer/Sub", # Special test from TinyViT model layers diff --git a/TargetLibraries/Generic/inc/DeeployBasicMath.h b/TargetLibraries/Generic/inc/DeeployBasicMath.h index 2023b9e725..36c6e4cd74 100644 --- a/TargetLibraries/Generic/inc/DeeployBasicMath.h +++ b/TargetLibraries/Generic/inc/DeeployBasicMath.h @@ -36,10 +36,12 @@ #include "kernel/BatchNorm.h" #include "kernel/Ceil.h" #include "kernel/Clip.h" -#include "kernel/ConvTranspose1d_fp32.h" +#include "kernel/Col2Im.h" +#include "kernel/ConvTranspose_fp32.h" #include "kernel/Convolution.h" #include "kernel/DWConvolution.h" #include "kernel/Div.h" +#include "kernel/Elu.h" #include "kernel/Exp.h" #include "kernel/Floor.h" #include "kernel/GELU.h" @@ -51,6 +53,7 @@ #include "kernel/HardSwish.h" #include "kernel/InstanceNorm.h" #include "kernel/Layernorm.h" +#include "kernel/LeakyRelu.h" #include "kernel/MatMul.h" #include "kernel/MaxPool.h" #include "kernel/Pow.h" @@ -60,6 +63,9 @@ #include "kernel/RQHardswish.h" #include "kernel/Relu.h" #include "kernel/RequantShift.h" +#include "kernel/Resize.h" +#include "kernel/Scatter.h" +#include "kernel/Selu.h" #include "kernel/Sigmoid.h" #include "kernel/Softmax.h" #include "kernel/Sqrt.h" diff --git a/TargetLibraries/Generic/inc/kernel/Col2Im.h b/TargetLibraries/Generic/inc/kernel/Col2Im.h new file mode 100644 index 0000000000..31e31e3180 --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/Col2Im.h @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef __DEEPLOY_BASIC_MATH_COL2IM_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_COL2IM_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +/******************************************************************************/ +/* Col2Im */ +/******************************************************************************/ + +/* Maximum supported number of spatial dimensions. */ +#define COL2IM_MAX_SPATIAL_DIMS 4 + +/* + * DECLARE_COL2IM_FN(SUFFIX, DATA_TYPE) + * + * Emits a forward declaration for Col2Im_. + * The matching definition lives in Col2Im.c via DEFINE_COL2IM_FN. + * + * Implements ONNX Col2Im semantics: + * input : (N, C * prod(block_shape), L) — column matrix + * output : (N, C, image_shape[0], ..., image_shape[P-1]) + * + * For each kernel position bk and output block ob the contribution is + * accumulated into the corresponding image location (with bounds checking + * to handle padding). The output is zero-initialised before accumulation. + * + * pads layout: [p_0_begin, ..., p_{P-1}_begin, p_0_end, ..., p_{P-1}_end] + */ +#define DECLARE_COL2IM_FN(SUFFIX, DATA_TYPE) \ + void Col2Im_##SUFFIX(const DATA_TYPE *input, DATA_TYPE *output, int32_t N, \ + int32_t C, int32_t spatial_dims, \ + const int32_t *image_shape, const int32_t *block_shape, \ + const int32_t *dilations, const int32_t *pads, \ + const int32_t *strides) + +DECLARE_COL2IM_FN(fp32, float32_t); +DECLARE_COL2IM_FN(s8, int8_t); +DECLARE_COL2IM_FN(u8, uint8_t); + +#endif //__DEEPLOY_BASIC_MATH_COL2IM_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/inc/kernel/ConvTranspose1d_fp32.h b/TargetLibraries/Generic/inc/kernel/ConvTranspose1d_fp32.h deleted file mode 100644 index 40ef065992..0000000000 --- a/TargetLibraries/Generic/inc/kernel/ConvTranspose1d_fp32.h +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef CONV_TRANSPOSE1D_FP32_H -#define CONV_TRANSPOSE1D_FP32_H - -#include -#include - -void ConvTranspose1d_fp32(const float32_t *input, uint32_t C_in, uint32_t W_in, - const float32_t *weight, uint32_t C_out, uint32_t K, - uint32_t stride, const float32_t *bias, bool has_bias, - float32_t *output, uint32_t W_out); - -#endif // CONV_TRANSPOSE1D_FP32_H diff --git a/TargetLibraries/Generic/inc/kernel/ConvTranspose_fp32.h b/TargetLibraries/Generic/inc/kernel/ConvTranspose_fp32.h new file mode 100644 index 0000000000..7ff06f171e --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/ConvTranspose_fp32.h @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef CONV_TRANSPOSE_FP32_H +#define CONV_TRANSPOSE_FP32_H + +#include +#include + +void ConvTranspose1d_fp32(const float32_t *input, uint32_t C_in, uint32_t W_in, + const float32_t *weight, uint32_t C_out, uint32_t K, + uint32_t stride, const float32_t *bias, bool has_bias, + float32_t *output, uint32_t W_out); + +void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *weight, + uint32_t C_out, uint32_t kH, uint32_t kW, + uint32_t stride_h, uint32_t stride_w, + const float32_t *bias, bool has_bias, + float32_t *output, uint32_t H_out, uint32_t W_out); + +#endif // CONV_TRANSPOSE_FP32_H diff --git a/TargetLibraries/Generic/inc/kernel/Elu.h b/TargetLibraries/Generic/inc/kernel/Elu.h new file mode 100644 index 0000000000..ac6d03c4ee --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/Elu.h @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_BASIC_MATH_ELU_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_ELU_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +/* + * element wise Exponential Linear Unit (ELU) function + */ + +/******************************************************************************/ +/* Elu */ +/******************************************************************************/ +void Elu_fp32_fp32(const float32_t *data_in, float32_t *data_out, int32_t size, + float32_t alpha); + +#endif //__DEEPLOY_BASIC_MATH_ELU_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/inc/kernel/LeakyRelu.h b/TargetLibraries/Generic/inc/kernel/LeakyRelu.h new file mode 100644 index 0000000000..daa096c2a9 --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/LeakyRelu.h @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_BASIC_MATH_LEAKYRELU_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_LEAKYRELU_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +/* + * element wise LeakyRelu function + */ + +/******************************************************************************/ +/* LeakyRelu */ +/******************************************************************************/ +void LeakyRelu_fp32_fp32(const float32_t *data_in, float32_t *data_out, + int32_t size, float32_t alpha); + +#endif //__DEEPLOY_BASIC_MATH_LEAKYRELU_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/inc/kernel/Resize.h b/TargetLibraries/Generic/inc/kernel/Resize.h new file mode 100644 index 0000000000..153831f6bc --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/Resize.h @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef __DEEPLOY_BASIC_MATH_RESIZE_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_RESIZE_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +/* Maximum number of spatial dimensions (excludes batch N and channels C). */ +#define RESIZE_MAX_SPATIAL_DIMS 4 + +/* Element type — passed as a compile-time constant from generated code. */ +typedef enum { + RESIZE_TYPE_FLOAT32 = 0, + RESIZE_TYPE_INT8, + RESIZE_TYPE_UINT8, + RESIZE_TYPE_INT16, + RESIZE_TYPE_UINT16, + RESIZE_TYPE_INT32, + RESIZE_TYPE_UINT32, +} resize_type_t; + +/* Interpolation mode (mirrors ONNX Resize `mode` attribute). */ +typedef enum { + RESIZE_MODE_NEAREST = 0, + RESIZE_MODE_LINEAR, + RESIZE_MODE_CUBIC, +} resize_mode_t; + +/* Coordinate transformation mode. */ +typedef enum { + RESIZE_COORD_ASYMMETRIC = 0, + RESIZE_COORD_HALF_PIXEL, + RESIZE_COORD_HALF_PIXEL_SYMMETRIC, + RESIZE_COORD_PYTORCH_HALF_PIXEL, + RESIZE_COORD_ALIGN_CORNERS, + RESIZE_COORD_TF_CROP_AND_RESIZE, +} resize_coord_mode_t; + +/* Nearest-neighbour rounding mode. */ +typedef enum { + RESIZE_NEAREST_FLOOR = 0, + RESIZE_NEAREST_CEIL, + RESIZE_NEAREST_ROUND_PREFER_FLOOR, + RESIZE_NEAREST_ROUND_PREFER_CEIL, +} resize_nearest_mode_t; + +/* + * Resize — single function for all element types. + * + * input / output – NCHW tensors (void* to stay type-agnostic) + * type_tag – element type; drives element size and float conversion + * N, C – batch size and number of channels + * spatial_dims – number of spatial dimensions (1..RESIZE_MAX_SPATIAL_DIMS) + * input_shape – spatial sizes of the input [d0, d1, …] + * output_shape – spatial sizes of the output [d0, d1, …] + * mode – interpolation mode + * coord_mode – coordinate transformation mode + * nearest_mode – rounding mode (only used when mode == RESIZE_MODE_NEAREST) + */ +void Resize(const void *input, void *output, resize_type_t type_tag, int32_t N, + int32_t C, int32_t spatial_dims, const int32_t *input_shape, + const int32_t *output_shape, resize_mode_t mode, + resize_coord_mode_t coord_mode, resize_nearest_mode_t nearest_mode); + +#endif // __DEEPLOY_BASIC_MATH_RESIZE_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/inc/kernel/Scatter.h b/TargetLibraries/Generic/inc/kernel/Scatter.h new file mode 100644 index 0000000000..8f4d2a41ca --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/Scatter.h @@ -0,0 +1,45 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_BASIC_MATH_SCATTER_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_SCATTER_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +/******************************************************************************/ +/* Scatter */ +/******************************************************************************/ + +/* Maximum supported tensor rank. */ +#define SCATTER_MAX_NDIM 8 + +/* Reduction modes (mirrors ONNX ScatterElements `reduction` attribute). */ +typedef enum { + SCATTER_REDUCTION_NONE = 0, + SCATTER_REDUCTION_ADD, + SCATTER_REDUCTION_MUL, + SCATTER_REDUCTION_MIN, + SCATTER_REDUCTION_MAX, +} scatter_reduction_t; + +/* + * DECLARE_SCATTER_FN(SUFFIX, DATA_TYPE) + * + * Emits a forward declaration for Scatter_. + * The matching definition lives in Scatter.c via DEFINE_SCATTER_FN. + */ +#define DECLARE_SCATTER_FN(SUFFIX, DATA_TYPE) \ + void Scatter_##SUFFIX(const DATA_TYPE *data, const int32_t *indices, \ + const DATA_TYPE *updates, DATA_TYPE *output, \ + int32_t ndim, const int32_t *data_shape, \ + const int32_t *indices_shape, int32_t axis, \ + scatter_reduction_t reduction) + +DECLARE_SCATTER_FN(fp32, float32_t); +DECLARE_SCATTER_FN(s8, int8_t); +DECLARE_SCATTER_FN(u8, uint8_t); + +#endif //__DEEPLOY_BASIC_MATH_SCATTER_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/inc/kernel/Selu.h b/TargetLibraries/Generic/inc/kernel/Selu.h new file mode 100644 index 0000000000..225ec75df8 --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/Selu.h @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_BASIC_MATH_SELU_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_SELU_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +/* + * element wise Scaled Exponential Linear Unit (SELU) function + */ + +/******************************************************************************/ +/* Selu */ +/******************************************************************************/ +void Selu_fp32_fp32(const float32_t *input, float32_t *output, int32_t size, + float32_t alpha, float32_t gamma); + +#endif //__DEEPLOY_BASIC_MATH_SELU_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/src/Col2Im.c b/TargetLibraries/Generic/src/Col2Im.c new file mode 100644 index 0000000000..94e715e620 --- /dev/null +++ b/TargetLibraries/Generic/src/Col2Im.c @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployBasicMath.h" + +// clang-format off +#define DEFINE_COL2IM_FN(SUFFIX, DATA_TYPE) \ + DECLARE_COL2IM_FN(SUFFIX, DATA_TYPE) { \ + if (spatial_dims < 1 || spatial_dims > COL2IM_MAX_SPATIAL_DIMS || \ + N <= 0 || C <= 0) \ + return; \ + for (int32_t p = 0; p < spatial_dims; p++) { \ + if (image_shape[p] <= 0 || block_shape[p] <= 0 || strides[p] <= 0 || \ + dilations[p] <= 0 || pads[p] < 0 || pads[p + spatial_dims] < 0) \ + return; \ + } \ + /* Compute per-dim sliding-window sizes, L, block_volume, image_volume. */ \ + int32_t col_dims[COL2IM_MAX_SPATIAL_DIMS]; \ + int32_t col_strides[COL2IM_MAX_SPATIAL_DIMS]; \ + int32_t blk_strides[COL2IM_MAX_SPATIAL_DIMS]; \ + int32_t img_strides[COL2IM_MAX_SPATIAL_DIMS]; \ + int32_t L = 1, block_volume = 1, image_volume = 1; \ + for (int32_t p = 0; p < spatial_dims; p++) { \ + col_dims[p] = (image_shape[p] + pads[p] + pads[p + spatial_dims] \ + - dilations[p] * (block_shape[p] - 1) - 1) \ + / strides[p] + 1; \ + if (col_dims[p] <= 0) return; \ + L *= col_dims[p]; \ + block_volume *= block_shape[p]; \ + image_volume *= image_shape[p]; \ + } \ + /* Row-major strides for flat-index decomposition. */ \ + col_strides[spatial_dims - 1] = 1; \ + blk_strides[spatial_dims - 1] = 1; \ + img_strides[spatial_dims - 1] = 1; \ + for (int32_t p = spatial_dims - 2; p >= 0; p--) { \ + col_strides[p] = col_strides[p + 1] * col_dims[p + 1]; \ + blk_strides[p] = blk_strides[p + 1] * block_shape[p + 1]; \ + img_strides[p] = img_strides[p + 1] * image_shape[p + 1]; \ + } \ + /* Zero-initialise output. */ \ + memset(output, 0, (size_t)(N * C * image_volume) * sizeof(DATA_TYPE)); \ + /* Accumulate each column entry into its image position. */ \ + for (int32_t n = 0; n < N; n++) { \ + for (int32_t c = 0; c < C; c++) { \ + for (int32_t bk = 0; bk < block_volume; bk++) { \ + /* Decompose kernel flat index once per bk. */ \ + int32_t k_coords[COL2IM_MAX_SPATIAL_DIMS]; \ + int32_t bk_rem = bk; \ + for (int32_t p = 0; p < spatial_dims; p++) { \ + k_coords[p] = bk_rem / blk_strides[p]; \ + bk_rem -= k_coords[p] * blk_strides[p]; \ + } \ + for (int32_t ob = 0; ob < L; ob++) { \ + /* Decompose output-block flat index and map to image coords. */ \ + int32_t ob_rem = ob, img_flat = 0, in_bounds = 1; \ + for (int32_t p = 0; p < spatial_dims; p++) { \ + int32_t o_coord = ob_rem / col_strides[p]; \ + ob_rem -= o_coord * col_strides[p]; \ + int32_t h = o_coord * strides[p] - pads[p] \ + + k_coords[p] * dilations[p]; \ + if (h < 0 || h >= image_shape[p]) { in_bounds = 0; break; } \ + img_flat += h * img_strides[p]; \ + } \ + if (in_bounds) { \ + int32_t in_flat = (n * C * block_volume \ + + c * block_volume + bk) * L + ob; \ + output[(n * C + c) * image_volume + img_flat] += input[in_flat]; \ + } \ + } \ + } \ + } \ + } \ + } +// clang-format on + +DEFINE_COL2IM_FN(fp32, float32_t) +DEFINE_COL2IM_FN(s8, int8_t) +DEFINE_COL2IM_FN(u8, uint8_t) diff --git a/TargetLibraries/Generic/src/ConvTranspose1d_fp32.c b/TargetLibraries/Generic/src/ConvTranspose1d_fp32.c deleted file mode 100644 index 362058734e..0000000000 --- a/TargetLibraries/Generic/src/ConvTranspose1d_fp32.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna -// -// SPDX-License-Identifier: Apache-2.0 - -#include "DeeployBasicMath.h" - -void ConvTranspose1d_fp32(const float32_t *input, uint32_t C_in, uint32_t W_in, - const float32_t *weight, uint32_t C_out, uint32_t K, - uint32_t stride, const float32_t *bias, bool has_bias, - float32_t *output, uint32_t W_out) { - /* - input: [C_in, W_in] - weight: [C_in, C_out, K] - output: [C_out, W_out] - bias: [C_out] optionally - - */ - - // Output initialization - for (uint32_t c = 0; c < C_out; ++c) { - for (uint32_t w = 0; w < W_out; ++w) { - output[c * W_out + w] = 0.0f; - } - } - - // For each output channel - for (uint32_t cout = 0; cout < C_out; ++cout) { - // For each input channel - for (uint32_t cin = 0; cin < C_in; ++cin) { - // For each input width - for (uint32_t w_in = 0; w_in < W_in; ++w_in) { - float32_t val = input[cin * W_in + w_in]; - // Transposed convolution: output width is calculated based on stride - for (uint32_t k = 0; k < K; ++k) { - uint32_t w_out = w_in * stride + k; - if (w_out < W_out) { - // weight indexing: weight[cin, cout, k] - float32_t wgt = weight[cin * (C_out * K) + cout * K + k]; - output[cout * W_out + w_out] += val * wgt; - } - } - } - } - if (has_bias) { - for (uint32_t w = 0; w < W_out; ++w) { - output[cout * W_out + w] += bias[cout]; - } - } - } -} diff --git a/TargetLibraries/Generic/src/ConvTranspose_fp32.c b/TargetLibraries/Generic/src/ConvTranspose_fp32.c new file mode 100644 index 0000000000..a64dcdb582 --- /dev/null +++ b/TargetLibraries/Generic/src/ConvTranspose_fp32.c @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployBasicMath.h" + +void ConvTranspose1d_fp32(const float32_t *input, uint32_t C_in, uint32_t L_in, + const float32_t *weight, uint32_t C_out, uint32_t K, + uint32_t stride, const float32_t *bias, bool has_bias, + float32_t *output, uint32_t L_out) { + /* + input: [C_in, L_in] + weight: [C_in, C_out, K] + output: [C_out, L_out] + bias: [C_out] (optional) + */ + + for (uint32_t cout = 0; cout < C_out; ++cout) { + float32_t b = has_bias ? bias[cout] : 0.0f; + float32_t *out_row = output + cout * L_out; + for (uint32_t l = 0; l < L_out; ++l) + out_row[l] = b; + } + + for (uint32_t cout = 0; cout < C_out; ++cout) { + float32_t *out_row = output + cout * L_out; + for (uint32_t cin = 0; cin < C_in; ++cin) { + const float32_t *in_row = input + cin * L_in; + const float32_t *wgt_row = weight + cin * (C_out * K) + cout * K; + for (uint32_t l_in = 0; l_in < L_in; ++l_in) { + float32_t val = in_row[l_in]; + uint32_t base = l_in * stride; + for (uint32_t k = 0; k < K; ++k) + out_row[base + k] += val * wgt_row[k]; + } + } + } +} + +void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *weight, + uint32_t C_out, uint32_t kH, uint32_t kW, + uint32_t stride_h, uint32_t stride_w, + const float32_t *bias, bool has_bias, + float32_t *output, uint32_t H_out, uint32_t W_out) { + /* + input: [C_in, H_in, W_in] + weight: [C_in, C_out, kH, kW] + output: [C_out, H_out, W_out] + bias: [C_out] (optional) + */ + + for (uint32_t cout = 0; cout < C_out; ++cout) { + float32_t b = has_bias ? bias[cout] : 0.0f; + float32_t *out_ch = output + cout * H_out * W_out; + for (uint32_t i = 0; i < H_out * W_out; ++i) + out_ch[i] = b; + } + + for (uint32_t cout = 0; cout < C_out; ++cout) { + float32_t *out_ch = output + cout * H_out * W_out; + for (uint32_t cin = 0; cin < C_in; ++cin) { + const float32_t *in_ch = input + cin * H_in * W_in; + const float32_t *wgt_ch = + weight + cin * (C_out * kH * kW) + cout * (kH * kW); + for (uint32_t h_in = 0; h_in < H_in; ++h_in) { + const float32_t *in_row = in_ch + h_in * W_in; + uint32_t h_base = h_in * stride_h; + for (uint32_t w_in = 0; w_in < W_in; ++w_in) { + float32_t val = in_row[w_in]; + uint32_t w_base = w_in * stride_w; + for (uint32_t kh = 0; kh < kH; ++kh) { + float32_t *out_row = out_ch + (h_base + kh) * W_out + w_base; + const float32_t *wgt_krow = wgt_ch + kh * kW; + for (uint32_t kw = 0; kw < kW; ++kw) + out_row[kw] += val * wgt_krow[kw]; + } + } + } + } + } +} diff --git a/TargetLibraries/Generic/src/Elu_fp32.c b/TargetLibraries/Generic/src/Elu_fp32.c new file mode 100644 index 0000000000..c71a1c9282 --- /dev/null +++ b/TargetLibraries/Generic/src/Elu_fp32.c @@ -0,0 +1,20 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployBasicMath.h" +#include + +void Elu_fp32_fp32(const float32_t *input, float32_t *output, int32_t size, + float32_t alpha) { + + for (int i = 0; i < size; i++) { + if (input[i] >= 0) { + output[i] = input[i]; + } else { + output[i] = alpha * (expf(input[i]) - 1.0f); + } + } +} \ No newline at end of file diff --git a/TargetLibraries/Generic/src/GlobalAveragePool_fp32.c b/TargetLibraries/Generic/src/GlobalAveragePool_fp32.c index 907de4bb90..6df133b8fe 100644 --- a/TargetLibraries/Generic/src/GlobalAveragePool_fp32.c +++ b/TargetLibraries/Generic/src/GlobalAveragePool_fp32.c @@ -22,7 +22,7 @@ void GlobalAveragePool_fp32_fp32(float32_t const *__restrict__ src, for (uint32_t i = 0; i < spatial_size; ++i) { sum += x[i]; } - dst[n * C + c] = sum / spatial_size; + dst[n * C + c] = sum / (float32_t)spatial_size; } } } \ No newline at end of file diff --git a/TargetLibraries/Generic/src/GlobalMaxPool_fp32.c b/TargetLibraries/Generic/src/GlobalMaxPool_fp32.c index 209404494c..92164eaa72 100644 --- a/TargetLibraries/Generic/src/GlobalMaxPool_fp32.c +++ b/TargetLibraries/Generic/src/GlobalMaxPool_fp32.c @@ -15,17 +15,13 @@ void GlobalMaxPool_fp32_fp32(float32_t const *__restrict__ src, } for (uint32_t n = 0; n < N; n++) { for (uint32_t c = 0; c < C; c++) { - - float32_t sum = 0.0f; const float32_t *x = src + (n * C + c) * spatial_size; - float32_t max = x[0]; for (uint32_t i = 1; i < spatial_size; i++) { if (x[i] > max) { max = x[i]; } } - dst[n * C + c] = max; } } diff --git a/TargetLibraries/Generic/src/HardSwish_fp32.c b/TargetLibraries/Generic/src/HardSwish_fp32.c index 4776586fff..632123bc98 100644 --- a/TargetLibraries/Generic/src/HardSwish_fp32.c +++ b/TargetLibraries/Generic/src/HardSwish_fp32.c @@ -11,6 +11,6 @@ void HardSwish_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size) { for (int i = 0; i < size; i++) { float32_t x = data_in[i]; - data_out[i] = x * fmaxf(0, fminf(1, x / 6 + 0.5)); + data_out[i] = x * fmaxf(0, fminf(1, x / 6.0f + 0.5f)); } } diff --git a/TargetLibraries/Generic/src/Layernorm_fp32.c b/TargetLibraries/Generic/src/Layernorm_fp32.c index fb68df8dfe..b0eec7d8df 100644 --- a/TargetLibraries/Generic/src/Layernorm_fp32.c +++ b/TargetLibraries/Generic/src/Layernorm_fp32.c @@ -42,7 +42,7 @@ void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, float32_t *bias, float32_t epsilon, int32_t size, int32_t lastDimLength) { float32_t mean, variance, std, inv_std; - float32_t sum_dy, sum_dy_scaled, sum_dy_scaled_centered; + float32_t sum_dy, sum_dy_scaled; float32_t centered_input; for (int i = 0; i < (size / lastDimLength); i++) { @@ -53,26 +53,26 @@ void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, for (int j = 0; j < lastDimLength; j++) { mean += data_in[j + i * lastDimLength]; } - mean = mean / lastDimLength; + mean = mean / (float32_t)lastDimLength; for (int j = 0; j < lastDimLength; j++) { centered_input = data_in[j + i * lastDimLength] - mean; variance += centered_input * centered_input; } - variance = variance / lastDimLength; + variance = variance / (float32_t)lastDimLength; variance += epsilon; std = sqrtf(variance); inv_std = 1.0f / std; // RW: Step 2: Compute intermediate values needed for gradient calculation sum_dy = 0.0f; - sum_dy_scaled_centered = 0.0f; + sum_dy_scaled = 0.0f; // RW: Calculate sum(dy) and sum(dy * scale * (x - mean) / std) for (int j = 0; j < lastDimLength; j++) { sum_dy += grad_in[j + i * lastDimLength]; centered_input = data_in[j + i * lastDimLength] - mean; - sum_dy_scaled_centered += + sum_dy_scaled += grad_in[j + i * lastDimLength] * scale[j] * centered_input * inv_std; } @@ -85,9 +85,10 @@ void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in, // (x-mean)/(N*std^2)*sum(dy*scale*(x-mean)/std)) grad_out[j + i * lastDimLength] = inv_std * scale[j] * - (grad_in[j + i * lastDimLength] - (sum_dy / lastDimLength) - - (centered_input * inv_std * inv_std / lastDimLength) * - sum_dy_scaled_centered); + (grad_in[j + i * lastDimLength] - + (sum_dy / (float32_t)lastDimLength) - + (centered_input * inv_std * inv_std / (float32_t)lastDimLength) * + sum_dy_scaled); } } } diff --git a/TargetLibraries/Generic/src/LeakyRelu_fp32.c b/TargetLibraries/Generic/src/LeakyRelu_fp32.c new file mode 100644 index 0000000000..3994b98937 --- /dev/null +++ b/TargetLibraries/Generic/src/LeakyRelu_fp32.c @@ -0,0 +1,19 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployBasicMath.h" + +void LeakyRelu_fp32_fp32(const float32_t *input, float32_t *output, + int32_t size, float32_t alpha) { + + for (int i = 0; i < size; i++) { + if (input[i] >= 0) { + output[i] = input[i]; + } else { + output[i] = alpha * input[i]; + } + } +} \ No newline at end of file diff --git a/TargetLibraries/Generic/src/Resize.c b/TargetLibraries/Generic/src/Resize.c new file mode 100644 index 0000000000..e0b19791f3 --- /dev/null +++ b/TargetLibraries/Generic/src/Resize.c @@ -0,0 +1,257 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "DeeployBasicMath.h" + +/* Number of bytes per element. */ +static inline uint32_t _resize_element_size(resize_type_t type_tag) { + switch (type_tag) { + case RESIZE_TYPE_FLOAT32: + return sizeof(float32_t); + case RESIZE_TYPE_INT16: + case RESIZE_TYPE_UINT16: + return sizeof(int16_t); + case RESIZE_TYPE_INT32: + case RESIZE_TYPE_UINT32: + return sizeof(int32_t); + default: /* INT8, UINT8 */ + return sizeof(int8_t); + } +} + +/* Read one element as float for use in the linear interpolation path. */ +static inline float32_t _resize_read(const void *buf, int32_t idx, + resize_type_t type_tag) { + switch (type_tag) { + case RESIZE_TYPE_FLOAT32: + return ((const float32_t *)buf)[idx]; + case RESIZE_TYPE_INT8: + return (float32_t)((const int8_t *)buf)[idx]; + case RESIZE_TYPE_UINT8: + return (float32_t)((const uint8_t *)buf)[idx]; + case RESIZE_TYPE_INT16: + return (float32_t)((const int16_t *)buf)[idx]; + case RESIZE_TYPE_UINT16: + return (float32_t)((const uint16_t *)buf)[idx]; + case RESIZE_TYPE_INT32: + return (float32_t)((const int32_t *)buf)[idx]; + default: /* RESIZE_TYPE_UINT32 */ + return (float32_t)((const uint32_t *)buf)[idx]; + } +} + +/* Round val to the nearest integer, breaking ties toward the nearest even + * integer (banker's rounding). This matches numpy.round / the ONNX reference + * implementation. */ +static inline float32_t _round_half_to_even(float32_t val) { + float32_t f = floorf(val); + float32_t diff = val - f; + if (diff < 0.5f) + return f; + if (diff > 0.5f) + return f + 1.0f; + /* exactly 0.5: pick the even neighbour */ + return (fmodf(f, 2.0f) == 0.0f) ? f : f + 1.0f; +} + +/* Write a float result back as the element's native type. */ +static inline void _resize_write(void *buf, int32_t idx, float32_t val, + resize_type_t type_tag) { + switch (type_tag) { + case RESIZE_TYPE_FLOAT32: + ((float32_t *)buf)[idx] = val; + break; + case RESIZE_TYPE_INT8: + ((int8_t *)buf)[idx] = (int8_t)_round_half_to_even(val); + break; + case RESIZE_TYPE_UINT8: + ((uint8_t *)buf)[idx] = (uint8_t)_round_half_to_even(val); + break; + case RESIZE_TYPE_INT16: + ((int16_t *)buf)[idx] = (int16_t)_round_half_to_even(val); + break; + case RESIZE_TYPE_UINT16: + ((uint16_t *)buf)[idx] = (uint16_t)_round_half_to_even(val); + break; + case RESIZE_TYPE_INT32: + ((int32_t *)buf)[idx] = (int32_t)_round_half_to_even(val); + break; + default: /* RESIZE_TYPE_UINT32 */ + ((uint32_t *)buf)[idx] = (uint32_t)_round_half_to_even(val); + break; + } +} + +/* Map an output coordinate to its source coordinate in the input. */ +static float32_t _resize_get_coord(int32_t out_idx, int32_t in_size, + int32_t out_size, + resize_coord_mode_t coord_mode) { + float32_t x_scale = (float32_t)out_size / (float32_t)in_size; + switch (coord_mode) { + + case RESIZE_COORD_HALF_PIXEL: + return ((float32_t)out_idx + 0.5f) / x_scale - 0.5f; + + case RESIZE_COORD_HALF_PIXEL_SYMMETRIC: { + float32_t adjustment = + (float32_t)out_size / (floorf((float32_t)in_size + 0.5f) * x_scale); + return ((float32_t)out_idx + 0.5f) / x_scale * adjustment - 0.5f; + } + case RESIZE_COORD_ALIGN_CORNERS: + if (out_size == 1) + return 0.0f; + return (float32_t)out_idx * (float32_t)(in_size - 1) / + (float32_t)(out_size - 1); + + case RESIZE_COORD_PYTORCH_HALF_PIXEL: + if (out_size == 1) + return 0.0f; + return ((float32_t)out_idx + 0.5f) / x_scale - 0.5f; + + default: /* RESIZE_COORD_ASYMMETRIC */ + return (float32_t)out_idx / x_scale; + } +} + +/* Round a source coordinate to the nearest input index and clamp to [0, + * max_idx]. */ +static int32_t _resize_nearest_idx(float32_t x, int32_t max_idx, + resize_nearest_mode_t nearest_mode) { + int32_t in_idx; + switch (nearest_mode) { + case RESIZE_NEAREST_CEIL: + in_idx = (int32_t)ceilf(x); + break; + case RESIZE_NEAREST_ROUND_PREFER_FLOOR: + /* At exactly n+0.5 choose floor; otherwise standard round. */ + in_idx = (x - floorf(x) == 0.5f) ? (int32_t)floorf(x) : (int32_t)roundf(x); + break; + case RESIZE_NEAREST_ROUND_PREFER_CEIL: + in_idx = (int32_t)roundf(x); + break; + default: /* RESIZE_NEAREST_FLOOR */ + in_idx = (int32_t)floorf(x); + break; + } + return CLAMP(in_idx, 0, max_idx); +} + +void Resize(const void *input, void *output, resize_type_t type_tag, int32_t N, + int32_t C, int32_t spatial_dims, const int32_t *input_shape, + const int32_t *output_shape, resize_mode_t mode, + resize_coord_mode_t coord_mode, + resize_nearest_mode_t nearest_mode) { + + if (N <= 0 || C <= 0 || spatial_dims < 1 || + spatial_dims > RESIZE_MAX_SPATIAL_DIMS) + return; + + /* not implemented */ + if (mode == RESIZE_MODE_CUBIC || + coord_mode == RESIZE_COORD_TF_CROP_AND_RESIZE) + return; + + uint32_t elem_size = _resize_element_size(type_tag); + + /* Row-major strides for the spatial dimensions. */ + int32_t out_strides[RESIZE_MAX_SPATIAL_DIMS]; + int32_t in_strides[RESIZE_MAX_SPATIAL_DIMS]; + int32_t L_out = 1, L_in = 1; + out_strides[spatial_dims - 1] = 1; + in_strides[spatial_dims - 1] = 1; + for (int32_t d = spatial_dims - 2; d >= 0; d--) { + out_strides[d] = out_strides[d + 1] * output_shape[d + 1]; + in_strides[d] = in_strides[d + 1] * input_shape[d + 1]; + } + for (int32_t d = 0; d < spatial_dims; d++) { + L_out *= output_shape[d]; + L_in *= input_shape[d]; + } + + for (int32_t n = 0; n < N; n++) { + for (int32_t c = 0; c < C; c++) { + int32_t in_base = (n * C + c) * L_in; + int32_t out_base = (n * C + c) * L_out; + + for (int32_t oi = 0; oi < L_out; oi++) { + int32_t rem = oi; + + if (mode == RESIZE_MODE_NEAREST) { + /* Nearest-neighbour: map each spatial coord and copy the element. */ + int32_t in_flat = 0; + for (int32_t d = 0; d < spatial_dims; d++) { + int32_t out_idx = rem / out_strides[d]; + rem -= out_idx * out_strides[d]; + float32_t x = _resize_get_coord(out_idx, input_shape[d], + output_shape[d], coord_mode); + in_flat += + _resize_nearest_idx(x, input_shape[d] - 1, nearest_mode) * + in_strides[d]; + } + memcpy((char *)output + (uint32_t)(out_base + oi) * elem_size, + (const char *)input + + (uint32_t)(in_base + in_flat) * elem_size, + elem_size); + + } else { + + float32_t x_in[RESIZE_MAX_SPATIAL_DIMS]; // fractional input coord + int32_t lo[RESIZE_MAX_SPATIAL_DIMS]; // index lower than x_in + int32_t hi[RESIZE_MAX_SPATIAL_DIMS]; // index higher than x_in + float32_t w_lo[RESIZE_MAX_SPATIAL_DIMS]; // low interpolation weight + float32_t w_hi[RESIZE_MAX_SPATIAL_DIMS]; // high interpolation weight + + /* + prepares the data for the N-linear interpolation that follows. + For each spatial dimension d: + - Extract the per-dimension output coordinate from the flat index oi + - Map the output coordinate to a fractional input coordinate x_in + - Find the two bracketing input indices (lo, hi) along dimension d + - Compute interpolation weights (w_hi, w_lo) + */ + for (int32_t d = 0; d < spatial_dims; d++) { + int32_t out_idx = rem / out_strides[d]; + rem -= out_idx * out_strides[d]; // flat output index + x_in[d] = _resize_get_coord(out_idx, input_shape[d], + output_shape[d], coord_mode); + x_in[d] = CLAMP(x_in[d], 0.0f, (float32_t)(input_shape[d] - 1)); + lo[d] = (int32_t)floorf(x_in[d]); + hi[d] = (lo[d] + 1 < input_shape[d]) ? lo[d] + 1 : lo[d]; + w_hi[d] = x_in[d] - (float32_t)lo[d]; + w_lo[d] = 1.0f - w_hi[d]; + } + + /* + N-linear interpolation: weighted sum over the 2^spatial_dims corners. + + example: spatial_dims = 2 (bilinear), there are 2^2 = 4 corners + corner=0 (bits: 00) → (lo[0], lo[1]) weight = w_lo[0] * w_lo[1] + corner=1 (bits: 01) → (hi[0], lo[1]) weight = w_hi[0] * w_lo[1] + corner=2 (bits: 10) → (lo[0], hi[1]) weight = w_lo[0] * w_hi[1] + corner=3 (bits: 11) → (hi[0], hi[1]) weight = w_hi[0] * w_hi[1] + */ + float32_t result = 0.0f; + for (int32_t corner = 0, n_corners = 1 << spatial_dims; + corner < n_corners; corner++) { + float32_t weight = 1.0f; + int32_t in_flat = 0; + for (int32_t d = 0; d < spatial_dims; d++) { + if ((corner >> d) & 1) { + weight *= w_hi[d]; + in_flat += hi[d] * in_strides[d]; + } else { + weight *= w_lo[d]; + in_flat += lo[d] * in_strides[d]; + } + } + result += weight * _resize_read(input, in_base + in_flat, type_tag); + } + _resize_write(output, out_base + oi, result, type_tag); + } + } + } + } +} diff --git a/TargetLibraries/Generic/src/Scatter.c b/TargetLibraries/Generic/src/Scatter.c new file mode 100644 index 0000000000..f7e5185013 --- /dev/null +++ b/TargetLibraries/Generic/src/Scatter.c @@ -0,0 +1,67 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployBasicMath.h" + +// clang-format off +#define DEFINE_SCATTER_FN(SUFFIX, DATA_TYPE) \ + DECLARE_SCATTER_FN(SUFFIX, DATA_TYPE) { \ + int32_t data_size = 1; \ + for (int32_t dim = 0; dim < ndim; dim++) { \ + data_size *= data_shape[dim]; \ + } \ + int32_t indices_size = 1; \ + for (int32_t dim = 0; dim < ndim; dim++) { \ + indices_size *= indices_shape[dim]; \ + } \ + memcpy(output, data, (size_t)data_size * sizeof(DATA_TYPE)); \ + int32_t stride_data[SCATTER_MAX_NDIM]; \ + int32_t stride_idx[SCATTER_MAX_NDIM]; \ + stride_data[ndim - 1] = 1; \ + stride_idx[ndim - 1] = 1; \ + for (int32_t dim = ndim - 2; dim >= 0; dim--) { \ + stride_data[dim] = stride_data[dim + 1] * data_shape[dim + 1]; \ + stride_idx[dim] = stride_idx[dim + 1] * indices_shape[dim + 1]; \ + } \ + for (int32_t fi = 0; fi < indices_size; fi++) { \ + int32_t out_idx = 0; \ + int32_t rem = fi; \ + for (int32_t dim = 0; dim < ndim; dim++) { \ + int32_t coord = rem / stride_idx[dim]; \ + rem -= coord * stride_idx[dim]; \ + if (dim == axis) { \ + int32_t scatter_idx = indices[fi]; \ + if (scatter_idx < 0) scatter_idx += data_shape[dim]; \ + out_idx += scatter_idx * stride_data[dim]; \ + } else { \ + out_idx += coord * stride_data[dim]; \ + } \ + } \ + switch (reduction) { \ + case SCATTER_REDUCTION_ADD: \ + output[out_idx] += updates[fi]; \ + break; \ + case SCATTER_REDUCTION_MUL: \ + output[out_idx] *= updates[fi]; \ + break; \ + case SCATTER_REDUCTION_MIN: \ + if (updates[fi] < output[out_idx]) \ + output[out_idx] = updates[fi]; \ + break; \ + case SCATTER_REDUCTION_MAX: \ + if (updates[fi] > output[out_idx]) \ + output[out_idx] = updates[fi]; \ + break; \ + default: \ + output[out_idx] = updates[fi]; break; \ + } \ + } \ + } +// clang-format on + +DEFINE_SCATTER_FN(fp32, float32_t) +DEFINE_SCATTER_FN(s8, int8_t) +DEFINE_SCATTER_FN(u8, uint8_t) \ No newline at end of file diff --git a/TargetLibraries/Generic/src/Selu_fp32.c b/TargetLibraries/Generic/src/Selu_fp32.c new file mode 100644 index 0000000000..ac120c7e55 --- /dev/null +++ b/TargetLibraries/Generic/src/Selu_fp32.c @@ -0,0 +1,20 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployBasicMath.h" +#include + +void Selu_fp32_fp32(const float32_t *input, float32_t *output, int32_t size, + float32_t alpha, float32_t gamma) { + + for (int i = 0; i < size; i++) { + float32_t tmp = input[i]; + if (input[i] < 0) { + tmp = alpha * (expf(tmp) - 1.0f); + } + output[i] = gamma * tmp; + } +} \ No newline at end of file