diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c8a024c15..9ca0eda18f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch, Spatz)") +set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch Spatz) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -46,6 +46,8 @@ elseif(platform STREQUAL SoftHier) message(STATUS "Building for platform 'SoftHier'") elseif(platform STREQUAL Chimera) message(STATUS "Building for platform 'Chimera'") +elseif(platform STREQUAL Spatz) + message(STATUS "Building for platform 'Spatz'") else() message(FATAL_ERROR "Invalid platform '${platform}' specified!") endif() @@ -299,5 +301,33 @@ if(platform STREQUAL Chimera) endif() +if(platform STREQUAL Spatz) + + if(NOT DEFINED ENV{SPATZ_HOME}) + message(FATAL_ERROR "Environment variable SPATZ_HOME not set.") + endif() + + set(SPATZ_HOME $ENV{SPATZ_HOME}) + + set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/toolchain_llvm.cmake) + + include(${CMAKE_CURRENT_LIST_DIR}/cmake/spatz/spatz.cmake) + + project(deeploy LANGUAGES C ASM) + + message(STATUS "============================= ${platform} Configuration ============================") + message(STATUS "[cMake ] ISA = " ${ISA}) + message(STATUS "================================================================================") + message(STATUS "") + + add_subdirectory(TargetLibraries/Generic) + add_subdirectory(TargetLibraries/Spatz) + target_include_directories(deeployspatz PUBLIC TargetLibraries/Generic/inc) + + add_subdirectory(DeeployTest) + target_link_libraries(deeploylib INTERFACE deeploybasic deeployspatz) + +endif() + print_simulation_config() diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 308b179aef..4b0ecfc258 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -19,12 +19,12 @@ GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \ MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \ - iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate + iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate, TopKTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \ LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \ ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \ - SoftmaxChecker, TransposeChecker + SoftmaxChecker, TransposeChecker, TopKChecker BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()]) @@ -327,3 +327,14 @@ ConvTransposeTemplate.referenceTemplate, BasicTransformer) for type in FloatDataTypes ] + +BasicTopKBindings = [ + NodeBinding( + TopKChecker( + [PointerClass(float32_t), PointerClass(int8_t)], # inputs + [PointerClass(float32_t), PointerClass(int8_t)] # outputs + ), + TopKTemplate.referenceTemplate, + BasicTransformer, + ) +] diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..21e22992e6 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -709,3 +709,9 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class TopKLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index ad787d9e4b..b58f875c17 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -982,7 +982,7 @@ def parseNode(self, node: gs.Node) -> (bool): return False indices_shape = node.inputs[1].shape - assert np.prod(indices_shape) == 1, f"Only indices of size 1 supported. Got indices of shape {indices_shape}" + self.operatorRepresentation['num_indices'] = int(np.prod(indices_shape)) self.operatorRepresentation['axis'] = node.attrs['axis'] if 'axis' in node.attrs else 0 return True @@ -1002,10 +1002,17 @@ def parseNodeCtxt(self, axis = self.operatorRepresentation['axis'] shape = ctxt.lookup(node.inputs[0].name).shape - self.operatorRepresentation['batch'] = np.prod(shape[:axis]) - self.operatorRepresentation['batch_length'] = np.prod(shape[axis:]) - self.operatorRepresentation['axis_length'] = np.prod(shape[axis + 1:]) - self.operatorRepresentation['index'] = int(node.inputs[1].values.item()) + self.operatorRepresentation['batch'] = int(np.prod(shape[:axis])) if axis > 0 else 1 + self.operatorRepresentation['batch_length'] = int(np.prod(shape[axis:])) + self.operatorRepresentation['axis_length'] = int(np.prod(shape[axis + 1:])) if axis + 1 < len(shape) else 1 + + if self.operatorRepresentation['num_indices'] == 1: + try: + self.operatorRepresentation['index'] = int(node.inputs[1].values.item()) + except AttributeError: + self.operatorRepresentation['index'] = f"{self.operatorRepresentation['indices']}[0]" + else: + self.operatorRepresentation['index'] = 0 # in this case is not used but is needed for mako template return ctxt, True @@ -2886,3 +2893,28 @@ def parseNodeCtxt(self, self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) return ctxt, True + +# TopKParser: selects the largest k elements from a vector +class TopKParser(NodeParser): + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + return len(node.inputs)==2 and len(node.outputs)==2 and node.op=='TopK' + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + data_in = ctxt.lookup(node.inputs[0].name) + k_in = ctxt.lookup(node.inputs[1].name) + values_out = ctxt.lookup(node.outputs[0].name) + indices_out = ctxt.lookup(node.outputs[1].name) + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_in_size'] = int(np.prod(data_in.shape)) + self.operatorRepresentation['k_value'] = int(k_in.values[0]) + self.operatorRepresentation['values_out'] = values_out.name + self.operatorRepresentation['indices_out'] = indices_out.name + + return ctxt, True diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py index e05e897270..2e4601bdd4 100644 --- a/Deeploy/Targets/Generic/Platform.py +++ b/Deeploy/Targets/Generic/Platform.py @@ -14,19 +14,19 @@ BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \ BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \ BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \ - DummyBinding + DummyBinding, BasicTopKBindings from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \ LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \ ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \ - SoftmaxLayer, SqrtLayer, TransposeLayer + SoftmaxLayer, SqrtLayer, TransposeLayer, TopKLayer from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \ DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \ GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \ IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \ Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \ RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \ - TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser + TransposeParser, TopKParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \ @@ -67,6 +67,7 @@ SoftmaxMapper = NodeMapper(SoftmaxParser(), BasicSoftmaxBindings) iSoftmaxMapper = NodeMapper(iSoftmaxParser(), BasicSoftmaxBindings) TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings) +TopKMapper = NodeMapper(TopKParser(), BasicTopKBindings) UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings) QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) @@ -113,6 +114,7 @@ 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), 'Squeeze': ReshapeLayer([UnsqueezeMapper]), 'Transpose': TransposeLayer([TransposeMapper]), + 'TopK': TopKLayer([TopKMapper]), 'Unsqueeze': ReshapeLayer([UnsqueezeMapper]), 'Slice': SliceLayer([SliceMapper]), 'Quant': QuantLayer([QuantMapper]), diff --git a/Deeploy/Targets/Generic/Templates/GatherTemplate.py b/Deeploy/Targets/Generic/Templates/GatherTemplate.py index dd5e534fa4..171fbab779 100644 --- a/Deeploy/Targets/Generic/Templates/GatherTemplate.py +++ b/Deeploy/Targets/Generic/Templates/GatherTemplate.py @@ -10,8 +10,18 @@ width = int(data_in_type.referencedType.typeWidth/8) %> BEGIN_SINGLE_CORE +% if num_indices == 1: for (uint32_t i=0; i<${batch}; ++i) { memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width}); } +% else: +for (uint32_t i=0; i<${batch}; ++i) { + for (uint32_t j=0; j<${num_indices}; ++j) { + memcpy(${data_out} + i * (${num_indices} * ${axis_length}) + j * ${axis_length}, + ${data_in} + i * ${batch_length} + ${indices}[j] * ${axis_length}, + ${axis_length} * ${width}); + } +} +% endif END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/TopKTemplate.py b/Deeploy/Targets/Generic/Templates/TopKTemplate.py new file mode 100644 index 0000000000..3f9b6474fa --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/TopKTemplate.py @@ -0,0 +1,40 @@ +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +referenceTemplate = NodeTemplate(""" +// TopK (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE +// Find the top ${k_value} values and their indices +// Assumes 1D input for simplicity +typedef struct { + ${data_in_type.referencedType.typeName} value; + uint32_t index; +} topk_pair_t; + +topk_pair_t pairs[${data_in_size}]; +for (uint32_t i = 0; i < ${data_in_size}; ++i) { + pairs[i].value = ((${data_in_type.referencedType.typeName}*)${data_in})[i]; + pairs[i].index = i; +} +// Simple selection sort for top-k +for (uint32_t i = 0; i < ${k_value}; ++i) { + uint32_t max_idx = i; + for (uint32_t j = i + 1; j < ${data_in_size}; ++j) { + if (pairs[j].value > pairs[max_idx].value) { + max_idx = j; + } + } + // Swap + if (max_idx != i) { + topk_pair_t tmp = pairs[i]; + pairs[i] = pairs[max_idx]; + pairs[max_idx] = tmp; + } + // Write output + ((${values_out_type.referencedType.typeName}*)${values_out})[i] = pairs[i].value; + ((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = pairs[i].index; +} +END_SINGLE_CORE +""") \ No newline at end of file diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..5d363206f8 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -610,3 +610,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + +# TopKChecker: infers types for both values and indices outputs of TopK operation +class TopKChecker(SignPropTypeChecker): + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: + # Output 0: values (same as input), Output 1: indices (integer, usually not quantized) + # We assume indices output is not quantized (set to 0 or 1) + return [inputs[0].nLevels, 1] + + def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: + # Output 0: values (same signedness as input), Output 1: indices (unsigned) + return [inputs[0]._signed, False] \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/Bindings.py b/Deeploy/Targets/Spatz/Bindings.py new file mode 100644 index 0000000000..b1456486d0 --- /dev/null +++ b/Deeploy/Targets/Spatz/Bindings.py @@ -0,0 +1,97 @@ +from functools import partial + +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding +from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ + MemoryManagementGeneration +from Deeploy.Targets.Spatz.CodeTransformationPasses.Benchmarking import SpatzBenchmarkInnerPass, SpatzBenchmarkOuterPass + +from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t +from Deeploy.Targets.Generic.TypeCheckers import GatherChecker, MatMulChecker, TopKChecker, SoftmaxChecker + +from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration +from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterTiling import SnitchClusterTiling +from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterSynch import SnitchSynchCoresPass +from Deeploy.Targets.Spatz.DMA.SpatzDma import SpatzDma +from Deeploy.Targets.Spatz.Templates import GatherTemplate, MatMulTemplate as SpatzMatMulTemplate, TopKTemplate, SoftmaxTemplate +from Deeploy.Targets.Generic.Templates import MatMulTemplate, FloatMatMulTemplate +from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ + TilingVariableReplacementUpdate + +TilingCallClosure = partial(ClosureGeneration, closureSuffix = "_tiling_closure") +MemoryAwareFunctionCallClosure = partial(MemoryAwareClosureGeneration, + closureSuffix = "_closure", + startRegion = "L3", + endRegion = "L1") + +BasicTransformer = CodeTransformation( + [ArgumentStructGeneration(), + MemoryManagementGeneration(), + FutureGeneration()]) + +TiledTransformer = CodeTransformation([ + TilingVariableReplacement("L1"), + TilingCallClosure(writeback = False), + SnitchSynchCoresPass(), # snrt_cluster_hw_barrier() + # SpatzBenchmarkInnerPass(), # <- attention: increases runtime and benchmarks only when tiling loop has one iteration + TilingVariableReplacementUpdate("L1"), + SnitchClusterTiling("L3", "L1", SpatzDma()), + # SpatzBenchmarkOuterPass(), # <- attention: increases runtime and benchmarks only when tiling loop has one iteration + ArgumentStructGeneration(), + MemoryManagementGeneration("L1"), + MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), + MemoryManagementGeneration("L3"), + MemoryManagementGeneration(), +]) + +SpatzGatherBindings = [ + NodeBinding( + GatherChecker( + [PointerClass(float32_t), PointerClass(type)], + [PointerClass(float32_t)] + ), + GatherTemplate.dynamicDMAtemplate, + TiledTransformer + ) for type in IntegerDataTypes +] + +# with tiled transformer +SpatzMatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + SpatzMatMulTemplate.spatzSIMatMulTemplate, TiledTransformer), + NodeBinding( + MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + SpatzMatMulTemplate.spatzFloatMatMulTemplate, TiledTransformer) +] + +# without tiled transformer +''' +SpatzMatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + SpatzMatMulTemplate.spatzSIMatMulTemplate, BasicTransformer), + NodeBinding( + MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + SpatzMatMulTemplate.spatzFloatMatMulTemplate, BasicTransformer) +] +''' + +SpatzTopKBindings = [ + NodeBinding( + TopKChecker( + [PointerClass(float32_t), PointerClass(int32_t)], # inputs + [PointerClass(float32_t), PointerClass(int32_t)] # outputs + ), + TopKTemplate.minHeapTemplate, + TiledTransformer, + ) +] + + +SpatzSoftmaxBindings = [ + NodeBinding( + SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + SoftmaxTemplate.floatTilingTemplate, + TiledTransformer + ) +] \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/CodeTransformationPasses/Benchmarking.py b/Deeploy/Targets/Spatz/CodeTransformationPasses/Benchmarking.py new file mode 100644 index 0000000000..0caa24f1b5 --- /dev/null +++ b/Deeploy/Targets/Spatz/CodeTransformationPasses/Benchmarking.py @@ -0,0 +1,23 @@ +from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, NodeTemplate, CodeSnippet, _NoVerbosity + + +class SpatzBenchmarkInnerPass(CodeTransformationPass): + def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str, verbose: CodeGenVerbosity = _NoVerbosity): + if "include_benchmark" not in ctxt.globalObjects: + ctxt.hoistGlobalDefinition("include_benchmark", "#include \n") + if "include_printf" not in ctxt.globalObjects: + ctxt.hoistGlobalDefinition("include_printf", "#include \"printf.h\"\n") + tsop = NodeTemplate(""" tsop = benchmark_get_cycle();\n""") + teop = NodeTemplate(""" teop = benchmark_get_cycle();\n""") + executionBlock.codeSnippets.insert(1, CodeSnippet(tsop, {})) + executionBlock.codeSnippets.append(CodeSnippet(teop, {})) + return ctxt, executionBlock + +class SpatzBenchmarkOuterPass(CodeTransformationPass): + def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str, verbose: CodeGenVerbosity = _NoVerbosity): + t0 = NodeTemplate(""" uint32_t t0, tsop, teop, te;\n t0 = benchmark_get_cycle();\n""") + te = NodeTemplate(f"""te = benchmark_get_cycle();if (snrt_is_dm_core()) {{printf(\"Benchmark of {name}:\\n\");\nprintf(\"data_in=%d; op=%d; data_out=%d; total=%d\\n\\n\", tsop-t0, teop-tsop, te-teop, te-t0); }}\nsnrt_cluster_hw_barrier();""") + + executionBlock.addLeft(t0, {}) + executionBlock.addRight(te, {}) + return ctxt, executionBlock diff --git a/Deeploy/Targets/Spatz/DMA/SpatzDma.py b/Deeploy/Targets/Spatz/DMA/SpatzDma.py new file mode 100644 index 0000000000..e13df3aaa1 --- /dev/null +++ b/Deeploy/Targets/Spatz/DMA/SpatzDma.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer +from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy + + +class SnitchBarrierFuture(Future): + _initTemplate = NodeTemplate("") + _deinitTemplate = NodeTemplate("") + _allocTemplate = NodeTemplate("") + _waitTemplate = NodeTemplate("if (snrt_is_dm_core()) snrt_dma_wait_all();") + + +class SnitchFuture(Future): + _initTemplate = NodeTemplate("snrt_dma_txid_t ${name} = (snrt_dma_txid_t) -1;") + + _deinitTemplate = NodeTemplate("") + + _allocTemplate = NodeTemplate("") + + _waitTemplate = NodeTemplate(#remove if condition -1 + "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait_all();") + # "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait(${name});") + + +class SpatzDma(AsyncDma): + + _transferTemplates = { + 2: + NodeTemplate(""" + if (snrt_is_dm_core()) { + ${future} = snrt_dma_start_2d(${dest}, ${src}, ${size}, ${stride_dest}, ${stride_src}, ${repeat}); + } + """), + } + _waitingStrategy = PerTensorWaitingStrategy(SnitchFuture) + + def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None: + super().__init__(transferTemplates) + + def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, + shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], + direction: DmaDirection) -> None: + super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction) + assert strideLoc[1] == 1 and strideExt[1] == 1, f"Supports only contigous transfers in the innermost dimension" + + def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...], + strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection, + future: Future) -> OperatorRepresentation: + operatorRepresentation: OperatorRepresentation = { + "dest": localBuffer.name if direction == "ExternalToLocal" else externalBuffer.name, + "src": externalBuffer.name if direction == "ExternalToLocal" else localBuffer.name, + "repeat": shape[0], + "size": shape[1], + "stride_dest": strideLoc[0] if direction == "ExternalToLocal" else strideExt[0], + "stride_src": strideExt[0] if direction == "ExternalToLocal" else strideLoc[0], + "future": future.name + } + return operatorRepresentation diff --git a/Deeploy/Targets/Spatz/Deployer.py b/Deeploy/Targets/Spatz/Deployer.py new file mode 100644 index 0000000000..4d99b61f54 --- /dev/null +++ b/Deeploy/Targets/Spatz/Deployer.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Callable, Dict, Type + +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + NCHWtoNHWCPass, TransposeMatmulInputsPass +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass + + +class SpatzDeployer(SignPropDeployer): + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda x: x, + name: str = 'DeeployNetwork', + default_channels_first = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets: Dict[str, int] = {}): + + super().__init__(graph, + deploymentPlatform, + inputTypes, + loweringOptimizer, + scheduler, + name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir) diff --git a/Deeploy/Targets/Spatz/Platform.py b/Deeploy/Targets/Spatz/Platform.py new file mode 100644 index 0000000000..5150f0928a --- /dev/null +++ b/Deeploy/Targets/Spatz/Platform.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List +import numpy as np + +from Deeploy.DeeployTypes import VariableBuffer, TransientBuffer, ConstantBuffer, StructBuffer, \ + NodeMapper, NodeTemplate, TopologyOptimizer, DeploymentEngine, DeploymentPlatform + +from Deeploy.Targets.Spatz.Templates import AllocateTemplate as SpatzAllocateTemplate +from Deeploy.Targets.Spatz.Templates import FreeTemplate as SpatzFreeTemplate + +from Deeploy.Targets.Spatz.Tiler import SpatzMatMulTilingBindings, SpatzGatherTilingBindings, SpatzTopKTilingBindings, SpatzSoftmaxTilingBindings +from Deeploy.Targets.Generic.Layers import GEMMLayer, SoftmaxLayer, TopKLayer, GatherLayer +from Deeploy.Targets.Generic.Parsers import MatMulParser, SoftmaxParser, TopKParser, GatherParser + +MatMulMapper = NodeMapper(MatMulParser(), SpatzMatMulTilingBindings) +SoftmaxMapper = NodeMapper(SoftmaxParser(), SpatzSoftmaxTilingBindings) +TopKMapper = NodeMapper(TopKParser(), SpatzTopKTilingBindings) +GatherMapper = NodeMapper(GatherParser(), SpatzGatherTilingBindings) + +SpatzMapping = { + 'MatMul': GEMMLayer([MatMulMapper]), + 'Softmax': SoftmaxLayer([SoftmaxMapper]), + 'TopK': TopKLayer([TopKMapper]), + 'Gather': GatherLayer([GatherMapper]), +} + + +class SpatzVariableBuffer(VariableBuffer): + initTemplate = SpatzAllocateTemplate.spatzInitTemplate + allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate + deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate + + def _bufferRepresentation(self): + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + return { + "type": self._instance, + "name": self.name, + "size": int(np.prod(self.shape)), + "_memoryLevel": memoryLevel + } + +class SpatzTransientBuffer(TransientBuffer): + initTemplate = SpatzAllocateTemplate.spatzInitTemplate + allocTemplate = SpatzAllocateTemplate.spatzGenericAllocate + deallocTemplate = SpatzFreeTemplate.spatzLocalTemplate + + def _bufferRepresentation(self): + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + return { + "type": self._type, + "name": self.name, + "size": self.size, + "_memoryLevel": memoryLevel + } + + +class SpatzConstantBuffer(ConstantBuffer): + initTemplate = SpatzAllocateTemplate.spatzGlobalInitTemplate + allocTemplate = NodeTemplate("") + deallocTemplate = NodeTemplate("") + + def _bufferRepresentation(self): + operatorRepresentation = super()._bufferRepresentation() + + if hasattr(self, "_memoryLevel"): + memoryLevel = self._memoryLevel + else: + memoryLevel = None + + operatorRepresentation["_memoryLevel"] = memoryLevel + + return operatorRepresentation + + +class SpatzStructBuffer(StructBuffer): + initTemplate = SpatzAllocateTemplate.spatzStructInitTemplate + allocTemplate = SpatzAllocateTemplate.spatzStructAllocateTemplate + deallocTemplate = NodeTemplate("") + + +SpatzOptimizer = TopologyOptimizer([ +], name = "SpatzOptimizer") + +includeList = [ + "snrt.h", + "DeeploySpatzMath.h", +] + + +class SpatzEngine(DeploymentEngine): + def __init__(self, name: str, Mapping = SpatzMapping, initCode = "", includeList = includeList) -> None: + super().__init__(name, Mapping, initCode, includeList) + + +class SpatzPlatform(DeploymentPlatform): + + def __init__( self, + engines = [SpatzEngine("SpatzVectorProcessor")], + variableBuffer = SpatzVariableBuffer, + transientBuffer = SpatzTransientBuffer, + constantBuffer = SpatzConstantBuffer, + structBuffer = SpatzStructBuffer, + includeList: List[str] = includeList + ): + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + diff --git a/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py new file mode 100644 index 0000000000..0834283947 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/AllocateTemplate.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Declaration of a runtime-allocated buffer (just a pointer; the memory is +# obtained at runtime by the allocate template below). +spatzInitTemplate = NodeTemplate("${type.typeName} ${name}; // variable buffer of size ${size}\n") + +# Runtime allocation: L1 -> TCDM (snrt_l1alloc), L3/None -> DRAM (snrt_l3alloc). +spatzGenericAllocate = NodeTemplate(""" +% if _memoryLevel == "L1": +${name} = (${type.typeName}) snrt_l1alloc(sizeof(${type.referencedType.typeName}) * ${size});\n +% elif _memoryLevel == "L3" or _memoryLevel is None: +${name} = (${type.typeName}) snrt_l3alloc(sizeof(${type.referencedType.typeName}) * ${size});\n +% else: +// COMPILER WARNING — unsupported memory level ${_memoryLevel}, defaulting to L3 +${name} = (${type.typeName}) snrt_l3alloc(${type.referencedType.typeWidth//8} * ${size}); +% endif +""") + +# Constant buffers: emitted as static initialized arrays. +spatzGlobalInitTemplate = NodeTemplate("static ${type.referencedType.typeName} ${name}[${size}] = {${values}};\n") + +# Struct buffers. +spatzStructInitTemplate = NodeTemplate(""" +static ${type.typeName} ${name}; +""") + +spatzStructAllocateTemplate = NodeTemplate(""" + ${name} = (${structDict.typeName}) ${str(structDict)}; +""") diff --git a/Deeploy/Targets/Spatz/Templates/FreeTemplate.py b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py new file mode 100644 index 0000000000..f67cb3de38 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/FreeTemplate.py @@ -0,0 +1,5 @@ +from Deeploy.DeeployTypes import NodeTemplate + +# snrt_l1alloc currently does not support free-ing of memory (spatz/sw/snRuntime/src/alloc.c) +spatzLocalTemplate = NodeTemplate("") +spatzGlobalTemplate = NodeTemplate("") \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/Templates/GatherTemplate.py b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py new file mode 100644 index 0000000000..b8b2ec20d1 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/GatherTemplate.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +dynamicDMAtemplate = NodeTemplate(""" +// Gather (Name: ${nodeName}, Op: ${nodeOp}) +// Dynamic DMA strategy (Spatz): +// - indices already transferred to local memory by the tiling pass +// - fetch selected rows directly from external data_in into local data_out +<% +width = int(data_in_type.referencedType.typeWidth/8) +%> + +// Currently supported configuration: axis=0 and batch=1 (matches existing Spatz Gather tests) +if ((${axis} != 0) || (${batch} != 1)) { + error(); +} else { + if (snrt_is_dm_core()) { + const size_t bytes_per_row = (size_t)${axis_length} * (size_t)${width}; // sizeof(${data_in_type.referencedType.typeName}) = ${width} + char *dst_base = (char *)${data_out}; + const char *src_base = (const char *)${data_in}; + + for (size_t j = 0; j < (size_t)${num_indices}; ++j) { + const size_t dst_off = j * bytes_per_row; + const size_t src_off = (size_t)${indices}[j] * bytes_per_row; + snrt_dma_start_1d((void *)(dst_base + dst_off), (const void *)(src_base + src_off), bytes_per_row); + } + + // Ensure all row DMAs complete before the tiling pass starts the output transfer. + snrt_dma_wait_all(); + } +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py new file mode 100644 index 0000000000..b528ee3fb4 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/MatMulTemplate.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _MatMulTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + A = ctxt.lookup(operatorRepresentation['A']) + B = ctxt.lookup(operatorRepresentation['B']) + C = ctxt.lookup(operatorRepresentation['data_out']) + operatorRepresentation['A_offset'] = 0 + operatorRepresentation['B_offset'] = 0 + operatorRepresentation['C_offset'] = 0 + if hasattr(A, "_signed") and hasattr(A, "nLevels"): + operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2) + if hasattr(B, "_signed") and hasattr(B, "nLevels"): + operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2) + if hasattr(C, "_signed") and hasattr(C, "nLevels"): + operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2) + + return ctxt, operatorRepresentation, [] + + +# signed integer +spatzSIMatMulTemplate = _MatMulTemplate(""" +// MatMul (Name: ${nodeName}, Op: ${nodeOp}) +${A_type.typeName} ref_${data_out}_${A} = ${A}; +${B_type.typeName} ref_${data_out}_${B} = ${B}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for(uint32_t i=0;i<${batch};i++){ + MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O}, + ${A_offset}, ${B_offset}, ${C_offset} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; +} +""") + +# supports single precision float (fp32) +# also possible ot add half and double precision +spatzFloatMatMulTemplate = NodeTemplate(""" +// Matmul (Name: ${nodeName}, Op: ${nodeOp}) +${A_type.typeName} ref_${data_out}_${A} = ${A}; +${B_type.typeName} ref_${data_out}_${B} = ${B}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +% if batch==1: +Spatz_MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O} +); + +% else: +for(uint32_t i=0; i<${batch}; i++){ + Spatz_MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; +} +% endif +""") diff --git a/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py new file mode 100644 index 0000000000..2ddcc2c9b0 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/SoftmaxTemplate.py @@ -0,0 +1,8 @@ +from Deeploy.DeeployTypes import NodeTemplate + +# integerTilingTemplate + +floatTilingTemplate = NodeTemplate(""" +// Softmax (Name: ${nodeName}, Op: ${nodeOp}) +Spatz_Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength}); +""") diff --git a/Deeploy/Targets/Spatz/Templates/TopKTemplate.py b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py new file mode 100644 index 0000000000..7e7c836e62 --- /dev/null +++ b/Deeploy/Targets/Spatz/Templates/TopKTemplate.py @@ -0,0 +1,49 @@ +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +selectionSortTemplate = NodeTemplate(""" +// TopK node: finds the top ${k_value} values and their indices +// Assumes 1D input +${data_in_type.referencedType.typeName} *values_tmp = snrt_l1alloc(sizeof(${data_in_type.referencedType.typeName})*${data_in_size}); +${indices_out_type.referencedType.typeName} *indices_tmp = snrt_l1alloc(sizeof(${indices_out_type.referencedType.typeName})*${data_in_size}); + +for (uint32_t i = 0; i < ${data_in_size}; ++i) { + values_tmp[i] = ((${data_in_type.referencedType.typeName}*)${data_in})[i]; + indices_tmp[i] = i; +} +// Simple selection sort for top-k +for (uint32_t i = 0; i < ${k_value}; ++i) { + uint32_t max_idx = i; + for (uint32_t j = i + 1; j < ${data_in_size}; ++j) { + if (values_tmp[j] > values_tmp[max_idx]) { + max_idx = j; + } + } + // Swap + if (max_idx != i) { + float32_t tmp_val = values_tmp[i]; + int32_t tmp_idx = indices_tmp[i]; + values_tmp[i] = values_tmp[max_idx]; + indices_tmp[i] = indices_tmp[max_idx]; + values_tmp[max_idx] = tmp_val; + indices_tmp[max_idx] = tmp_idx; + } + // Write output + ((${values_out_type.referencedType.typeName}*)${values_out})[i] = values_tmp[i]; + ((${indices_out_type.referencedType.typeName}*)${indices_out})[i] = indices_tmp[i]; +} +""") + +# compute_topk_vector_instructions +minHeapTemplate = NodeTemplate(""" +compute_topk_min_heap( + ${k_value}, + ${data_in_size}, + ${data_in}, + ${values_out}, + ${indices_out} +); + +""") \ No newline at end of file diff --git a/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py new file mode 100644 index 0000000000..27a3c7ec3e --- /dev/null +++ b/Deeploy/Targets/Spatz/TileConstraints/GatherTileConstraint.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + +class GatherTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + pointer: List[str] = [] + for key, value in parseDict.items(): + if not isinstance(value, str): + continue + + if ctxt.is_global(value) or ctxt.is_local(value): + pointer.append(value) + + _buffer = ctxt.lookup(value) + if isinstance(_buffer, TransientBuffer): + continue + + tilerModel.addTensorDimToModel(ctxt, value) + + # no tile contraint for data_in, because is not moved by the tiling engine + if key == 'data_in': + continue + + for idx, shapeDim in enumerate(_buffer.shape): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = value, dimIdx = idx) == shapeDim) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + # Dynamic-DMA Gather policy: + # - DMA only indices into local memory + # - Do NOT DMA the full data_in tile into local memory + # - DMA the output tile back to external memory + addrNames = ['indices', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + dataInBuffer = ctxt.lookup(operatorRepresentation['data_in']) + indicesBuffer = ctxt.lookup(operatorRepresentation['indices']) + + dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape)) + indicesCube = HyperRectangle(offset = (0,) * len(indicesBuffer.shape), dims = tuple(indicesBuffer.shape)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for out in outputCubes: + # Gather execution policy (dynamic DMA): load indices in L1, execute once, then store output tile. + # data_in stays in external memory; selected rows are fetched directly into the local output buffer. + _ = dataInCube # Keep for clarity; intentionally unused in this schedule. + inputLoadSchedule.append({'indices': indicesCube}) + outputLoadSchedule.append({'data_out': out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + repScheme = VariableReplacementScheme({}, {}) + + return repScheme, schedule diff --git a/Deeploy/Targets/Spatz/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/MatMulTileConstraint.py new file mode 100644 index 0000000000..4211fed17d --- /dev/null +++ b/Deeploy/Targets/Spatz/TileConstraints/MatMulTileConstraint.py @@ -0,0 +1,240 @@ +import math +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import int8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class MatMulTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # ===== GET NECESSARY INFORMATION ===== + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + tensorsShapeLenA = len(bufferA.shape) + tensorsShapeLenB = len(bufferB.shape) + tensorsShapeLenOutput = len(outputBuffer.shape) + + # ===== ADD I/O DIMS TO MODEL AS VARS ===== + for _buffer in [bufferA, bufferB, outputBuffer]: + tilerModel.addTensorDimToModel(ctxt, _buffer.name) + + # ===== EXTRACT TENSOR DIMS AS VARS ===== + # *Checks on wether dimesnions are reversed via the transA and transB flags + # A dims + AMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLenA - 2) + parseDict['transA']) + AMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLenA - 1) - parseDict['transA']) + + # B dims + BMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLenB - 2) + parseDict['transB']) + BMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLenB - 1) - parseDict['transB']) + + # Output dims + outputMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, + dimIdx = (tensorsShapeLenOutput - 2)) + outputMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, + dimIdx = (tensorsShapeLenOutput - 1)) + + # ===== ADD CONSTRAINTS ===== + # Add batch constraints + if (bufferA.shape[:-2] == bufferB.shape[:-2]): + for idx in range(tensorsShapeLenA - 2): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx) + == tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = tensorsShapeLenA - 3 - idx)) + + for idx in range(tensorsShapeLenB - 2): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx) + == tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = tensorsShapeLenB - 3 - idx)) + + # Add GEMM geometrical constraints + tilerModel.addConstraint(outputMatrixFirstDimVar == AMatrixFirstDimVar) + tilerModel.addConstraint(outputMatrixSecondDimVar == BMatrixSecondDimVar) + + tilerModel.addConstraint(AMatrixSecondDimVar == BMatrixFirstDimVar) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # ===== GET NECESSARY INFORMATION ===== + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + + # ===== EXTRACT TENSOR DIMS AS VARS ===== + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (len(bufferA.shape) - 1) - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (len(bufferB.shape) - 2) + parseDict['transB']) + + # ===== ADD CONSTRAINTS ===== + # VIC: We don't want to deal with intermediate results between kernel calls + tilerModel.addConstraint(ASecondDimVar == parseDict['N']) + tilerModel.addConstraint(BFirstDimVar == parseDict['N']) + + # Spatz row-stride alignment: the kernel loads B rows / stores output rows at + # a stride of O elements. On this Spatz config every row base must be 64-bit + # aligned, otherwise a chained vector load corrupts the upper lanes. So force + # the O tile size to be a multiple of (8 / elemBytes) (fp32 -> 2). With an even + # original O the remainder tile is even too (even - even*k = even), so every + # tile's row stride stays 8-byte aligned. (Odd original O is unsupported: its + # remainder tile is odd and would be misaligned.) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + elemBytes = outputBuffer._type.referencedType.typeWidth // 8 + modulo = 8 // elemBytes + if modulo > 1: + outputODimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, + dimIdx = len(outputBuffer.shape) - 1) + tilerModel.addTileSizeDivisibleConstraint(parseDict, "O", outputODimVar, modulo) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + # Get output cubes + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + # Get names, optimizer variables, buffers, and other information for elements of interest + addrNames = ['A', 'B', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + buffA = ctxt.lookup(operatorRepresentation['A']) + buffB = ctxt.lookup(operatorRepresentation['B']) + buffOut = ctxt.lookup(operatorRepresentation['data_out']) + + transA = operatorRepresentation['transA'] + transB = operatorRepresentation['transB'] + + tensorsShapeLenA = len(buffA.shape) + tensorsShapeLenB = len(buffB.shape) + tensorsShapeOutput = len(buffOut.shape) + + # NSize depends on transA: if transA=0, N is last dim; if transA=1, N is second-to-last + NSize = buffA.shape[-1] if transA == 0 else buffA.shape[-2] + NOffset = 0 + + # Prepare input cubes lists + inputACubes = [] + inputBCubes = [] + + # Prepare replacements lists + replacements = {"M": [], "O": [], "batch": []} + + # Every output tile is constructed by a pair of input tiles. Reconstruct this pair. + for cube in outputCubes: + # Get output dimensions + MOffset, OOffset = cube.offset[-2:] + MSize, OSize = cube.dims[-2:] + + # Check that batch tiling is set up properly + if len(cube.offset) > 2: + BatchSize = math.prod(cube.dims[:-2]) + + if len(cube.offset) > 3: + assert all(off == 0 for off in cube.offset[:-3]), ( + f"Unsupported tiling across leading batch dims: offsets={cube.offset}. " + "Only the last batch dim (besides M/O) may be tiled.") + else: + BatchSize = 1 + + # Prepare cube dimensions replacements + replacements["M"].append(MSize) + replacements["O"].append(OSize) + replacements["batch"].append(BatchSize) + + # ===== Compute A cube information ===== + # Matrix offsets and shape (swap based on transA) + if transA == 0: + AMatrixOffsets = (MOffset, NOffset) + AMatrixShape = (MSize, NSize) + else: + AMatrixOffsets = (NOffset, MOffset) + AMatrixShape = (NSize, MSize) + + # Batch offset and shape (with broadcasting handling) + ABatchOffsets = list() + ABatchShape = list() + + for idx in range(tensorsShapeLenA - 2): + if buffA.shape[tensorsShapeLenA - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]: + ABatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx]) + ABatchShape.append(cube.dims[len(cube.dims) - 3 - idx]) + else: + ABatchOffsets.append(0) + ABatchShape.append(1) + + ACube = HyperRectangle( + tuple(reversed(ABatchOffsets)) + tuple(AMatrixOffsets), + tuple(reversed(ABatchShape)) + tuple(AMatrixShape)) + inputACubes.append(ACube) + + # ===== Compute B cube information ===== + # Matrix offsets and shape (swap based on transB) + if transB == 0: + BMatrixOffsets = (NOffset, OOffset) + BMatrixShape = (NSize, OSize) + else: + BMatrixOffsets = (OOffset, NOffset) + BMatrixShape = (OSize, NSize) + + # Batch offset and shape (with broadcasting handling) + BBatchOffsets = list() + BBatchShape = list() + + for idx in range(tensorsShapeLenB - 2): + if buffB.shape[tensorsShapeLenB - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]: + BBatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx]) + BBatchShape.append(cube.dims[len(cube.dims) - 3 - idx]) + else: + BBatchOffsets.append(0) + BBatchShape.append(1) + + BCube = HyperRectangle( + tuple(reversed(BBatchOffsets)) + tuple(BMatrixOffsets), + tuple(reversed(BBatchShape)) + tuple(BMatrixShape)) + inputBCubes.append(BCube) + + # Prepare load schedule lists for computed cubes + inputLoadSchedule = [] + outputLoadSchedule = [] + + # Prepare replacements + replacements["N"] = [NSize] * len(outputCubes) + + replacementTypes = { + "M": PointerClass(uint16_t), + "N": PointerClass(uint16_t), + "O": PointerClass(uint16_t), + "batch": PointerClass(uint16_t) + } + + # Update load schedule lists + # *With strict=True to fail fast if different list lenghts + for a, b in zip(inputACubes, inputBCubes, strict = True): + inputLoadSchedule.append({"A": a, "B": b}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + # Prepare tiling schedule object + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py new file mode 100644 index 0000000000..c34b84890f --- /dev/null +++ b/Deeploy/Targets/Spatz/TileConstraints/SoftmaxTileConstraint.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class SoftmaxTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Register and pin all referenced tensors to full shape to avoid tiling. + # This also covers constant inputs that may appear as parseDict string references. + tensorNames: List[str] = [] + + for value in parseDict.values(): + if not isinstance(value, str): + continue + if ctxt.is_global(value) or ctxt.is_local(value): + tensorNames.append(value) + + for tensorName in tensorNames: + _buffer = ctxt.lookup(tensorName) + if isinstance(_buffer, TransientBuffer): + continue + + tilerModel.addTensorDimToModel(ctxt, tensorName) + + for idx, shapeDim in enumerate(_buffer.shape): + tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) + tilerModel.addConstraint(tileDimVar == shapeDim) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + dataInBuffer = ctxt.lookup(operatorRepresentation['data_in']) + + dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for out in outputCubes: + inputLoadSchedule.append({'data_in': dataInCube}) + outputLoadSchedule.append({'data_out': out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + repScheme = VariableReplacementScheme({}, {}) + + return repScheme, schedule diff --git a/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py new file mode 100644 index 0000000000..30572d5819 --- /dev/null +++ b/Deeploy/Targets/Spatz/TileConstraints/TopKTileConstraint.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, TransientBuffer +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class TopKTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Register and pin all referenced tensors to full shape to avoid tiling. + # This also covers constant inputs that may appear as parseDict string references. + tensorNames: List[str] = [] + + for value in parseDict.values(): + if not isinstance(value, str): + continue + if ctxt.is_global(value) or ctxt.is_local(value): + tensorNames.append(value) + + for tensorName in tensorNames: + _buffer = ctxt.lookup(tensorName) + if isinstance(_buffer, TransientBuffer): + continue + + tilerModel.addTensorDimToModel(ctxt, tensorName) + + for idx, shapeDim in enumerate(_buffer.shape): + tileDimVar = tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) + tilerModel.addConstraint(tileDimVar == shapeDim) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + # k_value is a scalar parsed in operatorRepresentation, not a tensor to transfer. + addrNames = ['data_in', 'values_out', 'indices_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + dataInBuffer = ctxt.lookup(operatorRepresentation['data_in']) + + dataInCube = HyperRectangle(offset = (0,) * len(dataInBuffer.shape), dims = tuple(dataInBuffer.shape)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for out in outputCubes: + # TopK execution policy: load full input in L1, execute once, then store both outputs. + inputLoadSchedule.append({'data_in': dataInCube}) + outputLoadSchedule.append({'values_out': out, 'indices_out': out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + repScheme = VariableReplacementScheme({}, {}) + + return repScheme, schedule diff --git a/Deeploy/Targets/Spatz/Tiler.py b/Deeploy/Targets/Spatz/Tiler.py new file mode 100644 index 0000000000..f3f67102bc --- /dev/null +++ b/Deeploy/Targets/Spatz/Tiler.py @@ -0,0 +1,18 @@ +from Deeploy.Targets.Spatz.Bindings import SpatzMatMulBindings, SpatzGatherBindings, SpatzTopKBindings, SpatzSoftmaxBindings +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings +from Deeploy.Targets.Spatz.TileConstraints.MatMulTileConstraint import MatMulTileConstraint +from Deeploy.Targets.Spatz.TileConstraints.GatherTileConstraint import GatherTileConstraint +from Deeploy.Targets.Spatz.TileConstraints.TopKTileConstraint import TopKTileConstraint +from Deeploy.Targets.Spatz.TileConstraints.SoftmaxTileConstraint import SoftmaxTileConstraint + +SpatzMatMulTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzMatMulBindings, + tileConstraint = MatMulTileConstraint()) + +SpatzGatherTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzGatherBindings, + tileConstraint = GatherTileConstraint()) + +SpatzTopKTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzTopKBindings, + tileConstraint = TopKTileConstraint()) + +SpatzSoftmaxTilingBindings = TilingReadyNodeBindings(nodeBindings = SpatzSoftmaxBindings, + tileConstraint = SoftmaxTileConstraint()) diff --git a/Deeploy/TilingExtension/MemoryScheduler.py b/Deeploy/TilingExtension/MemoryScheduler.py index e46f50e6f7..c5fb2445f0 100644 --- a/Deeploy/TilingExtension/MemoryScheduler.py +++ b/Deeploy/TilingExtension/MemoryScheduler.py @@ -83,7 +83,10 @@ class MemoryScheduler(): _COSTVARIABLENAME = "H" _COSTPRODUCTNAME = "costProduct" - byteAlignment = 4 + # 8-byte (64-bit) alignment: required on Spatz so vector loads land on 64-bit + # boundaries (a misaligned vle that gets chained corrupts upper lanes). 8 is a + # superset of the previous 4-byte requirement, so it stays correct on other targets. + byteAlignment = 8 @staticmethod def overlap(lifetimeA: Tuple[int, int], lifetimeB: Tuple[int, int]) -> bool: diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py index 5b067b2ce9..9a2aa6b9d9 100644 --- a/Deeploy/TilingExtension/TileConstraint.py +++ b/Deeploy/TilingExtension/TileConstraint.py @@ -131,7 +131,9 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List return solution, solutionLengths - assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!" + # Support multi-output nodes: use first output tensor to determine tiling structure. + # For operators like TopK with multiple outputs, all outputs share the same tiling pattern. + assert len(tilingSolution.outputTensorMemoryConstraints) >= 1, "Expected node to have at least one output!" outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items())) memoryPath = list(outTensorConstraint.memoryConstraints.keys()) diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py index 9b48d9456c..fd866de066 100644 --- a/Deeploy/TilingExtension/TilerExtension.py +++ b/Deeploy/TilingExtension/TilerExtension.py @@ -44,6 +44,51 @@ class Tiler(): + """Tiler for a computation graphs with memory-aware optimization. + + The Tiler class provides functionality for tiling operations to fit within + memory constraints of target hardware platforms. It performs memory allocation, constraint + solving, and scheduling to optimize execution within hierarchical memory systems. + + Parameters + ---------- + memoryHierarchy : MemoryHierarchy + The memory hierarchy specification defining available memory levels and their capacities. + + Attributes + ---------- + arenaName : str + Name prefix for memory arena buffers. + memorySchedulerClass : Type[MemoryScheduler] + Class type for memory scheduler instances. + memoryHierarchy : MemoryHierarchy + The memory hierarchy configuration. + tilerModel : Optional[TilerModel] + The constraint solver model for tiling optimization. + innerMemoryScheduler : MemoryScheduler + Scheduler for inner memory level allocation. + outerMemoryScheduler : MemoryScheduler + Scheduler for outer memory level allocation. + symbolicMemoryConstraints : Optional[List[PatternMemoryConstraints]] + Symbolic memory constraints for the tiling problem. + visualizeMemoryAlloc : bool + Flag to enable memory allocation visualization. + memoryAllocStrategy : {"TetrisRandom", "TetrisCo-Opt", "MiniMalloc"} + Strategy for memory allocation. + searchStrategy : {"min", "max", "random-max"} + Search strategy for constraint solving. + + Examples + -------- + >>> L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 1024000) + >>> L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 512000) + >>> L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 128000) + >>> memoryHierarchy = MemoryHierarchy([L3, L2, L1]) + >>> memoryHierarchy.setDefaultMemoryLevel("L3") + >>> tiler = Tiler(hierarchy) + >>> tiler.memoryAllocStrategy = "MiniMalloc" + >>> solution = tiler.computeTilingSchedule(context) + """ arenaName = "MEMORYARENA" memorySchedulerClass: Type[MemoryScheduler] = MemoryScheduler @@ -53,6 +98,17 @@ class Tiler(): # Initialize with the list of TemplateTCFbinding def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = None, workDir: Optional[str] = None): + """Initialize the Tiler with a memory hierarchy. + + Parameters + ---------- + memoryHierarchy : MemoryHierarchy + The memory hierarchy specification defining available memory levels. + testName : Optional[str], optional + Optional name for the test case, used for file naming. Defaults to None. + workDir : Optional[str], optional + Optional working directory for temporary files. Defaults to None. + """ self.memoryHierarchy = memoryHierarchy self.tilerModel: Optional[TilerModel] = None @@ -85,10 +141,39 @@ def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = N @property def worstCaseBufferSize(self): + """Get the worst-case buffer sizes for each memory level. + + Returns + ------- + Dict[str, int] + Dictionary mapping memory level names to their worst-case buffer sizes in bytes. + """ return self._worstCaseBufferSize def plotMemoryAlloc(self, memoryMap: Dict[str, List[List[MemoryBlock]]], ctxt: NetworkContext, deeployStateDir: str, memoryHierarchy: MemoryHierarchy): + """Generate interactive visualization of memory allocation patterns. + + Creates an HTML file with Plotly visualizations showing memory allocation + over time for each memory level in the hierarchy. + + Parameters + ---------- + memoryMap : Dict[str, List[List[MemoryBlock]]] + Memory allocation map containing blocks for each memory level and time step. + ctxt : NetworkContext + Network context containing buffer information. + deeployStateDir : str + Directory path where the visualization HTML file will be saved. + memoryHierarchy : MemoryHierarchy + Memory hierarchy configuration for the visualization. + + Notes + ----- + Generates a file named 'memory_alloc.html' in the specified directory. + Each memory level is visualized as a separate subplot showing buffer + lifetimes and address space usage. + """ os.makedirs(os.path.abspath(deeployStateDir), exist_ok = True) memoryAllocPlotPath = os.path.abspath(os.path.join(deeployStateDir, f"memory_alloc.html")) @@ -177,6 +262,29 @@ def plotSingleMemoryLevel(memoryLevel: MemoryLevel): def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext, memoryMap: Dict[str, List[List[MemoryBlock]]]) -> NetworkContext: + """Convert network context to use static memory allocation. + + Transforms the network context to use statically allocated memory arenas + based on the computed memory map. Updates buffer allocation templates + to reference specific offsets within memory arenas. + + Parameters + ---------- + ctxt : NetworkContext + The network context to be updated. + memoryMap : Dict[str, List[List[MemoryBlock]]] + Memory allocation map containing blocks for each memory level. + + Returns + ------- + NetworkContext + Updated network context with static memory allocation. + + Notes + ----- + Creates memory arena buffers for each memory level and updates + individual buffer allocation templates to use offsets within these arenas. + """ maxAddr: Dict[str, int] = {} @@ -254,6 +362,47 @@ def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext, return ctxt def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memoryLevel: str): + """Perform memory allocation using the MiniMalloc external tool. + + Interfaces with the external MiniMalloc memory allocator to compute + optimal memory allocation for the given memory blocks and constraints. + + Parameters + ---------- + memoryMap : List[MemoryBlock] + List of memory blocks to be allocated. + ctxt : NetworkContext + Network context containing buffer information. + nodeMemoryConstraint : Optional[NodeMemoryConstraint] + Memory constraints for the current node, if available. + capacity : int + Total memory capacity available for allocation. + memoryLevel : str + Name of the memory level being allocated. + + Returns + ------- + List[MemoryBlock] + Updated memory blocks with assigned address spaces. + + Raises + ------ + KeyError + If MINIMALLOC_INSTALL_DIR environment variable is not set. + subprocess.CalledProcessError + If the MiniMalloc tool fails to execute successfully. + + Notes + ----- + Requires the MiniMalloc tool to be installed and the MINIMALLOC_INSTALL_DIR + environment variable to be set to the installation directory. + """ + + # MiniMalloc has no alignment flag, so allocate in units of byteAlignment: + # round sizes/capacity down/up to alignment units and scale offsets back to + # bytes. This guarantees every returned buffer offset is byteAlignment-aligned + # (required on Spatz so vector loads land on 64-bit boundaries). + alignment = MemoryScheduler.byteAlignment with open(f"{self._minimalloc_input}.csv", mode = "w", newline = "") as file: writer = csv.writer(file, lineterminator = "\n") @@ -276,11 +425,14 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor 8) * nodeMemoryConstraint.tensorMemoryConstraints[ memoryBlock.name].memoryConstraints[memoryLevel].multiBufferCoefficient + # Size in alignment units (rounded up) so offsets come back aligned. + _bufferSizeAligned = (int(_bufferSize) + alignment - 1) // alignment + writer.writerow([ memoryBlock.name, str(memoryBlock.lifetime[0]), str(memoryBlock.lifetime[1] + 1), - str(int(_bufferSize)) + str(_bufferSizeAligned) ]) try: @@ -289,8 +441,8 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor raise KeyError("MINIMALLOC_INSTALL_DIR symbol not found!") minimallocOutput = subprocess.run([ - f"{minimallocInstallDir}/minimalloc", f"--capacity={capacity}", f"--input={self._minimalloc_input}.csv", - f"--output={self._minimalloc_output}.csv" + f"{minimallocInstallDir}/minimalloc", f"--capacity={capacity // alignment}", + f"--input={self._minimalloc_input}.csv", f"--output={self._minimalloc_output}.csv" ], capture_output = True, text = True) @@ -307,11 +459,39 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor for row in reader: for memoryBlock in memoryMap: if memoryBlock.name == row[0]: - memoryBlock._addrSpace = (int(row[-1]), int(row[-1]) + int(row[-2])) + # Scale offset/size back from alignment units to bytes (offsets are + # therefore multiples of `alignment`). + memoryBlock._addrSpace = (int(row[-1]) * alignment, + (int(row[-1]) + int(row[-2])) * alignment) return memoryMap def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution: + """Compute the optimal tiling schedule for the network. + + Solves the constraint optimization problem to find the best tiling + solution that satisfies memory and computational constraints. + + Parameters + ---------- + ctxt : NetworkContext + Network context containing the computational graph and constraints. + + Returns + ------- + TilingSolution + The computed tiling solution with memory constraints for each pattern. + + Raises + ------ + AssertionError + If the tiler model or symbolic memory constraints are not initialized. + + Notes + ----- + This method requires that setupModel() has been called previously to + initialize the constraint model and symbolic memory constraints. + """ assert self.tilerModel is not None and self.symbolicMemoryConstraints is not None, "Set up the model before trying to compute a schedule!" collector = self.tilerModel.trySolveModel() tilingSolution = self._getTilingSolution(self.tilerModel, ctxt, collector, self.symbolicMemoryConstraints) @@ -323,6 +503,29 @@ def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution: return tilingSolution def computeMemoryMap(self, ctxt: NetworkContext, tilingSolution: TilingSolution) -> MemoryMap: + """Compute memory allocation map from the tiling solution. + + Generates a concrete memory allocation map that assigns specific + memory addresses to each buffer based on the tiling solution. + + Parameters + ---------- + ctxt : NetworkContext + Network context containing buffer information. + tilingSolution : TilingSolution + The computed tiling solution. + + Returns + ------- + MemoryMap + Dictionary mapping memory level names to lists of memory blocks + for each time step. + + Notes + ----- + The memory allocation strategy (TetrisRandom, TetrisCo-Opt, or MiniMalloc) + determines how the actual memory addresses are assigned. + """ memoryMap = {} for key in self.innerMemoryScheduler.memoryMap.keys(): @@ -348,6 +551,30 @@ def computeMemoryMap(self, ctxt: NetworkContext, tilingSolution: TilingSolution) def annotateMemoryLevel(self, ctxt: NetworkContext, tilingSolution: TilingSolution, memoryMap: Dict) -> NetworkContext: + """Annotate memory constraints with actual address space allocations. + + Updates the memory constraints in the tiling solution with the actual + address spaces computed during memory allocation. + + Parameters + ---------- + ctxt : NetworkContext + Network context containing buffer information. + tilingSolution : TilingSolution + The tiling solution to be annotated. + memoryMap : Dict[str, List[List[MemoryBlock]]] + Memory allocation map with assigned address spaces. + + Returns + ------- + NetworkContext + Updated network context (returned for consistency). + + Notes + ----- + This method modifies the tiling solution in-place by adding address + space information to memory constraints. + """ for idx, pattern in enumerate(tilingSolution): for nodeIdx, nodeConstraint in enumerate(pattern.nodeConstraints): for tensorConstraint in nodeConstraint.tensorMemoryConstraints.values(): @@ -373,6 +600,32 @@ def annotateMemoryLevel(self, ctxt: NetworkContext, tilingSolution: TilingSoluti def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NetworkContext: + """Set up the constraint optimization model for tiling. + + Initializes the tiler model with geometric constraints, memory constraints, + and optimization objectives based on the network schedule and layer bindings. + + Parameters + ---------- + ctxt : NetworkContext + Network context containing the computational graph. + schedule : Schedule + Execution schedule defining the order of operations. + layerBinding : OrderedDict[str, ONNXLayer] + Mapping from node names to their layer implementations. + targetMemoryLevelMapping : TargetMemoryLevelMapping + Mapping defining which memory levels to use for each tensor. + + Returns + ------- + NetworkContext + The network context (returned for consistency). + + Notes + ----- + This method must be called before computeTilingSchedule() to initialize + the constraint model and symbolic memory constraints. + """ wrapSchedule: List[SubGraph] = [] for entry in schedule: @@ -396,6 +649,37 @@ def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: Ord # SCHEREMO: Return a integer factor or IntVar variable for the multi Buffer coefficient given the tiling path, hop and tensorName. def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], hop: str, tensorName: str) -> Union[int, IntVar]: + """Determine multi-buffering coefficient for a tensor in the tiling strategy. + + Computes the buffering factor (e.g., double buffering = 2) for a given tensor + based on its type and usage pattern in the computation graph. This coefficient + is used to determine how many copies of the tensor should be kept in memory. + + Parameters + ---------- + tilerModel : TilerModel, (unused) + The constraint solver model. + ctxt : NetworkContext + Network context containing buffer information. + pattern : SubGraph, (unused) + The computation pattern being analyzed. + path : List[str], (unused) + Memory hierarchy path for the tensor. + hop : str, (unused) + Current memory level in the path. + tensorName : str + Name of the tensor to analyze. + + Returns + ------- + Union[int, IntVar] + Buffering coefficient (typically 1 for transient buffers, 2 for others). + + Notes + ----- + The multi-buffering strategy helps overlap computation with data movement + by maintaining multiple copies of buffers at different memory levels. + """ varBuffer = ctxt.lookup(tensorName) @@ -426,6 +710,30 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt def propagateIOBufferStrategy(self, tileConstraintPattern: PatternMemoryConstraints, pattern: SubGraph, ctxt: NetworkContext) -> PatternMemoryConstraints: + """Propagate I/O buffer strategy across the tiling pattern. + + Implements static n-tuple buffering strategy by propagating border tensor + constraints across all steps in the tiling pattern. + + Parameters + ---------- + tileConstraintPattern : PatternMemoryConstraints + Memory constraints for the tiling pattern. + pattern : SubGraph + The computation subgraph being tiled. + ctxt : NetworkContext + Network context containing buffer information. + + Returns + ------- + PatternMemoryConstraints + Updated pattern memory constraints with propagated I/O buffer strategy. + + Notes + ----- + This method ensures that border tensors (inputs/outputs of the pattern) + maintain consistent memory allocation across all computation steps. + """ borderTensorStep = NodeMemoryConstraint() for patternStep in tileConstraintPattern.nodeConstraints: @@ -438,6 +746,37 @@ def propagateIOBufferStrategy(self, tileConstraintPattern: PatternMemoryConstrai def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector, tensorConstraint: TensorMemoryConstraint) -> TensorMemoryConstraint: + """Resolve symbolic tensor memory constraints to concrete values. + + Converts symbolic variables in tensor memory constraints to their + concrete values from the solver solution. + + Parameters + ---------- + tilerModel : TilerModel + The constraint solver model with the solution. + ctxt : NetworkContext + Network context containing buffer information. + collector : SolutionCollector + Solution collector from the constraint solver. + tensorConstraint : TensorMemoryConstraint + Symbolic tensor memory constraint to resolve. + + Returns + ------- + TensorMemoryConstraint + Tensor memory constraint with resolved concrete values. + + Raises + ------ + AssertionError + If the tiler model is not initialized. + + Notes + ----- + This method extracts the actual buffer sizes and shapes from the + solved constraint model and creates concrete memory constraints. + """ assert self.tilerModel is not None, "Can't resolve tensor memory constraints, tilerModel is None!" tensorName = tensorConstraint.tensorName @@ -472,6 +811,32 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo def _getTilingSolution(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector, allConstraints: List[PatternMemoryConstraints]) -> List[PatternMemoryConstraints]: + """Extract tiling solution from the solved constraint model. + + Processes all pattern memory constraints and resolves symbolic variables + to create a concrete tiling solution. + + Parameters + ---------- + tilerModel : TilerModel + The solved constraint model. + ctxt : NetworkContext + Network context containing buffer information. + collector : SolutionCollector + Solution collector from the constraint solver. + allConstraints : List[PatternMemoryConstraints] + List of all symbolic pattern memory constraints. + + Returns + ------- + List[PatternMemoryConstraints] + Resolved tiling solution with concrete memory constraints. + + Notes + ----- + Only constraints that require resolution (multi-level or transient buffers) + are processed. Global single-level buffers are skipped. + """ retList = [] @@ -502,6 +867,29 @@ def _checkResolve(ctxt, tensorName, tensorConstraint): def _setupTensorDimensionProducts(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph]) -> TilerModel: + """Set up tensor dimension product variables in the tiler model. + + Adds variables representing the number of elements in each tensor + to the constraint model for each pattern in the schedule. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model to update. + ctxt : NetworkContext + Network context containing buffer information. + schedule : List[SubGraph] + List of computation patterns in the schedule. + + Returns + ------- + TilerModel + Updated tiler model with tensor dimension variables. + + Notes + ----- + Only processes tensors that are marked for deployment in the context. + """ for idx, pattern in enumerate(schedule): subGraph = gs.Graph(nodes = pattern) @@ -517,6 +905,33 @@ def _setupTensorDimensionProducts(self, tilerModel: TilerModel, ctxt: NetworkCon def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph], layerBinding: OrderedDict[str, ONNXLayer]) -> TilerModel: + """Set up geometric constraints for each layer in the schedule. + + Adds geometric and policy constraints from each layer's tile constraint + specification to the tiler model. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model to update. + ctxt : NetworkContext + Network context containing buffer information. + schedule : List[SubGraph] + List of computation patterns in the schedule. + layerBinding : OrderedDict[str, ONNXLayer] + Mapping from node names to their layer implementations. + + Returns + ------- + TilerModel + Updated tiler model with geometric constraints. + + Notes + ----- + Each pattern is treated as a decoupled sub-problem with respect to + geometric constraints. Dimension variables are regenerated for each + tensor using the copyIdx mechanism. + """ # SCHEREMO: Each pattern is a decoupled sub-problem w.r.t the geometric constraints. # We need to regenerate dimension variables for each tensor @@ -542,6 +957,30 @@ def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContex return tilerModel def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph]) -> TilerModel: + """Set up optimization heuristics for the tiler model. + + Adds optimization objectives to maximize memory usage efficiency + for each pattern in the schedule. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model to update. + ctxt : NetworkContext + Network context containing buffer information. + schedule : List[SubGraph] + List of computation patterns in the schedule. + + Returns + ------- + TilerModel + Updated tiler model with optimization objectives. + + Notes + ----- + Creates pattern-level memory size variables and adds maximization + objectives to encourage efficient memory utilization. + """ for idx, pattern in enumerate(schedule): @@ -556,7 +995,7 @@ def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedul patternMemSizeExpr: IntVar = 0 for tensor in patternTensorList: - if not ctxt.lookup(tensor.name)._deploy: + if not ctxt.lookup(tensor.name)._deploy or isinstance(ctxt.lookup(tensor.name), ConstantBuffer): continue patternMemSizeExpr += tilerModel.getTensorNumberOfEltVar( @@ -581,6 +1020,34 @@ def _setupMemoryConstraints( self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph], layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping) -> Tuple[TilerModel, List[PatternMemoryConstraints]]: + """Set up memory constraints for the tiling optimization. + + Generates memory constraints for both inner and outer memory levels, + considering the memory hierarchy and scheduling requirements. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model to update. + ctxt : NetworkContext + Network context containing buffer information. + schedule : List[SubGraph] + List of computation patterns in the schedule. + layerBinding : OrderedDict[str, ONNXLayer] + Mapping from node names to their layer implementations. + targetMemoryLevelMapping : TargetMemoryLevelMapping + Mapping defining which memory levels to use for each tensor. + + Returns + ------- + Tuple[TilerModel, List[PatternMemoryConstraints]] + Updated tiler model and list of all memory constraints. + + Notes + ----- + Sets up both outer (inter-pattern) and inner (intra-pattern) memory + constraints, considering the chosen memory allocation strategy. + """ allMemoryConstraints = self._generateAllMemoryConstraints(tilerModel, ctxt, schedule, layerBinding, targetMemoryLevelMapping) @@ -621,6 +1088,34 @@ def _generateAllMemoryConstraints( self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph], layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping) -> List[PatternMemoryConstraints]: + """Generate all memory constraints combining dynamic and constant tensors. + + Creates comprehensive memory constraints by combining dynamic tensor + constraints with constant tensor constraints for each pattern. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model. + ctxt : NetworkContext + Network context containing buffer information. + schedule : List[SubGraph] + List of computation patterns in the schedule. + layerBinding : OrderedDict[str, ONNXLayer] + Mapping from node names to their layer implementations. + targetMemoryLevelMapping : TargetMemoryLevelMapping + Mapping defining which memory levels to use for each tensor. + + Returns + ------- + List[PatternMemoryConstraints] + Complete list of memory constraints for all patterns. + + Notes + ----- + Combines results from _generateMemoryConstraints to create the complete + constraint set including both variable and constant buffers. + """ dynamicTensorConstraints, constantTensorConstraints = self._generateMemoryConstraints( tilerModel, ctxt, schedule, layerBinding, targetMemoryLevelMapping) @@ -641,6 +1136,39 @@ def _generateMemoryConstraints( self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph], layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping ) -> Tuple[List[PatternMemoryConstraints], NodeMemoryConstraint]: + """Generate memory constraints for variable and constant buffers. + + Creates detailed memory constraints including outer/inner variable + buffer constraints, tiled tensor constraints, and constant buffer + constraints. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model. + ctxt : NetworkContext + Network context containing buffer information. + schedule : List[SubGraph] + List of computation patterns in the schedule. + layerBinding : OrderedDict[str, ONNXLayer] + Mapping from node names to their layer implementations. + targetMemoryLevelMapping : TargetMemoryLevelMapping + Mapping defining which memory levels to use for each tensor. + + Returns + ------- + Tuple[List[PatternMemoryConstraints], NodeMemoryConstraint] + Tuple containing: + - List of pattern memory constraints for dynamic tensors + - Node memory constraint for constant buffers + + Notes + ----- + Generates three levels of constraints: + 1. First-level: global buffers + higher-level tensors + 2. Tiled tensor constraints with double buffering + 3. In-place tensor constraints for unkilled tensors + """ # SCHEREMO: Construct non-double-buffered constraints of local variable buffers @@ -703,6 +1231,38 @@ def _generateMemoryConstraints( def _generateTilePath(self, tilerModel: TilerModel, ctxt: NetworkContext, tensorMemoryConstraint: TensorMemoryConstraint, pattern: SubGraph) -> TensorMemoryConstraint: + """Generate tiling path for a tensor across memory hierarchy levels. + + Creates memory constraints for a tensor that needs to move between + different levels of the memory hierarchy, including multi-buffering. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model. + ctxt : NetworkContext + Network context containing buffer information. + tensorMemoryConstraint : TensorMemoryConstraint + Original tensor memory constraint with multiple levels. + pattern : SubGraph + The computation pattern using this tensor. + + Returns + ------- + TensorMemoryConstraint + Updated tensor memory constraint with complete tiling path. + + Raises + ------ + AssertionError + If the tensor constraint doesn't have exactly 2 memory levels, + or if the multi-buffer factor is invalid. + + Notes + ----- + Uses breadth-first search to find the path between memory levels + and applies multi-buffering strategy at each intermediate level. + """ assert len(tensorMemoryConstraint.memoryConstraints.keys() ) == 2, "Can't generate a tile path for more than one hierarchy level!" @@ -736,6 +1296,34 @@ def _generateTilePath(self, tilerModel: TilerModel, ctxt: NetworkContext, def _generateIntermediateTilingSteps(self, tilerModel: TilerModel, ctxt: NetworkContext, sourceStep: NodeMemoryConstraint, destinationStep: NodeMemoryConstraint, pattern: SubGraph) -> NodeMemoryConstraint: + """Generate intermediate tiling steps between source and destination constraints. + + Creates tiling constraints for tensors that need to move between different + memory levels within a computation pattern. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model. + ctxt : NetworkContext + Network context containing buffer information. + sourceStep : NodeMemoryConstraint + Memory constraints for the source step. + destinationStep : NodeMemoryConstraint + Memory constraints for the destination step. + pattern : SubGraph + The computation pattern being analyzed. + + Returns + ------- + NodeMemoryConstraint + Memory constraints for intermediate tiling steps. + + Notes + ----- + Identifies tensors that require tiling (those with multiple memory + constraints) and generates appropriate tiling paths for them. + """ tileConstraintStep = NodeMemoryConstraint() mergedStep = sourceStep + destinationStep @@ -755,6 +1343,39 @@ def _generateTilePathConstraints(self, tilerModel: TilerModel, ctxt: NetworkCont sourceConstraints: List[PatternMemoryConstraints], destinationConstraints: List[PatternMemoryConstraints], schedule: List[SubGraph]) -> List[PatternMemoryConstraints]: + """Generate tiling path constraints for all patterns in the schedule. + + Creates comprehensive tiling constraints by combining source and destination + constraints for each pattern and applying I/O buffer strategies. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model. + ctxt : NetworkContext + Network context containing buffer information. + sourceConstraints : List[PatternMemoryConstraints] + Source memory constraints for each pattern. + destinationConstraints : List[PatternMemoryConstraints] + Destination memory constraints for each pattern. + schedule : List[SubGraph] + List of computation patterns in the schedule. + + Returns + ------- + List[PatternMemoryConstraints] + Complete tiling path constraints for all patterns. + + Raises + ------ + AssertionError + If source pattern constraints are not single-step. + + Notes + ----- + Assumes source patterns are constant and single-step since they + represent tensors that are live throughout the pattern execution. + """ tileConstraints = [] @@ -781,6 +1402,26 @@ def _generateTilePathConstraints(self, tilerModel: TilerModel, ctxt: NetworkCont return tileConstraints def _generateBufferConstraints(self, ctxt: NetworkContext) -> NodeMemoryConstraint: + """Generate memory constraints for constant global buffers. + + Creates memory constraints for all constant buffers that are marked + for deployment in the network context. + + Parameters + ---------- + ctxt : NetworkContext + Network context containing buffer information. + + Returns + ------- + NodeMemoryConstraint + Memory constraints for all constant global buffers. + + Notes + ----- + Only processes constant buffers with _deploy flag set to True. + Each buffer is treated as an input tensor in the constraints. + """ constantGlobalConstraint: NodeMemoryConstraint = NodeMemoryConstraint() constantGlobalBuffers = [ @@ -805,6 +1446,37 @@ def _generateVariableBufferConstraints( self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph], layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping ) -> Tuple[List[PatternMemoryConstraints], List[PatternMemoryConstraints]]: + """Generate memory constraints for variable buffers using flow analysis. + + Performs liveness analysis on the computation graph to determine + memory requirements for variable buffers at different points in execution. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model. + ctxt : NetworkContext + Network context containing buffer information. + schedule : List[SubGraph] + List of computation patterns in the schedule. + layerBinding : OrderedDict[str, ONNXLayer] + Mapping from node names to their layer implementations. + targetMemoryLevelMapping : TargetMemoryLevelMapping + Mapping defining which memory levels to use for each tensor. + + Returns + ------- + Tuple[List[PatternMemoryConstraints], List[PatternMemoryConstraints]] + Tuple containing: + - Outer memory constraints (inter-pattern) + - Inner memory constraints (intra-pattern) + + Notes + ----- + Uses graph flow analysis to compute liveness information and generates + both outer (pattern-level) and inner (step-level) memory constraints. + Includes transient buffer constraints for each computation step. + """ def deltaFlow( patternFlow: List[GenericFlowState[TensorMemLevelTuple]]) -> GenericFlowState[TensorMemLevelTuple]: @@ -877,6 +1549,35 @@ def deltaFlow( def _generatePatternStepTransientBufferConstraints( self, tilerModel: TilerModel, ctxt: NetworkContext, layerBinding: OrderedDict[str, ONNXLayer], step: gs.Node, targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NodeMemoryConstraint: + """Generate memory constraints for transient buffers in a pattern step. + + Computes memory requirements for temporary buffers needed during + the execution of a single computation step. + + Parameters + ---------- + tilerModel : TilerModel + The constraint model. + ctxt : NetworkContext + Network context containing buffer information. + layerBinding : OrderedDict[str, ONNXLayer] + Mapping from node names to their layer implementations. + step : gs.Node + The computation node being analyzed. + targetMemoryLevelMapping : TargetMemoryLevelMapping + Mapping defining which memory levels to use for each tensor. + + Returns + ------- + NodeMemoryConstraint + Memory constraints for transient buffers in this step. + + Notes + ----- + Transient buffers are assumed to be allocated in the same memory + level as the main input of the computation step. Buffer sizes are + computed using the layer template's transient buffer size calculation. + """ patternStepTransientBufferSizes = NodeMemoryConstraint() @@ -907,6 +1608,26 @@ def _generatePatternStepTransientBufferConstraints( return patternStepTransientBufferSizes def assertLayerWiseTiling(self, schedule: List[List[gs.Node]]) -> bool: + """Assert that the schedule uses layer-wise tiling (one node per pattern). + + Verifies that each pattern in the schedule contains exactly one node, + which is required for certain memory allocation strategies. + + Parameters + ---------- + schedule : List[List[gs.Node]] + The execution schedule to validate. + + Returns + ------- + bool + True if all patterns contain exactly one node, False otherwise. + + Notes + ----- + Layer-wise tiling is required when using the MiniMalloc memory + allocation strategy. + """ for pattern in schedule: if len(pattern) > 1: return False @@ -914,12 +1635,55 @@ def assertLayerWiseTiling(self, schedule: List[List[gs.Node]]) -> bool: return True def assertUniformMemoryLevelAllocation(self, ctxt: NetworkContext, defaultMemoryLevel: str) -> bool: + """Assert that all local buffers are allocated to the default memory level. + + Verifies that all local buffers in the network context are assigned + to the specified default memory level. + + Parameters + ---------- + ctxt : NetworkContext + Network context containing buffer information. + defaultMemoryLevel : str + Name of the default memory level to check against. + + Returns + ------- + bool + True if all local buffers use the default memory level, False otherwise. + + Notes + ----- + Uniform memory level allocation is required when using the MiniMalloc + memory allocation strategy. + """ for buffer in ctxt.localObjects.values(): if buffer._memoryLevel != defaultMemoryLevel: return False return True def testTilingSolutionCorrectness(self, tilingSolution: TilingSolution) -> None: + """Test the correctness of a computed tiling solution. + + Validates that buffer sizes in the tiling solution are properly + aligned according to memory alignment requirements. + + Parameters + ---------- + tilingSolution : TilingSolution + The tiling solution to validate. + + Raises + ------ + AssertionError + If any buffer is not properly aligned or if multi-buffer + coefficients are not integers. + + Notes + ----- + Checks that all allocated buffers meet the byte alignment requirements + specified in MemoryScheduler.byteAlignment. + """ # LMACAN: Assert buffer sizes are word aligned as per comment in MemoryScheduler.py:MemoryScheduler._buildCostVector() byteAlignment = MemoryScheduler.byteAlignment for patternMemoryConstraint in tilingSolution: @@ -934,6 +1698,32 @@ def testTilingSolutionCorrectness(self, tilingSolution: TilingSolution) -> None: def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]], graph: gs.Graph, schedule: Schedule) -> None: + """Test the correctness of a computed memory map. + + Validates that the memory map correctly represents buffer lifetimes + and ensures all required buffers are alive when needed. + + Parameters + ---------- + memoryMap : Dict[str, List[List[MemoryBlock]]] + The memory map to validate. + graph : gs.Graph + The computation graph. + schedule : Schedule + The execution schedule. + + Raises + ------ + AssertionError + If output buffers are not alive until the end, input buffers + are not alive at the beginning, or required buffers are not + alive during computation steps. + + Notes + ----- + Performs comprehensive validation of buffer lifetimes to ensure + the memory map is consistent with the computation requirements. + """ memoryBlockMap = { memoryBlock.name: memoryBlock for levelMemoryMap in memoryMap.values() for memoryBlock in levelMemoryMap[-1] @@ -960,12 +1750,52 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]] class TilerDeployerWrapper(NetworkDeployerWrapper): + """Wrapper for network deployers that adds tiling capabilities. + + Extends NetworkDeployerWrapper to provide automatic tiling and memory + management for neural network deployment on memory-constrained hardware. + + Parameters + ---------- + deployer : Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper] + The base deployer to wrap with tiling capabilities. + tilerCls : Type[Tiler], optional + The tiler class to use, by default Tiler. + + Attributes + ---------- + tiler : Tiler + The tiler instance used for memory optimization. + + Raises + ------ + AssertionError + If the platform is not a MemoryPlatform or MemoryPlatformWrapper. + + Notes + ----- + The wrapper automatically handles tiling setup, constraint solving, + and memory allocation during the binding process. + """ def __init__(self, deployer: Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper], tilerCls: Type[Tiler] = Tiler, testName: Optional[str] = None, workDir: Optional[str] = None): + """Initialize the tiler deployer wrapper. + + Parameters + ---------- + deployer : Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper] + The base deployer to wrap. + tilerCls : Type[Tiler], optional + The tiler class to instantiate, by default Tiler. + testName : Optional[str], optional + Optional name for the test case, used for file naming. Defaults to None. + workDir : Optional[str], optional + Optional working directory for temporary files. Defaults to None. + """ super().__init__(deployer) assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \ f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}" @@ -973,9 +1803,56 @@ def __init__(self, @property def worstCaseBufferSize(self): + """Get the worst-case buffer sizes including inputs and outputs. + + Computes the total worst-case memory requirements including + both tiled buffers and input/output buffers. + + Returns + ------- + Dict[str, int] + Dictionary mapping memory level names to their total worst-case + buffer sizes in bytes. + + Notes + ----- + Extends the tiler's worst-case buffer size calculation by adding + the memory requirements of input and output buffers. + """ return self.tiler.worstCaseBufferSize def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optional[MemoryMap] = None): + """Perform tiling and memory allocation for the network. + + Executes the complete tiling process including constraint setup, + optimization, memory allocation, and code generation updates. + + Parameters + ---------- + tilingSolution : Optional[TilingSolution], optional + Pre-computed tiling solution to use instead of computing one. + If None, the solution will be computed automatically. + memoryMap : Optional[MemoryMap], optional + Pre-computed memory map to use instead of computing one. + If None, the memory map will be computed automatically. + + Raises + ------ + AssertionError + If only one of tilingSolution or memoryMap is provided, + if MiniMalloc is used with non-layer-wise tiling, + or if tensors are not uniformly allocated when using MiniMalloc. + + Notes + ----- + When using MiniMalloc memory allocation strategy, additional + constraints apply: + - Only layer-wise execution is supported + - All tensors must be in the default memory level + + The method performs validation of the computed solutions and + updates the execution blocks with tiling information. + """ assert (tilingSolution is None and memoryMap is None) or (tilingSolution is not None and memoryMap is not None), \ "You need to provide both the manual tilingSolution and the memoryMap to override tiling." @@ -1022,6 +1899,21 @@ def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optio # SCHEREMO: Code generation STUB def bind(self): + """Bind the network with automatic tiling. + + Performs the complete binding process including layer binding + and automatic tiling optimization. + + Returns + ------- + bool + True if binding was successful, False otherwise. + + Notes + ----- + Calls the parent bind() method first, then performs tiling + if the initial binding was successful. + """ if not super().bind(): return False @@ -1046,9 +1938,35 @@ def _printMemorySummary(self): def TilingReadyNodeBindings(nodeBindings: List[NodeBinding], tileConstraint: TileConstraint) -> List[NodeBinding]: - ''' - Apply the TillingReadyNodeTemplate to the template of each NodeBinding. - ''' + """Apply tiling constraints to a list of node bindings. + + Creates deep copies of the provided node bindings and attaches the + specified tile constraint to each binding's template. + + Parameters + ---------- + nodeBindings : List[NodeBinding] + List of node bindings to make tiling-ready. + tileConstraint : TileConstraint + The tile constraint to attach to each binding. + + Returns + ------- + List[NodeBinding] + List of node bindings with tiling constraints attached. + + Notes + ----- + The function creates deep copies to avoid modifying the original + node bindings. Each template in the copied bindings gets the + tileConstraint attribute set. + + Examples + -------- + >>> bindings = [binding1, binding2, binding3] + >>> constraint = MyTileConstraint() + >>> tiling_bindings = TilingReadyNodeBindings(bindings, constraint) + """ nodeBindingsCopy = copy.deepcopy(nodeBindings) #.copy() for binding in nodeBindingsCopy: binding.template.tileConstraint = tileConstraint diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py index db83974f0c..080211270b 100644 --- a/Deeploy/TilingExtension/TilerModel.py +++ b/Deeploy/TilingExtension/TilerModel.py @@ -10,6 +10,7 @@ import numpy as np from ortools.constraint_solver.pywrapcp import IntExpr, IntVar, SolutionCollector, Solver +from Deeploy.DeeployTypes import ConstantBuffer from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation from Deeploy.Logging import DEFAULT_LOGGER as log from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel @@ -170,6 +171,10 @@ def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyId tensor = ctxt.lookup(tensorName) + # Skip constant buffers: they don't participate in tiling and don't need num_elements variables + if isinstance(tensor, ConstantBuffer): + return + tensorDimProductExpr = 1 for idx, _ in enumerate([ diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py index 0974fa337b..945aaa971d 100644 --- a/Deeploy/TilingExtension/TilingCodegen.py +++ b/Deeploy/TilingExtension/TilingCodegen.py @@ -16,18 +16,94 @@ @dataclass class MemoryTransfer(): + """ + Represents a memory transfer operation between two memory levels. + + This dataclass encapsulates the source and destination memory constraints + for a memory transfer operation in the tiling system, defining where data + is transferred from and to in the memory hierarchy. + + Attributes + ---------- + source : MemoryConstraint + The source memory constraint defining the memory level data is + transferred from. + destination : MemoryConstraint + The destination memory constraint defining the memory level data is + transferred to. + + Notes + ----- + This class is used in conjunction with memory hierarchies to define + data movement patterns during tiled neural network execution. + """ source: MemoryConstraint destination: MemoryConstraint @dataclass class HyperRectangle(): + """ + Represents a multi-dimensional rectangular region in tensor space. + + A HyperRectangle defines a rectangular tile or region within a + multi-dimensional tensor, specified by its position (offset) and + dimensions (size) in each axis. This is fundamental for tiled + processing of tensors where operations are performed on smaller + rectangular chunks. + + Attributes + ---------- + offset : Tuple[int, ...] + Position of the hyperrectangle in feature map space. Each element + represents the starting index along the corresponding dimension. + dims : Tuple[int, ...] + Size of the hyperrectangle along each dimension. Each element + represents the extent of the rectangle in the corresponding dimension. + + Parameters + ---------- + offset : Tuple[int, ...] + Starting position of the rectangle in multi-dimensional space. + dims : Tuple[int, ...] + Dimensions/size of the rectangle in multi-dimensional space. + + Raises + ------ + AssertionError + If the offset and dims tuples have different lengths. + + Notes + ----- + The offset and dims must have the same rank (number of dimensions). + This ensures the hyperrectangle is well-defined in the tensor space. + + Examples + -------- + >>> rect = HyperRectangle((0, 5), (10, 15)) + >>> # Creates a 2D rectangle starting at (0,5) with size 10x15 + """ # position of the hyperrectangle in feature map space offset: Tuple[int, ...] # size of the hyperrectangle dims: Tuple[int, ...] def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]): + """ + Initialize a HyperRectangle with given offset and dimensions. + + Parameters + ---------- + offset : Tuple[int, ...] + Starting position of the rectangle in multi-dimensional space. + dims : Tuple[int, ...] + Dimensions/size of the rectangle in multi-dimensional space. + + Raises + ------ + AssertionError + If offset and dims have mismatching dimensions. + """ assert len(offset) == len( dims), f"HyperRectangle offset and dims for mismatching dimensions {offset} and {dims}" @@ -37,10 +113,58 @@ def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]): @dataclass class AbsoluteHyperRectangle: + """ + Represents a HyperRectangle with an absolute offset in memory space. + + This class combines a HyperRectangle with an absolute memory offset, + providing both the logical tensor coordinates and the physical memory + location. This is useful for tracking tiles that have been positioned + in specific memory locations during tiling operations. + + Attributes + ---------- + rectangle : HyperRectangle + The hyperrectangle defining the logical tensor region. + absoluteOffset : Tuple[int, ...] + The absolute offset in memory space where this rectangle is located. + + Parameters + ---------- + rectangle : HyperRectangle + The hyperrectangle to associate with the absolute offset. + absoluteOffset : Tuple[int, ...] + The absolute position in memory space. + + Raises + ------ + AssertionError + If the absoluteOffset and rectangle.offset have mismatching dimensions. + + Notes + ----- + The absoluteOffset must have the same dimensionality as the rectangle's + offset to ensure consistent coordinate mapping between logical and physical + memory spaces. + """ rectangle: HyperRectangle absoluteOffset: Tuple[int, ...] def __init__(self, rectangle: HyperRectangle, absoluteOffset: Tuple[int, ...]): + """ + Initialize an AbsoluteHyperRectangle with rectangle and absolute offset. + + Parameters + ---------- + rectangle : HyperRectangle + The hyperrectangle defining the logical tensor region. + absoluteOffset : Tuple[int, ...] + The absolute position in memory space. + + Raises + ------ + AssertionError + If absoluteOffset and rectangle.offset have mismatching dimensions. + """ assert len(absoluteOffset) == len( rectangle.offset ), f"AsoluteHyperRectangle's absoluteOffset and rectangle's offset for mismatching dimensions {absoluteOffset} and {rectangle.offset}" @@ -51,6 +175,46 @@ def __init__(self, rectangle: HyperRectangle, absoluteOffset: Tuple[int, ...]): @dataclass class TilingSchedule(): + """ + Represents a complete schedule for tiled execution of neural network operations. + + A TilingSchedule defines how data should be loaded, processed, and stored + during tiled execution. It specifies the memory offsets for input and output + tensors, as well as the hyperrectangles that define which regions of data + are processed in each tiling step. + + Attributes + ---------- + inputBaseOffsets : Dict[str, List[int]] + Dictionary mapping tensor names to lists of base memory offsets for + input tiles. Each list should have length equal to the number of tiles. + outputBaseOffsets : Dict[str, List[int]] + Dictionary mapping tensor names to lists of base memory offsets for + output tiles. Each list should have length equal to the number of tiles. + inputLoadSchedule : List[Dict[str, HyperRectangle]] + List of dictionaries, one per tile, mapping tensor names to the + hyperrectangles that should be loaded as input for that tile. + outputLoadSchedule : List[Dict[str, HyperRectangle]] + List of dictionaries, one per tile, mapping tensor names to the + hyperrectangles that should be stored as output for that tile. + + Parameters + ---------- + inputBaseOffsets : Dict[str, List[int]] + Input tensor base offsets for each tile. + outputBaseOffsets : Dict[str, List[int]] + Output tensor base offsets for each tile. + inputLoadSchedule : List[Dict[str, HyperRectangle]] + Input loading schedule for each tile. + outputLoadSchedule : List[Dict[str, HyperRectangle]] + Output storing schedule for each tile. + + Notes + ----- + The lengths of inputLoadSchedule and outputLoadSchedule should typically + be equal, representing the same number of tiles. Each schedule step + corresponds to processing one tile of the operation. + """ # the places to store input tiles # Should have length numTiles inputBaseOffsets: Dict[str, List[int]] @@ -70,6 +234,27 @@ class TilingSchedule(): def __init__(self, inputBaseOffsets: Dict[str, List[int]], outputBaseOffsets: Dict[str, List[int]], inputLoadSchedule: List[Dict[str, HyperRectangle]], outputLoadSchedule: List[Dict[str, HyperRectangle]]): + """ + Initialize a TilingSchedule with specified offsets and load schedules. + + Parameters + ---------- + inputBaseOffsets : Dict[str, List[int]] + Input tensor base offsets for each tile. + outputBaseOffsets : Dict[str, List[int]] + Output tensor base offsets for each tile. + inputLoadSchedule : List[Dict[str, HyperRectangle]] + Input loading schedule for each tile. + outputLoadSchedule : List[Dict[str, HyperRectangle]] + Output storing schedule for each tile. + + Raises + ------ + AssertionError + If any key from inputBaseOffsets is missing from a schedule step + in inputLoadSchedule, or if any key from outputBaseOffsets is + missing from a schedule step in outputLoadSchedule. + """ # assert len(inputLoadSchedule) == len(outputLoadSchedule), "Didn't get equal amount of input and output tiles!" @@ -100,6 +285,30 @@ def __repr__(self) -> str: return outStr def __add__(self, other: TilingSchedule) -> TilingSchedule: + """ + Concatenate two TilingSchedule objects. + + Combines this tiling schedule with another by concatenating their + load schedules while maintaining the same base offsets. This is + useful for creating composite tiling schedules from multiple stages. + + Parameters + ---------- + other : TilingSchedule + The other TilingSchedule to concatenate with this one. + + Returns + ------- + TilingSchedule + A new TilingSchedule containing the concatenated load schedules + from both input schedules. + + Raises + ------ + AssertionError + If the other object is not a TilingSchedule, or if the tensor + keys don't match between the two schedules. + """ assert isinstance(other, TilingSchedule), f"Other {other} is not a TilingSchedule" @@ -124,10 +333,60 @@ def __add__(self, other: TilingSchedule) -> TilingSchedule: @dataclass class VariableReplacementScheme(): + """ + Defines how variables should be replaced with tile-specific values. + + This class manages the replacement of scalar variables with arrays of + tile-specific values during tiled execution. It tracks both the per-tile + replacement values and the corresponding data types for each variable. + + Attributes + ---------- + perTileReplacements : Dict[str, List] + Dictionary mapping variable names to lists of replacement values, + one value per tile. Each list should have length equal to the + number of tiles. + replacementTypes : Dict[str, Type[Pointer]] + Dictionary mapping variable names to their corresponding pointer + types for the replacement arrays. + + Parameters + ---------- + perTileReplacements : Dict[str, List] + Per-tile replacement values for each variable. + replacementTypes : Dict[str, Type[Pointer]] + Type information for each replacement variable. + + Raises + ------ + AssertionError + If the keys in perTileReplacements and replacementTypes don't match + exactly, or if they have different numbers of entries. + + Notes + ----- + This scheme is used to replace compile-time constants with runtime + arrays during tiled execution, enabling different values for each tile. + """ perTileReplacements: Dict[str, List] replacementTypes: Dict[str, Type[Pointer]] def __init__(self, perTileReplacements: Dict[str, List], replacementTypes: Dict[str, Type[Pointer]]): + """ + Initialize a VariableReplacementScheme with replacements and types. + + Parameters + ---------- + perTileReplacements : Dict[str, List] + Per-tile replacement values for each variable. + replacementTypes : Dict[str, Type[Pointer]] + Type information for each replacement variable. + + Raises + ------ + AssertionError + If the keys don't match exactly or have different counts. + """ assert len(perTileReplacements.keys()) == len( replacementTypes.keys()), "Exactly all replacements must have one type" @@ -138,6 +397,29 @@ def __init__(self, perTileReplacements: Dict[str, List], replacementTypes: Dict[ self.replacementTypes = replacementTypes def __add__(self, other: VariableReplacementScheme) -> VariableReplacementScheme: + """ + Concatenate two VariableReplacementScheme objects. + + Combines this replacement scheme with another by concatenating their + per-tile replacement lists. This is useful for merging replacement + schemes from multiple tiling stages. + + Parameters + ---------- + other : VariableReplacementScheme + The other VariableReplacementScheme to concatenate with this one. + + Returns + ------- + VariableReplacementScheme + A new VariableReplacementScheme with concatenated replacement lists. + + Raises + ------ + AssertionError + If the other object is not a VariableReplacementScheme, or if + the variable keys don't match between the two schemes. + """ assert isinstance(other, VariableReplacementScheme), f"Other {other} is not a VariableReplacementScheme" @@ -161,6 +443,33 @@ def __add__(self, other: VariableReplacementScheme) -> VariableReplacementScheme def minimizeVariableReplacement( scheme: VariableReplacementScheme, operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, Dict]: + """ + Optimize a variable replacement scheme by eliminating constant replacements. + + Analyzes the replacement scheme and removes variables that have the same + value across all tiles, directly setting them in the operator representation + instead. This optimization reduces memory usage and improves performance. + + Parameters + ---------- + scheme : VariableReplacementScheme + The original variable replacement scheme to optimize. + operatorRepresentation : OperatorRepresentation + The operator representation that will be updated with constant values. + + Returns + ------- + Tuple[VariableReplacementScheme, Dict] + A tuple containing: + - The minimized VariableReplacementScheme with only non-constant variables + - A dictionary of updates to apply to the operator representation + + Notes + ----- + Variables with identical values across all tiles are considered constants + and are removed from the replacement scheme. Their single value is set + directly in the operator representation. + """ newPerTileRep = {} newRepTypes = {} @@ -175,6 +484,48 @@ def minimizeVariableReplacement( def minimizeRectangle(rect: HyperRectangle, referenceShape: Sequence[int]) -> Tuple[HyperRectangle, Tuple[int, ...]]: + """ + Minimize a hyperrectangle by collapsing dimensions where possible. + + Reduces the dimensionality of a hyperrectangle by merging consecutive + dimensions where the rectangle spans the entire reference shape. This + optimization is useful for memory transfers and reduces complexity. + + Parameters + ---------- + rect : HyperRectangle + The hyperrectangle to minimize. + referenceShape : Sequence[int] + The shape of the reference tensor that the rectangle is within. + + Returns + ------- + Tuple[HyperRectangle, Tuple[int, ...]] + A tuple containing: + - The minimized HyperRectangle with collapsed dimensions + - The corresponding minimized reference shape + + Raises + ------ + AssertionError + If the rectangle offset is non-zero when dimensions match the + reference shape (indicating the rectangle spans the full dimension). + + Notes + ----- + Dimensions are collapsed from right to left. When a rectangle dimension + equals the reference dimension and has zero offset, it can be merged + with adjacent dimensions to reduce the overall rank. + + Example + ------- + >>> rect = HyperRectangle((0, 0), (2, 2)) + >>> minimizeRectangle(rect, (4, 4)) + (HyperRectangle(offset=(0, 0), dims=(2, 2)), (2, 4)) + >>> rect = HyperRectangle((0, 0), (2, 2)) + >>> minimizeRectangle(rect, (4, 2)) + (HyperRectangle(offset=(0,), dims=(4,)), (8,)) + """ minRectShape: List[int] = [] minRectOffset: List[int] = [] minReferenceShape: List[int] = [] @@ -200,6 +551,37 @@ def minimizeRectangle(rect: HyperRectangle, referenceShape: Sequence[int]) -> Tu def padShape(shape: Tuple[int, ...], rank: int) -> Tuple[int, ...]: + """ + Pad a shape tuple to a target rank by prepending ones. + + Extends a shape tuple to a higher dimensionality by adding leading + dimensions of size 1. This is useful for broadcasting operations + and ensuring consistent tensor ranks. + + Parameters + ---------- + shape : Tuple[int, ...] + The original shape tuple to pad. + rank : int + The target rank (number of dimensions) for the padded shape. + + Returns + ------- + Tuple[int, ...] + The padded shape tuple with leading dimensions of size 1. + + Raises + ------ + AssertionError + If the target rank is smaller than the current shape's rank. + + Examples + -------- + >>> padShape((3, 4), 4) + (1, 1, 3, 4) + >>> padShape((5,), 3) + (1, 1, 5) + """ assert rank >= len( shape), f"Cannot pad to rank smaller then shape's. Received rank: {rank}, shape rank: {len(shape)}" ret = tuple([1] * (rank - len(shape))) + shape @@ -208,6 +590,37 @@ def padShape(shape: Tuple[int, ...], rank: int) -> Tuple[int, ...]: def padOffset(offset: Tuple[int, ...], rank: int) -> Tuple[int, ...]: + """ + Pad an offset tuple to a target rank by prepending zeros. + + Extends an offset tuple to a higher dimensionality by adding leading + offset values of 0. This ensures offset tuples match the rank of + their corresponding shapes. + + Parameters + ---------- + offset : Tuple[int, ...] + The original offset tuple to pad. + rank : int + The target rank (number of dimensions) for the padded offset. + + Returns + ------- + Tuple[int, ...] + The padded offset tuple with leading zeros. + + Raises + ------ + AssertionError + If the target rank is smaller than the current offset's rank. + + Examples + -------- + >>> padOffset((2, 3), 4) + (0, 0, 2, 3) + >>> padOffset((5,), 3) + (0, 0, 5) + """ assert rank >= len( offset), f"Cannot pad to rank smaller then offset's. Received rank: {rank}, offset rank: {len(offset)}" ret = tuple([0] * (rank - len(offset))) + offset @@ -216,6 +629,39 @@ def padOffset(offset: Tuple[int, ...], rank: int) -> Tuple[int, ...]: def padStride(stride: Tuple[int, ...], rank: int, paddingStride: int) -> Tuple[int, ...]: + """ + Pad a stride tuple to a target rank by prepending a specified stride value. + + Extends a stride tuple to a higher dimensionality by adding leading + stride values. This is useful for maintaining consistent stride + calculations across different tensor ranks. + + Parameters + ---------- + stride : Tuple[int, ...] + The original stride tuple to pad. + rank : int + The target rank (number of dimensions) for the padded stride. + paddingStride : int + The stride value to use for padding (prepended dimensions). + + Returns + ------- + Tuple[int, ...] + The padded stride tuple with leading padding stride values. + + Raises + ------ + AssertionError + If the target rank is smaller than the current stride's rank. + + Examples + -------- + >>> padStride((4, 1), 4, 16) + (16, 16, 4, 1) + >>> padStride((1,), 3, 8) + (8, 8, 1) + """ assert rank >= len( stride), f"Cannot pad to rank smaller then stride's. Received rank: {rank}, stride rank: {len(stride)}" ret = tuple([paddingStride] * (rank - len(stride))) + stride @@ -224,6 +670,36 @@ def padStride(stride: Tuple[int, ...], rank: int, paddingStride: int) -> Tuple[i def stridesFromShape(shape: Sequence[int]) -> Tuple[int, ...]: + """ + Calculate memory strides from a tensor shape. + + Computes the stride values for each dimension of a tensor based on its + shape. Strides represent the number of elements to skip in memory when + moving one position along each dimension. + + Parameters + ---------- + shape : Sequence[int] + The shape of the tensor as a sequence of dimension sizes. + + Returns + ------- + Tuple[int, ...] + The stride values for each dimension, where the last dimension + has stride 1 and earlier dimensions have progressively larger strides. + + Notes + ----- + Strides are computed assuming row-major (C-style) memory layout. + The stride for dimension i is the product of all dimensions after i. + + Examples + -------- + >>> stridesFromShape([2, 3, 4]) + (12, 4, 1) + >>> stridesFromShape([5, 6]) + (6, 1) + """ strides = [1] * len(shape) for idx, dim in enumerate(reversed(shape[1:])): strides[idx + 1] = strides[idx] * dim @@ -231,18 +707,114 @@ def stridesFromShape(shape: Sequence[int]) -> Tuple[int, ...]: def calculateFlatOffset(offsets: Sequence[int], strides: Sequence[int]) -> int: + """ + Calculate the flat memory offset from multi-dimensional coordinates. + + Converts multi-dimensional tensor coordinates (offsets) to a single + flat memory offset using the provided stride information. This is + essential for translating tensor indices to memory addresses. + + Parameters + ---------- + offsets : Sequence[int] + The multi-dimensional coordinates/offsets in each dimension. + strides : Sequence[int] + The stride values for each dimension. + + Returns + ------- + int + The flat memory offset corresponding to the multi-dimensional position. + + Raises + ------ + AssertionError + If offsets and strides have different numbers of dimensions. + + Notes + ----- + The flat offset is computed as the sum of (offset[i] * stride[i]) + for all dimensions i. + + Examples + -------- + >>> calculateFlatOffset([1, 2, 3], [12, 4, 1]) + 23 + >>> calculateFlatOffset([0, 1], [6, 1]) + 1 + """ assert len(offsets) == len(strides), \ f"Offsets and strides have to have the same number of dimensions. Length offsets: {len(offsets)}, strides: {len(strides)}" return sum(offset * stride for offset, stride in zip(offsets, strides)) def calculateFlatOffsetInBytes(tile: HyperRectangle, referenceBuffer: VariableBuffer) -> int: + """ + Calculate the flat memory offset in bytes for a hyperrectangle tile. + + Computes the byte offset in memory for the starting position of a + hyperrectangle tile within a reference buffer. This accounts for + both the multi-dimensional positioning and the data type size. + + Parameters + ---------- + tile : HyperRectangle + The hyperrectangle tile whose offset should be calculated. + referenceBuffer : VariableBuffer + The reference buffer containing the tile, used for shape and type info. + + Returns + ------- + int + The flat memory offset in bytes from the buffer start to the tile start. + + Notes + ----- + The calculation combines multi-dimensional offset computation with + data type width to produce a byte-level memory offset. + """ return int( calculateFlatOffset(tile.offset, stridesFromShape(referenceBuffer.shape)) * (referenceBuffer._type.referencedType.typeWidth // 8)) def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRectangle]: + """ + Compute hyperrectangle tiles for a memory transfer operation. + + Generates a list of hyperrectangle tiles that partition the source tensor + into smaller chunks that fit within the destination memory constraints. + This is fundamental for tiled execution where large tensors are processed + in smaller, memory-efficient pieces. + + Parameters + ---------- + memoryTransfer : MemoryTransfer + The memory transfer operation defining source and destination constraints. + + Returns + ------- + List[HyperRectangle] + A list of hyperrectangle tiles that cover the entire source tensor, + each fitting within the destination memory constraints. + + Raises + ------ + AssertionError + If source or destination shapes are undefined, if they have different + numbers of dimensions, or if any destination dimension is larger than + the corresponding source dimension. + + Notes + ----- + The tiling algorithm generates non-overlapping tiles that completely + cover the source tensor. Each tile is sized to fit within the destination + memory constraints, with edge tiles potentially being smaller to fit + exactly within the source tensor boundaries. + + The tiles are generated in row-major order, iterating through dimensions + from outermost to innermost. + """ assert memoryTransfer.source.shape is not None, "Source transfer shape cannot be undefined!" assert memoryTransfer.destination.shape is not None, "Destination transfer shape cannot be undefined!" @@ -256,6 +828,19 @@ def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRect assert dimSizeSmall <= dimSizeLarge, f"smallShape[{dimIdx}] should not be bigger then largeShape[{dimIdx}]. ({dimSizeSmall} > {dimSizeLarge})" def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]: + """ + Generate tile indices in row-major order. + + Parameters + ---------- + tileIndexEnd : List[int] + The end index for each dimension (exclusive). + + Yields + ------ + List[int] + Successive tile indices covering the entire index space. + """ tileCount = np.prod(tileIndexEnd) tileIndex = [0] * len(tileIndexEnd) for _ in range(tileCount): diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index b7f3535790..71f632cbd2 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -50,6 +50,8 @@ elseif(DEEPLOY_ARCH STREQUAL SNITCH) add_subdirectory(Platforms/Snitch) elseif(DEEPLOY_ARCH STREQUAL CHIMERA) add_subdirectory(Platforms/Chimera) +elseif(DEEPLOY_ARCH STREQUAL SPATZ) + add_subdirectory(Platforms/Spatz) elseif(platform STREQUAL GAP9) # Search for hex files generated by Python code generator diff --git a/DeeployTest/Platforms/Spatz/CMakeLists.txt b/DeeployTest/Platforms/Spatz/CMakeLists.txt new file mode 100644 index 0000000000..6af333af2b --- /dev/null +++ b/DeeployTest/Platforms/Spatz/CMakeLists.txt @@ -0,0 +1,23 @@ +set(ProjectId ${TESTNAME}) + +file(GLOB_RECURSE SOURCES + main.c +) + +list(APPEND SOURCES + ${SPATZ_HOME}/sw/spatzBenchmarks/benchmark/benchmark.c +) + +add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES}) + +set(SPATZ_BENCHMARK_INCLUDE_DIR + ${SPATZ_HOME}/sw/spatzBenchmarks/include +) +target_include_directories(${ProjectId} PRIVATE ${SPATZ_BENCHMARK_INCLUDE_DIR}) +target_include_directories(network PUBLIC ${SPATZ_BENCHMARK_INCLUDE_DIR}) + +target_link_libraries(${ProjectId} PRIVATE network deeploylib) +target_compile_options(${ProjectId} INTERFACE network) + +add_spatz_gvsoc_emulation(${ProjectId} "spatz_v2") +add_spatz_vsim_simulation(${ProjectId}) \ No newline at end of file diff --git a/DeeployTest/Platforms/Spatz/main.c b/DeeployTest/Platforms/Spatz/main.c new file mode 100644 index 0000000000..4a413b48ed --- /dev/null +++ b/DeeployTest/Platforms/Spatz/main.c @@ -0,0 +1,105 @@ + +#include +#include +#include +#include "printf.h" + +#include "Network.h" +#include "testinputs.h" +#include "testoutputs.h" + +#ifndef DEEPLOY_ZERO_COPY_TEST_INPUTS +#define DEEPLOY_ZERO_COPY_TEST_INPUTS 1 +#endif + +// Optional: some generated networks provide this helper to avoid copying +// test inputs into Deeploy-owned buffers. +#ifndef DEEPLOYNETWORK_HAS_BIND_EXTERNAL_INPUTS +void DeeployNetwork_BindExternalInputs(void **external_inputs) __attribute__((weak)); +#endif + + +int main() { + const unsigned int core_id = snrt_cluster_core_idx(); + unsigned int timer_start, timer_end, timer; + + if (core_id == 0) printf("[INFO] Running on %d cores\n", snrt_cluster_core_num()); + if (snrt_is_dm_core()){printf("[INFO] DM core is core number %d\n", core_id);} + snrt_cluster_hw_barrier(); + + // do it only with one of the two spatz cores + if (snrt_is_dm_core()){ + printf("Initializing network...\r\n"); + InitNetwork(0, 1); + + // printf("Copying inputs to l3 buffer...\r\n"); +#if DEEPLOY_ZERO_COPY_TEST_INPUTS + if (DeeployNetwork_BindExternalInputs) { + DeeployNetwork_BindExternalInputs(testInputVector); + } else { + for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { + memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]); + } + } +#else + for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { + memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]); + } +#endif + + printf("Running network...\r\n"); + } + snrt_cluster_hw_barrier(); + + if (snrt_is_dm_core()){ timer_start = benchmark_get_cycle(); } + RunNetwork(core_id, 2); + + snrt_cluster_hw_barrier(); + + if (snrt_is_dm_core()){ + timer_end = benchmark_get_cycle(); + timer = timer_end - timer_start; + + printf("Network ran in %d cycles.\r\nChecking Outputs...\r\n", timer); + int32_t tot_err = 0; + uint32_t tot = 0; + OUTPUTTYPE diff; + OUTPUTTYPE expected, actual; + + for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) { + tot += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); + for (uint32_t i = 0; + i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { + expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; + actual = ((OUTPUTTYPE *)DeeployNetwork_outputs[buf])[i]; + diff = expected - actual; + +#if ISOUTPUTFLOAT == 1 + // RUNWANG: Allow margin of error for float32_t + // MATTIA: if diff is a quiet nan 0x7FC00000 we want to error + if ((diff < -1e-4f) || (diff > 1e-4f) || *(uint32_t*)&diff == 0x7FC00000) { + tot_err += 1; + // printf("Expected: %f Actual: %f Diff: %f at Index %12u in Output %u\r\n", expected, actual, diff, i, buf); + printf("Expected: 0x%08x Actual: 0x%08x Diff: 0x%08x at Index %4u in Output %u\r\n", *(uint32_t*)&expected, *(uint32_t*)&actual, *(uint32_t*)&diff, i, buf); + } +#else + // RUNWANG: No margin for integer comparison + if (diff != 0) { + tot_err += 1; + printf("Expected: %4d ", expected); + printf("Actual: %4d ", actual); + printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); + } +#endif + } + } + + printf("Errors: %d out of %d \r\n", tot_err, tot); + } + + printf("core %d arrived at the end\r\n", core_id); + snrt_cluster_hw_barrier(); + printf("We are after hw barrier\r\n"); + + return 0; +} diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz new file mode 100644 index 0000000000..eb073685c7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx new file mode 100644 index 0000000000..c20c89bd05 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz new file mode 100644 index 0000000000..ed786d2e1d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Gather/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz new file mode 100644 index 0000000000..afc11e34d7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx new file mode 100644 index 0000000000..94e265be97 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/network.onnx @@ -0,0 +1,13 @@ +pytorch2.7.0:^ +& +VA/Softmax"Softmax* +axis  +main_graphZ +V +  + +b +A +  + +B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz new file mode 100644 index 0000000000..f5f6daea15 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular2D/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/inputs.npz new file mode 100644 index 0000000000..cf71086f0a Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/network.onnx new file mode 100644 index 0000000000..25a9df5ce5 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/outputs.npz new file mode 100644 index 0000000000..6fb79e45c5 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopK/TopK128L2048/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/inputs.npz new file mode 100644 index 0000000000..9c1eb8c9be Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/network.onnx new file mode 100644 index 0000000000..d50f25b9a9 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/network.onnx @@ -0,0 +1,35 @@ +pytorch2.7.0:Í +) +Q +K/MatMul_output_0/MatMul"MatMul +E +/MatMul_output_0/Softmax_output_0/Softmax"Softmax* +axis  +, +/Softmax_output_0 +VA /MatMul_1"MatMul +main_graphZ +Q +  + +@Z +K +  +@ +€Z +V +  +€ +@b +A +  + +@j# +/MatMul_output_0 +  + +€j$ +/Softmax_output_0 +  + +€B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/outputs.npz new file mode 100644 index 0000000000..15750ef660 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/DenseAttention_1.64.2048/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/inputs.npz new file mode 100644 index 0000000000..9c1eb8c9be Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/network.onnx new file mode 100644 index 0000000000..3d8856b104 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/outputs.npz new file mode 100644 index 0000000000..f2d40944bd Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k10/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/inputs.npz new file mode 100644 index 0000000000..9c1eb8c9be Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/network.onnx new file mode 100644 index 0000000000..1cfad6347b Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/outputs.npz new file mode 100644 index 0000000000..7ff1584247 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k128/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/inputs.npz new file mode 100644 index 0000000000..9c1eb8c9be Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/network.onnx new file mode 100644 index 0000000000..10882f4c35 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/outputs.npz new file mode 100644 index 0000000000..7ff1584247 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k256/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/inputs.npz new file mode 100644 index 0000000000..9c1eb8c9be Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/network.onnx new file mode 100644 index 0000000000..e0a9db3b16 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/outputs.npz new file mode 100644 index 0000000000..2a25c8d109 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k32/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/inputs.npz new file mode 100644 index 0000000000..9c1eb8c9be Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/network.onnx new file mode 100644 index 0000000000..5ecf54f7e7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/outputs.npz new file mode 100644 index 0000000000..7ff1584247 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k512/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/inputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/inputs.npz new file mode 100644 index 0000000000..9c1eb8c9be Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/network.onnx b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/network.onnx new file mode 100644 index 0000000000..ce16e85054 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/outputs.npz b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/outputs.npz new file mode 100644 index 0000000000..76aacbb6e0 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/TopKAttention/TopKAttention_1.64.2048_k64/outputs.npz differ diff --git a/DeeployTest/deeployRunner_spatz.py b/DeeployTest/deeployRunner_spatz.py new file mode 100644 index 0000000000..5404defc13 --- /dev/null +++ b/DeeployTest/deeployRunner_spatz.py @@ -0,0 +1,12 @@ +import sys + +from testUtils.deeployRunner import main + +if __name__ == "__main__": + sys.exit( + main( + default_platform = "Spatz", + default_simulator = "gvsoc", + tiling_enabled = False, + ) + ) diff --git a/DeeployTest/deeployRunner_tiled_spatz.py b/DeeployTest/deeployRunner_tiled_spatz.py new file mode 100644 index 0000000000..6900d7010e --- /dev/null +++ b/DeeployTest/deeployRunner_tiled_spatz.py @@ -0,0 +1,12 @@ +import sys + +from testUtils.deeployRunner import main + +if __name__ == "__main__": + sys.exit( + main( + default_platform = "Spatz", + default_simulator = "gvsoc", + tiling_enabled = True, + ) + ) diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 01216984af..ec9fb3a7ce 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -123,6 +123,7 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg if args.doublebuffer: assert args.defaultMemLevel in ["L3", "L2"] if args.defaultMemLevel == "L3": + # for double buffering on spatz set this to DBTiler and pass --doublebuffer to deeployRunner_spatz deployer = TilerDeployerWrapper(deployer, DBOnlyL3Tiler, testName = testIdentifier, workDir = args.dumpdir) else: deployer = TilerDeployerWrapper(deployer, DBTiler, testName = testIdentifier, workDir = args.dumpdir) @@ -250,11 +251,20 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg test_inputs = [test_inputs[0]] test_outputs = [test_outputs[-2]] - # Instantiate Classes Requried for Memory Level Annotation Extension - L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000) - L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2) - L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1) - memoryLevels = [L3, L2, L1] + # Instantiate Classes Required for Memory Level Annotation Extension + if args.platform == "Spatz": + # Spatz cluster has only TCDM (L1) + external DRAM (L3). No on-chip L2. + # Declare L1 and L3 as direct neighbours so BFS-based tile-path + # generation does not insert a phantom L2 staging buffer. + L3 = MemoryLevel(name = "L3", neighbourNames = ["L1"], size = 64000000) + L1 = MemoryLevel(name = "L1", neighbourNames = ["L3"], size = args.l1) + memoryLevels = [L3, L1] + else: + L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000) + L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = args.l2) + L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = args.l1) + memoryLevels = [L3, L2, L1] + if args.neureka_wmem: memoryLevels.append(MemoryLevel(name = "WeightMemory_SRAM", neighbourNames = [], size = 4 * 1024 * 1024)) diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py index 39a44d9442..d34ee25e74 100644 --- a/DeeployTest/testUtils/codeGenerate.py +++ b/DeeployTest/testUtils/codeGenerate.py @@ -145,6 +145,25 @@ def generateTestNetworkHeader(deployer: NetworkDeployer) -> str: return retStr +def _generateBindExternalInputs(deployer: NetworkDeployer) -> str: + """Generate a bind function for all global network input buffers.""" + inputs = deployer.inputs() + + retStr = "void DeeployNetwork_BindExternalInputs(void **external_inputs) {\n" + retStr += " // NOTE: This is a hack to avoid the memcpy in main.c from \n" + retStr += " // testInputVector to DeeployNetwork_inputs, since they are both in L3\n" + + for index, node in enumerate(inputs): + typeName = node._type.referencedType.typeName + retStr += f" DeeployNetwork_input_{index} = ({typeName} *)external_inputs[{index}];\n" + + for index in range(len(inputs)): + retStr += f" DeeployNetwork_inputs[{index}] = (void *)DeeployNetwork_input_{index};\n" + + retStr += "}\n" + return retStr + + def generateTestNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: CodeGenVerbosity) -> str: retStr = "" @@ -198,6 +217,9 @@ def generateTestNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: C } """ + # TODO: make this work only for spatz and with the correct number of unputs every time + retStr += _generateBindExternalInputs(deployer) + return retStr diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 1dcddeea62..cdbd0af3db 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -6,6 +6,7 @@ import shutil import subprocess import sys +import threading from pathlib import Path from Deeploy.Logging import DEFAULT_LOGGER as log @@ -191,15 +192,43 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult: log.debug(f"[Execution] Simulation command: {' '.join(cmd)}") - result = subprocess.run(cmd, capture_output = True, text = True, env = env) - - if result.stdout: - print(result.stdout, end = '') - if result.stderr: - print(result.stderr, end = '', file = sys.stderr) + process = subprocess.Popen( + cmd, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, + text = True, + env = env, + bufsize = 1, + ) + + stdout_chunks = [] + stderr_chunks = [] + + def _stream_reader(pipe, chunks, is_stderr: bool = False) -> None: + assert pipe is not None + for line in iter(pipe.readline, ''): + chunks.append(line) + if is_stderr: + print(line, end = '', file = sys.stderr, flush = True) + else: + print(line, end = '', flush = True) + pipe.close() + + stdout_thread = threading.Thread(target = _stream_reader, args = (process.stdout, stdout_chunks), daemon = True) + stderr_thread = threading.Thread(target = _stream_reader, args = (process.stderr, stderr_chunks, True), daemon = True) + + stdout_thread.start() + stderr_thread.start() + + returncode = process.wait() + stdout_thread.join() + stderr_thread.join() + + stdout = ''.join(stdout_chunks) + stderr = ''.join(stderr_chunks) # Parse output for error count and cycles - test_result = parse_test_output(result.stdout, result.stderr) + test_result = parse_test_output(stdout, stderr) if not test_result.success and test_result.error_count == -1: log.warning(f"Could not parse error count from output") diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index a5a8d70ef3..0c98e254aa 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -348,6 +348,7 @@ def main(default_platform: Optional[str] = None, "snitch": "Snitch", "chimera": "Chimera", "softhier": "SoftHier", + "spatz": "Spatz", } if args.platform: @@ -388,6 +389,7 @@ def main(default_platform: Optional[str] = None, "Snitch": "gvsoc", "Chimera": "gvsoc", "SoftHier": "gvsoc", + "Spatz": "vsim", } simulator = simulator_map.get(platform, "host") log.info(f"No simulator specified, using default for {platform}: {simulator}") diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 9d526906f9..69a83f1e8d 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -10,6 +10,8 @@ from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper +from Deeploy.Targets.Spatz.Deployer import SpatzDeployer +from Deeploy.Targets.Spatz.Platform import SpatzOptimizer, SpatzPlatform from Deeploy.Targets.Chimera.Deployer import ChimeraDeployer from Deeploy.Targets.Chimera.Platform import ChimeraOptimizer, ChimeraPlatform from Deeploy.Targets.CortexM.Deployer import CMSISDeployer @@ -31,7 +33,7 @@ from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9", "Spatz"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -76,6 +78,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Chimera": Platform = ChimeraPlatform() + elif platformName == "Spatz": + Platform = SpatzPlatform() + else: raise RuntimeError(f"Deployment platform {platformName} is not implemented") @@ -272,6 +277,18 @@ def mapDeployer(platform: DeploymentPlatform, name = name, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) + + elif isinstance(platform, (SpatzPlatform)): + deployer = SpatzDeployer( + graph, + platform, + inputTypes, + SpatzOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir + ) else: raise RuntimeError(f"Deployer for platform {platform} is not implemented") diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 6d9f3cfcd7..6be4bef197 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -110,6 +110,14 @@ def param_id(param): "model_tests": SNITCH_MODEL_TESTS, "default_num_cores": SNITCH_DEFAULT_NUM_CORES, }, + "spatz": { + "platform": "Spatz", + "simulator": "vsim", + # TODO: Define KERNEL_TESTS and MODEL_TESTS for Spatz + "kernel_tests": [], + "model_tests": [], + # "default_num_cores": , + }, "gap9": { "platform": "GAP9", "simulator": "gvsoc", diff --git a/Makefile b/Makefile index d40a49da11..49b04baeb2 100644 --- a/Makefile +++ b/Makefile @@ -27,10 +27,12 @@ PICOLIBC_RV32IMF_INSTALL_DIR ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imf CHIMERA_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/chimera-sdk PULP_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/pulp-sdk SNITCH_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/snitch_cluster +SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/spatz QEMU_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/qemu BANSHEE_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/banshee MEMPOOL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/mempool GVSOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc +GVSOC_SPATZ_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/gvsoc_spatz SOFTHIER_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/softhier MINIMALLOC_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/minimalloc XTL_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/xtl @@ -44,8 +46,10 @@ PICOLIBC_COMMIT_HASH ?= 31ff1b3601b379e4cab63837f253f59729ce1fef PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2 +SPATZ_COMMIT_HASH ?= 6bd9f3094e237dab392983edb827105bce8e3e86 SOFTHIER_COMMIT_HASH ?= 0 # bowwang: to be updated -GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 +# GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 # old +GVSOC_COMMIT_HASH ?= 209c147cbd293d5c1590694e68c489122c777acc # new MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea XTL_VERSION ?= 0.7.5 @@ -69,7 +73,7 @@ else $(error unsupported platform $(OS)) endif -all: toolchain emulators docs echo-bash +all: toolchain emulators # docs echo-bash echo-bash: @@ -79,8 +83,10 @@ echo-bash: @echo "export PULP_SDK_HOME=${PULP_SDK_INSTALL_DIR}" @echo "export CHIMERA_SDK_HOME=${CHIMERA_SDK_INSTALL_DIR}" @echo "export SNITCH_HOME=${SNITCH_INSTALL_DIR}" + @echo "export SPATZ_HOME=${SPATZ_INSTALL_DIR}" @echo "export GVSOC_INSTALL_DIR=${GVSOC_INSTALL_DIR}" @echo "export SOFTHIER_INSTALL_DIR=${SOFTHIER_INSTALL_DIR}" + @echo "export BANSHEE_INSTALL_DIR=${BANSHEE_INSTALL_DIR}" @echo "export LLVM_INSTALL_DIR=${LLVM_INSTALL_DIR}" @echo "export MEMPOOL_HOME=${MEMPOOL_INSTALL_DIR}" @echo "export CMAKE=$$(which cmake)" @@ -91,9 +97,9 @@ echo-bash: @echo "source ${PULP_SDK_INSTALL_DIR}/configs/siracusa.sh" -toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv +toolchain: llvm llvm-compiler-rt-riscv llvm-compiler-rt-arm picolibc-arm picolibc-riscv xtensor minimalloc # xtensor needed for gvsoc, minimalloc for tiling -emulators: snitch_runtime pulp-sdk qemu banshee mempool +emulators: snitch_runtime spatz_runtime pulp-sdk qemu banshee mempool gvsoc ${TOOLCHAIN_DIR}/llvm-project: cd ${TOOLCHAIN_DIR} && \ @@ -124,6 +130,7 @@ ${LLVM_INSTALL_DIR}: ${TOOLCHAIN_DIR}/llvm-project llvm: ${LLVM_INSTALL_DIR} +# runtimes for different architectures ${LLVM_CLANG_RT_RISCV_RV32IM}: ${TOOLCHAIN_DIR}/llvm-project cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build-compiler-rt-riscv-rv32im \ && cd build-compiler-rt-riscv-rv32im; \ @@ -429,16 +436,55 @@ ${SNITCH_INSTALL_DIR}: ${TOOLCHAIN_DIR}/snitch_cluster snitch_runtime: ${SNITCH_INSTALL_DIR} +${TOOLCHAIN_DIR}/spatz: + cd ${TOOLCHAIN_DIR} && \ + git clone https://github.com/pulp-platform/spatz.git && \ + cd ${TOOLCHAIN_DIR}/spatz && git checkout ${SPATZ_COMMIT_HASH} && \ + git submodule update --init --recursive + +${SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/spatz + mkdir -p ${SPATZ_INSTALL_DIR} + cp -r ${TOOLCHAIN_DIR}/spatz/ ${SPATZ_INSTALL_DIR}/../ + cd ${SPATZ_INSTALL_DIR} && \ + make all -j8 && \ + python3.6 -m venv .venv && \ + .venv/bin/pip install jsonref jsonschema jstyleson dataclasses hjson mako && \ + source .venv/bin/activate && \ + source util/iis-env.sh && \ + make init && \ + cd hw/system/spatz_cluster/ && \ + make sw + +spatz_runtime: ${SPATZ_INSTALL_DIR} + +# ${TOOLCHAIN_DIR}/gvsoc_spatz: +# cd ${TOOLCHAIN_DIR} && \ +# git clone https://github.com/gvsoc/gvsoc.git gvsoc_spatz && \ +# cd ${TOOLCHAIN_DIR}/gvsoc_spatz && git checkout ${GVSOC_SPATZ_COMMIT_HASH} && \ +# git submodule update --init --recursive && \ +# python3 -m venv venv && source venv/bin/activate &&\ +# pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil && \ +# cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch +# +# +# ${GVSOC_SPATZ_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc_spatz +# cd ${TOOLCHAIN_DIR}/gvsoc_spatz && \ +# source venv/bin/activate &&\ +# CXX=g++-11.2.0 CC=gcc-11.2.0 CMAKE=cmake-3.18.1 make all TARGETS=spatz_v2 INSTALLDIR=${GVSOC_SPATZ_INSTALL_DIR} +# +# gvsoc_spatz: ${GVSOC_SPATZ_INSTALL_DIR} + ${TOOLCHAIN_DIR}/gvsoc: cd ${TOOLCHAIN_DIR} && \ git clone https://github.com/gvsoc/gvsoc.git && \ cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \ git submodule update --init --recursive && \ - pip install -r core/requirements.txt && pip install -r gapy/requirements.txt + pip3 install -r core/requirements.txt && pip3 install -r gapy/requirements.txt && pip3 install psutil &&\ + cd core && git apply ${TOOLCHAIN_DIR}/gvsoc.patch ${GVSOC_INSTALL_DIR}: ${TOOLCHAIN_DIR}/gvsoc cd ${TOOLCHAIN_DIR}/gvsoc && \ - XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera" build INSTALLDIR=${GVSOC_INSTALL_DIR} + XTENSOR_INSTALL_DIR=${XTENSOR_INSTALL_DIR}/include XTL_INSTALL_DIR=${XTL_INSTALL_DIR}/include XSIMD_INSTALL_DIR=${XSIMD_INSTALL_DIR}/include make all TARGETS="pulp.snitch.snitch_cluster_single siracusa chimera spatz_v2" build INSTALLDIR=${GVSOC_INSTALL_DIR} gvsoc: ${GVSOC_INSTALL_DIR} @@ -504,7 +550,7 @@ ${QEMU_INSTALL_DIR}: ${TOOLCHAIN_DIR}/qemu cd ${TOOLCHAIN_DIR}/qemu/ && \ mkdir -p build && cd build && \ ../configure --target-list=arm-softmmu,arm-linux-user,riscv32-softmmu,riscv32-linux-user \ - --prefix=${QEMU_INSTALL_DIR} && \ + --prefix=${QEMU_INSTALL_DIR} --disable-werror && \ make -j && \ make install @@ -543,7 +589,7 @@ ${TOOLCHAIN_DIR}/minimalloc: cd ${TOOLCHAIN_DIR} && \ git clone --recursive https://github.com/google/minimalloc.git && \ cd ${TOOLCHAIN_DIR}/minimalloc && git checkout ${MINIMALLOC_COMMMIT_HASH} && \ - cmake -DCMAKE_BUILD_TYPE=Release && make -j && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=17 && make -j && \ mkdir -p ${MINIMALLOC_INSTALL_DIR} && cp minimalloc ${MINIMALLOC_INSTALL_DIR} ${CHIMERA_SDK_INSTALL_DIR}: diff --git a/TargetLibraries/Spatz/CMakeLists.txt b/TargetLibraries/Spatz/CMakeLists.txt new file mode 100644 index 0000000000..ef0fd63ab8 --- /dev/null +++ b/TargetLibraries/Spatz/CMakeLists.txt @@ -0,0 +1,18 @@ +file(GLOB_RECURSE SOURCES + "src/**" +) + +list(APPEND SOURCES + ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c +) + +include(cmake/spatz-runtime-precompiled.cmake) + +add_deeploy_library(deeployspatz STATIC ${SOURCES}) +target_include_directories(deeployspatz + PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/inc +) +target_include_directories(deeployspatz PRIVATE ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel) +target_include_directories(deeployspatz SYSTEM PUBLIC ${SPATZ_RUNTIME_INCLUDE}) +target_link_libraries(deeployspatz INTERFACE spatz-runtime) diff --git a/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake new file mode 100644 index 0000000000..42e15e1b31 --- /dev/null +++ b/TargetLibraries/Spatz/cmake/spatz-runtime-precompiled.cmake @@ -0,0 +1,27 @@ + +set(SPATZ_RUNTIME_BASE_INCLUDE + ${SPATZ_HOME}/sw/snRuntime/include + ${SPATZ_HOME}/sw/snRuntime/vendor + ${SPATZ_HOME}/sw/toolchain/riscv-opcodes +) + +set(SPATZ_CLUSTER_LINK_INCLUDE + ${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime +) + +set(SPATZ_LINKER_SCRIPT ${SPATZ_HOME}/hw/system/spatz_cluster/sw/build/snRuntime/common.ld) +# set(SPATZ_LINKER_SCRIPT ${SNITCH_RUNTIME_HOME}/base.ld) +if(NOT EXISTS ${SPATZ_LINKER_SCRIPT}) + message(FATAL_ERROR "Spatz linker script not found: ${SPATZ_LINKER_SCRIPT}") +endif() + +set(SPATZ_CLUSTER_LINK_OPTIONS + -Wl,--gc-sections + -T ${SPATZ_LINKER_SCRIPT} +) + +set(SPATZ_RUNTIME_INCLUDE ${SPATZ_RUNTIME_BASE_INCLUDE}) + +add_library(spatz-runtime INTERFACE) +target_link_directories(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_INCLUDE}) +target_link_libraries(spatz-runtime INTERFACE ${SPATZ_CLUSTER_LINK_OPTIONS} libsnRuntime-cluster.a) diff --git a/TargetLibraries/Spatz/inc/DeeploySpatzMath.h b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h new file mode 100644 index 0000000000..0157d8d966 --- /dev/null +++ b/TargetLibraries/Spatz/inc/DeeploySpatzMath.h @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_SPATZ_MATH_HEADER_ +#define __DEEPLOY_SPATZ_MATH_HEADER_ + +#include +#include + +#include "DeeployBasicMath.h" + +void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t O); + +void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size, + int32_t last_dim_length); + + +void compute_topk_min_heap( uint32_t k, uint32_t n, float32_t *data_in, float32_t *heap_values, int32_t *heap_indices); + + +#define BEGIN_SINGLE_CORE if (core_id == 0) { +#define END_SINGLE_CORE } +#define SINGLE_CORE if (core_id == 0) + +#endif // __DEEPLOY_SPATZ_MATH_HEADER_ diff --git a/TargetLibraries/Spatz/inc/Util.h b/TargetLibraries/Spatz/inc/Util.h new file mode 100644 index 0000000000..893d687fa1 --- /dev/null +++ b/TargetLibraries/Spatz/inc/Util.h @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SPATZ_UTIL_H +#define SPATZ_UTIL_H + +void spatz_util_dummy(void); + +#endif // SPATZ_UTIL_H diff --git a/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c new file mode 100644 index 0000000000..1caf2c3004 --- /dev/null +++ b/TargetLibraries/Spatz/src/MatMul_fp32_spatz.c @@ -0,0 +1,154 @@ +#include "DeeploySpatzMath.h" +#include +#include + +// functions defined in ${SPATZ_HOME}/sw/spatzBenchmarks/sp-fmatmul/kernel/sp-fmatmul.c +// they calculate matrix matrix multiplication +void matmul_2xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + + +void matmul_4xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + + +void matmul_8xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + + +// calculates vector matrix multiplication with Gustavson algorithm +void gemv_v32b_m4(float *a, float *b, float *c, int N, int local_P, int total_P) { + unsigned int p = 0; + // Loop only up to the local number of columns assigned to this core + while (p < (unsigned int)local_P) { + size_t gvl; + asm volatile("vsetvli %0, %1, e32, m8, ta, ma" + : "=r"(gvl) : "r"((unsigned int)local_P - p)); + + const float *b_ = b + p; + asm volatile("vle32.v v8, (%0)" ::"r"(b_)); + asm volatile("vfmul.vf v16, v8, %0" ::"f"(a[0])); + + for (int row = 1; row < N; row++) { + // CRITICAL: Must skip the TOTAL width of the matrix to reach the next row + b_ += total_P; + asm volatile("vle32.v v8, (%0)" ::"r"(b_)); + asm volatile("vfmacc.vf v16, %0, v8" ::"f"(a[row])); + } + + asm volatile("vse32.v v16, (%0)" ::"r"(c + p)); + p += gvl; + } +} + +// calculates vector matrix multiplication one lement of the result at a time (inner product) +void gemv_col_reduction(float *a, float *b, float *c, int N, int local_P, int total_P) { + // CRITICAL: Stride must use the original TOTAL width of matrix B + ptrdiff_t b_stride = total_P * sizeof(float); + + // Loop only through the columns assigned to this core + for (int col = 0; col < local_P; col++) { + unsigned int row = 0; + + // Clear vector register v0 (takes v0-v7) to accumulate partial products + size_t init_gvl; + asm volatile("vsetvli %0, zero, e32, m8, ta, ma" : "=r"(init_gvl)); + asm volatile("vmv.v.i v0, 0"); + + // Loop through the N elements of the current column + while (row < (unsigned int)N) { + size_t gvl; + asm volatile("vsetvli %0, %1, e32, m8, ta, ma" + : "=r"(gvl) : "r"((unsigned int)N - row)); + + // Pointer uses total_P to correctly jump down the rows + const float *b_ptr = b + (row * total_P) + col; + const float *a_ptr = a + row; + + // Load strided elements from B into v8 (takes v8-v15) + asm volatile("vlse32.v v8, (%0), %1" ::"r"(b_ptr), "r"(b_stride)); + + // Load contiguous elements from A into v16 (takes v16-v23) + asm volatile("vle32.v v16, (%0)" ::"r"(a_ptr)); + + // Multiply and accumulate: v0 = v0 + (v8 * v16) + asm volatile("vfmacc.vv v0, v8, v16"); + + row += gvl; + } + + // --- Reduction Phase --- + asm volatile("vsetvli zero, zero, e32, m8, ta, ma"); + asm volatile("vmv.v.i v24, 0"); + + // Reduce the accumulated vector v0 into the first element of v24 + asm volatile("vfredosum.vs v24, v0, v24"); + + // Store only the single scalar result (1 element) into c[col] + size_t one = 1; + asm volatile("vsetvli zero, %0, e32, m1, ta, ma" :: "r"(one)); + asm volatile("vse32.v v24, (%0)" ::"r"(c + col)); + } +} + +void matmul_vanila(float *a, float *b, float *c, int M, int N, int P) { + for (int i = 0; i < M; i++) { + for (int j = 0; j < P; j++) { + float sum = 0.0f; + for (int k = 0; k < N; k++) { + sum += a[i * N + k] * b[k * P + j]; + } + + c[i * P + j] = sum; + } + } +} + + + +void Spatz_MatMul_fp32_fp32_fp32(const float32_t *__restrict__ a, + const float32_t *__restrict__ b, + float32_t *__restrict__ c, uint32_t M, + uint32_t N, uint32_t P) { + // const unsigned int num_cores = snrt_cluster_core_num(); = 2 for spatz + const unsigned int cid = snrt_cluster_core_idx(); + + if (M == 1) { + // TODO make this be more specific, probably needs to me N>5*P or some other constant + int cols_core0 = P / 2; + int cols_core1 = P - cols_core0; // Safely gets the remainder if P is odd + if (N>4*P){ + if (cid == 0) { + gemv_col_reduction(a, b, c, N, cols_core0, P); + } else { + float *b_offset = b + cols_core0; float *c_offset = c + cols_core0; + gemv_col_reduction(a, b_offset, c_offset, N, cols_core1, P); + } + } else { + if (cid == 0) { + gemv_v32b_m4(a, b, c, N, cols_core0, P); + } else { + float *b_offset = b + cols_core0; float *c_offset = c + cols_core0; + gemv_v32b_m4(a, b_offset, c_offset, N, cols_core1, P); + } + } + } else { + unsigned int p_start, p_end; + if (cid == 0){ p_start = 0; p_end = (P/2); + } else { p_start = (P/2); p_end = P; } + + if (M <= 4) { + matmul_2xVL(c, a, b, 0, M, N, P, p_start, p_end); + } else if (M <= 8) { + matmul_4xVL(c, a, b, 0, M, N, P, p_start, p_end); + } else { + matmul_8xVL(c, a, b, 0, M, N, P, p_start, p_end); + } + } +} diff --git a/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c new file mode 100644 index 0000000000..bf4d24b221 --- /dev/null +++ b/TargetLibraries/Spatz/src/Softmax_fp32_spatz.c @@ -0,0 +1,163 @@ +#include "DeeployBasicMath.h" +#include + +float32_t myexpf(float32_t x){ + const float32_t inv_ln2 = 1.4426950409f; + const float32_t ln2 = 0.6931471806f; + + // Range reduction: x = k * ln(2) + r, with r kept small so the polynomial is accurate. + float32_t scaled = x * inv_ln2; + int32_t k = (int32_t)(scaled + (scaled >= 0.0f ? 0.5f : -0.5f)); + float32_t r = x - ((float32_t)k * ln2); + + float32_t r2 = r * r; + float32_t r3 = r2 * r; + float32_t r4 = r3 * r; + float32_t r5 = r4 * r; + float32_t r6 = r5 * r; + float32_t r7 = r6 * r; + + float32_t poly = 1.0f + r + (r2 * 0.5f) + (r3 * 0.1666666667f) + (r4 * 0.0416666667f) + (r5 * 0.0083333333f) + (r6 * 0.0013888889f) + (r7 * 0.0001984127f); + + return ldexpf(poly, k); +} + +// Type-punning union to safely manipulate IEEE 754 float bits without breaking strict aliasing rules +union float_bits { + float f; + uint32_t i; +}; + +float expf_nodiv_reduced(float x) { + // Mathematical constants + const float LN2_HI = 0.69314575195f; // High bits of ln(2) + const float LN2_LO = 1.4286068203e-6f; // Low bits of ln(2) for quasi-double precision reduction + const float INV_LN2 = 1.4426950408f; // log2(e) = 1/ln(2) + + // Bound limits to prevent float overflow/underflow + if (x > 88.722839f) x = 88.722839f; + if (x < -87.336544f) return 0.0f; + + // 1. Argument Reduction: Find integer k closest to x / ln(2) + // We cast to integer to perform a fast round-to-nearest operation + // int32_t k = (int32_t)(x * INV_LN2 + (x >= 0.0f ? 0.5f : -0.5f)); + float sign_offset = __builtin_copysignf(0.5f, x); + int32_t k = (int32_t)(x * INV_LN2 + sign_offset); + + // Compute residual r = x - k * ln(2) using Cody-Waite reduction to minimize loss of significance + float r = x - ((float)k * LN2_HI) - ((float)k * LN2_LO); + + // 2. Taylor Polynomial Approximation of e^r (Horner's Method) + // Range of r is strictly bounded within [-0.34657, 0.34657] + // Coefficients are 1, 1, 1/2, 1/6, 1/24, 1/120 + float poly = 1.0f + r * (1.0f + r * (0.5f + r * (0.166666671f + r * (0.041666664f + r * 0.008333333f)))); + + // 3. Reconstruction: Generate 2^k via IEEE 754 bit-manipulation + // The exponent field is bits [30:23] with a bias of 127 + int32_t biased_exp = k + 127; + + union float_bits two_to_k; + two_to_k.i = ((uint32_t)biased_exp << 23); // Shift biased integer into the float exponent slot + + // e^x = e^r * 2^k + return poly * two_to_k.f; +} + +// inverse funciton that doesnt use fdiv.s +float32_t myinv(float32_t x){ + uint32_t i = *(uint32_t*)&x; + i = 0x7EEEEEEE - i; + float y = *(float*)&i; + + // Newton-Raphson steps (Multiplication only!) + y = y * (2.0f - x * y); + y = y * (2.0f - x * y); + y = y * (2.0f - x * y); + + return y; +} + +void Spatz_Softmax_fp32_fp32(float32_t *input, float32_t *output, int32_t size, int32_t last_dim_length) { + const unsigned int cid = snrt_cluster_core_idx(); + // two cores divided on the vector lenght + if (size == last_dim_length){ + static float32_t maxval[1]; + if (cid==0){ + float32_t max_val = -inf; + + for (int i = 0; i < last_dim_length; i++) { + if (input[i] > max_val) { max_val = input[i]; } + } + maxval[0] = max_val; + } + + snrt_cluster_hw_barrier(); + + static float32_t partial_sum[2]; + float32_t exp_val = 0.0f; + + if (cid==0){ + float32_t sum_core0 = 0.0f; + for (int i = 0; i < last_dim_length/2; i++) { + exp_val = expf_nodiv_reduced(input[i] - maxval[0]); + output[i] = exp_val; + sum_core0 += exp_val; + } + partial_sum[0] = sum_core0; + } else { + float32_t sum_core1 = 0.0f; + for (int i = last_dim_length/2; i < last_dim_length; i++) { + exp_val = expf_nodiv_reduced(input[i] - maxval[0]); + output[i] = exp_val; + sum_core1 += exp_val; + } + partial_sum[1] = sum_core1; + } + + snrt_cluster_hw_barrier(); + float32_t one_over_sum= 0.0f; + + if (cid == 0){ one_over_sum = myinv(partial_sum[0] + partial_sum[1]); } + snrt_cluster_hw_barrier(); + if (cid == 0){ for (int i = 0; i < last_dim_length; i++) { output[i] *= one_over_sum; } } + snrt_cluster_hw_barrier(); + return; + + } else { + // divide worload betw cores in batches + int32_t batch_size = size / last_dim_length; + unsigned int items_per_core = (batch_size + 1) / 2; + + unsigned int b_start, b_end; + + if (cid == 0) { + b_start = 0; + b_end = items_per_core; + } else { + b_start = items_per_core; + // Core 1 always ends at the total batch size + b_end = batch_size; + } + for (int b = b_start; b < b_end; b++) { + float32_t max_val = -inf; + float sum = 0.0f; + + for (int i = 0; i < last_dim_length; i++) { + if (input[b * last_dim_length + i] > max_val) { + max_val = input[b * last_dim_length + i]; + } + } + + for (int i = 0; i < last_dim_length; i++) { + float32_t exp_val = input[b * last_dim_length + i] - max_val; + output[b * last_dim_length + i] = expf_nodiv_reduced(exp_val); + sum += output[b * last_dim_length + i]; + } + + float32_t sum_1 = myinv(sum); + for (int i = 0; i < last_dim_length; i++) { + output[b * last_dim_length + i] = output[b * last_dim_length + i] * sum_1; + } + } + } +} diff --git a/TargetLibraries/Spatz/src/TopK_fp32_int32_spatz.c b/TargetLibraries/Spatz/src/TopK_fp32_int32_spatz.c new file mode 100644 index 0000000000..3802f330b3 --- /dev/null +++ b/TargetLibraries/Spatz/src/TopK_fp32_int32_spatz.c @@ -0,0 +1,126 @@ +#include "DeeployBasicMath.h" +#include +#include + +/* note: + * heap is stored in a vector + * minimum element is in root of heap (index 0 in the vector) + * left and right of a index are always > than root + */ +static inline __attribute__((always_inline)) void reorder_heap(uint32_t idx, uint32_t size, float32_t *heap_values, int32_t *heap_indices){ + for (;;) { + uint32_t left = 2 * idx + 1; + if (left >= size) { + break; + } + uint32_t smallest = left; + uint32_t right = left + 1; + if (right < size && heap_values[right] < heap_values[left]) { + smallest = right; + } + if (heap_values[smallest] < heap_values[idx]) { + float32_t tmp_val = heap_values[idx]; + int32_t tmp_idx = heap_indices[idx]; + heap_values[idx] = heap_values[smallest]; + heap_indices[idx] = heap_indices[smallest]; + heap_values[smallest] = tmp_val; + heap_indices[smallest] = tmp_idx; + idx = smallest; + } else { + break; + } + } +} + +// heap_value and _indices are arrays i can modify and work with, used as scratchpad, but also as output +void compute_topk_min_heap( uint32_t k, uint32_t n, float32_t *data_in, float32_t *heap_values, int32_t *heap_indices) { + // Initialize heap with first k elements + for (uint32_t i = 0; i < k; ++i) { heap_values[i] = data_in[i]; heap_indices[i] = (int32_t)i; } + + // Build min-heap by reordeing each sub heap starting fomr the smallest ones (k/2-1) to the biggest ones (0) + for (int32_t root = (int32_t)k / 2 - 1; root >= 0; --root) { + reorder_heap(root, k, heap_values, heap_indices); + } + + // Process remaining elements, keeping top k values in the min-heap + for (uint32_t i = k; i < n; ++i) { + float32_t value = data_in[i]; + if (value > heap_values[0]) { + heap_values[0] = value; + heap_indices[0] = (int32_t)i; + + reorder_heap(0, k, heap_values, heap_indices); + } + } + + /* heap sort */ + for (uint32_t i = k-1; i > 0; i--) { + // swap min and max, root and most bottom (biggest) leaf + float32_t root_val = heap_values[0]; float32_t root_idx = heap_indices[0]; + + heap_values[0] = heap_values[i]; heap_indices[0] = heap_indices[i]; + + heap_values[i] = root_val; heap_indices[i] = root_idx; + // reduce size and heapify + reorder_heap(0, i, heap_values, heap_indices); + } + +} + +// finds the k biggest elements from a vector of n elements, and returns them in data_out +void compute_topk_vector_instructions(uint32_t k, uint32_t n, float32_t *data_in, float32_t *data_out, int32_t *indices_out) { + + for (uint32_t i = 0; i < k; i++) { + float32_t global_max = -FLT_MAX; + int32_t global_max_idx = -1; + + uint32_t avl = n; + uint32_t vl; + float32_t *ptr = data_in; + uint32_t current_idx_offset = 0; + + // --- Pass 1: Find the maximum value and its index in the current array --- + while (avl > 0) { + asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl)); + + // Setup scalar helper registers for reduction initialization + float32_t block_max_scalar = -FLT_MAX; + + // Inline assembly to load, reduce, and find the index manually or via step tracking + // v24 will hold the loaded data chunks + asm volatile ( + "vle32.v v24, (%1)\n\t" + "vfmv.s.f v0, %2\n\t" // Init scalar reduction register with -FLT_MAX + "vfredmax.vs v0, v24, v0\n\t" // Find max in this vector block + "vfmv.f.s %0, v0\n\t" // Move block max back to C variable + : "=f"(block_max_scalar) + : "r"(ptr), "f"(-FLT_MAX) + : "v0", "v24" + ); + + // Check if the maximum found in this block beats our global tracker + if (block_max_scalar > global_max) { + // If it does, we sweep the block to catch the exact scalar index position + for (uint32_t j = 0; j < vl; j++) { + if (ptr[j] > global_max) { + global_max = ptr[j]; + global_max_idx = current_idx_offset + j; + } + } + } + + ptr += vl; + current_idx_offset += vl; + avl -= vl; + } + + // Save the found top element metadata to output arrays + data_out[i] = global_max; + indices_out[i] = global_max_idx; + + // --- Pass 2: Mask out the found maximum to prevent re-discovery --- + if (global_max_idx != -1) { + data_in[global_max_idx] = -FLT_MAX; + } + } +} \ No newline at end of file diff --git a/TargetLibraries/Spatz/src/Util.c b/TargetLibraries/Spatz/src/Util.c new file mode 100644 index 0000000000..9c30c11f49 --- /dev/null +++ b/TargetLibraries/Spatz/src/Util.c @@ -0,0 +1,5 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// SPDX-License-Identifier: Apache-2.0 + +// Minimal stub for Spatz runtime linkage +void spatz_util_dummy(void) {} diff --git a/cmake/simulation.cmake b/cmake/simulation.cmake index 55525feedd..983dc0e4ee 100644 --- a/cmake/simulation.cmake +++ b/cmake/simulation.cmake @@ -102,3 +102,19 @@ macro(add_gvsoc_emulation name target) USES_TERMINAL ) endmacro() + +macro(add_spatz_gvsoc_emulation name target) + set(GVSOC_WORKDIR ${CMAKE_BINARY_DIR}/gvsoc_workdir) + make_directory(${GVSOC_WORKDIR}) + set(GVSOC_BINARY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}") + add_custom_target(gvsoc_${name} + DEPENDS ${name} + WORKING_DIRECTORY ${GVSOC_WORKDIR} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/*.bin ${GVSOC_WORKDIR}/ || true + COMMAND bash -c "${GVSOC_INSTALL_DIR}/bin/gvrun --target ${target} --param chip/soc/binary=${GVSOC_BINARY} run" + COMMENT "Simulating deeploytest ${name} with gvsoc for the target ${target}" + POST_BUILD + USES_TERMINAL + VERBATIM + ) +endmacro() diff --git a/cmake/spatz/spatz.cmake b/cmake/spatz/spatz.cmake new file mode 100644 index 0000000000..b715f625c9 --- /dev/null +++ b/cmake/spatz/spatz.cmake @@ -0,0 +1,30 @@ +add_compile_definitions( + DEEPLOY_SPATZ_PLATFORM +) + +set(DEEPLOY_ARCH SPATZ) + +set(num_threads ${NUM_CORES}) + +macro(add_spatz_vsim_simulation name) + add_custom_target(vsim_${name} + WORKING_DIRECTORY ${SPATZ_HOME}/hw/system/spatz_cluster + DEPENDS ${name} + COMMAND ${QUESTA} bin/spatz_cluster.vsim + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name} || true + COMMENT "Simulating deeploytest with vsim (Spatz cluster)" + POST_BUILD + USES_TERMINAL + VERBATIM + ) +endmacro() + +add_compile_options( + -ffast-math +) + +add_link_options( + -ffast-math + -Wl,--gc-sections +) + diff --git a/cmake/spatz/toolchain_llvm.cmake b/cmake/spatz/toolchain_llvm.cmake new file mode 100644 index 0000000000..89f10f1954 --- /dev/null +++ b/cmake/spatz/toolchain_llvm.cmake @@ -0,0 +1,74 @@ + +set(CMAKE_SYSTEM_NAME Generic) + +# Crucial: Point CMake to the specialized Clang toolchain instead of system cc +set(SPATZ_TOOLCHAIN_DIR ${SPATZ_HOME}/sw/toolchain/llvm-project/build/bin) + +set(CMAKE_C_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang) +set(CMAKE_CXX_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang++) +set(CMAKE_ASM_COMPILER ${SPATZ_TOOLCHAIN_DIR}/clang) +set(CMAKE_OBJCOPY ${SPATZ_TOOLCHAIN_DIR}/llvm-objcopy) +set(CMAKE_OBJDUMP ${SPATZ_TOOLCHAIN_DIR}/llvm-objdump) +set(CMAKE_LINKER ${SPATZ_TOOLCHAIN_DIR}/ld.lld) +set(CMAKE_EXECUTABLE_SUFFIX ".elf") + +set(ISA rv32imafdvzfh_xdma) + +# Compile options based on user's manual compilation commands +add_compile_options( + -target riscv32-unknown-elf + # -MP + -mcpu=snitch + -mcmodel=small + + -ffast-math + -fno-builtin-printf + -fno-common + -falign-loops=16 + -ffunction-sections + -Wextra + + # LLVM specific flags from user command + -mllvm -misched-topdown + -menable-experimental-extensions + -mno-relax + + -march=${ISA} + -mabi=ilp32d + # Newlib headers: prefer $GCC_INSTALL_DIR (set by util/iis-env.sh to the + # cluster's spatz-gcc) over a source-built GNU toolchain inside spatz. + # -isystem $ENV{GCC_INSTALL_DIR}/riscv32-unknown-elf/include + -isystem ${SPATZ_HOME}/sw/toolchain/riscv-gnu-toolchain/riscv-newlib/newlib/libc/include + + # Optimization and debug + -O3 + -g +) + +# Link options matching user command +add_link_options( + -target riscv32-unknown-elf + -mcpu=snitch + -march=${ISA} + -mabi=ilp32d + -mcmodel=small + + -fuse-ld=lld + -nostartfiles + + -ffast-math + -fno-common + -fno-builtin-printf + + -static + -Wl,-z,norelro + -Wl,--gc-sections + -Wl,--no-relax + + --gcc-toolchain=/usr/pack/riscv-1.0-kgf/spatz-gcc-7.1.1 +) + +# libsnRuntime-cluster.a is handled by our target_link_libraries(deeployspatz INTERFACE spatz-runtime) +link_libraries( + -lm -lgcc -lm -lgcc +) diff --git a/conda_enviroment_topk_attention.yml b/conda_enviroment_topk_attention.yml new file mode 100644 index 0000000000..8121982e85 --- /dev/null +++ b/conda_enviroment_topk_attention.yml @@ -0,0 +1,81 @@ +name: ~/.conda/envs/deeploy_conda_venv +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2025.12.2=h06a4308_0 + - ld_impl_linux-64=2.44=h9e0c5a2_3 + - libexpat=2.7.5=h7354ed3_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc=15.2.0=h69a1729_7 + - libgcc-ng=15.2.0=h166f726_7 + - libgomp=15.2.0=h4751f2c_7 + - libnsl=2.0.0=h5eee18b_0 + - libstdcxx=15.2.0=h39759b7_7 + - libstdcxx-ng=15.2.0=hc03a8fd_7 + - libuuid=1.41.5=h5eee18b_0 + - libxcb=1.17.0=h9b100fa_0 + - libzlib=1.3.1=hb25bd0a_0 + - ncurses=6.5=h7934f7d_0 + - openssl=3.5.5=h1b28b03_0 + - packaging=25.0=py311h06a4308_1 + - pip=26.0.1=pyhc872135_0 + - pthread-stubs=0.3=h0ce48e5_1 + - python=3.11.15=h741d88c_0 + - readline=8.3=hc2a1206_0 + - setuptools=80.10.2=py311h06a4308_0 + - sqlite=3.51.2=h3e8d24a_0 + - tk=8.6.15=h54e0aa7_0 + - tzdata=2026a=he532380_0 + - wheel=0.46.3=py311h06a4308_0 + - xorg-libx11=1.8.12=h9b100fa_1 + - xorg-libxau=1.0.12=h9b100fa_0 + - xorg-libxdmcp=1.1.5=h9b100fa_0 + - xorg-xorgproto=2024.1=h5eee18b_1 + - xz=5.8.2=h448239c_0 + - zlib=1.3.1=hb25bd0a_0 + - pip: + - absl-py==2.4.0 + - argparse==1.4.0 + - beautifulsoup4==4.14.3 + - certifi==2026.2.25 + - chardet==5.2.0 + - charset-normalizer==3.4.6 + - contourpy==1.3.3 + - cycler==0.12.1 + - deeploy-pulp==0.2.1 + - flatbuffers==25.12.19 + - fonttools==4.62.1 + - idna==3.11 + - imagesize==2.0.0 + - iniconfig==2.3.0 + - jinja2==3.1.6 + - kiwisolver==1.5.0 + - lz4==4.4.5 + - markdown-it-py==4.0.0 + - markupsafe==3.0.3 + - mdurl==0.1.2 + - mpmath==1.3.0 + - narwhals==2.18.1 + - pillow==12.1.1 + - plotly==6.6.0 + - pluggy==1.6.0 + - psutil==7.2.2 + - ptyprocess==0.7.0 + - pyparsing==3.3.2 + - pytest==9.0.2 + - python-dateutil==2.9.0.post0 + - pytz==2026.1.post1 + - six==1.17.0 + - snowballstemmer==3.0.1 + - soupsieve==2.8.3 + - sphinxcontrib-jsmath==1.0.1 + - sympy==1.14.0 + - tabulate==0.10.0 + - toml==0.10.2 + - typing-extensions==4.15.0 + - urllib3==2.6.3 + - wcwidth==0.6.0 +prefix: ~/.conda/envs/deeploy_conda_venv diff --git a/toolchain/gvsoc.patch b/toolchain/gvsoc.patch new file mode 100644 index 0000000000..22e65922a9 --- /dev/null +++ b/toolchain/gvsoc.patch @@ -0,0 +1,12 @@ +diff --git a/engine/src/launcher.cpp b/engine/src/launcher.cpp +index f0b1b654..48c83592 100644 +--- a/engine/src/launcher.cpp ++++ b/engine/src/launcher.cpp +@@ -21,6 +21,7 @@ + + #include + #include ++#include + #include + + #include