diff --git a/.gitignore b/.gitignore index d2753903..ac626b94 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ build-hw.log profile-ips.log magia_venv/ modelsim.ini +regression_logs/ sw/tests/*/build/ sw/tests/*/logs/ @@ -34,4 +35,10 @@ spatz/sw/bin/*.bin spatz/sw/bin/*.dump spatz/sw/bin/*.o -spatz/sw/headers_bin/*.h \ No newline at end of file +spatz/sw/headers_bin/*.h + +sw/kernel_pulp/bin/ +sw/kernel_pulp/headers_bin/ + +*.wlf +*.dbg \ No newline at end of file diff --git a/Bender.local b/Bender.local index d9f37247..098f121f 100644 --- a/Bender.local +++ b/Bender.local @@ -1,6 +1,6 @@ overrides: fpnew : { git: "https://github.com/pulp-platform/cvfpu.git" , rev: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 } - cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: 37a82d337ba60129c333d104c29e816d0698b53b } + cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: f5241403d5d65dbe1fffacd7035dd7ae1359c8ef } # RI5CY branch: lb/magia_core cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: a90101211048ba1a16cedbe4db963ab6e12569d7 } axi : { git: "https://github.com/pulp-platform/axi.git" , version: 0.39.5 } obi : { git: "https://github.com/pulp-platform/obi.git" , rev: 528dc65303d5ffb02fbc254324c6b53eac0dd6e5 } diff --git a/Bender.yml b/Bender.yml index fe353048..5c611be1 100644 --- a/Bender.yml +++ b/Bender.yml @@ -26,7 +26,7 @@ package: dependencies: redmule : { git: "https://github.com/pulp-platform/redmule.git" , rev: 944d4a4d45fe05147cfbf7f872af677578f3b15c } # branch: fc/ooo-mux cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: a90101211048ba1a16cedbe4db963ab6e12569d7 } # branch: vi/redmule_scaleup - cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: f5241403d5d65dbe1fffacd7035dd7ae1359c8ef } # branch: lb/magia_core + cv32e40p : { git: "https://github.com/FondazioneChipsIT/cv32e40p.git" , rev: 4458c6d } # branch: ng/pulp_cluster spatz : { git: "https://github.com/pulp-platform/spatz.git" , rev: 9380883fd36a4794d7f31e2c22e3fed3202aeb81 } # branch: lb/magia-spatz_cc idma : { git: "https://github.com/pulp-platform/iDMA.git" , rev: ff5d56fffb3767814db88d6bf8f381974ea33aa5 } # version: 0.6.4 hwpe-stream : { git: "https://github.com/pulp-platform/hwpe-stream.git" , version: 1.6 } @@ -94,6 +94,8 @@ sources: - hw/tile/fractal_sync_xif_inst_decoder.sv - hw/tile/obi_slave_fsync.sv - hw/tile/obi_slave_ctrl_spatz.sv + - hw/tile/obi_slave_ctrl_cluster.sv + - hw/tile/tile_csr.sv - hw/tile/spatz_bootrom.sv - hw/tile/spatz_cc_wrapper.sv - hw/tile/core_data_demux_eu_direct.sv @@ -159,6 +161,8 @@ sources: - hw/tile/fractal_sync_xif_inst_decoder.sv - hw/tile/obi_slave_fsync.sv - hw/tile/obi_slave_ctrl_spatz.sv + - hw/tile/obi_slave_ctrl_cluster.sv + - hw/tile/tile_csr.sv - hw/tile/spatz_bootrom.sv - hw/tile/spatz_cc_wrapper.sv - hw/tile/core_data_demux_eu_direct.sv @@ -223,6 +227,8 @@ sources: - hw/tile/fractal_sync_xif_inst_decoder.sv - hw/tile/obi_slave_fsync.sv - hw/tile/obi_slave_ctrl_spatz.sv + - hw/tile/obi_slave_ctrl_cluster.sv + - hw/tile/tile_csr.sv - hw/tile/spatz_bootrom.sv - hw/tile/spatz_cc_wrapper.sv - hw/tile/core_data_demux_eu_direct.sv diff --git a/Makefile b/Makefile index 00484467..cda35baf 100644 --- a/Makefile +++ b/Makefile @@ -27,26 +27,40 @@ MAGIA_DIR ?= $(shell pwd) SW ?= sw BUILD_DIR ?= sim/work + ifneq (,$(wildcard /etc/iis.version)) QUESTA ?= questa-2025.1 - BENDER ?= bender BASE_PYTHON ?= python else QUESTA ?= - BENDER ?= ./bender BASE_PYTHON ?= python3 endif + +BENDER ?= bender BENDER_DIR ?= . -ISA ?= riscv ARCH ?= rv XLEN ?= 32 + ifeq ($(core), CV32E40X) - XTEN = imafc + XTEN = imafc + ISA = riscv + ABI ?= ilp + XABI ?= f +else ifeq ($(core), RI5CY) + XTEN = imcxgap9 + ISA = riscv + ABI ?= ilp + XABI ?= else - XTEN = imcxgap9 + # CV32E40P configured with ZFINX=1 in RTL: FP ops use the GPRs (no F register + # file). Toolchain must therefore use Zfinx (and Zhinxmin for FP16) and the + # plain ilp32 ABI; using `f` in march or ilp32f ABI would emit instructions + # that target the (non-existent) F regs. + XTEN = imc_xcvalu_xcvbi_xcvbitmanip_xcvhwlp_xcvmac_xcvmem_xcvsimd_xcvelw_zfinx_zhinxmin + ISA = cv32e40p + ABI ?= ilp + XABI ?= endif -ABI ?= ilp -XABI ?= f #ifeq ($(REDMULE_COMPLEX),1) # TEST_SRCS := sw/redmule_complex.c @@ -54,9 +68,19 @@ XABI ?= f # TEST_SRCS := sw/redmule.c #endif -TEST_DIR := sw/tests -# Auto-detect test location in any subdirectory -TEST_SUBDIR = $(filter-out .,$(shell find $(TEST_DIR) -name "$(test).c" -printf "%P\n" 2>/dev/null | head -1 | xargs dirname 2>/dev/null)) +# Auto-detect test location under sw/tests/ recursively — no cluster= flag needed. +# A directory named $(test) is searched first (handles both sw/tests// and +# sw/tests/cluster_tests//); single-file tests fall back to sw/tests/$(test).c. +# cluster= is still accepted for backward compatibility but its value is ignored. +cluster ?= 0 # accepted for backward compat — value is ignored +num_clusters ?= 16 # number of PULP cluster tiles (N_TILES for 4x4 mesh) + +_FOUND_TEST_DIR := $(shell find sw/tests -maxdepth 5 -type d -name "$(test)" 2>/dev/null | head -1) +TEST_DIR = $(if $(_FOUND_TEST_DIR),$(patsubst %/,%,$(dir $(_FOUND_TEST_DIR))),sw/tests) +TEST_SUBDIR = $(filter-out .,$(shell find $(TEST_DIR) -name "$(test).c" -printf "%P\n" 2>/dev/null | head -1 | xargs dirname 2>/dev/null)) +# CV32-side test source. For cluster tests with a pulp_task/ subdirectory the +# entrypoint is main.c (handled in the $(OBJ) rule); for plain single-binary +# tests this matches the legacy $(test).c naming convention. TEST_SRCS = $(TEST_DIR)/$(if $(TEST_SUBDIR),$(TEST_SUBDIR)/)$(test).c compile_script ?= scripts/compile.tcl @@ -102,27 +126,72 @@ ifeq ($(core), CV32E40X) FLAGS += -DCV32E40X endif +ifeq ($(core), CV32E40P) + FLAGS += -DCV32E40P +endif + +ifeq ($(core), RI5CY) + FLAGS += -DRI5CY +endif + # Include directories INC += -Isw INC += -Isw/inc INC += -Isw/utils INC += -Ispatz/sw/headers_bin +INC += -Isw/kernel_pulp/headers_bin + +# ---------------------------------------------------------------------------- +# Kernel runtime selection — magia-sdk single-binary flow. +# +# A single CV32 ELF is always produced via sw/kernel/{crt0.S,link.ld}. +# If the test directory contains a `pulp_task/` subdirectory with at least +# one .c source, the PULP cluster sources are compiled into a separate +# position-independent ELF (sw/kernel_pulp/{pulp_crt0.S,pulp_program.ld}), +# converted to a flat binary, and packed into +# sw/kernel_pulp/headers_bin/_pulp_task_bin.h +# which the CV32 main.c `#include`s. The linker (sw/kernel/link.ld) KEEPs +# that array inside the `.pulp_binary` section, right after the optional +# Spatz binary in instrram (0xCC000000...). This mirrors what is done for +# Spatz tasks via spatz/sw/Makefile and the `.spatz_binary` section. +# ---------------------------------------------------------------------------- +TEST_DIR_PATH := $(if $(_FOUND_TEST_DIR),$(_FOUND_TEST_DIR),sw/tests/$(test)) +PULP_TASK_DIR_PATH := $(TEST_DIR_PATH)/pulp_task +PULP_TASKS := $(if $(wildcard $(PULP_TASK_DIR_PATH)/*.c),$(notdir $(basename $(wildcard $(PULP_TASK_DIR_PATH)/*.c))),) BOOTSCRIPT := sw/kernel/crt0.S LINKSCRIPT := sw/kernel/link.ld -CC=$(ISA)$(XLEN)-unknown-elf-gcc +PULP_SW_DIR := sw/kernel_pulp + +ifneq ($(core), CV32E40P) + CC=$(ISA)$(XLEN)-unknown-elf-gcc + OBJDUMP=$(ISA)$(XLEN)-unknown-elf-objdump + NM=$(ISA)$(XLEN)-unknown-elf-nm +else + CC=riscv64-unknown-elf-gcc + OBJDUMP=riscv64-unknown-elf-objdump + NM=riscv64-unknown-elf-nm +endif LD=$(CC) -OBJDUMP=$(ISA)$(XLEN)-unknown-elf-objdump -CC_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -O2 -g -Wextra -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wundef -fdata-sections -ffunction-sections -MMD -MP -LD_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -MMD -MP -nostartfiles -nostdlib -Wl,--gc-sections +ifneq ($(core), CV32E40P) + CC_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -O2 -g -Wextra -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wundef -fdata-sections -ffunction-sections -MMD -MP + LD_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -MMD -MP -nostartfiles -nostdlib -Wl,--gc-sections +else + CC_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -U__riscv__ -g -Wextra -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wundef -fdata-sections -ffunction-sections -MMD -MP + LD_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -U__riscv__ -MMD -MP -nostartfiles -nostdlib -Wl,--gc-sections +endif # Spatz embedded binary support (via header) SPATZ_SW_DIR := spatz/sw -# Auto-detect which Spatz tasks are used by looking for *_TASK symbols in CV32 code -# Example: HELLO_WORLD_TASK → hello_world_task -SPATZ_TASKS := $(shell grep -oP '\b(?!SPATZ_)[A-Z][A-Z0-9_]*_TASK\b' $(TEST_SRCS) 2>/dev/null | tr '[:upper:]' '[:lower:]' | awk '!seen[$$0]++') +# Auto-detect which Spatz tasks are used by looking for *_TASK symbols in CV32 code. +# Example: HELLO_WORLD_TASK -> hello_world_task. PULP task macros use the same +# naming scheme, so remove tasks backed by pulp_task/*.c from the Spatz list. +# When PULP tasks are embedded, the actual CV32 source is main.c. +_SPATZ_SRC_TO_GREP := $(if $(PULP_TASKS),$(TEST_DIR_PATH)/main.c,$(TEST_SRCS)) +_AUTO_TASKS := $(shell grep -oP '\b(?!SPATZ_)[A-Z][A-Z0-9_]*_TASK\b' $(_SPATZ_SRC_TO_GREP) 2>/dev/null | tr '[:upper:]' '[:lower:]' | awk '!seen[$$0]++') +SPATZ_TASKS := $(filter-out $(PULP_TASKS),$(_AUTO_TASKS)) # Setup build object dirs TEST_BUILD_DIR = $(TEST_DIR)/$(if $(TEST_SUBDIR),$(TEST_SUBDIR)/)$(test) @@ -131,6 +200,9 @@ OBJ=$(TEST_BUILD_DIR)/build/verif.o BIN=$(TEST_BUILD_DIR)/build/verif DUMP=$(TEST_BUILD_DIR)/build/verif.dump ODUMP=$(TEST_BUILD_DIR)/build/verif.objdump +# PULP cluster disassembly with global (runtime) addresses (only when PULP_TASKS set) +PULP_ELF=$(PULP_SW_DIR)/bin/$(test)_pulp_task_bin.elf +PULP_DUMP_GLOBAL=$(TEST_BUILD_DIR)/build/$(test)_pulp_task_global.dump ITB=$(TEST_BUILD_DIR)/build/verif.itb STIM_INSTR=$(TEST_BUILD_DIR)/build/stim_instr.txt STIM_DATA=$(TEST_BUILD_DIR)/build/stim_data.txt @@ -157,26 +229,49 @@ spatz-header: echo "[SPATZ] No Spatz tasks detected - skipping Spatz compilation"; \ fi +# Build PULP cluster binary (magia-sdk style): produces +# sw/kernel_pulp/headers_bin/_pulp_task_bin.h +# embedding the position-independent flat binary in section .pulp_binary. +.PHONY: pulp-header +pulp-header: + @if [ -n "$(PULP_TASKS)" ]; then \ + echo "[PULP] Auto-detected tasks: $(PULP_TASKS)"; \ + $(MAKE) -C $(PULP_SW_DIR) TEST_NAME=$(test) task="$(PULP_TASKS)" PULP_TASK_DIR=$(ROOT_DIR)/$(PULP_TASK_DIR_PATH) core=$(core) all; \ + else \ + echo "[PULP] No pulp_task/ directory — skipping PULP cluster compilation"; \ + fi + $(BIN): $(CRT) $(OBJ) @if [ -n "$(SPATZ_TASKS)" ]; then \ echo "[CV32-LINK] Linking with embedded Spatz binary (tasks: $(SPATZ_TASKS))"; \ else \ echo "[CV32-LINK] Linking without Spatz binary"; \ fi + @if [ -n "$(PULP_TASKS)" ]; then \ + echo "[CV32-LINK] Linking with embedded PULP binary (tasks: $(PULP_TASKS))"; \ + fi $(LD) $(LD_OPTS) -o $(BIN) $(CRT) $(OBJ) -T$(LINKSCRIPT) $(CRT): mkdir -p $(TEST_BUILD_DIR)/build $(CC) $(CC_OPTS) -c $(BOOTSCRIPT) -o $(CRT) -# Compile CV32 test (depends on spatz-header only if tasks detected) +# Compile CV32 test (depends on spatz/pulp headers only when tasks are used) ifneq ($(SPATZ_TASKS),) $(OBJ): spatz-header endif +ifneq ($(PULP_TASKS),) +$(OBJ): pulp-header +endif $(OBJ): mkdir -p $(TEST_BUILD_DIR)/build +ifneq ($(PULP_TASKS),) + @echo "[CV32] Compiling main core source: $(TEST_DIR_PATH)/main.c" + $(CC) $(CC_OPTS) -c $(TEST_DIR_PATH)/main.c $(FLAGS) $(INC) -o $(OBJ) +else $(CC) $(CC_OPTS) -c $(TEST_SRCS) $(FLAGS) $(INC) -o $(OBJ) +endif SHELL := /bin/bash @@ -196,14 +291,22 @@ python_deps: $(BASE_PYTHON) -m pip install --upgrade pip setuptools && \ $(BASE_PYTHON) -m pip install -r requirements.txt -# Generate instructions and data stimuli +# Generate instructions and data stimuli (single-binary flow). +ifneq ($(PULP_TASKS),) +all: $(STIM_INSTR) $(STIM_DATA) dis objdump itb pulp-dis +else all: $(STIM_INSTR) $(STIM_DATA) dis objdump itb +endif # Run the simulation run: $(CRT) +ifneq ($(PULP_TASKS),) + @rm -rf $(TEST_BUILD_DIR)/traces $(TEST_BUILD_DIR)/trace_core_*.log + @bash $(ROOT_DIR)/scripts/setup_traces.sh $(TEST_BUILD_DIR) $(num_clusters) +endif ifeq ($(gui), 0) cd $(TEST_BUILD_DIR); \ - $(QUESTA) vsim -c vopt_tb $(questa_run_fast_flag) -l transcript -do "run -a" \ + $(QUESTA) vsim -c vopt_tb $(questa_run_fast_flag) -l transcript \ +INST_HEX=$(inst_hex_name) \ +DATA_HEX=$(data_hex_name) \ +INST_ENTRY=$(inst_entry) \ @@ -212,7 +315,8 @@ ifeq ($(gui), 0) $(foreach i, $(shell seq 0 $(shell echo $$(($(num_cores)-1)))), \ +log_file_$(i)=$(log_path_$(i)) \ ) \ - +itb_file=$(itb_file) + +itb_file=$(itb_file) \ + -do "run -a" else cd $(TEST_BUILD_DIR); \ $(QUESTA) vsim vopt_tb $(questa_run_flag) -l transcript \ @@ -228,6 +332,15 @@ else ) \ +itb_file=$(itb_file) endif +ifneq ($(PULP_TASKS),) + @bash $(ROOT_DIR)/scripts/sort_traces.sh $(TEST_BUILD_DIR) $(num_clusters) +else + @for f in $(TEST_BUILD_DIR)/trace_core_*.log; do \ + [ -f "$$f" ] || continue; \ + hartid=$$(printf '%d' "0x$$(basename $$f .log | sed 's/trace_core_//')"); \ + [ $$hartid -ge $$(( 2 * $(num_clusters) )) ] && rm -f "$$f" || true; \ + done +endif # Download bender bender: @@ -248,13 +361,18 @@ ifeq ($(core), CV32E40X) bender_defs += -D CV32E40X else ifeq ($(core), CV32E40P) bender_defs += -D CV32E40P +else ifeq ($(core), RI5CY) + bender_defs += -D RI5CY else - $(error Detected unsupported core, must choose among CV32E40X and CV32E40P) + $(error Detected unsupported core, must choose among CV32E40X, CV32E40P and RI5CY) endif bender_targs += -t rtl bender_targs += -t test +ifeq ($(core), CV32E40P) bender_targs += -t cv32e40p_include_tracer +endif +# RI5CY: riscv_*.sv compiled unconditionally by the PULP cv32e40p package, no extra bender target needed # Targets needed to avoid error even though the module is not used @@ -310,7 +428,18 @@ bender_defs += -D SPATZ_XDMA=$(SPATZ_XDMA) bender_defs += -D SPATZ_RVF=$(SPATZ_RVF) bender_defs += -D SPATZ_RVV=$(SPATZ_RVV) +# RI5CY_CV32E40P_GIT / RI5CY_CV32E40P_REV: PULP repo override for core=RI5CY +RI5CY_CV32E40P_GIT := https://github.com/pulp-platform/cv32e40p.git +RI5CY_CV32E40P_REV := f5241403d5d65dbe1fffacd7035dd7ae1359c8ef +CV32E40P_GIT := https://github.com/FondazioneChipsIT/cv32e40p.git +CV32E40P_REV := 7e48663 + update-ips: +ifeq ($(core), RI5CY) + @sed -i 's|^ cv32e40p .*| cv32e40p : { git: "$(RI5CY_CV32E40P_GIT)" , rev: $(RI5CY_CV32E40P_REV) } # RI5CY branch: lb/magia_core|' Bender.local +else + @sed -i 's|^ cv32e40p .*| cv32e40p : { git: "$(CV32E40P_GIT)" , rev: $(CV32E40P_REV) } # branch: ng/pulp_cluster|' Bender.local +endif $(BENDER) update $(BENDER) script vsim \ --vlog-arg="$(compile_flag)" \ @@ -360,11 +489,15 @@ clean-sdk: rm -rf $(SW)/pulp-sdk clean: - rm -rf $(TEST_BUILD_DIR) + rm -rf $(TEST_BUILD_DIR)/build @if [ -d "$(SPATZ_SW_DIR)" ]; then \ echo "[CLEAN] Cleaning Spatz..."; \ $(MAKE) -C $(SPATZ_SW_DIR) clean; \ fi + @if [ -d "$(PULP_SW_DIR)" ]; then \ + echo "[CLEAN] Cleaning PULP cluster..."; \ + $(MAKE) -C $(PULP_SW_DIR) clean; \ + fi dis: $(OBJDUMP) -d -S $(BIN) > $(DUMP) @@ -375,6 +508,32 @@ objdump: itb: $(BASE_PYTHON) scripts/objdump2itb.py $(ODUMP) > $(ITB) +# PULP cluster disassembly with actual runtime (global) addresses. +# Extracts _pulp_binary_start from the CV32 ELF via nm, then uses +# --adjust-vma to shift the PIC PULP ELF addresses to match the traces. +.PHONY: pulp-dis +pulp-dis: $(BIN) + @LOAD_ADDR=$$($(NM) $(BIN) | grep ' _pulp_binary_start$$' | awk '{print "0x"$$1}'); \ + if [ -z "$$LOAD_ADDR" ]; then \ + echo "[PULP-DIS] WARNING: _pulp_binary_start not found in $(BIN) - skipping"; \ + else \ + echo "[PULP-DIS] _pulp_binary_start = $$LOAD_ADDR"; \ + $(OBJDUMP) -d -S --adjust-vma=$$LOAD_ADDR $(PULP_ELF) > $(PULP_DUMP_GLOBAL); \ + echo "[PULP-DIS] Written: $(PULP_DUMP_GLOBAL)"; \ + fi + +# Trace directory helpers. +# setup-traces: pre-creates the traces/tile_N/{main,cluster}/ tree so it is +# visible as soon as the simulation starts (called by 'run'). +# sort-traces : after sim, moves trace_core_.log into the matching +# tile subdir (called by 'run'; also usable manually). +.PHONY: setup-traces sort-traces +setup-traces: + @bash $(ROOT_DIR)/scripts/setup_traces.sh $(TEST_BUILD_DIR) $(num_clusters) + +sort-traces: + @bash $(ROOT_DIR)/scripts/sort_traces.sh $(TEST_BUILD_DIR) $(num_clusters) + OP ?= gemm fp_fmt ?= FP16 M ?= 12 @@ -422,7 +581,7 @@ hw-all: hw-clean hw-lib hw-compile hw-opt # Nonfree components MAGIA_NONFREE_REMOTE ?= git@iis-git.ee.ethz.ch:pulp-restricted/magia-nonfree MAGIA_NONFREE_DIR ?= nonfree -MAGIA_NONFREE_COMMIT ?= v0.2 +MAGIA_NONFREE_COMMIT ?= d8322f07c92a39b0fed0134bb066b973d0d11c61 .PHONY: magia-nonfree-init MAGIA_NONFREE_DEPS ?= 1 diff --git a/README.md b/README.md index 641960ab..9d763c87 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,10 @@ The following *optional* parameters can be specified: `mesh_dv`: **0**|**1** (**Default**: 1). 0 simulation of a single tile; 1 simulation of the entire mesh. -`fast_sim`: **0**|**1** (**Default**: 0). 0 faster simulation that does not track signals; 1 simulation that tracks signals (for debugging). +`fast_sim`: **0**|**1** (**Default**: 0). 0 simulation that tracks signals (for debugging); 1 faster simulation that does not track signals. `gui`: **0**|**1** (**Default**: 0). 0 simulation without GUI; 1 simulation with GUI. -`test`: **tile_test**|**mesh_test** (**Default**: mesh_test). Specifies which tests should be run. More fine-grain tests are available, see `sw/tests`. **Instructions to build HW/SW and run simulations**: diff --git a/bender_common.mk b/bender_common.mk index cff75149..3c0c4f0d 100644 --- a/bender_common.mk +++ b/bender_common.mk @@ -26,6 +26,7 @@ ifeq ($(core), CV32E40X) sim_targs += -t cv32e40x else ifeq ($(core), CV32E40P) sim_targs += -t cv32e40p +# RI5CY: no bender target needed — riscv_*.sv compiled unconditionally by the PULP cv32e40p package endif common_targs += -t magia_tile diff --git a/bender_synth.mk b/bender_synth.mk index 7025a8e7..e514289b 100644 --- a/bender_synth.mk +++ b/bender_synth.mk @@ -32,4 +32,4 @@ synth_defs += -D SPATZ_NR_PARALLEL_INSTR=$(SPATZ_NR_PARALLEL_INSTR) synth_defs += -D SPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) synth_defs += -D SPATZ_XDMA=$(SPATZ_XDMA) synth_defs += -D SPATZ_RVF=$(SPATZ_RVF) -synth_defs += -D SPATZ_RVV=$(SPATZ_RVV) \ No newline at end of file +synth_defs += -D SPATZ_RVV=$(SPATZ_RVV) diff --git a/hw/mesh/magia.sv b/hw/mesh/magia.sv index cf9df1e0..77e1996f 100644 --- a/hw/mesh/magia.sv +++ b/hw/mesh/magia.sv @@ -228,12 +228,17 @@ module magia .irq_i ( irq_i[i*N_TILES_X+j] ), - .debug_req_i , - .debug_havereset_o ( debug_havereset_o[i*N_TILES_X+j] ), - .debug_running_o ( debug_running_o[i*N_TILES_X+j] ), - .debug_halted_o ( debug_halted_o[i*N_TILES_X+j] ), - .debug_pc_valid_o ( debug_pc_valid_o[i*N_TILES_X+j] ), - .debug_pc_o ( debug_pc_o[i*N_TILES_X+j] ), + // Tile expects [N_CLUSTER_CORES:0] (1 main + 8 cluster cores). + // Replicate the single top-level debug_req_i bit across all cores; + // implicit name-based connection would leave bits [N:1] unconnected (X) + // and X-propagate into cv32e40p_controller.debug_req_pending, corrupting + // ctrl_fsm_cs (observed: ctrl_fsm_cs=0x1xx, debug_req_pending=x). + .debug_req_i ( '0 ), + .debug_havereset_o ( debug_havereset_o[i*N_TILES_X+j] ), + .debug_running_o ( debug_running_o[i*N_TILES_X+j] ), + .debug_halted_o ( debug_halted_o[i*N_TILES_X+j] ), + .debug_pc_valid_o ( debug_pc_valid_o[i*N_TILES_X+j] ), + .debug_pc_o ( debug_pc_o[i*N_TILES_X+j] ), .fetch_enable_i , .core_sleep_o ( core_sleep_o[i*N_TILES_X+j] ), diff --git a/hw/mesh/magia_pkg.sv b/hw/mesh/magia_pkg.sv index 527dc31b..44e582e2 100644 --- a/hw/mesh/magia_pkg.sv +++ b/hw/mesh/magia_pkg.sv @@ -46,11 +46,11 @@ package magia_pkg; localparam int unsigned USR_W = 1; // Default User Width // Parameters used by the NoC - parameter int unsigned AXI_NOC_ID_W = 4; // AXI NoC ID Width: matches slave side id_width (4 bits) + parameter int unsigned AXI_NOC_ID_W = 6; // AXI NoC ID Width: matches slave side id_width (6 bits) parameter int unsigned AXI_NOC_U_W = USR_W; // Parameters used by the L2 - parameter int unsigned L2_ID_W = 2; // The ID Width reflects the slave ID Width of the Tile AXI XBAR (for 4 ports: log2(4)=2) + parameter int unsigned L2_ID_W = 3; // The ID Width reflects the slave ID Width of the Tile AXI XBAR (for 5 ports: log2(5)=3) parameter int unsigned L2_U_W = 1; // Parameter used for the Fractal Sync network diff --git a/hw/mesh/noc/floo_axi_nw_mesh_16x16_noc.sv b/hw/mesh/noc/floo_axi_nw_mesh_16x16_noc.sv index c28a32dd..c8eab555 100644 --- a/hw/mesh/noc/floo_axi_nw_mesh_16x16_noc.sv +++ b/hw/mesh/noc/floo_axi_nw_mesh_16x16_noc.sv @@ -603,7 +603,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_narrow_data_mst_addr_t; typedef logic[31:0] axi_narrow_data_mst_data_t; typedef logic[3:0] axi_narrow_data_mst_strb_t; -typedef logic[1:0] axi_narrow_data_mst_id_t; +typedef logic[2:0] axi_narrow_data_mst_id_t; typedef logic[0:0] axi_narrow_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_mst, axi_narrow_data_mst_req_t, axi_narrow_data_mst_rsp_t, axi_narrow_data_mst_addr_t, axi_narrow_data_mst_id_t, axi_narrow_data_mst_data_t, axi_narrow_data_mst_strb_t, axi_narrow_data_mst_user_t) @@ -611,7 +611,7 @@ typedef logic[0:0] axi_narrow_data_mst_user_t; typedef logic[31:0] axi_narrow_data_slv_addr_t; typedef logic[31:0] axi_narrow_data_slv_data_t; typedef logic[3:0] axi_narrow_data_slv_strb_t; -typedef logic[3:0] axi_narrow_data_slv_id_t; +typedef logic[5:0] axi_narrow_data_slv_id_t; typedef logic[0:0] axi_narrow_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_slv, axi_narrow_data_slv_req_t, axi_narrow_data_slv_rsp_t, axi_narrow_data_slv_addr_t, axi_narrow_data_slv_id_t, axi_narrow_data_slv_data_t, axi_narrow_data_slv_strb_t, axi_narrow_data_slv_user_t) @@ -619,7 +619,7 @@ typedef logic[0:0] axi_narrow_data_slv_user_t; typedef logic[31:0] axi_wide_data_mst_addr_t; typedef logic[255:0] axi_wide_data_mst_data_t; typedef logic[31:0] axi_wide_data_mst_strb_t; -typedef logic[1:0] axi_wide_data_mst_id_t; +typedef logic[2:0] axi_wide_data_mst_id_t; typedef logic[0:0] axi_wide_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_mst, axi_wide_data_mst_req_t, axi_wide_data_mst_rsp_t, axi_wide_data_mst_addr_t, axi_wide_data_mst_id_t, axi_wide_data_mst_data_t, axi_wide_data_mst_strb_t, axi_wide_data_mst_user_t) @@ -627,7 +627,7 @@ typedef logic[0:0] axi_wide_data_mst_user_t; typedef logic[31:0] axi_wide_data_slv_addr_t; typedef logic[255:0] axi_wide_data_slv_data_t; typedef logic[31:0] axi_wide_data_slv_strb_t; -typedef logic[1:0] axi_wide_data_slv_id_t; +typedef logic[2:0] axi_wide_data_slv_id_t; typedef logic[0:0] axi_wide_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_slv, axi_wide_data_slv_req_t, axi_wide_data_slv_rsp_t, axi_wide_data_slv_addr_t, axi_wide_data_slv_id_t, axi_wide_data_slv_data_t, axi_wide_data_slv_strb_t, axi_wide_data_slv_user_t) @@ -637,13 +637,13 @@ typedef logic[0:0] axi_wide_data_slv_user_t; localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; localparam axi_cfg_t AxiCfgW = '{ AddrWidth: 32, DataWidth: 256, UserWidth: 1, - InIdWidth: 2, - OutIdWidth: 2}; + InIdWidth: 3, + OutIdWidth: 3}; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_data_slv, axi_wide_data_slv, AxiCfgN, AxiCfgW, hdr_t) `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide) diff --git a/hw/mesh/noc/floo_axi_nw_mesh_2x2_noc.sv b/hw/mesh/noc/floo_axi_nw_mesh_2x2_noc.sv index 6ccb4a7f..eee5c695 100644 --- a/hw/mesh/noc/floo_axi_nw_mesh_2x2_noc.sv +++ b/hw/mesh/noc/floo_axi_nw_mesh_2x2_noc.sv @@ -71,7 +71,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_narrow_data_mst_addr_t; typedef logic[31:0] axi_narrow_data_mst_data_t; typedef logic[3:0] axi_narrow_data_mst_strb_t; -typedef logic[1:0] axi_narrow_data_mst_id_t; +typedef logic[2:0] axi_narrow_data_mst_id_t; typedef logic[0:0] axi_narrow_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_mst, axi_narrow_data_mst_req_t, axi_narrow_data_mst_rsp_t, axi_narrow_data_mst_addr_t, axi_narrow_data_mst_id_t, axi_narrow_data_mst_data_t, axi_narrow_data_mst_strb_t, axi_narrow_data_mst_user_t) @@ -79,7 +79,7 @@ typedef logic[0:0] axi_narrow_data_mst_user_t; typedef logic[31:0] axi_narrow_data_slv_addr_t; typedef logic[31:0] axi_narrow_data_slv_data_t; typedef logic[3:0] axi_narrow_data_slv_strb_t; -typedef logic[3:0] axi_narrow_data_slv_id_t; +typedef logic[5:0] axi_narrow_data_slv_id_t; typedef logic[0:0] axi_narrow_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_slv, axi_narrow_data_slv_req_t, axi_narrow_data_slv_rsp_t, axi_narrow_data_slv_addr_t, axi_narrow_data_slv_id_t, axi_narrow_data_slv_data_t, axi_narrow_data_slv_strb_t, axi_narrow_data_slv_user_t) @@ -87,7 +87,7 @@ typedef logic[0:0] axi_narrow_data_slv_user_t; typedef logic[31:0] axi_wide_data_mst_addr_t; typedef logic[255:0] axi_wide_data_mst_data_t; typedef logic[31:0] axi_wide_data_mst_strb_t; -typedef logic[1:0] axi_wide_data_mst_id_t; +typedef logic[2:0] axi_wide_data_mst_id_t; typedef logic[0:0] axi_wide_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_mst, axi_wide_data_mst_req_t, axi_wide_data_mst_rsp_t, axi_wide_data_mst_addr_t, axi_wide_data_mst_id_t, axi_wide_data_mst_data_t, axi_wide_data_mst_strb_t, axi_wide_data_mst_user_t) @@ -95,7 +95,7 @@ typedef logic[0:0] axi_wide_data_mst_user_t; typedef logic[31:0] axi_wide_data_slv_addr_t; typedef logic[255:0] axi_wide_data_slv_data_t; typedef logic[31:0] axi_wide_data_slv_strb_t; -typedef logic[1:0] axi_wide_data_slv_id_t; +typedef logic[2:0] axi_wide_data_slv_id_t; typedef logic[0:0] axi_wide_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_slv, axi_wide_data_slv_req_t, axi_wide_data_slv_rsp_t, axi_wide_data_slv_addr_t, axi_wide_data_slv_id_t, axi_wide_data_slv_data_t, axi_wide_data_slv_strb_t, axi_wide_data_slv_user_t) @@ -105,13 +105,13 @@ typedef logic[0:0] axi_wide_data_slv_user_t; localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; localparam axi_cfg_t AxiCfgW = '{ AddrWidth: 32, DataWidth: 256, UserWidth: 1, - InIdWidth: 2, - OutIdWidth: 2}; + InIdWidth: 3, + OutIdWidth: 3}; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_data_slv, axi_wide_data_slv, AxiCfgN, AxiCfgW, hdr_t) `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide) diff --git a/hw/mesh/noc/floo_axi_nw_mesh_32x32_noc.sv b/hw/mesh/noc/floo_axi_nw_mesh_32x32_noc.sv index 05a6266c..985b5dea 100644 --- a/hw/mesh/noc/floo_axi_nw_mesh_32x32_noc.sv +++ b/hw/mesh/noc/floo_axi_nw_mesh_32x32_noc.sv @@ -2171,7 +2171,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_narrow_data_mst_addr_t; typedef logic[31:0] axi_narrow_data_mst_data_t; typedef logic[3:0] axi_narrow_data_mst_strb_t; -typedef logic[1:0] axi_narrow_data_mst_id_t; +typedef logic[2:0] axi_narrow_data_mst_id_t; typedef logic[0:0] axi_narrow_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_mst, axi_narrow_data_mst_req_t, axi_narrow_data_mst_rsp_t, axi_narrow_data_mst_addr_t, axi_narrow_data_mst_id_t, axi_narrow_data_mst_data_t, axi_narrow_data_mst_strb_t, axi_narrow_data_mst_user_t) @@ -2179,7 +2179,7 @@ typedef logic[0:0] axi_narrow_data_mst_user_t; typedef logic[31:0] axi_narrow_data_slv_addr_t; typedef logic[31:0] axi_narrow_data_slv_data_t; typedef logic[3:0] axi_narrow_data_slv_strb_t; -typedef logic[3:0] axi_narrow_data_slv_id_t; +typedef logic[5:0] axi_narrow_data_slv_id_t; typedef logic[0:0] axi_narrow_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_slv, axi_narrow_data_slv_req_t, axi_narrow_data_slv_rsp_t, axi_narrow_data_slv_addr_t, axi_narrow_data_slv_id_t, axi_narrow_data_slv_data_t, axi_narrow_data_slv_strb_t, axi_narrow_data_slv_user_t) @@ -2187,7 +2187,7 @@ typedef logic[0:0] axi_narrow_data_slv_user_t; typedef logic[31:0] axi_wide_data_mst_addr_t; typedef logic[255:0] axi_wide_data_mst_data_t; typedef logic[31:0] axi_wide_data_mst_strb_t; -typedef logic[1:0] axi_wide_data_mst_id_t; +typedef logic[2:0] axi_wide_data_mst_id_t; typedef logic[0:0] axi_wide_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_mst, axi_wide_data_mst_req_t, axi_wide_data_mst_rsp_t, axi_wide_data_mst_addr_t, axi_wide_data_mst_id_t, axi_wide_data_mst_data_t, axi_wide_data_mst_strb_t, axi_wide_data_mst_user_t) @@ -2195,7 +2195,7 @@ typedef logic[0:0] axi_wide_data_mst_user_t; typedef logic[31:0] axi_wide_data_slv_addr_t; typedef logic[255:0] axi_wide_data_slv_data_t; typedef logic[31:0] axi_wide_data_slv_strb_t; -typedef logic[1:0] axi_wide_data_slv_id_t; +typedef logic[2:0] axi_wide_data_slv_id_t; typedef logic[0:0] axi_wide_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_slv, axi_wide_data_slv_req_t, axi_wide_data_slv_rsp_t, axi_wide_data_slv_addr_t, axi_wide_data_slv_id_t, axi_wide_data_slv_data_t, axi_wide_data_slv_strb_t, axi_wide_data_slv_user_t) @@ -2205,13 +2205,13 @@ typedef logic[0:0] axi_wide_data_slv_user_t; localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; localparam axi_cfg_t AxiCfgW = '{ AddrWidth: 32, DataWidth: 256, UserWidth: 1, - InIdWidth: 2, - OutIdWidth: 2}; + InIdWidth: 3, + OutIdWidth: 3}; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_data_slv, axi_wide_data_slv, AxiCfgN, AxiCfgW, hdr_t) `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide) diff --git a/hw/mesh/noc/floo_axi_nw_mesh_4x4_noc.sv b/hw/mesh/noc/floo_axi_nw_mesh_4x4_noc.sv index 56e69291..e17a6cf7 100644 --- a/hw/mesh/noc/floo_axi_nw_mesh_4x4_noc.sv +++ b/hw/mesh/noc/floo_axi_nw_mesh_4x4_noc.sv @@ -99,7 +99,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_narrow_data_mst_addr_t; typedef logic[31:0] axi_narrow_data_mst_data_t; typedef logic[3:0] axi_narrow_data_mst_strb_t; -typedef logic[1:0] axi_narrow_data_mst_id_t; +typedef logic[2:0] axi_narrow_data_mst_id_t; typedef logic[0:0] axi_narrow_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_mst, axi_narrow_data_mst_req_t, axi_narrow_data_mst_rsp_t, axi_narrow_data_mst_addr_t, axi_narrow_data_mst_id_t, axi_narrow_data_mst_data_t, axi_narrow_data_mst_strb_t, axi_narrow_data_mst_user_t) @@ -107,7 +107,7 @@ typedef logic[0:0] axi_narrow_data_mst_user_t; typedef logic[31:0] axi_narrow_data_slv_addr_t; typedef logic[31:0] axi_narrow_data_slv_data_t; typedef logic[3:0] axi_narrow_data_slv_strb_t; -typedef logic[3:0] axi_narrow_data_slv_id_t; +typedef logic[5:0] axi_narrow_data_slv_id_t; typedef logic[0:0] axi_narrow_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_slv, axi_narrow_data_slv_req_t, axi_narrow_data_slv_rsp_t, axi_narrow_data_slv_addr_t, axi_narrow_data_slv_id_t, axi_narrow_data_slv_data_t, axi_narrow_data_slv_strb_t, axi_narrow_data_slv_user_t) @@ -115,7 +115,7 @@ typedef logic[0:0] axi_narrow_data_slv_user_t; typedef logic[31:0] axi_wide_data_mst_addr_t; typedef logic[255:0] axi_wide_data_mst_data_t; typedef logic[31:0] axi_wide_data_mst_strb_t; -typedef logic[1:0] axi_wide_data_mst_id_t; +typedef logic[2:0] axi_wide_data_mst_id_t; typedef logic[0:0] axi_wide_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_mst, axi_wide_data_mst_req_t, axi_wide_data_mst_rsp_t, axi_wide_data_mst_addr_t, axi_wide_data_mst_id_t, axi_wide_data_mst_data_t, axi_wide_data_mst_strb_t, axi_wide_data_mst_user_t) @@ -123,7 +123,7 @@ typedef logic[0:0] axi_wide_data_mst_user_t; typedef logic[31:0] axi_wide_data_slv_addr_t; typedef logic[255:0] axi_wide_data_slv_data_t; typedef logic[31:0] axi_wide_data_slv_strb_t; -typedef logic[1:0] axi_wide_data_slv_id_t; +typedef logic[2:0] axi_wide_data_slv_id_t; typedef logic[0:0] axi_wide_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_slv, axi_wide_data_slv_req_t, axi_wide_data_slv_rsp_t, axi_wide_data_slv_addr_t, axi_wide_data_slv_id_t, axi_wide_data_slv_data_t, axi_wide_data_slv_strb_t, axi_wide_data_slv_user_t) @@ -133,13 +133,13 @@ typedef logic[0:0] axi_wide_data_slv_user_t; localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; localparam axi_cfg_t AxiCfgW = '{ AddrWidth: 32, DataWidth: 256, UserWidth: 1, - InIdWidth: 2, - OutIdWidth: 2}; + InIdWidth: 3, + OutIdWidth: 3}; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_data_slv, axi_wide_data_slv, AxiCfgN, AxiCfgW, hdr_t) `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide) diff --git a/hw/mesh/noc/floo_axi_nw_mesh_8x8_noc.sv b/hw/mesh/noc/floo_axi_nw_mesh_8x8_noc.sv index 67e35dc4..71d28794 100644 --- a/hw/mesh/noc/floo_axi_nw_mesh_8x8_noc.sv +++ b/hw/mesh/noc/floo_axi_nw_mesh_8x8_noc.sv @@ -203,7 +203,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_narrow_data_mst_addr_t; typedef logic[31:0] axi_narrow_data_mst_data_t; typedef logic[3:0] axi_narrow_data_mst_strb_t; -typedef logic[1:0] axi_narrow_data_mst_id_t; +typedef logic[2:0] axi_narrow_data_mst_id_t; typedef logic[0:0] axi_narrow_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_mst, axi_narrow_data_mst_req_t, axi_narrow_data_mst_rsp_t, axi_narrow_data_mst_addr_t, axi_narrow_data_mst_id_t, axi_narrow_data_mst_data_t, axi_narrow_data_mst_strb_t, axi_narrow_data_mst_user_t) @@ -211,7 +211,7 @@ typedef logic[0:0] axi_narrow_data_mst_user_t; typedef logic[31:0] axi_narrow_data_slv_addr_t; typedef logic[31:0] axi_narrow_data_slv_data_t; typedef logic[3:0] axi_narrow_data_slv_strb_t; -typedef logic[3:0] axi_narrow_data_slv_id_t; +typedef logic[5:0] axi_narrow_data_slv_id_t; typedef logic[0:0] axi_narrow_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_slv, axi_narrow_data_slv_req_t, axi_narrow_data_slv_rsp_t, axi_narrow_data_slv_addr_t, axi_narrow_data_slv_id_t, axi_narrow_data_slv_data_t, axi_narrow_data_slv_strb_t, axi_narrow_data_slv_user_t) @@ -219,7 +219,7 @@ typedef logic[0:0] axi_narrow_data_slv_user_t; typedef logic[31:0] axi_wide_data_mst_addr_t; typedef logic[255:0] axi_wide_data_mst_data_t; typedef logic[31:0] axi_wide_data_mst_strb_t; -typedef logic[1:0] axi_wide_data_mst_id_t; +typedef logic[2:0] axi_wide_data_mst_id_t; typedef logic[0:0] axi_wide_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_mst, axi_wide_data_mst_req_t, axi_wide_data_mst_rsp_t, axi_wide_data_mst_addr_t, axi_wide_data_mst_id_t, axi_wide_data_mst_data_t, axi_wide_data_mst_strb_t, axi_wide_data_mst_user_t) @@ -227,7 +227,7 @@ typedef logic[0:0] axi_wide_data_mst_user_t; typedef logic[31:0] axi_wide_data_slv_addr_t; typedef logic[255:0] axi_wide_data_slv_data_t; typedef logic[31:0] axi_wide_data_slv_strb_t; -typedef logic[1:0] axi_wide_data_slv_id_t; +typedef logic[2:0] axi_wide_data_slv_id_t; typedef logic[0:0] axi_wide_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_slv, axi_wide_data_slv_req_t, axi_wide_data_slv_rsp_t, axi_wide_data_slv_addr_t, axi_wide_data_slv_id_t, axi_wide_data_slv_data_t, axi_wide_data_slv_strb_t, axi_wide_data_slv_user_t) @@ -237,13 +237,13 @@ typedef logic[0:0] axi_wide_data_slv_user_t; localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; localparam axi_cfg_t AxiCfgW = '{ AddrWidth: 32, DataWidth: 256, UserWidth: 1, - InIdWidth: 2, - OutIdWidth: 2}; + InIdWidth: 3, + OutIdWidth: 3}; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_data_slv, axi_wide_data_slv, AxiCfgN, AxiCfgW, hdr_t) `FLOO_TYPEDEF_NW_LINK_ALL(req, rsp, wide, req, rsp, wide) diff --git a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_16x16_config.yml b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_16x16_config.yml index f3939c6a..c7c3d239 100644 --- a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_16x16_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_16x16_config.yml @@ -12,14 +12,14 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "narrow_data_slv" type: "narrow" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 - name: "wide_data_mst" type: "wide" diff --git a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_2x2_config.yml b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_2x2_config.yml index d0af44c6..374a86d0 100644 --- a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_2x2_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_2x2_config.yml @@ -12,14 +12,14 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "narrow_data_slv" type: "narrow" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 - name: "wide_data_mst" type: "wide" diff --git a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_32x32_config.yml b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_32x32_config.yml index 5f7f4169..036fdfc9 100644 --- a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_32x32_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_32x32_config.yml @@ -12,14 +12,14 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "narrow_data_slv" type: "narrow" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 - name: "wide_data_mst" type: "wide" diff --git a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_4x4_config.yml b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_4x4_config.yml index b1f4debc..b4ed9cf9 100644 --- a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_4x4_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_4x4_config.yml @@ -12,14 +12,14 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "narrow_data_slv" type: "narrow" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 - name: "wide_data_mst" type: "wide" diff --git a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_8x8_config.yml b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_8x8_config.yml index c62a00cd..1ecce761 100644 --- a/hw/mesh/noc_configs/floonoc_axi_nw_mesh_8x8_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_nw_mesh_8x8_config.yml @@ -12,14 +12,14 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "narrow_data_slv" type: "narrow" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 - name: "wide_data_mst" type: "wide" diff --git a/hw/tile/cluster_event_map.sv b/hw/tile/cluster_event_map.sv index c3792c25..f32f32eb 100644 --- a/hw/tile/cluster_event_map.sv +++ b/hw/tile/cluster_event_map.sv @@ -44,7 +44,7 @@ module cluster_event_map #( for (genvar i = 0; i < NB_CORES; i++) begin : gen_event_mapping assign events_mapped_o[i] = { cluster_events_i[i][31:16], // [31:16] Custom cluster events (upper 16 bits) - 4'b0, // [15:12] Reserved + cluster_events_i[i][15:12], // [15:12] PULP cluster events acc_events_i[i], // [11:8] Accelerator events 2'b0, // [7:6] Reserved timer_events_i[i], // [5:4] Timer events diff --git a/hw/tile/core_data_demux_eu_direct.sv b/hw/tile/core_data_demux_eu_direct.sv index 36f4fec2..fab0af66 100644 --- a/hw/tile/core_data_demux_eu_direct.sv +++ b/hw/tile/core_data_demux_eu_direct.sv @@ -52,35 +52,169 @@ module core_data_demux_eu_direct input magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp_i ); - enum logic {XBAR, EU} request_destination, request_destination_next; + // --------------------------------------------------------------------------- + // Address decode (combinatorial) + // --------------------------------------------------------------------------- - // Address range detection for EU direct access (pure combinatorial) logic use_eu_direct; + + assign use_eu_direct = core_data_req_i.req && + (core_data_req_i.addr >= EVENT_UNIT_ADDR_START) && + (core_data_req_i.addr < EVENT_UNIT_ADDR_END); + + // --------------------------------------------------------------------------- + // In-order response tracking — 2-entry FIFO of issued destinations + // + // Same approach as data_periph_demux in the PULP cluster. The FIFO tracks + // which path (EU or XBAR) each outstanding request was sent to and therefore + // which path must supply the *next* response to forward to the core. + // + // arriving_order[0] = most-recently pushed (newest) + // arriving_order[1] = oldest (= head, must respond first) + // + // CV32E40P LSU has DEPTH=2 outstanding, so a 2-entry FIFO is sufficient. + // A single registered bit (as used previously) is NOT enough: if two + // requests go to different paths the second grant overwrites the first + // destination, causing the response of the first to be routed to the wrong + // instruction or silently dropped. + // --------------------------------------------------------------------------- + + typedef enum logic { XBAR_D = 1'b0, EU_D = 1'b1 } dest_e; + + dest_e destination; // combinatorial: destination of the current request + dest_e arriving_order [0:1]; // FIFO: [0]=newest, [1]=oldest + logic [1:0] num_outstanding; + dest_e head; // combinatorial: oldest pending destination + + assign destination = use_eu_direct ? EU_D : XBAR_D; + + always_comb begin : _HEAD_MUX_ + case (num_outstanding) + 2'd2: head = arriving_order[1]; // oldest + 2'd1: head = arriving_order[0]; + default: head = XBAR_D; // idle, value unused + endcase + end + + // --------------------------------------------------------------------------- + // Out-of-order response capture + // + // If a path's rvalid arrives while it is NOT at the head of the FIFO the + // response would be lost. A 1-entry capture register per path holds it + // until that path reaches the head. With DEPTH=2 at most one response can + // be queued behind the other at any time. + // --------------------------------------------------------------------------- + + logic eu_cap_rvalid_q; + logic [31:0] eu_cap_rdata_q; + logic eu_cap_err_q; + + logic xbar_cap_rvalid_q; + logic [31:0] xbar_cap_rdata_q; + logic xbar_cap_err_q; + + // "Effective" rvalid/rdata: direct arrival OR previously captured + logic eu_rvalid_eff, xbar_rvalid_eff; + logic [31:0] eu_rdata_eff, xbar_rdata_eff; + logic eu_err_eff, xbar_err_eff; + + // Direct arrival takes priority so the capture register is consumed first + // only when nothing arrives directly that cycle. + assign eu_rvalid_eff = eu_direct_rsp_i.rvalid | eu_cap_rvalid_q; + assign eu_rdata_eff = eu_direct_rsp_i.rvalid ? eu_direct_rsp_i.rdata : eu_cap_rdata_q; + assign eu_err_eff = eu_direct_rsp_i.rvalid ? eu_direct_rsp_i.err : eu_cap_err_q; + + assign xbar_rvalid_eff = xbar_data_rsp_i.rvalid | xbar_cap_rvalid_q; + assign xbar_rdata_eff = xbar_data_rsp_i.rvalid ? xbar_data_rsp_i.rdata : xbar_cap_rdata_q; + assign xbar_err_eff = xbar_data_rsp_i.rvalid ? xbar_data_rsp_i.err : xbar_cap_err_q; + + // Response forwarded to the core this cycle + logic resp_valid_to_core; + always_comb begin : _RESP_VALID_MUX_ + if (num_outstanding == '0) + resp_valid_to_core = 1'b0; + else if (head == EU_D) + resp_valid_to_core = eu_rvalid_eff; + else + resp_valid_to_core = xbar_rvalid_eff; + end + + // Capture/clear FFs + always_ff @(posedge clk_i, negedge rst_ni) begin : _CAPTURE_FFS_ + if (!rst_ni) begin + eu_cap_rvalid_q <= 1'b0; + eu_cap_rdata_q <= '0; + eu_cap_err_q <= 1'b0; + xbar_cap_rvalid_q <= 1'b0; + xbar_cap_rdata_q <= '0; + xbar_cap_err_q <= 1'b0; + end else begin + // EU: capture when rvalid arrives and EU is not at head; + // clear when EU response is forwarded to core. + if (eu_direct_rsp_i.rvalid && (num_outstanding > 0) && (head != EU_D)) begin + eu_cap_rvalid_q <= 1'b1; + eu_cap_rdata_q <= eu_direct_rsp_i.rdata; + eu_cap_err_q <= eu_direct_rsp_i.err; + end else if ((head == EU_D) && resp_valid_to_core) begin + eu_cap_rvalid_q <= 1'b0; + end + + // XBAR: same logic + if (xbar_data_rsp_i.rvalid && (num_outstanding > 0) && (head != XBAR_D)) begin + xbar_cap_rvalid_q <= 1'b1; + xbar_cap_rdata_q <= xbar_data_rsp_i.rdata; + xbar_cap_err_q <= xbar_data_rsp_i.err; + end else if ((head == XBAR_D) && resp_valid_to_core) begin + xbar_cap_rvalid_q <= 1'b0; + end + end + end + + // --------------------------------------------------------------------------- + // FIFO push/pop + // --------------------------------------------------------------------------- + logic request_granted; - - assign use_eu_direct = core_data_req_i.req && - (core_data_req_i.addr >= EVENT_UNIT_ADDR_START) && - (core_data_req_i.addr < EVENT_UNIT_ADDR_END); + logic fifo_push, fifo_pop; - // Grant occurs when request is accepted by the selected path assign request_granted = core_data_req_i.req && core_data_rsp_o.gnt; + assign fifo_push = request_granted; + assign fifo_pop = resp_valid_to_core && (num_outstanding > 0); - // Determine next destination when a request is granted - assign request_destination_next = use_eu_direct ? EU : XBAR; - - // Update response destination based on GRANTED request - always_ff @(posedge clk_i, negedge rst_ni) begin : _UPDATE_RESPONSE_DESTINATION_ + always_ff @(posedge clk_i, negedge rst_ni) begin : _FIFO_ if (!rst_ni) begin - request_destination <= XBAR; + arriving_order[0] <= XBAR_D; + arriving_order[1] <= XBAR_D; + num_outstanding <= 2'b0; end else begin - if (request_granted) begin - request_destination <= request_destination_next; + if (fifo_push && fifo_pop) begin + // Back-to-back: pop the old head, push the new destination + arriving_order[0] <= destination; + if (num_outstanding == 2) + arriving_order[1] <= arriving_order[0]; // shift oldest away + // num_outstanding unchanged + end else if (fifo_push) begin + arriving_order[1] <= arriving_order[0]; + arriving_order[0] <= destination; + num_outstanding <= num_outstanding + 1; + end else if (fifo_pop) begin + arriving_order[1] <= arriving_order[0]; + num_outstanding <= num_outstanding - 1; end end end + // --------------------------------------------------------------------------- + // Request forwarding + // Gate with num_outstanding < 2 to prevent FIFO overflow (CV32E40P DEPTH=2). + // Back-to-back allowed when a response is simultaneously being consumed. + // --------------------------------------------------------------------------- + + logic can_issue; + assign can_issue = (num_outstanding < 2) || fifo_pop; + // To regular crossbar - assign xbar_data_req_o.req = core_data_req_i.req && !use_eu_direct; + assign xbar_data_req_o.req = core_data_req_i.req && !use_eu_direct && can_issue; assign xbar_data_req_o.addr = core_data_req_i.addr; assign xbar_data_req_o.be = core_data_req_i.be; assign xbar_data_req_o.wdata = core_data_req_i.wdata; @@ -92,38 +226,36 @@ module core_data_demux_eu_direct assign xbar_data_req_o.dbg = core_data_req_i.dbg; `endif - // To EU direct link (abstract interface) - // Pass relative offset to Event Unit (subtract base address) - // Event Unit expects offset within its address space [9:0], not absolute address - assign eu_direct_req_o.req = core_data_req_i.req && use_eu_direct; + // To EU direct link + assign eu_direct_req_o.req = core_data_req_i.req && use_eu_direct && can_issue; assign eu_direct_req_o.addr = core_data_req_i.addr - EVENT_UNIT_ADDR_START; - assign eu_direct_req_o.wen = ~core_data_req_i.we; // EU expects wen (write enable negated) + assign eu_direct_req_o.wen = ~core_data_req_i.we; assign eu_direct_req_o.wdata = core_data_req_i.wdata; assign eu_direct_req_o.be = core_data_req_i.be; - // Response routing - uses stored destination + // --------------------------------------------------------------------------- + // Response mux to core — select from head path (direct or captured) + // --------------------------------------------------------------------------- + always_comb begin : _HANDLE_RESP_ - case (request_destination) - XBAR: begin - core_data_rsp_o.rvalid = xbar_data_rsp_i.rvalid; - core_data_rsp_o.rdata = xbar_data_rsp_i.rdata; - core_data_rsp_o.err = xbar_data_rsp_i.err; + core_data_rsp_o.rvalid = resp_valid_to_core; + if (head == EU_D) begin + core_data_rsp_o.rdata = eu_rdata_eff; + core_data_rsp_o.err = eu_err_eff; `ifdef CV32E40X - core_data_rsp_o.exokay = xbar_data_rsp_i.exokay; + core_data_rsp_o.exokay = '0; `endif - end - EU: begin - core_data_rsp_o.rvalid = eu_direct_rsp_i.rvalid; - core_data_rsp_o.rdata = eu_direct_rsp_i.rdata; - core_data_rsp_o.err = eu_direct_rsp_i.err; + end else begin + core_data_rsp_o.rdata = xbar_rdata_eff; + core_data_rsp_o.err = xbar_err_eff; `ifdef CV32E40X - core_data_rsp_o.exokay = '0; + core_data_rsp_o.exokay = xbar_data_rsp_i.exokay; `endif - end - endcase + end end - // GNT is combinatorial - assign core_data_rsp_o.gnt = use_eu_direct ? eu_direct_rsp_i.gnt : xbar_data_rsp_i.gnt; + // GNT: combinatorial from selected path, gated by can_issue + assign core_data_rsp_o.gnt = can_issue && (use_eu_direct ? eu_direct_rsp_i.gnt + : xbar_data_rsp_i.gnt); endmodule \ No newline at end of file diff --git a/hw/tile/eu_direct_cut.sv b/hw/tile/eu_direct_cut.sv index 94665127..5635de6d 100644 --- a/hw/tile/eu_direct_cut.sv +++ b/hw/tile/eu_direct_cut.sv @@ -26,80 +26,96 @@ module eu_direct_cut parameter type eu_direct_rsp_t = logic, parameter bit Bypass = 1'b0, parameter bit BypassReq = Bypass, - parameter bit BypassRsp = Bypass + parameter bit BypassRsp = Bypass, + parameter int unsigned NB_CORES = 1 )( input logic clk_i, input logic rst_ni, - input eu_direct_req_t sbr_req_i, - output eu_direct_rsp_t sbr_rsp_o, + input eu_direct_req_t [NB_CORES -1 :0] sbr_req_i , + output eu_direct_rsp_t [NB_CORES -1 :0] sbr_rsp_o, - output eu_direct_req_t mgr_req_o, - input eu_direct_rsp_t mgr_rsp_i + output eu_direct_req_t [NB_CORES -1 :0] mgr_req_o, + input eu_direct_rsp_t [NB_CORES -1 :0] mgr_rsp_i ); - // Request payload (exclude req signal for valid/ready) + // ============================ + // Payload typedefs + // ============================ typedef struct packed { logic[magia_pkg::ADDR_W-1:0] addr; logic wen; logic[magia_pkg::DATA_W-1:0] wdata; logic[3:0] be; } eu_req_payload_t; - - eu_req_payload_t sbr_req_payload, mgr_req_payload; - - assign sbr_req_payload.addr = sbr_req_i.addr; - assign sbr_req_payload.wen = sbr_req_i.wen; - assign sbr_req_payload.wdata = sbr_req_i.wdata; - assign sbr_req_payload.be = sbr_req_i.be; - - assign mgr_req_o.addr = mgr_req_payload.addr; - assign mgr_req_o.wen = mgr_req_payload.wen; - assign mgr_req_o.wdata = mgr_req_payload.wdata; - assign mgr_req_o.be = mgr_req_payload.be; - - // Request spill register - spill_register #( - .T ( eu_req_payload_t ), - .Bypass ( BypassReq ) - ) i_req_spill ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .valid_i ( sbr_req_i.req ), - .ready_o ( sbr_rsp_o.gnt ), - .data_i ( sbr_req_payload ), - .valid_o ( mgr_req_o.req ), - .ready_i ( mgr_rsp_i.gnt ), - .data_o ( mgr_req_payload ) - ); - - // Response payload (exclude rvalid for valid/ready) + typedef struct packed { logic[magia_pkg::DATA_W-1:0] rdata; logic err; } eu_rsp_payload_t; - - eu_rsp_payload_t sbr_rsp_payload, mgr_rsp_payload; - - assign mgr_rsp_payload.rdata = mgr_rsp_i.rdata; - assign mgr_rsp_payload.err = mgr_rsp_i.err; - - assign sbr_rsp_o.rdata = sbr_rsp_payload.rdata; - assign sbr_rsp_o.err = sbr_rsp_payload.err; - - // Response spill register - spill_register #( - .T ( eu_rsp_payload_t ), - .Bypass ( BypassRsp ) - ) i_rsp_spill ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .valid_i ( mgr_rsp_i.rvalid ), - .ready_o ( ), - .data_i ( mgr_rsp_payload ), - .valid_o ( sbr_rsp_o.rvalid ), - .ready_i ( 1'b1 ), - .data_o ( sbr_rsp_payload ) - ); - -endmodule + + + // ============================ + // Per-core logic + // ============================ + for (genvar i = 0; i < NB_CORES; i++) begin : GEN_EU_CUT + + // ---------------------------- + // Request path + // ---------------------------- + eu_req_payload_t sbr_req_payload; + eu_req_payload_t mgr_req_payload; + + assign sbr_req_payload.addr = sbr_req_i[i].addr; + assign sbr_req_payload.wen = sbr_req_i[i].wen; + assign sbr_req_payload.wdata = sbr_req_i[i].wdata; + assign sbr_req_payload.be = sbr_req_i[i].be; + + assign mgr_req_o[i].addr = mgr_req_payload.addr; + assign mgr_req_o[i].wen = mgr_req_payload.wen; + assign mgr_req_o[i].wdata = mgr_req_payload.wdata; + assign mgr_req_o[i].be = mgr_req_payload.be; + + spill_register #( + .T ( eu_req_payload_t ), + .Bypass ( BypassReq ) + ) i_req_spill ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .valid_i ( sbr_req_i[i].req ), + .ready_o ( sbr_rsp_o[i].gnt ), + .data_i ( sbr_req_payload ), + .valid_o ( mgr_req_o[i].req ), + .ready_i ( mgr_rsp_i[i].gnt ), + .data_o ( mgr_req_payload ) + ); + + // ---------------------------- + // Response path + // ---------------------------- + eu_rsp_payload_t sbr_rsp_payload; + eu_rsp_payload_t mgr_rsp_payload; + + assign mgr_rsp_payload.rdata = mgr_rsp_i[i].rdata; + assign mgr_rsp_payload.err = mgr_rsp_i[i].err; + + assign sbr_rsp_o[i].rdata = sbr_rsp_payload.rdata; + assign sbr_rsp_o[i].err = sbr_rsp_payload.err; + + spill_register #( + .T ( eu_rsp_payload_t ), + .Bypass ( BypassRsp ) + ) i_rsp_spill ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .valid_i ( mgr_rsp_i[i].rvalid ), + .ready_o ( ), + .data_i ( mgr_rsp_payload ), + .valid_o ( sbr_rsp_o[i].rvalid ), + .ready_i ( 1'b1 ), + .data_o ( sbr_rsp_payload ) + ); + + end + +endmodule \ No newline at end of file diff --git a/hw/tile/magia_event_unit.sv b/hw/tile/magia_event_unit.sv index a63985e9..dbff502b 100644 --- a/hw/tile/magia_event_unit.sv +++ b/hw/tile/magia_event_unit.sv @@ -58,16 +58,16 @@ import magia_tile_pkg::*; input logic [NB_CORES-1:0] dbg_req_i, output logic [NB_CORES-1:0] core_dbg_req_o, - // EU Direct Link interface - input logic eu_direct_req_i, - input logic [31:0] eu_direct_addr_i, - input logic eu_direct_wen_i, - input logic [31:0] eu_direct_wdata_i, - input logic [3:0] eu_direct_be_i, - output logic eu_direct_gnt_o, - output logic eu_direct_rvalid_o, - output logic [31:0] eu_direct_rdata_o, - output logic eu_direct_err_o, + // EU Direct Link interface (one per core) + input logic [NB_CORES-1:0] eu_direct_req_i, + input logic [NB_CORES-1:0][31:0] eu_direct_addr_i, + input logic [NB_CORES-1:0] eu_direct_wen_i, + input logic [NB_CORES-1:0][31:0] eu_direct_wdata_i, + input logic [NB_CORES-1:0][3:0] eu_direct_be_i, + output logic [NB_CORES-1:0] eu_direct_gnt_o, + output logic [NB_CORES-1:0] eu_direct_rvalid_o, + output logic [NB_CORES-1:0][31:0] eu_direct_rdata_o, + output logic [NB_CORES-1:0] eu_direct_err_o, // OBI slave connection input core_obi_data_req_t obi_req_i, @@ -81,29 +81,31 @@ import magia_tile_pkg::*; // Internal signals logic soc_periph_evt_ready_internal; - // Convert abstract eu_direct interface to XBAR_PERIPH_BUS + // Convert abstract eu_direct interface to XBAR_PERIPH_BUS (one per core) // eu_direct_addr_i already contains relative offset (subtracted by demux) - assign eu_direct_link[0].req = eu_direct_req_i; - assign eu_direct_link[0].add = eu_direct_addr_i; - assign eu_direct_link[0].wen = eu_direct_wen_i; - assign eu_direct_link[0].wdata = eu_direct_wdata_i; - assign eu_direct_link[0].be = eu_direct_be_i; - assign eu_direct_link[0].id = '0; + generate + for (genvar k = 0; k < NB_CORES; k++) begin : gen_eu_direct_link + assign eu_direct_link[k].req = eu_direct_req_i[k]; + assign eu_direct_link[k].add = eu_direct_addr_i[k]; + assign eu_direct_link[k].wen = eu_direct_wen_i[k]; + assign eu_direct_link[k].wdata = eu_direct_wdata_i[k]; + assign eu_direct_link[k].be = eu_direct_be_i[k]; + assign eu_direct_link[k].id = '0; - // Convert XBAR_PERIPH_BUS response to abstract interface - // Event Unit handles all power management and grant logic internally - assign eu_direct_gnt_o = eu_direct_link[0].gnt; - assign eu_direct_rvalid_o = eu_direct_link[0].r_valid; - assign eu_direct_rdata_o = eu_direct_link[0].r_rdata; - assign eu_direct_err_o = eu_direct_link[0].r_opc; // r_opc: 0=OK, 1=ERROR + assign eu_direct_gnt_o[k] = eu_direct_link[k].gnt; + assign eu_direct_rvalid_o[k] = eu_direct_link[k].r_valid; + assign eu_direct_rdata_o[k] = eu_direct_link[k].r_rdata; + assign eu_direct_err_o[k] = eu_direct_link[k].r_opc; + end + endgenerate // Address range check and offset calculation localparam logic [magia_pkg::ADDR_W-1:0] EU_BASE_ADDR = magia_tile_pkg::EVENT_UNIT_ADDR_START; logic addr_in_range; logic [magia_pkg::ADDR_W-1:0] addr_offset; - assign addr_in_range = (obi_req_i.a.addr >= magia_tile_pkg::EVENT_UNIT_ADDR_START) && - (obi_req_i.a.addr <= magia_tile_pkg::EVENT_UNIT_ADDR_END); + assign addr_in_range = (obi_req_i.a.addr >= magia_tile_pkg::EVENT_UNIT_ADDR_START) && + (obi_req_i.a.addr < magia_tile_pkg::EVENT_UNIT_ADDR_END); assign addr_offset = obi_req_i.a.addr - EU_BASE_ADDR; // OBI to XBAR_PERIPH_BUS conversion - pass RELATIVE address (offset from base) diff --git a/hw/tile/magia_tile.sv b/hw/tile/magia_tile.sv index 188bead3..4b5b96ca 100644 --- a/hw/tile/magia_tile.sv +++ b/hw/tile/magia_tile.sv @@ -116,7 +116,7 @@ module magia_tile input logic[magia_pkg::N_IRQ-1:0] irq_i, - input logic debug_req_i, + input logic[magia_tile_pkg::N_CLUSTER_CORES:0] debug_req_i, output logic debug_havereset_o, output logic debug_running_o, output logic debug_halted_o, @@ -144,9 +144,9 @@ module magia_tile logic[magia_pkg::ADDR_W-1:0] tile_fsync_ctrl_end_addr; logic[magia_pkg::ADDR_W-1:0] tile_event_unit_start_addr; logic[magia_pkg::ADDR_W-1:0] tile_event_unit_end_addr; - logic[magia_pkg::ADDR_W-1:0] tile_spatz_ctrl_start_addr; - logic[magia_pkg::ADDR_W-1:0] tile_spatz_ctrl_end_addr; - + logic[magia_pkg::ADDR_W-1:0] tile_csr_start_addr; + logic[magia_pkg::ADDR_W-1:0] tile_csr_end_addr; + magia_tile_pkg::redmule_data_req_t redmule_data_req; magia_tile_pkg::redmule_data_rsp_t redmule_data_rsp; @@ -168,11 +168,11 @@ module magia_tile magia_tile_pkg::core_obi_data_req_t core_l1_data_amo_req; magia_tile_pkg::core_obi_data_rsp_t core_l1_data_amo_rsp; - magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request - magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request + magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request, Index 3-3+N_CLUSTER_CORES -> cluster cores + magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request, Index 3-3+N_CLUSTER_CORES -> cluster cores - magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request - magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request + magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request, Index 3-3+N_CLUSTER_CORES -> cluster cores + magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request, Index 3-3+N_CLUSTER_CORES -> cluster cores magia_tile_pkg::core_obi_data_req_t ext_obi_data_req; magia_tile_pkg::core_obi_data_rsp_t ext_obi_data_rsp; @@ -314,10 +314,10 @@ module magia_tile logic fsync_error; // Event arrays for Event Unit (need proper 2D array structure) - logic [0:0] [3:0] acc_events_array; - logic [0:0] [1:0] dma_events_array; - logic [0:0] [1:0] timer_events_array; - logic [0:0][31:0] other_events_array; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] [3:0] acc_events_array; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] [1:0] dma_events_array; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] [1:0] timer_events_array; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] [31:0] other_events_array; // FlooNoC connections between NI and router id_t floo_id; @@ -348,13 +348,16 @@ module magia_tile logic x_result_ready; fpu_ss_pkg::x_result_t x_result; - // Event Unit signals - Corrected for single-core array interface - logic [0:0] eu_core_irq_req; // [0:0] array for single core - logic [0:0][magia_tile_pkg::EVENT_UNIT_IRQ_WIDTH-1:0] eu_core_irq_id; // [0:0][4:0] array - logic [0:0] eu_core_irq_ack; // [0:0] array - logic [0:0][magia_tile_pkg::EVENT_UNIT_IRQ_WIDTH-1:0] eu_core_irq_ack_id; // [0:0][4:0] array - logic [0:0] eu_core_clk_en; // [0:0] array - logic [0:0] eu_core_dbg_req; // [0:0] array + // Event Unit signals + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_core_irq_req; + logic [magia_tile_pkg::N_CLUSTER_CORES:0][magia_tile_pkg::EVENT_UNIT_IRQ_WIDTH-1:0] eu_core_irq_id; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_core_irq_ack; + logic [magia_tile_pkg::N_CLUSTER_CORES:0][magia_tile_pkg::EVENT_UNIT_IRQ_WIDTH-1:0] eu_core_irq_ack_id; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_core_clk_en; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_core_dbg_req; + // Per-core 32-bit irq vector for CV32E40P. EU IRQ is mapped to MEI (bit 11), + // all other bits forced to 0 to avoid X-propagation through irq_i. + logic [magia_tile_pkg::N_CLUSTER_CORES:0][31:0] core_irq_vec; // Core data demux signals magia_tile_pkg::core_data_req_t core_data_req_to_xbar; @@ -362,9 +365,35 @@ module magia_tile magia_tile_pkg::eu_direct_req_t eu_direct_req; magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp; + // Cluster core data interface (converted directly to OBI xbar) + magia_tile_pkg::core_data_req_t [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_data_req; + magia_tile_pkg::core_data_rsp_t [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_data_rsp; + + // Cluster core OBI data interface (output from demux data2obi) + magia_tile_pkg::core_obi_data_req_t [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_obi_data_req; + magia_tile_pkg::core_obi_data_rsp_t [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_obi_data_rsp; + + // EU direct req/rsp arrays for the cut (CV32 core[0] + cluster cores[1..N]) + magia_tile_pkg::eu_direct_req_t [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_req_arr; + magia_tile_pkg::eu_direct_rsp_t [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_rsp_arr; + // EU direct with pipeline cut - magia_tile_pkg::eu_direct_req_t eu_direct_req_cut; - magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp_cut; + magia_tile_pkg::eu_direct_req_t [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_req_cut; + magia_tile_pkg::eu_direct_rsp_t [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_rsp_cut; + + // Flat EU direct signals for event unit connection + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_req_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0][31:0] eu_direct_addr_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_wen_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0][31:0] eu_direct_wdata_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0][3:0] eu_direct_be_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_gnt_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_rvalid_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0][31:0] eu_direct_rdata_flat; + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_direct_err_flat; + + // Core busy signal array for event unit + logic [magia_tile_pkg::N_CLUSTER_CORES:0] eu_core_busy; // Spatz CC signals snitch_pkg::interrupts_t spatz_irq; @@ -389,6 +418,49 @@ module magia_tile logic spatz_clk_en; logic spatz_clk; + + // Cluster signals + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_clk; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_clk_en; + logic [31:0] cluster_boot_addr [magia_tile_pkg::N_CLUSTER_CORES-1:0]; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_setback; + magia_tile_pkg::core_instr_req_t [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_instr_req; + magia_tile_pkg::core_instr_rsp_t [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_instr_rsp; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_fetch_enable; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_core_sleep; +`ifdef RI5CY + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_core_busy; // RI5CY cluster: core_busy_o intermediate +`endif + logic cluster_done; + // Per-core dispatch IRQ pulse from tile_csr. CV32E40P IRQ inputs are level + // sensitive, so the pulse is stretched until the worker acknowledges MEI. + // PULP cluster cores remain disconnected from the event-unit IRQ port. + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_start_irq; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_start_irq_pending; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_irq_ack; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0][magia_tile_pkg::CLIC_ID_W-1:0] cluster_irq_id; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0][31:0] cluster_irq_vec; + + + // Cluster icache interface (raw signals - struct type uses NR_FETCH_PORTS=1, not N_CLUSTER_CORES) + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_cache_req; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0][magia_tile_pkg::CLUSTER_FETCH_AW-1:0] cluster_cache_addr; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_cache_gnt; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_cache_rvalid; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0][magia_tile_pkg::CLUSTER_FETCH_DW-1:0] cluster_cache_rdata; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_cache_rerror; + + logic cluster_enable_prefetching; + snitch_icache_pkg::icache_l0_events_t [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_icache_l0_events; + snitch_icache_pkg::icache_l1_events_t cluster_icache_l1_events; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_icache_flush_valid; + logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_icache_flush_ready; + + magia_tile_pkg::core_axi_instr_req_t cluster_l2_instr_req; + magia_tile_pkg::core_axi_instr_rsp_t cluster_l2_instr_rsp; + + + /*******************************************************/ /** Internal Signal Definitions End **/ /*******************************************************/ @@ -403,8 +475,8 @@ module magia_tile assign tile_fsync_ctrl_end_addr = magia_tile_pkg::FSYNC_CTRL_ADDR_END; assign tile_event_unit_start_addr = magia_tile_pkg::EVENT_UNIT_ADDR_START; assign tile_event_unit_end_addr = magia_tile_pkg::EVENT_UNIT_ADDR_END; - assign tile_spatz_ctrl_start_addr = magia_tile_pkg::SPATZ_CTRL_ADDR_START; - assign tile_spatz_ctrl_end_addr = magia_tile_pkg::SPATZ_CTRL_ADDR_END; + assign tile_csr_start_addr = magia_tile_pkg::TILE_CSR_START; + assign tile_csr_end_addr = magia_tile_pkg::TILE_CSR_END; assign tile_reserved_start_addr = magia_tile_pkg::RESERVED_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; assign tile_reserved_end_addr = magia_tile_pkg::RESERVED_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; assign tile_l1_start_addr = magia_tile_pkg::L1_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; @@ -415,7 +487,7 @@ module magia_tile assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_STACK_IDX] = '{idx: 32'd1, start_addr: magia_tile_pkg::STACK_ADDR_START, end_addr: magia_tile_pkg::STACK_ADDR_END }; assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX] = '{idx: 32'd5, start_addr: tile_event_unit_start_addr, end_addr: tile_event_unit_end_addr }; - assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX] = '{idx: 32'd6, start_addr: tile_spatz_ctrl_start_addr, end_addr: tile_spatz_ctrl_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_TILE_CSR_IDX] = '{idx: 32'd6, start_addr: tile_csr_start_addr, end_addr: tile_csr_end_addr }; `ifndef CV32E40X assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_REDMULE_CTRL_IDX] = '{idx: 32'd2, start_addr: tile_redmule_ctrl_start_addr, end_addr: tile_redmule_ctrl_end_addr }; assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_IDMA_IDX] = '{idx: 32'd3, start_addr: tile_idma_ctrl_start_addr, end_addr: tile_idma_ctrl_end_addr }; @@ -426,17 +498,18 @@ module magia_tile assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_BOOTROM_IDX] = '{idx: 32'd2, start_addr: magia_tile_pkg::SPATZ_BOOT_ADDR, end_addr: magia_tile_pkg::SPATZ_BOOT_ADDR + magia_tile_pkg::SPATZ_BOOTROM_SIZE}; - assign obi_xbar_en_default_idx = '1; // Routing to the AXI Xbar all requests with an address outside the range of the internal L1 and the external L2 assign obi_xbar_default_idx = '0; - assign axi_xbar_slv_req[magia_tile_pkg::AXI_SLV_CORE_DATA_IDX] = core_l2_data_req; - assign core_l2_data_rsp = axi_xbar_slv_rsp[magia_tile_pkg::AXI_SLV_CORE_DATA_IDX]; - assign axi_xbar_slv_req[magia_tile_pkg::AXI_SLV_CORE_INSTR_IDX] = core_l2_instr_req; - assign core_l2_instr_rsp = axi_xbar_slv_rsp[magia_tile_pkg::AXI_SLV_CORE_INSTR_IDX]; + assign axi_xbar_slv_req[magia_tile_pkg::AXI_SLV_CORE_DATA_IDX] = core_l2_data_req; + assign core_l2_data_rsp = axi_xbar_slv_rsp[magia_tile_pkg::AXI_SLV_CORE_DATA_IDX]; + assign axi_xbar_slv_req[magia_tile_pkg::AXI_SLV_CORE_INSTR_IDX] = core_l2_instr_req; + assign core_l2_instr_rsp = axi_xbar_slv_rsp[magia_tile_pkg::AXI_SLV_CORE_INSTR_IDX]; assign axi_xbar_slv_req[magia_tile_pkg::AXI_SLV_SPATZ_INSTR_IDX] = spatz_icache_axi_req; assign spatz_icache_axi_rsp = axi_xbar_slv_rsp[magia_tile_pkg::AXI_SLV_SPATZ_INSTR_IDX]; - + assign axi_xbar_slv_req[magia_tile_pkg::AXI_SLV_CLUSTER_INSTR_IDX] = cluster_l2_instr_req; + assign cluster_l2_instr_rsp = axi_xbar_slv_rsp[magia_tile_pkg::AXI_SLV_CLUSTER_INSTR_IDX]; + assign obi_xbar_slv_req[magia_tile_pkg::OBI_CORE_IDX] = core_obi_data_req; assign core_obi_data_rsp = obi_xbar_slv_rsp[magia_tile_pkg::OBI_CORE_IDX]; assign obi_xbar_slv_req[magia_tile_pkg::OBI_EXT_IDX] = ext_obi_data_req; @@ -444,6 +517,15 @@ module magia_tile assign obi_xbar_slv_req[magia_tile_pkg::OBI_SPATZ_IDX] = spatz_obi_req; assign spatz_obi_rsp = obi_xbar_slv_rsp[magia_tile_pkg::OBI_SPATZ_IDX]; + + + generate + for (genvar idx_core = 0; idx_core < magia_tile_pkg::N_CLUSTER_CORES; idx_core++) begin + assign obi_xbar_slv_req[magia_tile_pkg::OBI_SPATZ_IDX + 1 + idx_core] = cluster_obi_data_req[idx_core]; + assign cluster_obi_data_rsp[idx_core] = obi_xbar_slv_rsp[magia_tile_pkg::OBI_SPATZ_IDX + 1 + idx_core]; + end + endgenerate + assign axi_data_user = '0; assign obi_rsp_data_user = '0; @@ -825,6 +907,25 @@ module magia_tile ); `endif +/*********************** Cluster **********************************/ + + + + // Cluster icache interface: direct signal assignments + generate + for (genvar i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin : gen_cluster_icache_assign + assign cluster_cache_req[i] = cluster_instr_req[i].req; + assign cluster_cache_addr[i] = cluster_instr_req[i].addr; + assign cluster_instr_rsp[i].gnt = cluster_cache_gnt[i]; + assign cluster_instr_rsp[i].rvalid = cluster_cache_rvalid[i]; + assign cluster_instr_rsp[i].rdata = cluster_cache_rdata[i]; + assign cluster_instr_rsp[i].err = cluster_cache_rerror[i]; + end + endgenerate + + + + /*******************************************************/ /** Type Conversions End **/ /*******************************************************/ @@ -849,22 +950,49 @@ module magia_tile .eu_direct_rsp_i ( eu_direct_rsp ) ); + // Assemble EU direct req/rsp arrays. Only the CV32 control core uses the EU + // direct link; cluster cores go through the OBI xbar + assign eu_direct_req_arr[0] = eu_direct_req; + assign eu_direct_rsp = eu_direct_rsp_arr[0]; + + generate + for (genvar i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin : gen_eu_direct_arr + assign eu_direct_req_arr[i+1] = '0; + end + endgenerate + // EU direct pipeline cut eu_direct_cut #( - .eu_direct_req_t ( magia_tile_pkg::eu_direct_req_t ), - .eu_direct_rsp_t ( magia_tile_pkg::eu_direct_rsp_t ), - .Bypass ( 1'b0 ), - .BypassReq ( 1'b0 ), - .BypassRsp ( 1'b0 ) + .eu_direct_req_t ( magia_tile_pkg::eu_direct_req_t ), + .eu_direct_rsp_t ( magia_tile_pkg::eu_direct_rsp_t ), + .Bypass ( 1'b0 ), + .BypassReq ( 1'b0 ), + .BypassRsp ( 1'b0 ), + .NB_CORES ( magia_tile_pkg::N_CLUSTER_CORES +1 ) ) i_eu_direct_cut ( .clk_i ( sys_clk ), .rst_ni ( rst_ni ), - .sbr_req_i ( eu_direct_req ), - .sbr_rsp_o ( eu_direct_rsp ), + .sbr_req_i ( eu_direct_req_arr ), + .sbr_rsp_o ( eu_direct_rsp_arr ), .mgr_req_o ( eu_direct_req_cut ), .mgr_rsp_i ( eu_direct_rsp_cut ) ); + // Flatten eu_direct_cut output for event unit connection + generate + for (genvar k = 0; k < magia_tile_pkg::N_CLUSTER_CORES + 1; k++) begin : gen_eu_direct_flat + assign eu_direct_req_flat[k] = eu_direct_req_cut[k].req; + assign eu_direct_addr_flat[k] = eu_direct_req_cut[k].addr; + assign eu_direct_wen_flat[k] = eu_direct_req_cut[k].wen; + assign eu_direct_wdata_flat[k] = eu_direct_req_cut[k].wdata; + assign eu_direct_be_flat[k] = eu_direct_req_cut[k].be; + assign eu_direct_rsp_cut[k].gnt = eu_direct_gnt_flat[k]; + assign eu_direct_rsp_cut[k].rvalid = eu_direct_rvalid_flat[k]; + assign eu_direct_rsp_cut[k].rdata = eu_direct_rdata_flat[k]; + assign eu_direct_rsp_cut[k].err = eu_direct_err_flat[k]; + end + endgenerate + /*******************************************************/ /** Core Data Demux End **/ /*******************************************************/ @@ -1011,7 +1139,7 @@ module magia_tile `HCI_ASSIGN_TO_INTF(hci_core_if[i+1], spatz_hci_req[i], spatz_hci_rsp[i]) // Spatz CC HCI ports end endgenerate - `HCI_ASSIGN_TO_INTF(hci_redmule_if[0], redmule_data_req, redmule_data_rsp) // Only 1 RedMulE supported + `HCI_ASSIGN_TO_INTF(hci_redmule_if[0], redmule_data_req, redmule_data_rsp) // Only 1 RedMulE supported `HCI_ASSIGN_TO_INTF(hci_dma_if[magia_tile_pkg::HCI_DMA_OUT_CH_READ_IDX], idma_hci_read_req_out, idma_hci_read_rsp_out) // iDMA out HCI read channel `HCI_ASSIGN_TO_INTF(hci_dma_if[magia_tile_pkg::HCI_DMA_OUT_CH_WRITE_IDX], idma_hci_write_req_out, idma_hci_write_rsp_out) // iDMA out HCI write channel `HCI_ASSIGN_TO_INTF(hci_dma_if[magia_tile_pkg::HCI_DMA_IN_CH_READ_IDX], idma_hci_read_req_in, idma_hci_read_rsp_in) // iDMA in HCI read channel @@ -1112,7 +1240,7 @@ module magia_tile .PMA_CFG ( ), // No array of PMA configurations .CLIC ( magia_tile_pkg::CLIC_EN ), // Support for Smclic, Smclicshv and Smclicconfig .CLIC_ID_WIDTH ( magia_tile_pkg::CLIC_ID_W ) // Width of clic_irq_id_i and clic_irq_id_o - ) i_cv32e40x_core ( + ) i_cv32e40x_main_core ( // Clock and reset .clk_i ( sys_clk ), .rst_ni ( rst_ni ), @@ -1191,8 +1319,7 @@ module magia_tile .core_sleep_o , .wu_wfe_i ); -`else - // RI5CY core with integrated FPU and tracer +`elsif RI5CY riscv_core #( .N_EXT_PERF_COUNTERS ( magia_tile_pkg::N_EXT_PERF_COUNTERS ), .INSTR_RDATA_WIDTH ( magia_tile_pkg::INSTR_RDATA_WIDTH ), @@ -1214,7 +1341,7 @@ module magia_tile .APU_NDSFLAGS_CPU ( magia_tile_pkg::APU_NDSFLAGS_CPU ), .APU_NUSFLAGS_CPU ( magia_tile_pkg::APU_NUSFLAGS_CPU ), .DM_HaltAddress ( magia_tile_pkg::DM_HALT_ADDR ) - ) i_cv32e40p_core ( + ) i_ri5cy_main_core ( // Clock and Reset .clk_i ( core_clk ), // Use gated clock for core .rst_ni ( rst_ni ), @@ -1230,8 +1357,8 @@ module magia_tile .boot_addr_i ( boot_addr_i ), // Cluster/Core IDs - .cluster_id_i ( mhartid_i[9:4] ), - .core_id_i ( mhartid_i[3:0] ), + .cluster_id_i ( '0 ), + .core_id_i ( mhartid_i[3:0] ), // Instruction memory interface .instr_req_o ( core_instr_req.req ), @@ -1275,7 +1402,7 @@ module magia_tile .sec_lvl_o ( ), // Debug interface - .debug_req_i ( debug_req_i ), + .debug_req_i ( debug_req_i[0] ), // CPU control .fetch_enable_i ( fetch_enable_i ), @@ -1286,7 +1413,59 @@ module magia_tile .ext_perf_counters_i ( '0 ) ); - assign core_sleep_o = !core_busy_o; +`else +`ifndef CORE_TRACES + cv32e40p_top #( +`else + cv32e40p_wrapper #( +`endif + .COREV_PULP ( 1 ), // For now this is a no + .COREV_CLUSTER ( 1 ), + .FPU ( FPU ), + .ZFINX ( magia_tile_pkg::ZFINX ), + .FPU_ADDMUL_LAT ( 1 ), // Match C_LAT_FP32=1 in fpnew wrapper + .FPU_OTHERS_LAT ( 1 ), // Match C_LAT_NONCOMP=1 in fpnew wrapper + .NUM_MHPMCOUNTERS ( 29 ) + ) i_cv32e40p_main_core ( + // Clock and Reset + .clk_i ( core_clk ), // Use gated clock for core + .rst_ni ( rst_ni ), + + // Clock Interface + .pulp_clock_en_i ( sys_clk_en ), + .scan_cg_en_i ( test_mode_i ), + .boot_addr_i ( boot_addr_i ), + .mtvec_addr_i ( boot_addr_i ), // mtvec defaults to boot vector; SW can override via csrw + .dm_halt_addr_i ( magia_tile_pkg::DM_HALT_ADDR), + .hart_id_i ( mhartid_i ), + .dm_exception_addr_i ( magia_tile_pkg::DM_HALT_ADDR + 16'h000C), //to be checked + // Instruction interface + .instr_req_o ( core_instr_req.req ), + .instr_gnt_i ( core_instr_rsp.gnt ), + .instr_rvalid_i ( core_instr_rsp.rvalid ), + .instr_addr_o ( core_instr_req.addr ), + .instr_rdata_i ( core_instr_rsp.rdata ), + // Data interface + .data_req_o ( core_data_req.req ), + .data_gnt_i ( core_data_rsp.gnt ), + .data_rvalid_i ( core_data_rsp.rvalid ), + .data_addr_o ( core_data_req.addr ), + .data_be_o ( core_data_req.be ), + .data_wdata_o ( core_data_req.wdata ), + .data_we_o ( core_data_req.we ), + .data_rdata_i ( core_data_rsp.rdata ), + // Interrupts (irq_i is [31:0]; EU IRQ goes to MEI bit 11, others 0) + .irq_i ( core_irq_vec[0] ), + .irq_ack_o ( eu_core_irq_ack[0] ), + .irq_id_o ( eu_core_irq_ack_id[0] ), + // Debug interface + .debug_req_i ( debug_req_i[0] ), + // CPU control + .fetch_enable_i ( fetch_enable_i ), + .core_sleep_o ( core_sleep_o ) + ); + + assign core_instr_req.memtype = 2'b00; assign core_instr_req.prot = 3'b000; @@ -1326,6 +1505,15 @@ module magia_tile assign obi_xbar_slv_cut_req[magia_tile_pkg::OBI_CORE_IDX] = obi_xbar_slv_req[magia_tile_pkg::OBI_CORE_IDX]; assign obi_xbar_slv_rsp[magia_tile_pkg::OBI_CORE_IDX] = obi_xbar_slv_cut_rsp[magia_tile_pkg::OBI_CORE_IDX]; + + generate + for (genvar idx_core = 0; idx_core < magia_tile_pkg::N_CLUSTER_CORES; idx_core++) begin + assign obi_xbar_slv_cut_req[magia_tile_pkg::OBI_SPATZ_IDX + 1 + idx_core] = obi_xbar_slv_req[magia_tile_pkg::OBI_SPATZ_IDX + 1 + idx_core]; + assign obi_xbar_slv_rsp[magia_tile_pkg::OBI_SPATZ_IDX + 1 + idx_core] = obi_xbar_slv_cut_rsp[magia_tile_pkg::OBI_SPATZ_IDX + 1 + idx_core]; + end + endgenerate + + obi_xbar #( .SbrPortObiCfg ( magia_tile_pkg::obi_amo_cfg ), .MgrPortObiCfg ( ), @@ -1909,42 +2097,116 @@ module magia_tile /*******************************************************/ /** Floating-Point Unit End **/ /*******************************************************/ -/** Spatz Control Slave Beginning **/ +/** Tile CSR Beginning **/ /*******************************************************/ - + /* obi_slave_ctrl_spatz #( - .BaseAddr ( magia_tile_pkg::SPATZ_CTRL_ADDR_START ) + .BaseAddr ( magia_tile_pkg::TILE_CSR_START ) ) i_spatz_ctrl ( .clk_i ( sys_clk ), .rst_ni ( rst_ni ), - .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX] ), - .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX] ), + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_TILE_CSR_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_TILE_CSR_IDX] ), .clk_en_o ( spatz_clk_en ), .start_o ( spatz_start ), .done_o ( spatz_done ) ); + */ + + tile_csr #( + .BaseAddr ( magia_tile_pkg::TILE_CSR_START ) + ) i_tile_csr ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_TILE_CSR_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_TILE_CSR_IDX] ), + .spatz_clk_en_o ( spatz_clk_en ), + .spatz_start_o ( spatz_start ), + .spatz_done_o ( spatz_done ), + .cluster_clk_en_o ( cluster_clk_en ), + .cluster_boot_addr_o ( cluster_boot_addr ), + .cluster_fetch_en_o ( cluster_fetch_enable ), + .cluster_done_o ( cluster_done ), + .cluster_start_irq_o ( cluster_start_irq ) + ); + /*******************************************************/ -/** Spatz Control Slave End **/ +/** Tile CSR End **/ /*******************************************************/ /** Event Unit Beginning **/ /*******************************************************/ // Event array assignments for proper 2D array structure - assign acc_events_array[0] = {redmule_evt[0][1], redmule_evt[0][0], redmule_busy, spatz_done}; - assign dma_events_array[0] = {idma_obi2axi_done, idma_axi2obi_done}; - assign timer_events_array[0] = 2'b00; - assign other_events_array[0] = {idma_obi2axi_busy, idma_axi2obi_busy, idma_obi2axi_start, idma_axi2obi_start, idma_obi2axi_error, idma_axi2obi_error, fsync_error, fsync_done, spatz_start, 23'b0}; // iDMA status events [31:28]|iDMA errors [27:26]|Fsync [25:24]|Spatz start [23]|Reserved [22:0] + // Per-tile shared HW event lines. PULP DONE is exposed to the CV32 Event Unit + // on event bit 12. Cluster cores are dispatched by + // tile_csr MEI pulses and do not consume EU direct-link events. + logic [3:0] acc_events_shared; + logic [1:0] dma_events_shared; + logic [1:0] timer_events_shared; + logic [31:0] other_events_shared; + + assign acc_events_shared = {redmule_evt[0][1], redmule_evt[0][0], redmule_busy, spatz_done}; + assign dma_events_shared = {idma_obi2axi_done, idma_axi2obi_done}; + assign timer_events_shared = 2'b00; + assign other_events_shared = {idma_obi2axi_busy, idma_axi2obi_busy, idma_obi2axi_start, idma_axi2obi_start, idma_obi2axi_error, idma_axi2obi_error, fsync_error, fsync_done, spatz_start, 7'b0, 3'b0, cluster_done, 12'b0}; // iDMA status [31:28] | iDMA errors [27:26] | Fsync [25:24] | Spatz start [23] | PULP done [12] + + // Broadcast event lines to CV32 (idx 0) and to every cluster core (idx 1..N). + generate + for (genvar i = 0; i <= magia_tile_pkg::N_CLUSTER_CORES; i++) begin : gen_eu_events_broadcast + assign acc_events_array[i] = acc_events_shared; + assign dma_events_array[i] = dma_events_shared; + assign timer_events_array[i] = timer_events_shared; + assign other_events_array[i] = other_events_shared; + end + endgenerate + + // Drive cluster icache control signals + assign cluster_enable_prefetching = 1'b0; + assign cluster_icache_flush_valid = '0; + + // Core busy array for Event Unit (CV32 + cluster cores) + assign eu_core_busy[0] = ~core_sleep_o; + generate + for (genvar i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin : gen_eu_core_busy + assign eu_core_busy[i+1] = ~cluster_core_sleep[i]; + end + endgenerate + + // Build the per-core 32-bit irq_i vector. The EU IRQ request is mapped to + // the Machine External Interrupt (bit 11), which is the only standard + // RISC-V interrupt bit for external devices and is enabled by IRQ_MASK in + // CV32E40P. All other bits are forced to 0 to prevent X-propagation + // (otherwise an unconnected [31:0] input would be 'z, get masked to X by + // IRQ_MASK, and corrupt the controller FSM during cv.elw). + generate + for (genvar i = 0; i <= magia_tile_pkg::N_CLUSTER_CORES; i++) begin : gen_core_irq_vec + assign core_irq_vec[i] = {20'b0, eu_core_irq_req[i], 11'b0}; + end + endgenerate `ifdef CV32E40X assign eu_core_irq_ack = eu_core_irq_req; assign eu_core_irq_ack_id = eu_core_irq_id; assign core_busy_o = !core_sleep_o; +`else + // PULP cluster cores are no longer wired to the event unit's IRQ port: + // tie their ack/ack_id slots so the EU sees them as idle/never-acking. + generate + for (genvar i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin : gen_cluster_irq_ack_tie + assign eu_core_irq_ack[i+1] = 1'b0; + assign eu_core_irq_ack_id[i+1] = '0; + end + endgenerate +`ifdef RI5CY + // RI5CY outputs core_busy_o (active-high: 1 = busy); derive core_sleep_o (active-high: 1 = sleeping) + assign core_sleep_o = ~core_busy_o; +`endif `endif magia_event_unit #( - .NB_CORES ( 1 ), + .NB_CORES ( 1 + magia_tile_pkg::N_CLUSTER_CORES ), // control core + cluster cores .NB_SW_EVT ( 1 ), .NB_BARR ( 2 ), .NB_HW_MUT ( 1 ), @@ -1970,7 +2232,7 @@ module magia_tile .core_irq_ack_id_i( eu_core_irq_ack_id ), // Core control - .core_busy_i ( core_busy_o ), + .core_busy_i ( eu_core_busy ), .core_clock_en_o ( eu_core_clk_en ), // Debug @@ -1978,15 +2240,15 @@ module magia_tile .core_dbg_req_o ( eu_core_dbg_req ), // EU Direct Link Interface (with cut for timing) - .eu_direct_req_i ( eu_direct_req_cut.req ), - .eu_direct_addr_i ( eu_direct_req_cut.addr ), - .eu_direct_wen_i ( eu_direct_req_cut.wen ), - .eu_direct_wdata_i ( eu_direct_req_cut.wdata ), - .eu_direct_be_i ( eu_direct_req_cut.be ), - .eu_direct_gnt_o ( eu_direct_rsp_cut.gnt ), - .eu_direct_rvalid_o ( eu_direct_rsp_cut.rvalid ), - .eu_direct_rdata_o ( eu_direct_rsp_cut.rdata ), - .eu_direct_err_o ( eu_direct_rsp_cut.err ), + .eu_direct_req_i ( eu_direct_req_flat ), + .eu_direct_addr_i ( eu_direct_addr_flat ), + .eu_direct_wen_i ( eu_direct_wen_flat ), + .eu_direct_wdata_i ( eu_direct_wdata_flat ), + .eu_direct_be_i ( eu_direct_be_flat ), + .eu_direct_gnt_o ( eu_direct_gnt_flat ), + .eu_direct_rvalid_o ( eu_direct_rvalid_flat ), + .eu_direct_rdata_o ( eu_direct_rdata_flat ), + .eu_direct_err_o ( eu_direct_err_flat ), // OBI Peripheral Slave Interface .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX] ), @@ -2123,8 +2385,8 @@ module magia_tile ) i_axi_to_reg_bootrom ( .clk_i ( sys_clk ), .rst_ni ( rst_ni ), - .axi_req_i ( axi_xbar_mst_req[magia_tile_pkg::AXI_MST_BOOTROM_IDX] ), - .axi_rsp_o ( axi_xbar_mst_rsp[magia_tile_pkg::AXI_MST_BOOTROM_IDX] ), + .axi_req_i ( axi_xbar_mst_req[magia_tile_pkg::AXI_MST_BOOTROM_IDX] ), + .axi_rsp_o ( axi_xbar_mst_rsp[magia_tile_pkg::AXI_MST_BOOTROM_IDX] ), .reg_req_o ( bootrom_reg_req ), .reg_rsp_i ( bootrom_reg_rsp ), .reg_id_o ( bootrom_reg_id ), @@ -2154,4 +2416,259 @@ module magia_tile /** Spatz Bootrom End **/ /*******************************************************/ +/*******************************************************/ +/** Cluster Beginninng **/ +/*******************************************************/ + +// PULP cluster cores: clock is always enabled. They are disconnected from the +// event-unit clock-enable path and rely on WFI + MEI (from tile_csr PULP_START) +// for sleep/wake semantics, matching the new dynamic dispatch model. +for (genvar j = 0; j < magia_tile_pkg::N_CLUSTER_CORES; j++) begin : gen_cluster_clk_gate + tc_clk_gating i_cluster_clk_gate ( + .clk_i ( sys_clk ), + .en_i ( 1'b1 ), + .test_en_i ( test_mode_i ), + .clk_o ( cluster_clk[j] ) + ); +end + +always_ff @(posedge sys_clk or negedge rst_ni) begin + if (!rst_ni) begin + cluster_start_irq_pending <= '0; + end else begin + for (int unsigned i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin + // Clear on any ack: cluster cores have exactly one IRQ source (the + // dispatch pulse), so irq_ack always refers to that source. + // CV32E40P acks with irq_id_o=11 (MEI, from priority encoder on irq_i[31:0]). + // RI5CY acks with irq_id_o=0 (reflects irq_id_i which is tied to '0). + // Checking irq_id==11 would never fire for RI5CY, leaving pending stuck + // HIGH and causing repeated trap-handler re-entry after every mret. + if (cluster_irq_ack[i]) begin + cluster_start_irq_pending[i] <= 1'b0; + end else if (cluster_start_irq[i]) begin + cluster_start_irq_pending[i] <= 1'b1; + end + end + end +end + +// Build per-core IRQ vector for PULP cluster cores: MEI bit (11) is driven by +// the stretched dispatch request; all other interrupt bits forced to 0. +for (genvar k = 0; k < magia_tile_pkg::N_CLUSTER_CORES; k++) begin : gen_cluster_irq_vec + assign cluster_irq_vec[k] = {20'b0, cluster_start_irq_pending[k], 11'b0}; +end + +generate + for (genvar i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin : CORE + `ifdef RI5CY + // RI5CY core with integrated FPU and tracer + // cluster_id_i identifies WHICH cluster (= tile), same for all cores in a tile. + // core_id_i identifies WHICH core within the cluster (0-indexed). + // Use mhartid_i+1 for cluster_id so tile-0 cluster cores never get 0 (0 = standalone main core). + riscv_core #( + .N_EXT_PERF_COUNTERS ( magia_tile_pkg::N_EXT_PERF_COUNTERS ), + .INSTR_RDATA_WIDTH ( magia_tile_pkg::INSTR_RDATA_WIDTH ), + .PULP_SECURE ( magia_tile_pkg::PULP_SECURE ), + .N_PMP_ENTRIES ( magia_tile_pkg::N_PMP_ENTRIES ), + .USE_PMP ( magia_tile_pkg::USE_PMP ), + .PULP_CLUSTER ( magia_tile_pkg::PULP_CLUSTER ), + .FPU ( magia_tile_pkg::FPU ), + .Zfinx ( magia_tile_pkg::ZFINX ), + .FP_DIVSQRT ( magia_tile_pkg::FP_DIVSQRT ), + .SHARED_FP ( magia_tile_pkg::SHARED_FP ), + .SHARED_DSP_MULT ( magia_tile_pkg::SHARED_DSP_MULT ), + .SHARED_INT_MULT ( magia_tile_pkg::SHARED_INT_MULT ), + .SHARED_INT_DIV ( magia_tile_pkg::SHARED_INT_DIV ), + .SHARED_FP_DIVSQRT ( magia_tile_pkg::SHARED_FP_DIVSQRT ), + .WAPUTYPE ( magia_tile_pkg::WAPUTYPE ), + .APU_NARGS_CPU ( magia_tile_pkg::APU_NARGS_CPU ), + .APU_WOP_CPU ( magia_tile_pkg::APU_WOP_CPU ), + .APU_NDSFLAGS_CPU ( magia_tile_pkg::APU_NDSFLAGS_CPU ), + .APU_NUSFLAGS_CPU ( magia_tile_pkg::APU_NUSFLAGS_CPU ), + .DM_HaltAddress ( magia_tile_pkg::DM_HALT_ADDR ) + ) i_RI5CY_core ( + // Clock and Reset + .clk_i ( cluster_clk[i] ), // Always-on per-core cluster clock (NOT the EU-gated main-core clock) + .rst_ni ( rst_ni ), + + // Clock enable and test mode + .clock_en_i ( sys_clk_en ), + .test_en_i ( test_mode_i ), + + // Floating-point register file disable (for Zfinx) + .fregfile_disable_i ( 1'b0 ), // FPU enabled, use dedicated FP regfile + + // Boot configuration + .boot_addr_i ( cluster_boot_addr[i] ), + + // Cluster/Core IDs + .cluster_id_i ( 6'(mhartid_i) + 6'd1 ), // which cluster (tile+1, 1-indexed) + .core_id_i ( 4'(i) ), // which core within the cluster + + // Instruction memory interface + .instr_req_o ( cluster_instr_req[i].req ), + .instr_addr_o ( cluster_instr_req[i].addr ), + .instr_gnt_i ( cluster_instr_rsp[i].gnt ), + .instr_rvalid_i ( cluster_instr_rsp[i].rvalid ), + .instr_rdata_i ( cluster_instr_rsp[i].rdata ), + + // Data memory interface + .data_req_o ( cluster_data_req[i].req ), + .data_addr_o ( cluster_data_req[i].addr ), + .data_be_o ( cluster_data_req[i].be ), + .data_wdata_o ( cluster_data_req[i].wdata ), + .data_we_o ( cluster_data_req[i].we ), + .data_gnt_i ( cluster_data_rsp[i].gnt ), + .data_rvalid_i ( cluster_data_rsp[i].rvalid ), + .data_rdata_i ( cluster_data_rsp[i].rdata ), + + // APU interface (disabled - not connected) + .apu_master_req_o ( ), + .apu_master_ready_o ( ), + .apu_master_gnt_i ( '0 ), + + .apu_master_operands_o ( ), + .apu_master_op_o ( ), + .apu_master_type_o ( ), + .apu_master_flags_o ( ), + + .apu_master_valid_i ( '0 ), + .apu_master_result_i ( '0 ), + .apu_master_flags_i ( '0 ), + + // Interrupts + .irq_i ( cluster_start_irq_pending[i] ), + .irq_ack_o ( cluster_irq_ack[i] ), + .irq_id_o ( cluster_irq_id[i] ), + .irq_sec_i ( '0 ), + .irq_id_i ( '0 ), + + // Security level (unused) + .sec_lvl_o ( ), + + // Debug interface + .debug_req_i ( debug_req_i[i+1] ), + + // CPU control + .fetch_enable_i ( cluster_fetch_enable[i] ), + .core_busy_o ( cluster_core_busy[i] ), + + + // Performance counters + .ext_perf_counters_i ( '0 ) + ); + assign cluster_core_sleep[i] = ~cluster_core_busy[i]; // RI5CY: core_busy_o is active-high; derive core_sleep (active-high) + `else + `ifndef CORE_TRACES + cv32e40p_top #( + `else + cv32e40p_wrapper #( + `endif + .COREV_PULP ( 1 ), // For now this is a no + .COREV_CLUSTER ( 1 ), + .FPU ( FPU ), + .ZFINX ( magia_tile_pkg::ZFINX ), + .FPU_ADDMUL_LAT ( 1 ), // Match C_LAT_FP32=1 in fpnew wrapper + .FPU_OTHERS_LAT ( 1 ), // Match C_LAT_NONCOMP=1 in fpnew wrapper + .NUM_MHPMCOUNTERS ( 29 ) + ) i_cv32e40p_core ( + // Clock and Reset + .clk_i ( cluster_clk[i] ), // Use gated clock for core + .rst_ni ( rst_ni ), + + // Clock Interface — cluster cores always have clock enabled; rely on + // WFI / MEI (dispatch IRQ) for sleep/wake. + .pulp_clock_en_i ( 1'b1 ), + .scan_cg_en_i ( test_mode_i ), + .boot_addr_i ( cluster_boot_addr[i] ), // From tile CSR, dynamic per tile + .mtvec_addr_i ( cluster_boot_addr[i] ), // mtvec defaults to boot vector; SW can override via csrw + .dm_halt_addr_i ( magia_tile_pkg::DM_HALT_ADDR), + .hart_id_i ( 2 * magia_pkg::N_TILES + mhartid_i * magia_tile_pkg::N_CLUSTER_CORES + i ), + .dm_exception_addr_i ( magia_tile_pkg::DM_HALT_ADDR + 16'h000C), //to be checked + // Instruction interface + .instr_req_o ( cluster_instr_req[i].req ), + .instr_addr_o ( cluster_instr_req[i].addr ), + .instr_gnt_i ( cluster_instr_rsp[i].gnt ), + .instr_rvalid_i ( cluster_instr_rsp[i].rvalid ), + .instr_rdata_i ( cluster_instr_rsp[i].rdata ), + // Data interface (converted directly to OBI xbar) + .data_req_o ( cluster_data_req[i].req ), + .data_addr_o ( cluster_data_req[i].addr ), + .data_be_o ( cluster_data_req[i].be ), + .data_wdata_o ( cluster_data_req[i].wdata ), + .data_we_o ( cluster_data_req[i].we ), + .data_gnt_i ( cluster_data_rsp[i].gnt ), + .data_rvalid_i ( cluster_data_rsp[i].rvalid ), + .data_rdata_i ( cluster_data_rsp[i].rdata ), + // Interrupts: PULP cluster cores receive only the per-core dispatch IRQ + // (MEI bit 11) from tile_csr. They are disconnected from the event unit. + .irq_i ( cluster_irq_vec[i] ), + .irq_ack_o ( cluster_irq_ack[i] ), + .irq_id_o ( cluster_irq_id[i] ), + // Debug interface + .debug_req_i ( debug_req_i[i+1] ), + // CPU control + .fetch_enable_i ( cluster_fetch_enable[i] ), + .core_sleep_o ( cluster_core_sleep[i] ) + ); + `endif + end +endgenerate + + // Cluster core data demux (EU direct link) and OBI conversion + generate + for (genvar i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin : gen_cluster_data_obi + data2obi_req i_cluster_data2obi ( + .data_req_i ( cluster_data_req[i] ), + .obi_req_o ( cluster_obi_data_req[i] ) + ); + + obi2data_rsp i_cluster_obi2data ( + .obi_rsp_i ( cluster_obi_data_rsp[i] ), + .data_rsp_o ( cluster_data_rsp[i] ) + ); + end + endgenerate + + + + magia_tile_icache_wrap #( + .NumFetchPorts ( magia_tile_pkg::N_CLUSTER_CORES ), + .L0_LINE_COUNT ( magia_tile_pkg::CLUSTER_L0_LINE_COUNT ), + .LINE_WIDTH ( magia_tile_pkg::CLUSTER_LINE_WIDTH ), + .LINE_COUNT ( magia_tile_pkg::CLUSTER_LINE_COUNT ), + .WAY_COUNT ( magia_tile_pkg::CLUSTER_WAY_COUNT ), + .FetchAddrWidth ( magia_tile_pkg::CLUSTER_FETCH_AW ), + .FetchDataWidth ( magia_tile_pkg::CLUSTER_FETCH_DW ), + .AxiAddrWidth ( magia_tile_pkg::CLUSTER_FILL_AW ), + .AxiDataWidth ( magia_tile_pkg::CLUSTER_FILL_DW ), + .sram_cfg_data_t ( /* Not Used */ ), + .sram_cfg_tag_t ( /* Not Used */ ), + .axi_req_t ( magia_tile_pkg::core_axi_instr_req_t ), + .axi_rsp_t ( magia_tile_pkg::core_axi_instr_rsp_t ) + ) cluster_icache_top_i ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .fetch_req_i ( cluster_cache_req ), + .fetch_addr_i ( cluster_cache_addr ), + .fetch_gnt_o ( cluster_cache_gnt ), + .fetch_rvalid_o ( cluster_cache_rvalid ), + .fetch_rdata_o ( cluster_cache_rdata ), + .fetch_rerror_o ( cluster_cache_rerror ), + + .enable_prefetching_i ( cluster_enable_prefetching ), + .icache_l0_events_o ( cluster_icache_l0_events ), + .icache_l1_events_o ( cluster_icache_l1_events ), + .flush_valid_i ( cluster_icache_flush_valid ), + .flush_ready_o ( cluster_icache_flush_ready ), + + .sram_cfg_data_i ('0), + .sram_cfg_tag_i ('0), + + .axi_req_o ( cluster_l2_instr_req ), + .axi_rsp_i ( cluster_l2_instr_rsp ) + ); + + + endmodule: magia_tile \ No newline at end of file diff --git a/hw/tile/magia_tile_pkg.sv b/hw/tile/magia_tile_pkg.sv index 0a140585..836db59b 100644 --- a/hw/tile/magia_tile_pkg.sv +++ b/hw/tile/magia_tile_pkg.sv @@ -52,21 +52,21 @@ package magia_tile_pkg; // Address map localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_START = 32'h0000_0100; - localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_SIZE = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_SIZE = 32'h0000_0100; localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_END = REDMULE_CTRL_ADDR_START + REDMULE_CTRL_SIZE; localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_START = REDMULE_CTRL_ADDR_END; localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_SIZE = 32'h0000_0400; localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_END = IDMA_CTRL_ADDR_START + IDMA_CTRL_SIZE; localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_START = IDMA_CTRL_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_SIZE = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_SIZE = 32'h0000_0100; localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_END = FSYNC_CTRL_ADDR_START + FSYNC_CTRL_SIZE; localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_START = FSYNC_CTRL_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_SIZE = 32'h0000_1000; + localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_SIZE = 32'h0000_1000; localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_END = EVENT_UNIT_ADDR_START + EVENT_UNIT_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_ADDR_START = EVENT_UNIT_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_SIZE = 32'h0000_0100; - localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_ADDR_END = SPATZ_CTRL_ADDR_START + SPATZ_CTRL_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START = SPATZ_CTRL_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] TILE_CSR_START = EVENT_UNIT_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] TILE_CSR_SIZE = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] TILE_CSR_END = TILE_CSR_START + TILE_CSR_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START = TILE_CSR_END; localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_SIZE = 32'h0000_E800; localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_END = RESERVED_ADDR_START + RESERVED_SIZE; localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_START = RESERVED_ADDR_END; @@ -223,7 +223,7 @@ package magia_tile_pkg; // Parameters used by the HCI parameter int unsigned N_HWPE = 1; // Number of HWPEs attached to the port - parameter int unsigned N_CORE = 1 + SPATZ_HCI_PORTS; // Number of Core ports: 1 CV32 + Spatz HCI ports (RVD=1: 11 total, RVD=0: 6 total) + parameter int unsigned N_CORE = 1 + SPATZ_HCI_PORTS; // Number of core-side HCI ports (CV32 + Spatz TCDM ports) parameter int unsigned N_DMA = 4; // Number of DMA ports (1 out read channel, 1 out write channel, 1 in read channel and 1 in write channel) typedef enum logic[1:0]{ HCI_DMA_OUT_CH_READ_IDX = 2'b00, @@ -268,7 +268,7 @@ package magia_tile_pkg; // Parameters used by cv32e40p core parameter int unsigned N_EXT_PERF_COUNTERS = 0; // Number of external performance counters parameter int unsigned INSTR_RDATA_WIDTH = 32; // Instruction data width - parameter bit PULP_SECURE = 1'b0; // PULP security features + parameter bit PULP_SECURE = 1'b1; // PULP security features (must be 1 for writable mtvec; PULP_SECURE=0 hardwires mtvec_q to boot_addr_i) parameter int unsigned N_PMP_ENTRIES = 16; // Number of PMP entries parameter bit USE_PMP = 1'b1; // Enable PMP parameter bit PULP_CLUSTER = 1'b1; // PULP cluster mode @@ -308,32 +308,36 @@ package magia_tile_pkg; parameter int unsigned REDMULE_UW = UWH; // RedMulE User Width // Parameters used by OBI - parameter int unsigned AUSER_WIDTH = 1; // Width of the auser signal (see OBI documentation): not used by the CV32E40X - parameter int unsigned WUSER_WIDTH = 1; // Width of the wuser signal (see OBI documentation): not used by the CV32E40X - parameter int unsigned ACHK_WIDTH = 1; // Width of the achk signal (see OBI documentation): not used by the CV32E40X - parameter int unsigned RUSER_WIDTH = 1; // Width of the ruser signal (see OBI documentation): not used by the CV32E40X - parameter int unsigned RCHK_WIDTH = 1; // Width of the rchk signal (see OBI documentation): not used by the CV32E40X - parameter int unsigned AID_WIDTH = 1; // Width of the aid signal (address channel identifier, see OBI documentation) - parameter int unsigned RID_WIDTH = 1; // Width of the rid signal (response channel identifier, see OBI documentation) - parameter int unsigned MID_WIDTH = 1; // Width of the mid signal (manager identifier, see OBI documentation) - parameter int unsigned OBI_ID_WIDTH = 1; // Width of the id - configuration + parameter int unsigned AUSER_WIDTH = 1; // Width of the auser signal (see OBI documentation): not used by the CV32E40X + parameter int unsigned WUSER_WIDTH = 1; // Width of the wuser signal (see OBI documentation): not used by the CV32E40X + parameter int unsigned ACHK_WIDTH = 1; // Width of the achk signal (see OBI documentation): not used by the CV32E40X + parameter int unsigned RUSER_WIDTH = 1; // Width of the ruser signal (see OBI documentation): not used by the CV32E40X + parameter int unsigned RCHK_WIDTH = 1; // Width of the rchk signal (see OBI documentation): not used by the CV32E40X + parameter int unsigned AID_WIDTH = 1; // Width of the aid signal (address channel identifier, see OBI documentation) + parameter int unsigned RID_WIDTH = 1; // Width of the rid signal (response channel identifier, see OBI documentation) + parameter int unsigned MID_WIDTH = 1; // Width of the mid signal (manager identifier, see OBI documentation) + parameter int unsigned OBI_ID_WIDTH = 1; // Width of the id - configuration + parameter int unsigned N_CLUSTER_CORES = 8; // Number of cores in the cluster (cntrl core not considered) `ifdef CV32E40X - parameter int unsigned N_SBR = 4; // Number of slaves (HCI, AXI XBAR, Event_Unit, Spatz_Ctrl) + parameter int unsigned N_SBR = 5; // Number of slaves (HCI, AXI XBAR, Event_Unit, Tile_CSR, + unused RESERVED alias) `else - parameter int unsigned N_SBR = 7; // Number of OBI slaves (HCI, AXI XBAR, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Spatz_Ctrl) + parameter int unsigned N_SBR = 7; // Number of OBI slaves (HCI, AXI XBAR, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Tile_CSR) `endif - parameter int unsigned N_MGR = 3; // Number of masters (Core, AXI XBAR, Spatz CC) + parameter int unsigned N_MGR = 3 + N_CLUSTER_CORES; // Number of masters (Core, AXI XBAR, Spatz CC) parameter int unsigned N_MAX_TRAN = 1; // Number of maximum outstanding transactions `ifdef CV32E40X - parameter int unsigned N_ADDR_RULE = 6; // Number of address rules (L2, L1, Stack, Reserved, Event_Unit, Spatz_Ctrl) + parameter int unsigned N_ADDR_RULE = 6; // Number of address rules (L2, L1, Stack, Reserved, Event_Unit, Tile_CSR) `else - parameter int unsigned N_ADDR_RULE = 9; // Number of OBI address rules (L2, L1, Stack, Reserved, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Spatz_Ctrl) + parameter int unsigned N_ADDR_RULE = 9; // Number of OBI address rules (L2, L1, Stack, Reserved, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Tile_CSR) `endif - localparam int unsigned N_BIT_SBR = $clog2(N_SBR); // Number of bits required to identify each slave - + localparam int unsigned N_BIT_SBR = $clog2(N_SBR); // Number of bits required to identify each slave + localparam int unsigned N_BIT_MGR = $clog2(N_MGR); // Number of bits required to identify each master + localparam int unsigned N_BIT_CLUSTER_CORES = $clog2(N_CLUSTER_CORES); // Number of bits required to identify each core in the cluster // Parameters used by AXI - parameter int unsigned AXI_DATA_ID_W = 2; // Width of AXI data IDs (4 xbar slave ports) - parameter int unsigned AXI_ID_W = 2; // Width of the AXI Unified Communication Channel ID + parameter int unsigned AXI_DATA_ID_W = 3; // Width of the AXI Data ID (3 bits for 5 slave ports on crossbar: 2^3=8) + parameter int unsigned AXI_INSTR_ID_W = 3; // Width of the AXI Instruction ID (3 bits for 5 slave ports on crossbar) + parameter int unsigned AXI_ID_W = 3; // Width of the AXI Unified Communication Channel ID (3 bits for 5 slave ports) + localparam int unsigned AXI_MST_ID_W = 6; // Width of master port ID (slave 3b + prepend 3b for 5 ports) parameter int unsigned AXI_DATA_U_W = magia_pkg::USR_W; // Width of the AXI Data User parameter int unsigned AXI_INSTR_U_W = magia_pkg::USR_W; // Width of the AXI Instruction User parameter int unsigned AXI_U_W = magia_pkg::USR_W; // Width of the AXI Unified Communication Channel User @@ -457,7 +461,7 @@ package magia_tile_pkg; parameter bit FSYNC_STALL = 1; // Fractal Sync Stall during synchronization // Parameters of the AXI XBAR - parameter int unsigned AxiXbarNoSlvPorts = 4; // Number of Slave Ports (ext, Core Data, CV32 I$, Spatz I$) + parameter int unsigned AxiXbarNoSlvPorts = 5; // Number of Slave Ports (ext, Core Data, CV32 I$, Spatz I$, Cluster I$) parameter int unsigned AxiXbarNoMstPorts = 3; // Number of Master Ports (to ext, to internal L1, to Spatz bootrom) localparam int unsigned AxiXbarSlvAxiIDWidth = AXI_DATA_ID_W; // Number of bits to indentify each Slave Port parameter int unsigned AxiXbarMaxWTrans = 16; // Maximum number of outstanding transactions per write @@ -489,6 +493,19 @@ package magia_tile_pkg; parameter int unsigned SPATZ_ICACHE_WAYS = 2; // Spatz i$ number of ways (2-way set associative) localparam int unsigned SPATZ_L0_EARLY_TAG_W = snitch_pkg::PAGE_SHIFT - $clog2(SPATZ_ICACHE_LINE_WIDTH/8); // L0 early tag width + //Cluster ICache parameters (dedicated icache for cluster cores) + parameter int unsigned CLUSTER_NR_FETCH_PORTS = N_CLUSTER_CORES; // i$ Number of request (fetch) ports + parameter int unsigned CLUSTER_L0_LINE_COUNT = 32*N_CLUSTER_CORES; // i$ L0 Cache Line Count + parameter int unsigned CLUSTER_LINE_WIDTH = 128; // i$ Cache Line Width; >= 64 + parameter int unsigned CLUSTER_LINE_COUNT = 32*N_CLUSTER_CORES; // i$ The number of cache lines per set. Power of two; >= 2. + parameter int unsigned CLUSTER_WAY_COUNT = 32; // i$ The set associativity of the cache. Power of two; >= 1. + parameter int unsigned CLUSTER_L0_PARITY_W = 0; // i$ Parity of the L0 cache + parameter int unsigned CLUSTER_L1_PARITY_W = CLUSTER_L0_PARITY_W; // i$ Parity of the L1 cache + parameter int unsigned CLUSTER_FETCH_AW = magia_pkg::ADDR_W; // i$ Fetch interface address width. Same as FETCH_AW; >= 1. + parameter int unsigned CLUSTER_FETCH_DW = magia_pkg::DATA_W; // i$ Fetch interface data width. Power of two; >= 8. + parameter int unsigned CLUSTER_FILL_AW = magia_pkg::ADDR_W; // i$ Fill interface address width. Same as FILL_AW; >= 1. + parameter int unsigned CLUSTER_FILL_DW = magia_pkg::DATA_W; // i$ Fill interface data width. Power of two; >= 8. + // Parameters used by the FPU parameter bit FPU_ZFINX = 1; // FPU use Zfinx extension instead of the F ISA extention parameter int unsigned FPU_BUFFER_DEPTH = 8; // FPU FIFO depth that buffers instructions coming from core @@ -526,6 +543,7 @@ package magia_tile_pkg; OBI_CORE_IDX = 0 } obi_xbar_idx_e; + typedef struct packed { logic req; logic[magia_pkg::INSTR_W-1:0] addr; @@ -608,24 +626,24 @@ package magia_tile_pkg; `ifdef CV32E40X typedef enum logic[2:0]{ - OBI_XBAR_STACK_IDX = 5, - OBI_XBAR_SPATZ_CTRL_IDX = 4, - OBI_XBAR_EVENT_UNIT_IDX = 3, - OBI_XBAR_RESERVED_IDX = 2, - OBI_XBAR_L1SPM_IDX = 1, - OBI_XBAR_L2_IDX = 0 + OBI_XBAR_STACK_IDX = 5, + OBI_XBAR_TILE_CSR_IDX = 4, + OBI_XBAR_EVENT_UNIT_IDX = 3, + OBI_XBAR_RESERVED_IDX = 2, + OBI_XBAR_L1SPM_IDX = 1, + OBI_XBAR_L2_IDX = 0 } obi_mem_array_idx_e; `else typedef enum logic[3:0]{ - OBI_XBAR_STACK_IDX = 8, - OBI_XBAR_RESERVED_IDX = 7, - OBI_XBAR_SPATZ_CTRL_IDX = 6, - OBI_XBAR_EVENT_UNIT_IDX = 5, - OBI_XBAR_FSYNC_CTRL_IDX = 4, - OBI_XBAR_IDMA_IDX = 3, - OBI_XBAR_REDMULE_CTRL_IDX = 2, - OBI_XBAR_L1SPM_IDX = 1, - OBI_XBAR_L2_IDX = 0 + OBI_XBAR_STACK_IDX = 8, + OBI_XBAR_RESERVED_IDX = 7, + OBI_XBAR_TILE_CSR_IDX = 6, + OBI_XBAR_EVENT_UNIT_IDX = 5, + OBI_XBAR_FSYNC_CTRL_IDX = 4, + OBI_XBAR_IDMA_IDX = 3, + OBI_XBAR_REDMULE_CTRL_IDX = 2, + OBI_XBAR_L1SPM_IDX = 1, + OBI_XBAR_L2_IDX = 0 } obi_mem_array_idx_e; `endif @@ -637,12 +655,12 @@ package magia_tile_pkg; AXI_XBAR_L2_IDX = 0 } axi_mem_array_idx_e; - - typedef enum logic[1:0]{ - AXI_SLV_SPATZ_INSTR_IDX = 3, - AXI_SLV_EXT_IDX = 2, - AXI_SLV_CORE_DATA_IDX = 1, - AXI_SLV_CORE_INSTR_IDX = 0 + typedef enum logic[2:0]{ + AXI_SLV_CLUSTER_INSTR_IDX = 4, + AXI_SLV_SPATZ_INSTR_IDX = 3, + AXI_SLV_EXT_IDX = 2, + AXI_SLV_CORE_DATA_IDX = 1, + AXI_SLV_CORE_INSTR_IDX = 0 } axi_xbar_slv_idx_e; @@ -800,4 +818,7 @@ package magia_tile_pkg; `OBI_TYPEDEF_DEFAULT_REQ_T(spatz_obi32_req_t, spatz_obi32_a_chan_t) `OBI_TYPEDEF_RSP_T(spatz_obi32_rsp_t, spatz_obi32_r_chan_t) + + + endpackage: magia_tile_pkg \ No newline at end of file diff --git a/hw/tile/obi_slave_ctrl_cluster.sv b/hw/tile/obi_slave_ctrl_cluster.sv new file mode 100644 index 00000000..d5c929eb --- /dev/null +++ b/hw/tile/obi_slave_ctrl_cluster.sv @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + + +module obi_slave_ctrl_cluster + import magia_tile_pkg::*; +#( + parameter logic [31:0] BaseAddr = 32'h00001700 // Base address for PULP cluster control registers +) ( + input logic clk_i, + input logic rst_ni, + + // OBI interface + input core_obi_data_req_t obi_req_i, + output core_obi_data_rsp_t obi_rsp_o, + + // Control outputs to PULP cluster cores + output logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] clk_en_o, // broadcast (replicated) + output logic [31:0] boot_addr_o [magia_tile_pkg::N_CLUSTER_CORES-1:0], + output logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] fetch_en_o, // broadcast (replicated) + output logic done_o, // DONE IRQ pulse when all selected cores complete + output logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] start_irq_o // per-core 1-cycle dispatch IRQ pulse +); + +//----------------------------------------------------------------------------- +// Register map (offsets from BaseAddr, instantiated by tile_csr at BaseAddr+0x40 = 0x1740) +// 0x00 CLK_EN : RW broadcast (1=enable all cores, 0=disable all) +// writing CLK_EN also resets the READY counter +// 0x04 BINARY : RW PULP binary entry point (boot address) +// 0x08 NB_CORES_TO_WAIT: RW number of cores expected to ACK + DONE per dispatch +// 0x0C DONE : W each PULP core writes 1 after task returns; +// after NB_CORES_TO_WAIT writes done_o pulses +// 0x10 TASKBIN : RW task function address; read by PULP cores +// 0x14 DATA : RW context pointer passed as arg0 to the task +// 0x18 START : RW CV32 writes one-hot core_mask -> 1-cycle IRQ pulse +// PULP cores write 0 to ACK (before task); +// register clears when all NB_CORES_TO_WAIT ACKs received +// 0x1C READY : R reads as 1 once N_CLUSTER_CORES cores have written; +// W each PULP core writes 1 after boot (counter increment) +//----------------------------------------------------------------------------- +localparam logic [31:0] CLUSTER_CLK_EN = 32'h00; +localparam logic [31:0] CLUSTER_BINARY = 32'h04; +localparam logic [31:0] CLUSTER_NB_CORES_TO_WAIT = 32'h08; +localparam logic [31:0] CLUSTER_DONE = 32'h0C; +localparam logic [31:0] CLUSTER_TASKBIN = 32'h10; +localparam logic [31:0] CLUSTER_DATA = 32'h14; +localparam logic [31:0] CLUSTER_START = 32'h18; +localparam logic [31:0] CLUSTER_READY = 32'h1C; + +// Address decode (offset from base) +logic [31:0] addr_offset; +logic addr_valid; + +assign addr_offset = obi_req_i.a.addr - BaseAddr; +assign addr_valid = (obi_req_i.a.addr >= BaseAddr) && + (obi_req_i.a.addr < (BaseAddr + 32)); // 8 registers * 4 bytes + +// Registers +logic clk_en_q, clk_en_d; +logic [31:0] entry_point_q, entry_point_d; +logic [magia_tile_pkg::N_BIT_CLUSTER_CORES:0] nb_cores_to_wait_q, nb_cores_to_wait_d; +logic done_q, done_d; +logic [31:0] taskbin_q, taskbin_d; +logic [31:0] data_q, data_d; +logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] start_q, start_d; + +// Counters +logic [magia_tile_pkg::N_BIT_CLUSTER_CORES:0] nb_recv_done_reqs_q, nb_recv_done_reqs_d; +logic [magia_tile_pkg::N_BIT_CLUSTER_CORES:0] nb_recv_ack_reqs_q, nb_recv_ack_reqs_d; +logic [magia_tile_pkg::N_BIT_CLUSTER_CORES:0] nb_recv_ready_reqs_q, nb_recv_ready_reqs_d; + +// One-cycle start IRQ pulse register +logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] start_irq_q, start_irq_d; + +// Response pipeline +logic rvalid_q, rvalid_d; +logic [31:0] rdata_q, rdata_d; + +assign obi_rsp_o.gnt = obi_req_i.req && addr_valid; +assign obi_rsp_o.rvalid = rvalid_q; +assign obi_rsp_o.r.rdata = rdata_q; +assign obi_rsp_o.r.rid = '0; +assign obi_rsp_o.r.err = 1'b0; +assign obi_rsp_o.r.r_optional = '0; + +// ============================================ +// Register write logic (combinational) +// ============================================ +always_comb begin + // Defaults: hold + clk_en_d = clk_en_q; + entry_point_d = entry_point_q; + nb_cores_to_wait_d = nb_cores_to_wait_q; + done_d = 1'b0; + taskbin_d = taskbin_q; + data_d = data_q; + start_d = start_q; + nb_recv_done_reqs_d = nb_recv_done_reqs_q; + nb_recv_ack_reqs_d = nb_recv_ack_reqs_q; + nb_recv_ready_reqs_d = nb_recv_ready_reqs_q; + start_irq_d = '0; // default: no IRQ pulse this cycle + + if (obi_req_i.req && addr_valid && obi_req_i.a.we) begin + case (addr_offset) + CLUSTER_CLK_EN: begin + // Broadcast: any non-zero enables all cores; 0 disables all. + clk_en_d = |obi_req_i.a.wdata; + // Reset READY counter so CV32 can re-poll after each init. + nb_recv_ready_reqs_d = '0; + end + CLUSTER_BINARY: begin + entry_point_d = obi_req_i.a.wdata; + end + CLUSTER_NB_CORES_TO_WAIT: begin + nb_cores_to_wait_d = obi_req_i.a.wdata[magia_tile_pkg::N_BIT_CLUSTER_CORES:0]; + end + CLUSTER_DONE: begin + // Each PULP core writes 1 here on completion (write data ignored) + nb_recv_done_reqs_d = nb_recv_done_reqs_q + 1; + end + CLUSTER_TASKBIN: begin + taskbin_d = obi_req_i.a.wdata; + end + CLUSTER_DATA: begin + data_d = obi_req_i.a.wdata; + end + CLUSTER_START: begin + if (obi_req_i.a.wdata != 32'h0) begin + // CV32 dispatch: latch core_mask, reset counters, fire 1-cycle IRQ pulses + start_d = obi_req_i.a.wdata[magia_tile_pkg::N_CLUSTER_CORES-1:0]; + nb_recv_ack_reqs_d = '0; + nb_recv_done_reqs_d = '0; + start_irq_d = obi_req_i.a.wdata[magia_tile_pkg::N_CLUSTER_CORES-1:0]; + end else begin + // PULP core ACK (write 0): count; when all done, clear register + nb_recv_ack_reqs_d = nb_recv_ack_reqs_q + 1; + end + end + CLUSTER_READY: begin + // PULP core boot complete: count; saturate at N_CLUSTER_CORES + if (nb_recv_ready_reqs_q < magia_tile_pkg::N_CLUSTER_CORES) begin + nb_recv_ready_reqs_d = nb_recv_ready_reqs_q + 1; + end + end + default: ; + endcase + end + + // Fire a one-cycle DONE pulse when all expected cores have completed. + if (nb_recv_done_reqs_d >= nb_cores_to_wait_q && nb_cores_to_wait_q != '0) begin + done_d = 1'b1; + nb_recv_done_reqs_d = '0; + end + + // Clear PULP_START when all expected cores have ACK'd (write 0) + if (nb_recv_ack_reqs_d >= nb_cores_to_wait_q && nb_cores_to_wait_q != '0) begin + start_d = '0; + nb_recv_ack_reqs_d = '0; + end +end + +// ============================================ +// Register sequential logic +// ============================================ +always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + clk_en_q <= 1'b0; + entry_point_q <= 32'hCC000080; + nb_cores_to_wait_q <= magia_tile_pkg::N_CLUSTER_CORES; + done_q <= 1'b0; + taskbin_q <= 32'h0; + data_q <= 32'h0; + start_q <= '0; + nb_recv_done_reqs_q <= '0; + nb_recv_ack_reqs_q <= '0; + nb_recv_ready_reqs_q <= '0; + start_irq_q <= '0; + rvalid_q <= 1'b0; + rdata_q <= 32'h0; + end else begin + clk_en_q <= clk_en_d; + entry_point_q <= entry_point_d; + nb_cores_to_wait_q <= nb_cores_to_wait_d; + done_q <= done_d; + taskbin_q <= taskbin_d; + data_q <= data_d; + start_q <= start_d; + nb_recv_done_reqs_q <= nb_recv_done_reqs_d; + nb_recv_ack_reqs_q <= nb_recv_ack_reqs_d; + nb_recv_ready_reqs_q <= nb_recv_ready_reqs_d; + start_irq_q <= start_irq_d; + rvalid_q <= rvalid_d; + rdata_q <= rdata_d; + end +end + +// ============================================ +// OBI read response logic (combinational) +// ============================================ +logic ready_reg_val; +assign ready_reg_val = (nb_recv_ready_reqs_q == magia_tile_pkg::N_CLUSTER_CORES); + +always_comb begin + rdata_d = 32'h0; + rvalid_d = obi_req_i.req && addr_valid; + + if (obi_req_i.req && addr_valid && !obi_req_i.a.we) begin + case (addr_offset) + CLUSTER_CLK_EN: rdata_d = {31'h0, clk_en_q}; + CLUSTER_BINARY: rdata_d = entry_point_q; + CLUSTER_NB_CORES_TO_WAIT: rdata_d = {{(32-magia_tile_pkg::N_BIT_CLUSTER_CORES-1){1'b0}}, nb_cores_to_wait_q}; + CLUSTER_DONE: rdata_d = {31'h0, done_q}; + CLUSTER_TASKBIN: rdata_d = taskbin_q; + CLUSTER_DATA: rdata_d = data_q; + CLUSTER_START: rdata_d = {{(32-magia_tile_pkg::N_CLUSTER_CORES){1'b0}}, start_q}; + CLUSTER_READY: rdata_d = {31'h0, ready_reg_val}; + default: rdata_d = 32'hDEADBEEF; + endcase + end +end + +// Outputs: broadcast clk_en and fetch_en to all cores (replicated bit) +assign clk_en_o = {magia_tile_pkg::N_CLUSTER_CORES{clk_en_q}}; +assign fetch_en_o = {magia_tile_pkg::N_CLUSTER_CORES{clk_en_q}}; +assign done_o = done_q; +assign start_irq_o = start_irq_q; + +// All cores share the same boot address (PULP binary entry point) +always_comb begin + for (int i = 0; i < magia_tile_pkg::N_CLUSTER_CORES; i++) begin + boot_addr_o[i] = entry_point_q; + end +end + +endmodule \ No newline at end of file diff --git a/hw/tile/tile_csr.sv b/hw/tile/tile_csr.sv new file mode 100644 index 00000000..8fe512cd --- /dev/null +++ b/hw/tile/tile_csr.sv @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + + + +/* + * Register map (offsets from BaseAddr): + * 0x00–0x18 Spatz registers (handled by obi_slave_ctrl_spatz) + * 0x40–0x44 PULP/Cluster regs (handled by obi_slave_ctrl_cluster) + */ +module tile_csr + import magia_tile_pkg::*; +#( + parameter logic [31:0] BaseAddr = 32'h00001700, + parameter int unsigned BOOT_ADDR = 32'hCC000000 +) ( + input logic clk_i, + input logic rst_ni, + + // Single OBI slave interface + input core_obi_data_req_t obi_req_i, + output core_obi_data_rsp_t obi_rsp_o, + + // Spatz Control outputs + output logic spatz_clk_en_o, + output logic spatz_start_o, + output logic spatz_done_o, + + // Cluster Control outputs + output logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_clk_en_o, + output logic [31:0] cluster_boot_addr_o [magia_tile_pkg::N_CLUSTER_CORES-1:0], + output logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_fetch_en_o, + output logic cluster_done_o, + // Per-core 1-cycle dispatch IRQ pulse (drives MEI bit 11 of each PULP core) + output logic [magia_tile_pkg::N_CLUSTER_CORES-1:0] cluster_start_irq_o +); + + // ============================================ + // Internal OBI signals for sub-modules + // ============================================ + core_obi_data_rsp_t spatz_obi_rsp; + core_obi_data_rsp_t cluster_obi_rsp; + + // Both sub-modules receive the same OBI request. + // Each checks addr_valid against its own BaseAddr/range, + // so only one will grant at a time. + + // ============================================ + // Spatz control registers (offset 0x00–0x18) + // ============================================ + obi_slave_ctrl_spatz #( + .BaseAddr ( BaseAddr ) + ) i_spatz_ctrl ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .obi_req_i ( obi_req_i ), + .obi_rsp_o ( spatz_obi_rsp ), + .clk_en_o ( spatz_clk_en_o ), + .start_o ( spatz_start_o ), + .done_o ( spatz_done_o ) + ); + + // ============================================ + // Cluster control registers (offset 0x40–0x44) + // ============================================ + obi_slave_ctrl_cluster #( + .BaseAddr ( BaseAddr + 32'h40 ) + ) i_cluster_ctrl ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .obi_req_i ( obi_req_i ), + .obi_rsp_o ( cluster_obi_rsp ), + .clk_en_o ( cluster_clk_en_o ), + .boot_addr_o ( cluster_boot_addr_o), + .fetch_en_o ( cluster_fetch_en_o ), + .done_o ( cluster_done_o ), + .start_irq_o ( cluster_start_irq_o) + ); + + // ============================================ + // OBI response mux + // ============================================ + // Non-overlapping address ranges guarantee at most one + // sub-module grants/responds at a time. + assign obi_rsp_o.gnt = spatz_obi_rsp.gnt | cluster_obi_rsp.gnt; + assign obi_rsp_o.rvalid = spatz_obi_rsp.rvalid | cluster_obi_rsp.rvalid; + assign obi_rsp_o.r.rdata = spatz_obi_rsp.rvalid ? spatz_obi_rsp.r.rdata : cluster_obi_rsp.r.rdata; + assign obi_rsp_o.r.rid = '0; + assign obi_rsp_o.r.err = spatz_obi_rsp.r.err | cluster_obi_rsp.r.err; + assign obi_rsp_o.r.r_optional = '0; + +endmodule \ No newline at end of file diff --git a/scripts/parse_s19.pl b/scripts/parse_s19.pl index 7f7aff62..359f2026 100755 --- a/scripts/parse_s19.pl +++ b/scripts/parse_s19.pl @@ -137,7 +137,7 @@ sub hex2int { # sub int2hex { my $i=shift; # read in the integer - my $h; # define hex value + my $h = ''; # define hex value for my $n (0..7){ # 8 digits my $e=16 ** (7-$n); # calculate exponent if ($e > $i){ # if 2^e is larger diff --git a/scripts/s19tomem.py b/scripts/s19tomem.py index 4520fde7..e0165477 100755 --- a/scripts/s19tomem.py +++ b/scripts/s19tomem.py @@ -18,15 +18,24 @@ import numpy as np import sys -# Instructions start at 0xcc00_0000 -# Data starts at 0xcc01_0000 -# Stack starts at 0x0001_0000 -# We only keep last 2 bytes so memory will be filled with no offset. -# The CPU will also reference it as to not have any offset. -MEM_START = 0xcc000000 +# Default memory layout: main-core ELF at 0xCC000000. +# Override with optional argv[4] (instr base) and argv[5] (data base) +# for the PULP cluster-core ELF which lives at 0xC0000000 / 0xC0100000. +DEFAULT_MEM_START = 0xcc000000 +DEFAULT_DATA_OFFSET = 0x10000 # data = instr_base + 0x10000 (main) or explicit + +if len(sys.argv) >= 5: + MEM_START = int(sys.argv[4], 16) +else: + MEM_START = DEFAULT_MEM_START + +if len(sys.argv) >= 6: + DATA_BASE = int(sys.argv[5], 16) +else: + DATA_BASE = MEM_START + DEFAULT_DATA_OFFSET + INSTR_SIZE = 0x8000 INSTR_END = MEM_START + INSTR_SIZE -DATA_BASE = MEM_START + 0x10000 DATA_SIZE = 0x30000 DATA_END = DATA_BASE + DATA_SIZE STACK_BASE = 0x10000 diff --git a/scripts/setup_traces.sh b/scripts/setup_traces.sh new file mode 100755 index 00000000..a7e8b3ff --- /dev/null +++ b/scripts/setup_traces.sh @@ -0,0 +1,39 @@ +# Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# +# Authors: Niccolò Giuliani, Fondazione Chips-IT + +# setup_traces.sh — pre-simulation: create the per-tile trace directory tree. +# +# Makes the empty tree visible from the start of simulation: +# +# /traces/tile_N/main/ ← will receive CV32 main-core trace +# /traces/tile_N/cluster/ ← will receive PULP cores traces +# +# The actual trace files are moved in by sort_traces.sh after vsim exits +# (the cv32e40p tracer writes them to /trace_core_.log). +# +# Usage: setup_traces.sh [num_clusters] + +SIM_DIR=${1:-.} +NUM_CLUSTERS=${2:-16} + +echo "[setup-traces] Creating ${NUM_CLUSTERS} tile dirs under ${SIM_DIR}/traces/" + +for t in $(seq 0 $(( NUM_CLUSTERS - 1 ))); do + mkdir -p "${SIM_DIR}/traces/tile_${t}/main" "${SIM_DIR}/traces/tile_${t}/cluster" +done + +echo "[setup-traces] Done. Traces will be moved into place after simulation exits." diff --git a/scripts/sort_traces.sh b/scripts/sort_traces.sh new file mode 100755 index 00000000..58d9c02d --- /dev/null +++ b/scripts/sort_traces.sh @@ -0,0 +1,80 @@ +# Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# +# Authors: Niccolò Giuliani, Fondazione Chips-IT + +# sort_traces.sh — distribute per-core trace files into per-tile subdirectories. +# +# Usage: sort_traces.sh [num_clusters] [pulp_core_count] +# +# For a 4x4 MAGIA mesh: +# - Main core of tile N → mhartid = N (0x00..0x0F) +# - PULP core C of tile N → mhartid = 2*N_TILES + N*PULP_CORE_COUNT + C +# +# Result layout: +# /traces/tile_N/main/trace_core_XXXXXXXX.log +# /traces/tile_N/cluster/trace_core_XXXXXXXX.log + +SIM_DIR=${1:-.} +NUM_CLUSTERS=${2:-16} +PULP_CORE_COUNT=${3:-8} + +PULP_HARTID_BASE=$(( 2 * NUM_CLUSTERS )) + +# Move or confirm a single trace file into its destination directory. +# Handles three cases robustly: +# 1. src is a dangling symlink → remove it (file never written) +# 2. src is a symlink and already at dst → remove the symlink (file already there) +# 3. src is a regular file → mv it into dst/ +place_trace() { + local src="$1" dst_dir="$2" label="$3" + local fname; fname=$(basename "${src}") + local dst="${dst_dir}/${fname}" + + if [ -L "${src}" ]; then + # Symlink left over from a previous run: remove it. + # The real file is already inside traces/ (or was never written). + rm -f "${src}" + [ -f "${dst}" ] && echo " ${label} → already in place (symlink cleaned)" + elif [ -f "${src}" ]; then + if [ "${src}" -ef "${dst}" ]; then + # Same inode (hardlink edge-case): nothing to do. + echo " ${label} → already in place" + else + mv "${src}" "${dst_dir}/" && echo " ${label} → $(basename "${dst_dir%/*}")/$(basename "${dst_dir}")/${fname}" + fi + fi +} + +echo "[sort-traces] Sorting ${NUM_CLUSTERS} tiles x ${PULP_CORE_COUNT} PULP cores into ${SIM_DIR}/traces/" + +for t in $(seq 0 $(( NUM_CLUSTERS - 1 ))); do + MAIN_DIR="${SIM_DIR}/traces/tile_${t}/main" + CLUSTER_DIR="${SIM_DIR}/traces/tile_${t}/cluster" + mkdir -p "${MAIN_DIR}" "${CLUSTER_DIR}" + + # Main CV32 core (one per tile, mhartid = t) + f=$(printf "%s/trace_core_%08x.log" "${SIM_DIR}" ${t}) + place_trace "${f}" "${MAIN_DIR}" "tile ${t} main" + + # PULP cluster cores (PULP_CORE_COUNT per tile) + for c in $(seq 0 $(( PULP_CORE_COUNT - 1 ))); do + h=$(( PULP_HARTID_BASE + t * PULP_CORE_COUNT + c )) + f=$(printf "%s/trace_core_%08x.log" "${SIM_DIR}" ${h}) + place_trace "${f}" "${CLUSTER_DIR}" "tile ${t} core${c}" + done +done + +echo "[sort-traces] Done." diff --git a/setup_env.sh b/setup_env.sh index 88394275..b3fdd0d9 100644 --- a/setup_env.sh +++ b/setup_env.sh @@ -14,9 +14,9 @@ export PATH=/usr/pack/gcc-5.2.0-af/x86_64-rhe6-linux/bin:$PATH export PATH=/usr/local/anaconda3-2023.07/condabin:$PATH export PATH=/home/visachi/.local/bin:$PATH export XLEN=32 -if [[ "$core" == "CV32E40P" ]]; then -echo "Exporting ISA extentions: I, M, C, GAP9" -export XTEN=imcxgap9 +if [[ "$core" == "CV32E40P" ]] || [[ "$core" == "RI5CY" ]]; then + echo "Exporting ISA extentions: I, M, C, PULP" + export XTEN=imfc else echo "Exporting ISA extentions: I, M, A, F, C" export XTEN=imafc diff --git a/sw/kernel/crt0.S b/sw/kernel/crt0.S index 678f3182..2bf26a93 100644 --- a/sw/kernel/crt0.S +++ b/sw/kernel/crt0.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2019 ETH Zurich and University of Bologna + * Copyright (C) 2018-2026 ETH Zurich, University of Bologna and Fondazione Chips-IT * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,76 +13,81 @@ * See the License for the specific language governing permissions and * limitations under the License. * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * Authors: Germain Haugou, ETH (germain.haugou@iis.ee.ethz.ch) - * Francesco Conti, ETHZ & UNIBO + * + * Unified crt0 for the CV32 main core (one per tile, mhartid 0..NUM_TILES-1). + * + * Aligned with the magia-sdk single-binary flow: PULP cluster cores no + * longer share this code path — they fetch their own position-independent + * binary embedded in section .pulp_binary at _pulp_binary_start (see + * sw/kernel/link.ld and sw/kernel_pulp/pulp_crt0.S). + * + * exit() signals the testbench via the legacy write at + * 0xCCff0000 + 2*mhartid ← (s0 ^ 0x800) & 0x7ff */ .section .text .global _start _start: - - # Cluster PEs will also starts here to avoid aligning another entry point - # Just re-route them to the right entry csrr a0, mhartid + # Compute (tile_y, tile_x) just in case the test needs them in a0/a1. andi a1, a0, 0x1f srli a0, a0, 5 - # Enabling CV32E40P mstatus.MIE - li t0, 0x1 + + # Enable CV32E40P mstatus.MIE (bit 3 = 0x8, not bit 0) + li t0, 0x8 csrrs zero, mstatus, t0 - # Enabling CV32E40P SW interrupt (mie[3]) + # Enable CV32E40P SW interrupt (mie[3]) li t0, 0x8 csrrs zero, mie, t0 - # clear the bss segment + # Clear the BSS segment. la t0, _bss_start la t1, _bss_end 1: + beq t0, t1, 2f sw zero, 0(t0) addi t0, t0, 4 - bltu t0, t1, 1b + j 1b +2: - /* Stack initialization */ - la x2, stack + # Stack init. + la x2, stack -.section .text + # main(0, 0) + addi a0, x0, 0 + addi a1, x0, 0 + la t2, main + jalr x1, t2 + mv s0, a0 - // On all other chips we simply pass 0. - addi a0, x0, 0 - addi a1, x0, 0 - - // Jump to main program entry point (argc = a0, argv = a1). - la t2, main - jalr x1, t2 - mv s0, a0 - - /* If program returns from main, call exit routine */ - mv a0, s0 - la t2, exit - jalr x1, t2 + # If main returns, call exit. + mv a0, s0 + la t2, exit + jalr x1, t2 .global _init .global _fini _init: _fini: - # These don't have to do anything since we use init_array/fini_array. ret +# -------------------------------------------------------------------- +# Exit: testbench handshake (CV32 main cores only). +# Write halfword (s0 ^ 0x800) & 0x7ff to 0xCCff0000 + mhartid*2. +# -------------------------------------------------------------------- exit: - csrr a0, mhartid - andi s0, s0, 0x7ff - li a1, 0xCCff0000 - li a2, 0x800 - xor s0, s0, a2 - add a1, a1, a0 - add a1, a1, a0 - sh s0, 0(a1) - wfi + csrr a0, mhartid + andi s0, s0, 0x7ff + li a1, 0xCCff0000 + li a2, 0x800 + xor s0, s0, a2 + add a1, a1, a0 + add a1, a1, a0 + sh s0, 0(a1) + wfi .section .vectors, "ax" .option norvc; .org 0x80 - jal x0, _start \ No newline at end of file + jal x0, _start diff --git a/sw/kernel/link.ld b/sw/kernel/link.ld index 420dd196..b3090e6e 100644 --- a/sw/kernel/link.ld +++ b/sw/kernel/link.ld @@ -21,25 +21,39 @@ __DYNAMIC = 0; MEMORY { instrram : ORIGIN = 0xcc000000, LENGTH = 0x8000 - dataram : ORIGIN = 0xcc010000, LENGTH = 0xF00000 - stack : ORIGIN = 0x00010000, LENGTH = 0xC000 /* 48K - CV32 stack only */ + dataram : ORIGIN = 0xcc010000, LENGTH = 0xF00000 /* MAGIA: data at 0xCC010000 */ + stack : ORIGIN = 0x00010000, LENGTH = 0x0C000 /* CV32/PULP stacks below Spatz [0x1C000, 0x20000) */ } -/* Stack information variables */ -_min_stack = 0x1000; /* 4K - minimum stack space to reserve */ -_stack_len = LENGTH(stack); -_stack_start = ORIGIN(stack) + LENGTH(stack); +/* Stack information variables. + * Spatz owns [0x1C000, 0x20000) in the tile SPM (_stack_start = 0x1FFF8 in + * spatz_program.ld); this region stops at 0x1C000 to avoid overlap. + * Each hart gets _stack_slice_size bytes; crt0 computes: + * sp = stack - local_id * _stack_slice_size + * Slot 0 = CV32 main core, slots 1..8 = PULP cluster cores. */ +_stack_slice_size = 0x800; /* 2 KB per hart */ +_stack_hart_count = 9; /* 1 main + 8 PULP cores */ +_min_stack = _stack_slice_size * _stack_hart_count; /* 18 KB total */ +_stack_len = LENGTH(stack); +_stack_start = ORIGIN(stack) + LENGTH(stack); +ASSERT(_min_stack <= _stack_len, "Configured stacks exceed stack memory"); /* We have to align each sector to word boundaries as our current s19->slm * conversion scripts are not able to handle non-word aligned sections. */ +PHDRS +{ + text PT_LOAD; + data PT_LOAD; +} + SECTIONS { .vectors : { . = ALIGN(4); KEEP(*(.vectors)) - } > instrram + } > instrram :text .text : { . = ALIGN(4); @@ -59,21 +73,32 @@ SECTIONS *(.lit) *(.shdata) _endtext = .; - } > instrram + } > instrram :text /* Spatz embedded binary - inlined from header file */ /* Positioned right after CV32 .text in instrram */ - + . = ALIGN(4); _spatz_binary_start = .; - + .spatz_binary : { KEEP(*(.spatz_binary)) - } > instrram - + } > instrram :text + _spatz_binary_end = .; _spatz_binary_size = _spatz_binary_end - _spatz_binary_start; + /* PULP embedded binary - inlined from header file (magia-sdk flow). + CV32E40P stores mtvec as mtvec_addr_i[31:8], so keep the PULP + boot address and trap base in the same 256-byte page. */ + .pulp_binary ALIGN(256) : { + _pulp_binary_start = .; + KEEP(*(.pulp_binary)) + _pulp_binary_end = .; + } > instrram :text + + _pulp_binary_size = _pulp_binary_end - _pulp_binary_start; + /*--------------------------------------------------------------------*/ /* Global constructor/destructor segment */ /*--------------------------------------------------------------------*/ @@ -83,7 +108,7 @@ SECTIONS PROVIDE_HIDDEN (__preinit_array_start = .); KEEP (*(.preinit_array)) PROVIDE_HIDDEN (__preinit_array_end = .); - } > dataram + } > dataram :data .init_array : { @@ -91,7 +116,7 @@ SECTIONS KEEP (*(SORT(.init_array.*))) KEEP (*(.init_array )) PROVIDE_HIDDEN (__init_array_end = .); - } > dataram + } > dataram :data .fini_array : { @@ -99,29 +124,29 @@ SECTIONS KEEP (*(SORT(.fini_array.*))) KEEP (*(.fini_array )) PROVIDE_HIDDEN (__fini_array_end = .); - } > dataram + } > dataram :data + + .data : { + . = ALIGN(4); + sdata = .; + _sdata = .; + *(.data); + *(.data.*) + edata = .; + _edata = .; + } > dataram :data .rodata : { . = ALIGN(4); *(.rodata); *(.rodata.*) - } > dataram + } > dataram :data .shbss : { . = ALIGN(4); *(.shbss) - } > dataram - - .data : { - . = ALIGN(4); - sdata = .; - _sdata = .; - *(.data); - *(.data.*) - edata = .; - _edata = .; - } > dataram + } > dataram :data .bss : { @@ -133,15 +158,17 @@ SECTIONS *(.sbss.*) *(COMMON) _bss_end = .; - } > dataram + } > dataram :data - /* ensure there is enough room for stack */ - .stack (NOLOAD): { + /* Stack placed at the TOP of the CV32/PULP region so crt0 can compute + * per-hart SP = stack - local_id * _stack_slice_size. */ + .stack ORIGIN(stack) + LENGTH(stack) - _min_stack (NOLOAD): { . = ALIGN(4); + _stack_bottom = .; . = . + _min_stack ; . = ALIGN(4); stack = . ; _stack = . ; - } > stack + } > stack :NONE } \ No newline at end of file diff --git a/sw/kernel_pulp/Makefile b/sw/kernel_pulp/Makefile new file mode 100644 index 00000000..47b04cac --- /dev/null +++ b/sw/kernel_pulp/Makefile @@ -0,0 +1,186 @@ +# Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT +# +# Licensed under the Solderpad Hardware License, Version 0.51 +# (the "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: SHL-0.51 +# +# PULP cluster task Makefile for MAGIA. +# +# Mirrors spatz/sw/Makefile: builds a position-independent flat binary +# from 's pulp_task sources and packs it into +# sw/kernel_pulp/headers_bin/_pulp_task_bin.h +# so the main CV32 ELF can KEEP it inside the .pulp_binary section. +# +# Invocation (from top-level Makefile): +# make -C sw/kernel_pulp TEST_NAME= task=" " +# PULP_TASK_DIR= + +ISA ?= riscv +ARCH ?= rv +XLEN ?= 32 +core ?= CV32E40P + +ifeq ($(core), CV32E40X) + XTEN := imafc + CC = $(ISA)$(XLEN)-unknown-elf-gcc + OBJCOPY = $(ISA)$(XLEN)-unknown-elf-objcopy + OBJDUMP = $(ISA)$(XLEN)-unknown-elf-objdump + XABI :=f + ABI := ilp +else ifeq ($(core), RI5CY) + XTEN := imcxgap9 + CC = riscv$(XLEN)-unknown-elf-gcc + OBJCOPY = riscv$(XLEN)-unknown-elf-objcopy + OBJDUMP = riscv$(XLEN)-unknown-elf-objdump + XABI := + ABI := ilp +else + # CV32E40P with ZFINX=1 (FP ops on GPRs) — see top Makefile. + XTEN := imc_xcvalu_xcvbi_xcvbitmanip_xcvhwlp_xcvmac_xcvmem_xcvsimd_xcvelw_zfinx_zhinxmin + CC = riscv64-unknown-elf-gcc + OBJCOPY = riscv64-unknown-elf-objcopy + OBJDUMP = riscv64-unknown-elf-objdump + XABI := + ABI := ilp +endif + +TEST_NAME ?= base_pulp_test +task ?= +PULP_TASK_DIR ?= + +ARCH_FLAGS = -march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) +# -msmall-data-limit=0: never use gp/zero-relative addressing for globals. +# Required because the PIC binary is linked with ORIGIN=0, so .sdata +# symbols would otherwise be accessed via `lw rd, imm(zero)`, which +# hits MMIO addresses (iDMA CTRL at 0x200-0x5FF) instead of the +# embedded binary's relocated data section. +# -mno-relax: prevent the linker from collapsing auipc+addi back into li +# for small absolute values (would also break PIC at runtime). +# -fno-jump-tables: switch statements otherwise compile to a jump table +# whose base address is referenced ABSOLUTELY (e.g. `li a4, 0x618`). +# Since this binary is linked at ORIGIN=0 but copied/relocated to +# 0xCC00_xxxx at runtime without a dynamic loader, that absolute base +# is never fixed up: the indirect jump lands in unmapped space and the +# cluster icache stalls forever on a fill that loops in the NoC. +# Forcing compare-chains keeps every branch PC-relative (PIC-safe). +# Affects all cores (CV32E40P/RI5CY) and both mesh_dv settings. +CFLAGS = $(ARCH_FLAGS) -O2 -g -Wall -Wextra -Wno-unused-parameter \ + -Wno-unused-variable -Wno-unused-function -Wundef \ + -ffunction-sections -fdata-sections \ + -fPIC -mcmodel=medany \ + -fno-jump-tables \ + -msmall-data-limit=0 \ + -nostartfiles -nostdlib \ + -DPULP_CORE_COUNT=$(PULP_CORE_COUNT) \ + -D$(core) -U__riscv__ +CFLAGS += -I../utils -I../../hw/include -I. +# RI5CY toolchain (riscv32-unknown-elf-gcc / gap9 binutils) uses an older +# assembler that does not recognise the 3-operand FLW pseudo-instruction +# `flw rd, symbol, rt` emitted by newer gcc when rv32*f* is in the march. +# -mexplicit-relocs forces gcc to emit lui+flw(%hi/%lo) instead. +ifeq ($(core), RI5CY) +CFLAGS += -mexplicit-relocs +endif + +LDFLAGS = $(ARCH_FLAGS) -nostartfiles -nostdlib \ + -Wl,--gc-sections -Wl,--allow-multiple-definition \ + -Wl,-T,$(KERNEL_DIR)/pulp_program.ld +comma := , +TASK_UNDEF_FLAGS = $(foreach t,$(task),-Wl$(comma)--undefined=$(t)) + +PULP_CORE_COUNT ?= 8 + +BINARY_NAME = $(TEST_NAME)_pulp_task_bin +HEADER_NAME = $(TEST_NAME)_pulp_task_bin + +BIN_DIR = bin +HEADER_DIR = headers_bin +KERNEL_DIR = . + +# Task source files (resolved against PULP_TASK_DIR if provided, otherwise +# expected to be space-separated absolute/relative paths). +ifeq ($(PULP_TASK_DIR),) +TASK_SRCS = $(task) +else +TASK_SRCS = $(addprefix $(PULP_TASK_DIR)/,$(addsuffix .c,$(task))) +endif + +CRT0 = $(KERNEL_DIR)/pulp_crt0.S +CRT0_OBJ = $(BIN_DIR)/$(TEST_NAME)_pulp_crt0.o +ELF = $(BIN_DIR)/$(BINARY_NAME).elf +BIN = $(BIN_DIR)/$(BINARY_NAME).bin +DUMP = $(BIN_DIR)/$(BINARY_NAME).dump +HEADER = $(HEADER_DIR)/$(HEADER_NAME).h + +BIN2HEADER = ../../scripts/bin2header.py + +.PHONY: all clean dirs bin header + +all: bin + @rm -f $(HEADER) + @$(MAKE) header + +bin: dirs $(BIN) $(DUMP) +header: dirs $(HEADER) + +dirs: + @mkdir -p $(BIN_DIR) $(HEADER_DIR) + +$(CRT0_OBJ): $(CRT0) + @echo "[PULP] Compiling crt0..." + $(CC) $(ARCH_FLAGS) -DPULP_CORE_COUNT=$(PULP_CORE_COUNT) -c -o $@ $(CRT0) + +$(ELF): $(CRT0_OBJ) $(TASK_SRCS) $(KERNEL_DIR)/pulp_program.ld + @if [ -z "$(TASK_SRCS)" ]; then \ + echo "[PULP] ERROR: No task sources for task=$(task)"; \ + exit 1; \ + fi + @echo "[PULP] Building ELF (tasks: $(task))" + $(CC) $(CFLAGS) $(LDFLAGS) $(TASK_UNDEF_FLAGS) -o $@ $(CRT0_OBJ) $(TASK_SRCS) + +$(BIN): $(ELF) + @echo "[PULP] Generating flat binary..." + $(OBJCOPY) -O binary $< $@ + +$(DUMP): $(ELF) + @echo "[PULP] Generating disassembly..." + $(OBJDUMP) -D -S $< > $@ + +$(HEADER): $(BIN) $(ELF) + @echo "[PULP] Generating header..." + @python3 $(BIN2HEADER) $(BIN) $(HEADER) \ + --name $(HEADER_NAME) \ + --section .pulp_binary \ + --address "dynamic (_pulp_binary_start)" + @GUARD=$$(echo "$(HEADER_NAME)" | tr 'a-z' 'A-Z'); \ + sed -i "/#endif \/\* __$${GUARD}_H__ \*\//d" $(HEADER) + @echo "" >> $(HEADER) + @echo "/* Binary start address - defined by CV32 linker */" >> $(HEADER) + @echo "extern uint32_t _pulp_binary_start;" >> $(HEADER) + @echo "#define PULP_BINARY_START ((uint32_t)&_pulp_binary_start)" >> $(HEADER) + @echo "" >> $(HEADER) + @echo "/* PULP task entry points (offset from PULP_BINARY_START). */" >> $(HEADER) + @echo "/* Extracted from $(BINARY_NAME).elf: any global function whose name */" >> $(HEADER) + @echo "/* contains \"task\" becomes a dispatchable entry point. */" >> $(HEADER) + @$(OBJDUMP) -t $(ELF) | \ + awk '$$2 == "g" && $$4 ~ /^\.text/ && $$NF ~ /task/ { \ + name = $$NF; upper = toupper(name); \ + printf "#define %s (PULP_BINARY_START + 0x%s)\n", upper, $$1 \ + }' >> $(HEADER) + @echo "" >> $(HEADER) + @GUARD=$$(echo "$(HEADER_NAME)" | tr 'a-z' 'A-Z'); \ + echo "#endif /* __$${GUARD}_H__ */" >> $(HEADER) + +clean: + rm -rf $(BIN_DIR) $(HEADER_DIR) + +.PRECIOUS: $(ELF) $(BIN) diff --git a/sw/kernel_pulp/pulp_crt0.S b/sw/kernel_pulp/pulp_crt0.S new file mode 100644 index 00000000..510ac453 --- /dev/null +++ b/sw/kernel_pulp/pulp_crt0.S @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * PULP cluster CRT0 for embedded binary in CV32 instrram (magia_v3). + * + * Bare-metal dynamic dispatch programming model. + * + * Boot sequence per hart: + * 1. CV32 writes _pulp_binary_start to PULP_BINARY (0x1744). + * 2. CV32 writes 1 to PULP_CLK_EN (0x1740) -> fetch_enable goes high to + * all cluster cores, which reset to _pulp_binary_start and execute + * _start. + * 3. Each hart computes its local ID, sets up its 2 KB stack slice and + * clears BSS (all-harts race, benign zero-store). + * 4. Install trap_handler in mtvec (direct mode, mtvec[1:0] = 00). + * 5. Enable mstatus.MIE and mie.MEIE (bit 11). + * 6. Write 1 to PULP_READY (0x175C) so CV32's pulp_init() can advance. + * 7. Enter dispatcher_loop: WFI; on MEI, jump to trap_handler. + * + * trap_handler (per dispatch): + * - Read PULP_TASKBIN (0x1750) -> task function pointer. + * - Read PULP_DATA (0x1754) -> first argument (a0). + * - Write 0 to PULP_START (0x1758) to ACK *before* calling the task; this + * is what unblocks CV32's pulp_run_task() poll. + * - jalr the task function. + * - Write 1 to PULP_DONE (0x174C) so ClusterRegs counts the completion; + * after NB_CORES_TO_WAIT writes it fires PULP_DONE -> CV32 EU bit 12. + * - mret back into the dispatcher_loop (WFI). + * + * NOTE on alignment: CV32E40P's mtvec has mtvec[7:2] hardwired to 0, so the + * trap base is always 256-byte aligned regardless of what we write. We must + * therefore align trap_handler to 256 bytes; otherwise the trap entry would + * jump to a lower address inside _start and execute garbage. + * + * Stack layout (absolute tile addresses, grows downward): + * Slot 0 (CV32 main): [0x1B800, 0x1C000) — reserved by sw/kernel/crt0.S + * PULP STACK_TOP = 0x1B800 + * PULP core 0: [0x1B000, 0x1B800) (2 KB) + * ... + * PULP core 7: [0x14000, 0x14800) (2 KB) + * + * NOTE: stack and PULP_* MMIO addresses are loaded with 'li' (not 'la') + * because the binary is position-independent (ORIGIN=0x0) and 'la' would + * produce a PC-relative reference that breaks when loaded at the instrram + * offset. + */ + +#ifndef PULP_CORE_COUNT +#define PULP_CORE_COUNT 8 +#endif + +#define PULP_DONE_ADDR 0x0000174C /* PULP_CTRL_BASE + 0x0C */ +#define PULP_TASKBIN_ADDR 0x00001750 /* PULP_CTRL_BASE + 0x10 */ +#define PULP_DATA_ADDR 0x00001754 /* PULP_CTRL_BASE + 0x14 */ +#define PULP_START_ADDR 0x00001758 /* PULP_CTRL_BASE + 0x18 */ +#define PULP_READY_ADDR 0x0000175C /* PULP_CTRL_BASE + 0x1C */ + +/* PULP STACK_TOP sits 2 KB below the tile-wide top (0x1C000) so the slot + * at [0x1B800, 0x1C000) stays reserved for the CV32 main core (slot 0). */ +#define STACK_TOP 0x0001B800 +#define LOG2_SLICE 11 /* log2(2 KB) — each hart gets 2 KB */ + +.section .text.start, "ax" +.global _start +_start: + /* Compute local core ID = mhartid % PULP_CORE_COUNT */ + csrr a0, mhartid + li t0, PULP_CORE_COUNT + remu a1, a0, t0 + + /* Per-hart stack pointer: sp = STACK_TOP - local_id * 2KB */ + li x2, STACK_TOP + slli t0, a1, LOG2_SLICE + sub x2, x2, t0 + + /* Clear BSS — all harts write 0, benign race. + * + * NOTE: __bss_start / __bss_end are PIC symbols (linker script has + * ORIGIN=0), so their resolved values are small (a few hundred bytes). + * Without 'norelax' the linker collapses 'la' (auipc+addi) into 'li' + * because the absolute value fits in a 12-bit immediate — that would + * make the loop write zeros to tile-local MMIO (iDMA CTRL region at + * 0x200-0x5FF), hanging the bus. Force PC-relative addressing here. */ + .option push + .option norelax + la t0, __bss_start + la t1, __bss_end + .option pop +1: bgeu t0, t1, 2f + sw zero, 0(t0) + addi t0, t0, 4 + j 1b +2: + /* Install trap_handler in mtvec, direct mode (mtvec[1:0] = 00). + * CV32E40P hardwires mtvec[7:2]=0 (only mtvec[31:8] and mtvec[0] are + * writable), so the trap base address is rounded down to a 256-byte + * boundary. trap_handler must therefore be 256-byte aligned. */ + .option push + .option norelax + la t0, trap_handler + .option pop + csrw mtvec, t0 + + /* Enable MEIE (bit 11 of mie) */ + li t0, 0x800 + csrrs zero, mie, t0 + /* Enable MIE (bit 3 of mstatus) */ + li t0, 0x8 + csrrs zero, mstatus, t0 + + /* Signal "core armed": write 1 to PULP_READY. CV32's pulp_init() polls + * this until ClusterRegs has counted N_CLUSTER_CORES writes. */ + li t0, PULP_READY_ADDR + li t1, 1 + sw t1, 0(t0) + +dispatcher_loop: + wfi + j dispatcher_loop + +/*-------------------------------------------------------------------------- + * Machine External Interrupt trap handler (target of the MEI vector slot). + * + * The dispatcher loop only executes WFI + jump, so no caller state needs + * preserving — we may freely clobber t0/t1/a0/ra inside the handler. + * + * NOTE on mepc semantics: when MEI is taken from WFI, mepc points to the + * instruction *after* the WFI (the back-jump). mret therefore returns to + * that jump, which loops us back to WFI. Equivalent to "wfi again". + *--------------------------------------------------------------------------*/ +.balign 256 +.global trap_handler +trap_handler: + /* a0 = *PULP_DATA (first argument to task) */ + li t0, PULP_DATA_ADDR + lw a0, 0(t0) + + /* t1 = *PULP_TASKBIN (task function pointer) */ + li t0, PULP_TASKBIN_ADDR + lw t1, 0(t0) + + /* ACK: write 0 to PULP_START before invoking the task. This unblocks + * CV32's pulp_run_task() poll once all selected cores have written. */ + li t0, PULP_START_ADDR + sw zero, 0(t0) + + /* Call task(data) */ + jalr ra, 0(t1) + + /* Signal completion: write 1 to PULP_DONE. ClusterRegs accumulates and + * fires cluster_done when NB_CORES_TO_WAIT writes have arrived. */ + li t0, PULP_DONE_ADDR + li t1, 1 + sw t1, 0(t0) + + mret + +.global _init +.global _fini +_init: +_fini: + ret diff --git a/sw/kernel_pulp/pulp_program.ld b/sw/kernel_pulp/pulp_program.ld new file mode 100644 index 00000000..984a100b --- /dev/null +++ b/sw/kernel_pulp/pulp_program.ld @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Linker script for PULP embedded binary in CV32 instrram (magia_v3). + * + * Aligned with magia-sdk targets/magia_v3/pulp/src/pulp_program.ld. + * + * Memory layout when embedded: + * 0xCC000000: CV32 .text + * 0xCC000000 + CV32_size: .spatz_binary (Spatz code, if present) + * 0xCC000000 + offset: _pulp_binary_start (.pulp_binary, this binary) + * + * Binary is position-independent (compiled with -fPIC -mcmodel=medany). + * ORIGIN=0x0 means all section addresses are relative to load address. + * + * Local stack allocation (tile address space, absolute, set by pulp_crt0.S): + * CV32 stack: 0x00010000 – 0x00014000 (16 KB) + * PULP stack: 0x00014000 – 0x0001C000 (32 KB, 8 harts × 2 KB each) + * Spatz stack: 0x0001C000 – 0x00020000 (16 KB) + */ + +OUTPUT_ARCH(riscv) +ENTRY(_start) + +MEMORY +{ + prog (rwxa) : ORIGIN = 0x00000000, LENGTH = 0x10000 /* 64 KB, relative */ +} + +SECTIONS +{ + .text : { + . = ALIGN(4); + KEEP(*(.text.start)) + *(.text) + *(.text.*) + . = ALIGN(4); + } > prog + + .rodata : { + . = ALIGN(4); + *(.rodata) + *(.rodata.*) + . = ALIGN(4); + } > prog + + .data : { + . = ALIGN(4); + *(.data) + *(.data.*) + *(.sdata) + *(.sdata.*) + . = ALIGN(4); + } > prog + + .bss : { + . = ALIGN(4); + __bss_start = .; + *(.bss) + *(.bss.*) + *(.sbss) + *(.sbss.*) + *(COMMON) + . = ALIGN(4); + __bss_end = .; + } > prog + + /DISCARD/ : { + *(.comment) + *(.note.*) + } +} diff --git a/sw/tests/cluster_tests/fpu_cluster_test/main.c b/sw/tests/cluster_tests/fpu_cluster_test/main.c new file mode 100644 index 00000000..745c7796 --- /dev/null +++ b/sw/tests/cluster_tests/fpu_cluster_test/main.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * fpu_cluster_test - main core (CV32) binary. + * + * Boots the PULP cluster cores, dispatches the FPU task and waits for them + * to complete through the CV32 Event Unit PULP_DONE event. + * Each cluster core runs a small set of single-precision FPU operations + * (fadd.s, fmul.s, fsub.s) and writes a pass/fail word to L2. + * The CV32 main then reads back those words and reports the result. + */ + +#include +#include "magia_tile_utils.h" +#include "cluster_utils.h" +#include "fpu_cluster_test_pulp_task_bin.h" + +/* Result slot: 4 bytes per cluster core. + * Cluster cores write 0xFEEDxxxx where xx = error count. */ +#define FPU_RESULT_BASE (L2_BASE + 0x00060000) +#define FPU_RESULT_MAGIC (0xFEED0000u) +#define FPU_RESULT_MASK (0xFFFF0000u) + +static inline uint32_t get_hartid(void) { + uint32_t hartid; + asm volatile("csrr %0, mhartid" : "=r"(hartid)); + return hartid; +} + +int main(void) { + int print_summary = (get_hartid() == 0); + + /* Boot the PULP cluster cores into their dispatcher loop. */ + if (print_summary) + printf("[fpu_cluster_test] running %d PULP cores\n", PULP_CORE_COUNT); + cluster_boot(PULP_BINARY_START); + + /* Arm EU before dispatching the task to avoid missing DONE. */ + cluster_arm_done_event(); + + cluster_dispatch_task(FPU_CLUSTER_TEST_TASK, 0xFFu); + + /* Sleep (cv.elw) until all cluster cores have signalled task done. */ + cluster_wait_done_eu(); + + /* Read back per-core results from L2 and report. */ + unsigned int total_errors = 0; + unsigned int passed_cores = 0; + for (int core_idx = 0; core_idx < PULP_CORE_COUNT; core_idx++) { + uint32_t word = mmio32(FPU_RESULT_BASE + 4 * core_idx); + if ((word & FPU_RESULT_MASK) != FPU_RESULT_MAGIC) { + if (print_summary) + printf("[fpu_cluster_test] core %d MISSING slot=0x%08x\n", core_idx, word); + total_errors++; + } else { + unsigned int errs = word & 0xFFFFu; + if (errs == 0) { + passed_cores++; + } else { + if (print_summary) + printf("[fpu_cluster_test] core %d FAIL %u mismatches\n", core_idx, errs); + total_errors += errs; + } + } + } + + if (print_summary) { + if (total_errors == 0) + printf("[fpu_cluster_test] PASS: %u/%d cores, 3 ops/core\n", + passed_cores, PULP_CORE_COUNT); + else + printf("[fpu_cluster_test] FAIL: %u errors, %u/%d cores passed\n", + total_errors, passed_cores, PULP_CORE_COUNT); + } + + return (int)total_errors; + +} diff --git a/sw/tests/cluster_tests/fpu_cluster_test/pulp_task/fpu_cluster_test_task.c b/sw/tests/cluster_tests/fpu_cluster_test/pulp_task/fpu_cluster_test_task.c new file mode 100644 index 00000000..86121424 --- /dev/null +++ b/sw/tests/cluster_tests/fpu_cluster_test/pulp_task/fpu_cluster_test_task.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * fpu_cluster_test - PULP cluster-core binary. + * + * Mirrors sw/tests/fpu_test.c but runs on all 8 cluster cores concurrently. + * Each core performs: + * fadd.s: 12.34 + 56.78 ≈ 69.12 (tolerance 0.1) + * fsub.s: 56.78 - 12.34 ≈ 44.44 (tolerance 0.1) + * fmul.s: 12.34 * 2.0 ≈ 24.68 (tolerance 0.1) + * + * 12.34 and 56.78 are NOT exact in float32, so comparisons use the same + * 0.1 tolerance as fpu_test.c rather than bit-exact equality. + * + * Each core writes a 32-bit word to L2: + * 0xFEED0000 | + * The CV32 main reads these back to determine pass/fail. + */ + +#include +#include "magia_tile_utils.h" + +/* L2 result area: 4 bytes per cluster core (local_id 0..7). */ +#define FPU_RESULT_BASE (L2_BASE + 0x00060000) +#define FPU_RESULT_MAGIC (0xFEED0000u) + +#define A_VAL (12.34f) +#define B_VAL (56.78f) +#define ADD_EXP (69.12f) +#define SUB_EXP (44.44f) +#define MUL_EXP (24.68f) +#define FP_TH (0.1f) + +#define abs_diff(x, y) (((x) > (y)) ? ((x) - (y)) : ((y) - (x))) + +static inline uint32_t get_hartid(void) { + uint32_t id; + asm volatile("csrr %0, mhartid" : "=r"(id)); + return id; +} + +/* Enable the FPU: set mstatus.FS to INITIAL (01). */ +static inline void enable_fpu(void) { + asm volatile ( + "li t0, 0x2000\n" + "csrs mstatus, t0\n" + ::: "t0" + ); +} + +void fpu_cluster_test_task(void *data) { + (void)data; + enable_fpu(); + + uint32_t hartid = get_hartid(); + uint32_t pulp_gid = hartid - PULP_HARTID_BASE; /* 0..127 */ + uint32_t local_id = pulp_gid % PULP_CORE_COUNT; /* 0..7 */ + + unsigned int errors = 0; + + volatile float a = A_VAL; + volatile float b = B_VAL; + + /* fadd.s: a + b ≈ 69.12 */ + float c_add = a + b; + if (abs_diff(c_add, ADD_EXP) > FP_TH) errors++; + + /* fsub.s: b - a ≈ 44.44 */ + float c_sub = b - a; + if (abs_diff(c_sub, SUB_EXP) > FP_TH) errors++; + + /* fmul.s: a * 2.0 ≈ 24.68 */ + volatile float two = 2.0f; + float c_mul = a * two; + if (abs_diff(c_mul, MUL_EXP) > FP_TH) errors++; + + if (errors != 0) { + printf("[PULP FPU] core %u: %u errors (add=%s sub=%s mul=%s)\n", + local_id, errors, + (abs_diff(c_add, ADD_EXP) <= FP_TH) ? "OK" : "FAIL", + (abs_diff(c_sub, SUB_EXP) <= FP_TH) ? "OK" : "FAIL", + (abs_diff(c_mul, MUL_EXP) <= FP_TH) ? "OK" : "FAIL"); + } + + /* Write result to L2 slot for this core. */ + mmio32(FPU_RESULT_BASE + 4 * local_id) = FPU_RESULT_MAGIC | (errors & 0xFFFFu); +} diff --git a/sw/tests/cluster_tests/hello_pulp/main.c b/sw/tests/cluster_tests/hello_pulp/main.c new file mode 100644 index 00000000..c33bed62 --- /dev/null +++ b/sw/tests/cluster_tests/hello_pulp/main.c @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * hello_pulp — main core (CV32) binary. + * + * This ELF is linked at 0xCC000000 and executed by the CV32 main core + * of each tile (mhartid 0..NUM_CLUSTERS-1). + * + * Flow: + * 1) Print a "hello" banner. + * 2) Boot the PULP cluster cores into their dispatcher loop + * (cluster_boot -> pulp_init: programs PULP_BINARY, broadcasts + * CLK_EN, polls PULP_READY). + * 3) Arm the CV32 Event Unit for PULP_DONE (EU bit 12). + * 4) Dispatch the hello task to all 8 PULP cores by programming + * NB_CORES_TO_WAIT, TASKBIN and START. + * 5) Sleep in WFE until the DONE quorum reaches the Event Unit. + * 6) Print the "done" message. + */ + +#include "magia_tile_utils.h" +#include "cluster_utils.h" +#include "hello_pulp_pulp_task_bin.h" + +static inline uint32_t get_hartid(void) { + uint32_t hartid; + asm volatile("csrr %0, mhartid" + :"=r"(hartid):); + #ifndef RI5CY + return hartid; + #else + // RI5CY mhartid CSR: { 21'b0, cluster_id_i[5:0], 1'b0, core_id_i[3:0] } + // cluster_id_i = mhartid_tile + 1 (which tile/cluster, 1-indexed; 0 = standalone main core) + // core_id_i = i (which core within the cluster, 0-indexed) + uint32_t cluster_id = (hartid >> 5) & 0x3F; // = tile_hartid + 1 (same for all cores in a tile) + uint32_t core_id = hartid & 0xF; // = i (unique per core within tile) + if (cluster_id == 0) + return core_id; // standalone main tile core + return PULP_HARTID_BASE + (cluster_id - 1) * PULP_CORE_COUNT + core_id; + #endif +} + +int main(void) { + uint32_t hartid = get_hartid(); + + printf("[Main core %u] Hello World!\n", hartid); + + /* Boot the PULP cluster cores into their dispatcher loop. */ + cluster_boot(PULP_BINARY_START); + + /* Arm EU before dispatching the task to avoid missing DONE. */ + cluster_arm_done_event(); + + /* Dispatch the hello task to all 8 cluster cores of this tile. */ + cluster_dispatch_task(HELLO_PULP_TASK, 0xFFu); + + /* Sleep (cv.elw) until every cluster core of this tile has signalled + * task completion. */ + cluster_wait_done_eu(); + /* + printf("[Main core %u] All %d cluster cores done!\n", + hartid, PULP_CORE_COUNT); + */ + return 0; +} diff --git a/sw/tests/cluster_tests/hello_pulp/pulp_task/hello_pulp_task.c b/sw/tests/cluster_tests/hello_pulp/pulp_task/hello_pulp_task.c new file mode 100644 index 00000000..4abe95da --- /dev/null +++ b/sw/tests/cluster_tests/hello_pulp/pulp_task/hello_pulp_task.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * hello_pulp — PULP cluster-core task. + * + * Linked at 0x0 (PIC, ORIGIN=0) and embedded as .pulp_binary inside the + * CV32 ELF (single-binary flow). The PULP cores boot here via crt0 and + * stay in their dispatcher loop until the CV32 dispatches this function + * with cluster_dispatch_task(HELLO_PULP_TASK, mask). + * + * The task is entered as `void hello_pulp_task(void *data)`; `data` is + * whatever pointer the CV32 wrote to PULP_DATA (NULL here). When it + * returns, the trap handler writes 1 to PULP_DONE and re-enters WFI. + */ + +#include "magia_tile_utils.h" + +static inline uint32_t get_hartid(void) { + uint32_t hartid; + asm volatile("csrr %0, mhartid" + :"=r"(hartid):); + #ifndef RI5CY + return hartid; + #else + // RI5CY mhartid CSR: { 21'b0, cluster_id_i[5:0], 1'b0, core_id_i[3:0] } + // cluster_id_i = mhartid_tile + 1 (which tile/cluster, 1-indexed; 0 = standalone main core) + // core_id_i = i (which core within the cluster, 0-indexed) + uint32_t cluster_id = (hartid >> 5) & 0x3F; // = tile_hartid + 1 (same for all cores in a tile) + uint32_t core_id = hartid & 0xF; // = i (unique per core within tile) + if (cluster_id == 0) + return core_id; // standalone main tile core + return PULP_HARTID_BASE + (cluster_id - 1) * PULP_CORE_COUNT + core_id; + #endif +} + +void hello_pulp_task(void *data) { + (void)data; + + uint32_t hartid = get_hartid(); + uint32_t pulp_gid = hartid - PULP_HARTID_BASE; + uint32_t local_id = pulp_gid % PULP_CORE_COUNT; + uint32_t tile_id = pulp_gid / PULP_CORE_COUNT; + + /* Only core 0 of each tile prints, to avoid interleaving on the + shared per-tile UART peripheral at 0xFFFF0004. */ + if (local_id == 0) + printf("[Tile %u PULP-%u mhartid %u] Hello World!\n", + tile_id, local_id, hartid); +} diff --git a/sw/tests/cluster_tests/hello_redmule_pulp/main.c b/sw/tests/cluster_tests/hello_redmule_pulp/main.c new file mode 100644 index 00000000..1778bfa6 --- /dev/null +++ b/sw/tests/cluster_tests/hello_redmule_pulp/main.c @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * hello_redmule_pulp — CV32 main core entry point. + * CV32 runs RedMulE once (sanity check), then boots the PULP cluster + * and dispatches the task. Each cluster core GEMMs into its own private + * Y slot; CV32 verifies all 8 slots after PULP_DONE (EU bit 12). + */ + +#include +#include "magia_tile_utils.h" +#include "cluster_utils.h" +#include "redmule_mm_utils.h" +#include "event_unit_utils.h" +#include "idma_mm_utils.h" +#include "hello_redmule_pulp_pulp_task_bin.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +/* Address map. + * X, W, Y live in tile-local L1 (private per tile, no cross-tile race). + * The golden reference Z is read directly from z_oup[] in the main + * ELF .rodata (already in L2), so no per-tile L2 copy is needed. */ +#define X_BASE (L1_BASE + 0x00012048) /* shared, read-only */ +#define W_BASE (L1_BASE + 0x00016048) /* shared, read-only */ +/* PER-CORE output Y: each of the 8 cluster cores computes into its OWN + private slot at Y_BASE + c*Y_STRIDE, so no two cores ever touch the same + word -> the result is fully deterministic (no shared-buffer race). The + CV32 main pre-loads every slot with the y_inp bias before dispatch, so + the cores never reload the bias themselves. + MUST match Y_BASE/Y_STRIDE in pulp_task/hello_redmule_pulp_task.c. */ +#define Y_BASE (L1_BASE + 0x0001A048) +#define Y_STRIDE (0x00001000) /* 4 KB per core (8 slots) */ + +/* Enable the FPU: set mstatus.FS to INITIAL (01). + * Without this any FPU instruction faults. CV32E40P (and the cluster + * cores) need this even though FP is enabled at compile time. */ +static inline void enable_fpu(void) { + asm volatile ( + "li t0, 0x2000\n" /* mstatus.FS[14:13] = 01 (Initial) */ + "csrs mstatus, t0\n" + ::: "t0" + ); +} + +/* iDMA L2→L1 transfer: issue, WFE on A2O_DONE, re-poll until truly idle. */ +static inline void idma_load_l2_to_l1(uint32_t l2_src, uint32_t l1_dst, uint32_t size_bytes) { + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_A2O_DONE_MASK); + (void)idma_L2ToL1(l2_src, l1_dst, (unsigned short)size_bytes); + do { + eu_idma_wait_a2o_completion(EU_WAIT_MODE_WFE); + } while (idma_mm_is_busy_dir(/*is_l1_to_l2=*/0, /*stream_id=*/0)); +} + +/* M=1 to keep simulation fast; N/K fixed by the W matrix layout. */ +#define M_SIZE (1) +#define N_SIZE (64) +#define K_SIZE (64) + +#define USE_WFE (1) +#define DIFF_TH (0x0011) + +/* Helper: same body as redmule_test_event_unit.c::main, factored so we + can call it once from the CV32 main and verify results. Returns + number of mismatches. */ +static unsigned int redmule_run_and_verify(void) { + /* Bulk-load X, W, Y for this tile via iDMA. All 16 tiles run this + in parallel; cross-tile contention is handled the same way the + cluster cores serialize on the HWPE: each iDMA load issues, the + core sleeps in WFE, and on wake re-checks the HW state until it + is truly idle (see idma_load_l2_to_l1). */ + idma_load_l2_to_l1((uint32_t)x_inp, X_BASE, M_SIZE * N_SIZE * 2); + idma_load_l2_to_l1((uint32_t)w_inp, W_BASE, N_SIZE * K_SIZE * 2); + idma_load_l2_to_l1((uint32_t)y_inp, Y_BASE, M_SIZE * K_SIZE * 2); + + /* Initialize and configure RedMulE */ + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)X_BASE, (unsigned int)W_BASE, (unsigned int)Y_BASE, + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16, (uint8_t)Float16); + + /* Initialize Event Unit for RedMulE */ + eu_redmule_init(); + + printf("Testing matrix multiplication with RedMulE...\n"); + hwpe_trigger_job(); + + if (USE_WFE) { + /* Drain HCI writeback: redmule_evt fires when the LAST writeback + has been ISSUED, not when it's been committed to L1. Without + this loop, the verify below can race the in-flight stores and + see stale Y values on a few scattered indices (observed on + tiles 14/15 where the per-tile timing is tightest). Same + pattern used by pulp_main.c after the cluster jobs. */ + do { + eu_redmule_wait_completion(EU_WAIT_MODE_WFE); + } while (hwpe_get_status() != 0); + printf("Detected WFE...\n"); + } else { + do { + eu_redmule_wait_completion(EU_WAIT_MODE_POLLING); + } while (hwpe_get_status() != 0); + printf("Detected polling completion...\n"); + } + /* Make any in-flight HCI stores observable to subsequent loads. */ + asm volatile ("fence" ::: "memory"); + printf("Verifying results...\n"); + + hwpe_cg_disable(); + + unsigned int num_errors = 0; + uint16_t computed, expected, diff; + for (int i = 0; i < M_SIZE*K_SIZE; i++) { + computed = mmio16(Y_BASE + 2*i); + expected = z_oup[i]; + diff = (computed > expected) ? (computed - expected) : (expected - computed); + if (diff > DIFF_TH) { + num_errors++; + printf("**ERROR**: Y[%8x](=0x%4x) != Z[%0d](=0x%4x)\n", + Y_BASE + 2*i, computed, i, expected); + } + } + printf("Finished test with %0d errors\n", num_errors); + return num_errors; +} + +int main(void) { + enable_fpu(); + + /* Sanity-check run: also primes the HWPE so cluster cores don't read + garbage (0xFFFFFFFE) from REDMULE_ACQUIRE on first acquire. */ + unsigned int err_main = redmule_run_and_verify(); + printf("CV32 main RedMulE sanity-check: %0d errors\n", err_main); + + /* Pre-load every per-core Y slot with y_inp (slot 0 overwritten by the sanity check above). */ + for (int c = 0; c < PULP_CORE_COUNT; c++) + idma_load_l2_to_l1((uint32_t)y_inp, Y_BASE + c * Y_STRIDE, + M_SIZE * K_SIZE * 2); + + printf("Booting PULP cluster cores...\n"); + cluster_boot(PULP_BINARY_START); + + /* Arm EU before dispatching the task to avoid missing DONE. */ + cluster_arm_done_event(); + + cluster_dispatch_task(HELLO_REDMULE_PULP_TASK, 0xFFu); + + /* Step 3: sleep (cv.elw) until all cluster cores have exited. */ + cluster_wait_done_eu(); + + /* Step 4: verify EVERY cluster core's private Y slot == golden Z. */ + unsigned int err_pulp = 0; + uint16_t computed, expected, diff; + for (int c = 0; c < PULP_CORE_COUNT; c++) { + uint32_t ybase = Y_BASE + c * Y_STRIDE; + for (int i = 0; i < M_SIZE*K_SIZE; i++) { + computed = mmio16(ybase + 2*i); + expected = z_oup[i]; + diff = (computed > expected) ? (computed - expected) : (expected - computed); + if (diff > DIFF_TH) { + err_pulp++; + printf("**PULP_ERR**: core=%0d i=%0d Y=0x%4x Z=0x%4x diff=0x%4x\n", + c, i, computed, expected, diff); + } + } + } + printf("PULP cluster pass finished with %0d errors\n", err_pulp); + + return (err_main + err_pulp); +} diff --git a/sw/tests/cluster_tests/hello_redmule_pulp/pulp_task/hello_redmule_pulp_task.c b/sw/tests/cluster_tests/hello_redmule_pulp/pulp_task/hello_redmule_pulp_task.c new file mode 100644 index 00000000..b9120eaa --- /dev/null +++ b/sw/tests/cluster_tests/hello_redmule_pulp/pulp_task/hello_redmule_pulp_task.c @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * hello_redmule_pulp — PULP cluster task (PIC, ORIGIN=0). + * All 8 cores serialize on hwpe_acquire_job(); the winner programs RedMulE + * and polls STATUS. Each core writes into its own Y slot (Y_BASE + id*Y_STRIDE) + * so there is no shared-buffer race. crt0 writes PULP_DONE on return. + */ + +#include "magia_tile_utils.h" +#include "redmule_mm_utils.h" +#include "magia_utils.h" + +/* Do NOT include x_input.h/w_input.h/y_input.h: with ORIGIN=0 their symbol + addresses alias tile MMIO at runtime. Use X_BASE/W_BASE/Y_BASE instead. */ + +/* Same reduced size as main.c — must stay in sync. */ +#define M_SIZE (1) +#define N_SIZE (64) +#define K_SIZE (64) + +#define X_BASE (L1_BASE + 0x00012048) /* shared, read-only */ +#define W_BASE (L1_BASE + 0x00016048) /* shared, read-only */ +#define Y_BASE (L1_BASE + 0x0001A048) +#define Y_STRIDE (0x00001000) + +/* Per-core marker area in L2: 4B per cluster core, 8 cluster cores per + tile, 16 tiles -> 8*16 = 128 entries. Used only to prove every core + reached the kernel. */ +#define MARKER_BASE (L2_BASE + 0x00050000) + +/* Enable the FPU on this hart: set mstatus.FS to INITIAL (01). Without + this any FPU instruction faults. RedMulE's hwpe-mac-engine itself + does not need this (it's a separate HWPE), but the cluster core + may execute float register copies in libgcc helpers / printf, and + future tests may issue FP instructions directly. */ +static inline void enable_fpu(void) { + asm volatile ( + "li t0, 0x2000\n" + "csrs mstatus, t0\n" + ::: "t0" + ); +} + +void hello_redmule_pulp_task(void *data) { + (void)data; + enable_fpu(); + + uint32_t hartid = get_hartid(); + uint32_t pulp_gid = hartid - PULP_HARTID_BASE; /* 0..127 */ + uint32_t local_id = pulp_gid % PULP_CORE_COUNT; /* 0..7 within tile */ + uint32_t tile_id = pulp_gid / PULP_CORE_COUNT; /* 0..15 across mesh */ + + /* Drop a marker so the test trace can confirm we got here. */ + mmio32(MARKER_BASE + 4 * pulp_gid) = 0xC1057ED0u + local_id; + + /* Only core 0 of each tile prints, to avoid interleaving on the + per-tile UART peripheral at 0xFFFF0004. */ + if (local_id == 0) { + printf("[Tile %u PULP-%u mhartid %u] entering RedMulE contention\n", + tile_id, local_id, hartid); + } + + int job_id; + while ((job_id = hwpe_acquire_job()) < 0) + ; + + // Wait for the HWPE to be fully idle before (re)configuring it + + while (hwpe_get_status() != 0); + asm volatile("fence" ::: "memory"); + + uint32_t my_y = Y_BASE + local_id * Y_STRIDE; + + redmule_cfg((unsigned)X_BASE, (unsigned)W_BASE, (unsigned)my_y, + M_SIZE, N_SIZE, K_SIZE, + (uint8_t)gemm_ops, (uint8_t)Float16, (uint8_t)Float16); + + hwpe_trigger_job(); + + while (hwpe_get_status() != 0) + ; + asm volatile("fence" ::: "memory"); + + hwpe_cg_disable(); + + if (local_id == 0) { + printf("[Tile %u PULP-%u mhartid %u] RedMulE done, exiting\n", + tile_id, local_id, hartid); + } + /* trap_handler writes 1 to PULP_DONE on return. */ +} diff --git a/sw/tests/cluster_tests/hello_spatz_pulp/main.c b/sw/tests/cluster_tests/hello_spatz_pulp/main.c new file mode 100644 index 00000000..5b89f0fa --- /dev/null +++ b/sw/tests/cluster_tests/hello_spatz_pulp/main.c @@ -0,0 +1,174 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * hello_spatz_pulp — main core (CV32) binary. + * + * Demonstrates the CV32 + Spatz + PULP cluster pipeline on one tile: + * + * 1) Fill FP16 input vectors X (= 1.0) and Y (= 2.0) in tile-local L1. + * 2) Initialize Spatz + Event Unit; offload Z = X + Y (vecsum16) to Spatz. + * 3) Sleep via WFE (EU bit 8 = spatz_done) until Spatz signals completion. + * 4) Check Spatz exit code; disable Spatz clock. + * 5) Reset EU, boot PULP, arm PULP_DONE WFE and dispatch the cluster task. + * 6) Each cluster core sums its 1/8th of Z (raw uint16 bit-patterns) → L2. + * 7) CV32 wakes from WFE, collects per-core partial sums, verifies total. + * + * Expected result: + * Z[i] = 3.0 FP16 = 0x4200 for all i. + * Per-core partial sum = 32 × 0x4200 = 0x84000. + * Grand total (8 cores) = 256 × 0x4200 = 0x420000. + */ + +#include +#include "magia_tile_utils.h" +#include "cluster_utils.h" +#include "magia_spatz_utils.h" +#include "event_unit_utils.h" + +#include "hello_spatz_pulp_pulp_task_bin.h" +#include "hello_spatz_pulp_task_bin.h" /* generated: SPATZ_BINARY_START, VECSUM16_TASK */ + +/* ------------------------------------------------------------------------- + * Memory layout (tile-local L1 + shared L2) + * ------------------------------------------------------------------------- */ +#define VLEN 256 /* number of FP16 elements */ + +#define X_BASE (L1_BASE + 0x00000000) /* X[256] : 512 B */ +#define Y_BASE (L1_BASE + 0x00001000) /* Y[256] : 512 B */ +#define Z_BASE (L1_BASE + 0x00002000) /* Z[256] : 512 B (Spatz out) */ +#define PARAMS_BASE (L1_BASE + 0x00003000) /* vecsum_params_t : 16 B */ + +/* One uint32 slot per cluster core in L2. */ +#define RESULT_BASE (L2_BASE + 0x00060000) + +/* ------------------------------------------------------------------------- + * FP16 constants + * ------------------------------------------------------------------------- */ +#define FP16_ONE (0x3C00u) /* 1.0 */ +#define FP16_TWO (0x4000u) /* 2.0 */ +#define FP16_THREE (0x4200u) /* 3.0 = 1.0 + 2.0 */ + +/* Per-core partial sum = (VLEN / PULP_CORE_COUNT) elements × FP16_THREE */ +#define GOLDEN_PARTIAL ((VLEN / PULP_CORE_COUNT) * FP16_THREE) /* 0x84000 */ +/* Grand total = VLEN elements × FP16_THREE */ +#define GOLDEN_TOTAL (VLEN * FP16_THREE) /* 0x420000 */ + +/* ------------------------------------------------------------------------- + * Parameter struct for vecsum16_task (must match vecsum16_task.c). + * ------------------------------------------------------------------------- */ +typedef struct { + uint32_t x_addr; + uint32_t y_addr; + uint32_t z_addr; + uint32_t n_size; +} vecsum_params_t; + +int main(void) { + unsigned int errors = 0; + + /* ------------------------------------------------------------------ + * Step 1: populate X and Y in L1 with constant FP16 patterns. + * ------------------------------------------------------------------ */ + volatile uint16_t *X = (volatile uint16_t *)X_BASE; + volatile uint16_t *Y = (volatile uint16_t *)Y_BASE; + for (int i = 0; i < VLEN; i++) { + X[i] = FP16_ONE; + Y[i] = FP16_TWO; + } + + /* ------------------------------------------------------------------ + * Step 2: initialize Event Unit and Spatz. + * ------------------------------------------------------------------ */ + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); + + printf("[CV32] Initializing Spatz...\n"); + spatz_init(SPATZ_BINARY_START); + + /* Write parameter struct for vecsum16_task into L1. */ + volatile vecsum_params_t *params = (volatile vecsum_params_t *)PARAMS_BASE; + params->x_addr = X_BASE; + params->y_addr = Y_BASE; + params->z_addr = Z_BASE; + params->n_size = VLEN; + + /* ------------------------------------------------------------------ + * Step 3: trigger Spatz and wait for completion via WFE. + * + * spatz_run_task() sets TASKBIN, writes START=1, then polls START + * until Spatz firmware clears it (task acknowledged). + * eu_wait_spatz_wfe() then sleeps until the DONE event asserts. + * ------------------------------------------------------------------ */ + printf("[CV32] Launching Spatz vecsum16 (N=%d)...\n", VLEN); + spatz_pass_params(PARAMS_BASE); + spatz_run_task(VECSUM16_TASK); + eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK); + + if (spatz_get_exit_code() != 0) { + printf("[CV32] ERROR: Spatz returned exit code 0x%08x\n", + spatz_get_exit_code()); + errors++; + } else { + printf("[CV32] Spatz vecsum16 done OK.\n"); + } + + spatz_clk_dis(); + + /* ------------------------------------------------------------------ + * Step 4: boot PULP, dispatch the cluster task and wait for DONE via WFE. + * + * eu_init() fully resets the EU after the Spatz wait. The cluster done + * event is armed after boot and before START is written. + * ------------------------------------------------------------------ */ + eu_init(); + + printf("[CV32] Booting PULP cluster cores...\n"); + cluster_boot(PULP_BINARY_START); + cluster_arm_done_event(); + cluster_dispatch_task(HELLO_SPATZ_PULP_TASK, 0xFFu); + cluster_wait_done_eu(); + + /* ------------------------------------------------------------------ + * Step 5: collect per-core partial sums from L2 and verify. + * ------------------------------------------------------------------ */ + printf("[CV32] Verifying results...\n"); + uint32_t grand_total = 0; + for (int i = 0; i < PULP_CORE_COUNT; i++) { + uint32_t partial = mmio32(RESULT_BASE + 4 * i); + grand_total += partial; + if (partial != GOLDEN_PARTIAL) { + printf(" core %d partial=0x%08x expected=0x%08x FAIL\n", + i, partial, (unsigned)GOLDEN_PARTIAL); + errors++; + } else { + printf(" core %d partial=0x%08x PASS\n", i, partial); + } + } + + if (grand_total == GOLDEN_TOTAL) { + printf("[CV32] Grand total=0x%08x PASS\n", grand_total); + } else { + printf("[CV32] Grand total=0x%08x expected=0x%08x FAIL\n", + grand_total, (unsigned)GOLDEN_TOTAL); + errors++; + } + + return (int)errors; +} diff --git a/sw/tests/cluster_tests/hello_spatz_pulp/pulp_task/hello_spatz_pulp_task.c b/sw/tests/cluster_tests/hello_spatz_pulp/pulp_task/hello_spatz_pulp_task.c new file mode 100644 index 00000000..3cdcd0f2 --- /dev/null +++ b/sw/tests/cluster_tests/hello_spatz_pulp/pulp_task/hello_spatz_pulp_task.c @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2026 Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Niccolò Giuliani, Fondazione Chips-IT + */ + +/* + * hello_spatz_pulp — PULP cluster-core task. + * + * Dispatched via cluster_dispatch_task(HELLO_SPATZ_PULP_TASK, 0xFF). Each of the + * 8 cluster cores reads its 1/8th slice of the Z vector (filled by Spatz on + * the CV32 side: Z[i] = 1.0 + 2.0 = 3.0 FP16 = 0x4200), sums the raw uint16 + * bit-patterns into a uint32 partial sum, and writes the result to its + * per-core L2 slot. On return the trap handler writes 1 to PULP_DONE. + * + * Memory layout (shared with main.c): + * Z_BASE = L1_BASE + 0x00002000 (256 FP16 elements, Spatz output) + * RESULT_BASE = L2_BASE + 0x00060000 (8 × uint32, one per cluster core) + */ + +#include +#include "magia_tile_utils.h" +#include "cluster_utils.h" + +#define VLEN 256 +#define Z_BASE (L1_BASE + 0x00002000) +#define RESULT_BASE (L2_BASE + 0x00060000) + +void hello_spatz_pulp_task(void *data) { + (void)data; + + uint32_t local_id = cluster_core_id(); + uint32_t chunk = VLEN / PULP_CORE_COUNT; /* 32 elements per core */ + uint32_t start = local_id * chunk; + + /* Sum raw FP16 bit-patterns in this core's slice of Z. */ + uint32_t partial_sum = 0; + for (uint32_t i = start; i < start + chunk; i++) + partial_sum += mmio16(Z_BASE + 2 * i); + + /* Write result to per-core L2 slot. */ + mmio32(RESULT_BASE + 4 * local_id) = partial_sum; + + if (local_id == 0) + printf("[PULP core 0] partial_sum=0x%08x\n", partial_sum); +} diff --git a/sw/tests/eu_tests/event_unit_test.c b/sw/tests/eu_tests/event_unit_test.c index 2518aac9..6faf7c3c 100644 --- a/sw/tests/eu_tests/event_unit_test.c +++ b/sw/tests/eu_tests/event_unit_test.c @@ -40,9 +40,9 @@ #define W_BASE_2 (L1_BASE + 0x00022048) #define Y_BASE_2 (L1_BASE + 0x00026048) -#define Z_BASE_1 (L2_BASE + 0x00001000) -#define Z_BASE_2 (L2_BASE + 0x00005000) -#define Z_BASE_4 (L2_BASE + 0x0000D000) +#define Z_BASE_1 (L2_BASE + 0x00042000) +#define Z_BASE_2 (L2_BASE + 0x00046000) +#define Z_BASE_4 (L2_BASE + 0x0004A000) #define DMA_BUFFER_1 (L1_BASE + 0x00036048) #define DMA_BUFFER_2 (L1_BASE + 0x0003A048) diff --git a/sw/tests/eu_tests/idma_test_event_unit.c b/sw/tests/eu_tests/idma_test_event_unit.c index 7b9112fc..dbe3f94d 100644 --- a/sw/tests/eu_tests/idma_test_event_unit.c +++ b/sw/tests/eu_tests/idma_test_event_unit.c @@ -30,8 +30,8 @@ #define X_BASE (L1_BASE + 0x00012048) #define Y_BASE (L1_BASE + 0x00016048) -#define Z_BASE (L2_BASE + 0x00001000) -#define W_BASE (L2_BASE + 0x00005000) +#define Z_BASE (L2_BASE + 0x00042000) /* offset safe: clear of instrram (0xCC000000..0xCC007FFF) and dataram header */ +#define W_BASE (L2_BASE + 0x00046000) #define M_SIZE (96) #define N_SIZE (64) @@ -101,7 +101,6 @@ int main(void) { printf("iDMA moving data from L2 to L1...\n"); uint32_t transfer_id_1 = idma_L2ToL1(src_addr, dst_addr, len); - if (USE_WFE) { eu_idma_wait_a2o_completion(EU_WAIT_MODE_WFE); @@ -136,14 +135,15 @@ int main(void) { printf("src_std_3: 0x%8x\n", src_std_3); printf("reps_3: 0x%8x\n", reps_3); #endif - + // Clear Event Unit and ensure O2A mask is enabled eu_clear_events(0xFFFFFFFF); eu_enable_events(EU_IDMA_O2A_DONE_MASK); - printf("iDMA moving data from L1 to L2...\n"); uint32_t transfer_id_2 = idma_L1ToL2(src_addr, dst_addr, len); + printf("iDMA moving data from L1 to L2...\n"); + if (USE_WFE) { eu_idma_wait_o2a_completion(EU_WAIT_MODE_WFE); @@ -153,31 +153,34 @@ int main(void) { } #ifdef CONCURRENT - // Setup concurrent transfer L2->L1 to Y_BASE - dst_addr = (uint32_t)Y_BASE; - src_addr = (uint32_t)Z_BASE; - len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element + // Setup concurrent transfers: L2->L1 to Y_BASE and L1->L2 to W_BASE. + uint32_t a2o_dst_addr = (uint32_t)Y_BASE; + uint32_t a2o_src_addr = (uint32_t)Z_BASE; + uint32_t o2a_dst_addr = (uint32_t)W_BASE; + uint32_t o2a_src_addr = (uint32_t)X_BASE; + len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element #if VERBOSE > 10 - printf("dst_addr: 0x%8x (Y_BASE)\n", dst_addr); - printf("src_addr: 0x%8x (Z_BASE)\n", src_addr); + printf("a2o_dst_addr: 0x%8x (Y_BASE)\n", a2o_dst_addr); + printf("a2o_src_addr: 0x%8x (Z_BASE)\n", a2o_src_addr); + printf("o2a_dst_addr: 0x%8x (W_BASE)\n", o2a_dst_addr); + printf("o2a_src_addr: 0x%8x (X_BASE)\n", o2a_src_addr); printf("len: %0d\n", len); #endif - // Start both transfers concurrently - uint32_t transfer_id_o2a = transfer_id_2; // OBI2AXI (L1->L2) already started - uint32_t transfer_id_a2o = idma_L2ToL1(src_addr, dst_addr, len); // Start AXI2OBI (L2->L1) - // Clear Event Unit and ensure both masks are enabled eu_clear_events(0xFFFFFFFF); eu_enable_events(EU_IDMA_ALL_DONE_MASK); + uint32_t transfer_id_o2a = idma_L1ToL2(o2a_src_addr, o2a_dst_addr, len); + uint32_t transfer_id_a2o = idma_L2ToL1(a2o_src_addr, a2o_dst_addr, len); + printf("iDMA moving concurrently data from L1 to L2 and from L2 to L1...\n"); if (USE_WFE) { - eu_idma_wait_completion(EU_WAIT_MODE_WFE); + eu_multi_wait_all(0, 1, 1, 0, EU_WAIT_MODE_WFE); printf("Detected WFE...\n"); } else { - eu_idma_wait_completion(EU_WAIT_MODE_POLLING); + eu_multi_wait_all(0, 1, 1, 0, EU_WAIT_MODE_POLLING); } #endif diff --git a/sw/tests/hello_mesh.c b/sw/tests/hello_mesh.c index 48e5c5d9..4e51666e 100644 --- a/sw/tests/hello_mesh.c +++ b/sw/tests/hello_mesh.c @@ -23,7 +23,7 @@ #include "magia_utils.h" int main(void) { - // h_pprintf("Hello World! it is hartid "); pprintf(ds(get_hartid())); pprintln; + //h_pprintf("Hello World! it is hartid "); pprintf(ds(get_hartid())); pprintln; printf("Hello World! it is tile/hart %0d\n", get_hartid()); return 0; diff --git a/sw/tests/mm_tests/idma_test_mm.c b/sw/tests/mm_tests/idma_test_mm.c index 07554c90..c097a125 100644 --- a/sw/tests/mm_tests/idma_test_mm.c +++ b/sw/tests/mm_tests/idma_test_mm.c @@ -27,8 +27,8 @@ #define X_BASE (L1_BASE + 0x00012048) #define Y_BASE (L1_BASE + 0x00016048) -#define Z_BASE (L2_BASE + 0x00001000) -#define W_BASE (L2_BASE + 0x00005000) +#define Z_BASE (L2_BASE + 0x00042000) /* offset safe: clear of instrram (0xCC000000..0xCC007FFF) and dataram header */ +#define W_BASE (L2_BASE + 0x00046000) #define M_SIZE (96) #define N_SIZE (64) diff --git a/sw/tests/spatz_tests/matmul_compare_spatz_test.c b/sw/tests/spatz_tests/matmul_compare_spatz_test.c index d0cffc10..cc3cbf2d 100644 --- a/sw/tests/spatz_tests/matmul_compare_spatz_test.c +++ b/sw/tests/spatz_tests/matmul_compare_spatz_test.c @@ -192,4 +192,4 @@ int main(void) { return mismatch_errors; -} +} \ No newline at end of file diff --git a/sw/utils/cluster_utils.h b/sw/utils/cluster_utils.h new file mode 100644 index 00000000..1b8d29fa --- /dev/null +++ b/sw/utils/cluster_utils.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2023-2026 ETH Zurich, University of Bologna and Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Bare-metal PULP Cluster Utilities for MAGIA. + * + * Two usage perspectives: + * + * CV32 (main core) — orchestrator: + * cluster_boot(binary) boot all PULP cores into the dispatcher + * loop (= pulp_init); polls PULP_READY. + * cluster_arm_done_event() clear/enable the CV32 EU done event + * cluster_dispatch_task() write NB_CORES_TO_WAIT, TASKBIN, START; + * returns once selected cores have ACK'd + * cluster_wait_done_polling() spin on the CV32 EU done event + * cluster_done_pending() non-blocking EU done-event check + * cluster_wait_done_eu() WFE on PULP_DONE (EU bit 12) + * cluster_stop() de-assert PULP CLK_EN + * + * PULP cluster core — worker: + * cluster_core_id() local index within the cluster (0..N-1) + * cluster_tile_id() tile index of this core + * cluster_chunk_offset/size() data partition helpers + * + * Hardware (obi_slave_ctrl_cluster.sv) memory map @ PULP_CTRL_BASE = 0x1740: + * see magia_tile_utils.h. EU bit 12 = PULP_DONE quorum. + */ + +#ifndef CLUSTER_UTILS_H +#define CLUSTER_UTILS_H + +#include +#include "magia_tile_utils.h" +#include "magia_pulp_utils.h" +#include "event_unit_utils.h" + +// ============================================================================= +// CV32 (main core) — orchestrator API +// ============================================================================= + +static inline void cluster_boot(uint32_t binary_start) { + pulp_init(binary_start); +} + +static inline void cluster_dispatch_task(uint32_t task_addr, uint32_t core_mask) { + pulp_run_task(task_addr, core_mask); +} + +static inline void cluster_dispatch_task_with_params(uint32_t task_addr, + uint32_t params_ptr, + uint32_t core_mask) { + pulp_run_task_with_params(task_addr, params_ptr, core_mask); +} + +static inline void cluster_stop(void) { + pulp_clk_dis(); +} + +static inline void cluster_wait_done_polling(void) { + (void)eu_cluster_done_wait(EU_WAIT_MODE_POLLING); +} + +static inline uint32_t cluster_done_pending(void) { + return eu_check_events(EU_CLUSTER_DONE_MASK) != 0; +} + +static inline void cluster_arm_done_event(void) { + eu_cluster_done_init(); +} + +static inline void cluster_wait_done_eu(void) { + (void)eu_cluster_done_wait(EU_WAIT_MODE_WFE); +} + +// ============================================================================= +// Cluster core (worker) — identity helpers +// ============================================================================= + +static inline uint32_t cluster_core_id(void) { + uint32_t hartid; + asm volatile("csrr %0, mhartid" : "=r"(hartid)); + return (hartid - PULP_HARTID_BASE) % PULP_CORE_COUNT; +} + +static inline uint32_t cluster_tile_id(void) { + uint32_t hartid; + asm volatile("csrr %0, mhartid" : "=r"(hartid)); + return (hartid - PULP_HARTID_BASE) / PULP_CORE_COUNT; +} + +// ============================================================================= +// Cluster core (worker) — data partitioning helpers +// ============================================================================= + +static inline uint32_t cluster_chunk_offset(uint32_t total, uint32_t n_cores, + uint32_t core_id) { + return (total / n_cores) * core_id; +} + +static inline uint32_t cluster_chunk_size(uint32_t total, uint32_t n_cores, + uint32_t core_id) { + uint32_t base = total / n_cores; + return (core_id == n_cores - 1) ? (total - base * core_id) : base; +} + +#endif /* CLUSTER_UTILS_H */ diff --git a/sw/utils/event_unit_utils.h b/sw/utils/event_unit_utils.h index a3156fab..3bdf421e 100644 --- a/sw/utils/event_unit_utils.h +++ b/sw/utils/event_unit_utils.h @@ -118,6 +118,11 @@ #define EU_SPATZ_START_MASK (1 << EU_SPATZ_START_BIT) #define EU_SPATZ_ALL_MASK (EU_SPATZ_DONE_MASK | EU_SPATZ_START_MASK) +// Bare-metal PULP cluster completion event generated by the tile CSR. +#define EU_CLUSTER_DONE_BIT 12 +#define EU_CLUSTER_DONE_MASK (1 << EU_CLUSTER_DONE_BIT) +#define EU_CLUSTER_EVT_MASK EU_CLUSTER_DONE_MASK + // Wait modes typedef enum { EU_WAIT_MODE_POLLING = 0, @@ -128,17 +133,25 @@ typedef enum { // LOW-LEVEL HAL (PULP-compatible evt_read32) //============================================================================= -// evt_read32: blocking read with p.elw instruction +/* cv.elw on CV32E40P, p.elw on older PULP toolchains */ static inline unsigned int evt_read32(unsigned int base, unsigned int offset) { unsigned int value; unsigned int addr = base + offset; - // Direct p.elw inline assembly for PULP cores (RI5CY, CV32E40P) +#if defined(__cv32e40p__) || defined(CV32E40P) + __asm__ __volatile__ ( + "cv.elw %0, 0(%1)" + : "=r" (value) + : "r" (addr) + : "memory" + ); +#else __asm__ __volatile__ ( "p.elw %0, 0(%1)" : "=r" (value) : "r" (addr) : "memory" ); +#endif return value; } @@ -238,6 +251,31 @@ static inline unsigned int eu_evt_maskWaitAndClr(unsigned int evtMask) { return result; } +//============================================================================= +// PULP CLUSTER FUNCTIONS +//============================================================================= + +static inline void eu_cluster_done_init(void) { + /* Absolute write (not OR) — clears stale RedMulE/iDMA events that + would otherwise wake the WFE before the real PULP_DONE. */ + mmio32(EU_CORE_MASK) = 0x00000000; /* disable ALL events (absolute) */ + eu_clear_events(0xFFFFFFFF); /* drop any stale latched events */ + eu_enable_events(EU_CLUSTER_DONE_MASK); /* enable ONLY cluster-done (b12) */ +} + +static inline uint32_t eu_cluster_done_wait(eu_wait_mode_t mode) { + eu_enable_events(EU_CLUSTER_DONE_MASK); + return eu_wait_events(EU_CLUSTER_DONE_MASK, mode, 1000000); +} + +static inline void eu_pulp_init(void) { + eu_cluster_done_init(); +} + +static inline uint32_t eu_pulp_wait(eu_wait_mode_t mode) { + return eu_cluster_done_wait(mode); +} + //============================================================================= // REDMULE FUNCTIONS //============================================================================= diff --git a/sw/utils/magia_pulp_utils.h b/sw/utils/magia_pulp_utils.h new file mode 100644 index 00000000..b04bd519 --- /dev/null +++ b/sw/utils/magia_pulp_utils.h @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2026 ETH Zurich, University of Bologna and Fondazione Chips-IT + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Bare-metal PULP Cluster Utility Functions (CV32 control side). + * + * Dynamic dispatch model (CV32 -> PULP cluster): + * 1. pulp_init(binary) boot all cores into the PULP dispatcher loop; + * polls PULP_READY until every core is armed. + * 2. pulp_run_task(task,mask) dispatch the task function to a subset of + * cores: writes NB_CORES_TO_WAIT = popcount(mask), + * TASKBIN = task, then PULP_START = mask which + * fires per-core MEI pulses. CV32 polls + * PULP_START until it self-clears (all ACKs). + * 3. cluster_wait_done_eu()/_polling() wait for DONE quorum (PULP_DONE). + * + * pulp_run_task_with_params() also writes PULP_DATA so the task receives a + * context pointer as its first argument (a0 in the trap handler). + * + * Register map: see magia_tile_utils.h (PULP_CTRL_BASE). + */ +#ifndef MAGIA_PULP_UTILS_H +#define MAGIA_PULP_UTILS_H + +#include +#include "magia_tile_utils.h" + +/* ---- Low-level register helpers ---------------------------------------- */ + +static inline void pulp_clk_en(void) { mmio32(PULP_CLK_EN) = 1; } +static inline void pulp_clk_dis(void) { mmio32(PULP_CLK_EN) = 0; } + +static inline void pulp_set_binary(uint32_t addr) { + mmio32(PULP_BINARY) = addr; +} + +static inline void pulp_set_nb_cores_to_wait(uint32_t nb_cores) { + mmio32(PULP_NB_CORES_TO_WAIT) = nb_cores; +} + +static inline void pulp_set_func(uint32_t task_addr) { + mmio32(PULP_TASKBIN) = task_addr; +} + +static inline void pulp_pass_params(uint32_t params_ptr) { + mmio32(PULP_DATA) = params_ptr; +} + +static inline uint32_t _pulp_popcount(uint32_t x) { + x = x - ((x >> 1) & 0x55555555u); + x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u); + x = (x + (x >> 4)) & 0x0f0f0f0fu; + return (x * 0x01010101u) >> 24; +} + +/* ---- High-level dispatch API ------------------------------------------- */ + +/** + * @brief Boot the PULP cluster: write the binary entry point, enable all + * cores (CLK_EN broadcast), then wait until every core has armed its + * dispatcher (PULP_READY == 1). + */ +static inline void pulp_init(uint32_t binary_start) { + pulp_set_binary(binary_start); + pulp_clk_en(); + while ((mmio32(PULP_READY) & 1u) == 0u) { } +} + +/** + * @brief Dispatch @p task_addr to the cores selected by @p core_mask. + * Returns once every selected core has ACK'd the start (write-0 to + * PULP_START), i.e. once the cores have entered the task function. + * Use cluster_wait_done_eu()/cluster_wait_done_polling() for completion. + */ +static inline void pulp_run_task(uint32_t task_addr, uint32_t core_mask) { + pulp_set_nb_cores_to_wait(_pulp_popcount(core_mask)); + pulp_set_func(task_addr); + mmio32(PULP_START) = core_mask; + while (mmio32(PULP_START) != 0u) { } +} + +/** + * @brief Dispatch a task with a context pointer passed as first argument. + */ +static inline void pulp_run_task_with_params(uint32_t task_addr, + uint32_t params_ptr, + uint32_t core_mask) { + pulp_pass_params(params_ptr); + pulp_run_task(task_addr, core_mask); +} + +#endif /* MAGIA_PULP_UTILS_H */ diff --git a/sw/utils/magia_tile_utils.h b/sw/utils/magia_tile_utils.h index 88783ad3..d2e15859 100644 --- a/sw/utils/magia_tile_utils.h +++ b/sw/utils/magia_tile_utils.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna and Fondazione Chips-IT * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,39 @@ #define EVENT_UNIT_BASE (0x00000700) #define EVENT_UNIT_END (0x000016FF) #define SPATZ_CTRL_BASE (0x00001700) -#define SPATZ_CTRL_END (0x000017FF) +#define SPATZ_CTRL_END (0x0000173F) +/* PULP Cluster Control registers (tile_csr + 0x40), bare-metal dispatch model + * +0x00 PULP_CLK_EN : R/W broadcast enable. CV32 writes 1 to start + * ALL cores fetching from PULP_BINARY; writes 0 + * to disable. Writes also reset READY counter. + * +0x04 PULP_BINARY : entry point address (boot vector) for all + * cluster cores + * +0x08 PULP_NB_CORES_TO_WAIT : popcount of dispatch mask (ACK + DONE quorum) + * +0x0C PULP_DONE : W = each PULP hart signals completion; + * after the quorum the CSR emits EU bit 12 + * +0x10 PULP_TASKBIN : R/W per-dispatch task function address read + * by each PULP core in its trap handler + * +0x14 PULP_DATA : R/W per-dispatch opaque data ptr passed as + * first argument to the task + * +0x18 PULP_START : R/W CV32 writes one-hot mask -> per-core + * 1-cycle MEI pulse; cores write 0 to ACK; the + * register self-clears when all N ACKs arrive + * +0x1C PULP_READY : R = 1 once N_CLUSTER_CORES cores have booted; + * W = each core posts 1 when its dispatcher is + * armed (counter increment) + */ +#define PULP_CTRL_BASE (0x00001740) +#define PULP_CLK_EN (PULP_CTRL_BASE + 0x00) +#define PULP_BINARY (PULP_CTRL_BASE + 0x04) +#define PULP_NB_CORES_TO_WAIT (PULP_CTRL_BASE + 0x08) +#define PULP_DONE (PULP_CTRL_BASE + 0x0C) +#define PULP_TASKBIN (PULP_CTRL_BASE + 0x10) +#define PULP_DATA (PULP_CTRL_BASE + 0x14) +#define PULP_START (PULP_CTRL_BASE + 0x18) +#define PULP_READY (PULP_CTRL_BASE + 0x1C) +#define PULP_CTRL_END (0x000017FF) +#define PULP_CORE_COUNT (8) +#define PULP_HARTID_BASE (32) /* 2 * NUM_CLUSTERS (16) */ #define RESERVED_START (0x00001800) #define RESERVED_END (0x0000FFFF) #define STACK_START (0x00010000) @@ -126,7 +158,7 @@ static inline void sentinel_end(){ } static inline void ccount_en(){ -#ifdef CV32E40X +#if defined(CV32E40X) || defined(CV32E40P) asm volatile("csrrci zero, 0x320, 0x1" ::); #else asm volatile("csrw 0x7E0, %0" :: "r"(0x1)); @@ -135,16 +167,16 @@ static inline void ccount_en(){ } static inline void ccount_dis(){ -#ifdef CV32E40X +#if defined(CV32E40X) || defined(CV32E40P) asm volatile("csrrsi zero, 0x320, 0x1" ::); #else - asm volatile("csrw 0x7E1, %0" :: "r"(0x0)); + asm volatile("csrw 0x7E1, %0" :: "r"(0x0)); #endif } static inline uint32_t get_cyclel(){ uint32_t cyclel; -#ifdef CV32E40X +#if defined(CV32E40X) || defined(CV32E40P) asm volatile("csrr %0, cycle" :"=r"(cyclel):); #else diff --git a/sw/utils/magia_utils.h b/sw/utils/magia_utils.h index 9443c9f8..22c42765 100644 --- a/sw/utils/magia_utils.h +++ b/sw/utils/magia_utils.h @@ -43,11 +43,23 @@ #define pprintf(x) ( psprint(get_hartid(), x)) #define pprintln ( pprintf("\n")) -static inline uint32_t get_hartid(){ +static inline uint32_t get_hartid(void) { uint32_t hartid; asm volatile("csrr %0, mhartid" :"=r"(hartid):); - return hartid; + #ifndef RI5CY + return hartid; + #else + // RI5CY mhartid CSR: { 21'b0, cluster_id_i[5:0], 1'b0, core_id_i[3:0] } + // cluster_id_i = mhartid_tile + 1 (which tile/cluster, 1-indexed; 0 = standalone main core) + // core_id_i = i (which core within the cluster, 0-indexed) + uint32_t cluster_id = (hartid >> 5) & 0x3F; // = tile_hartid + 1 (same for all cores in a tile) + uint32_t core_id = hartid & 0xF; // = i (unique per core within tile) + //printf ("%d\n",PULP_HARTID_BASE + (cluster_id - 1) * PULP_CORE_COUNT + core_id); + if (cluster_id == 0) + return core_id; // standalone main tile core + return PULP_HARTID_BASE + (cluster_id - 1) * PULP_CORE_COUNT + core_id; + #endif } static inline void amo_increment(volatile uint32_t addr, volatile uint32_t amnt){ @@ -84,21 +96,21 @@ char* utoa(unsigned int value, unsigned int base, char* result) { char* bs(uint32_t x) { uint32_t hartid = get_hartid(); - char *address = STR_BASE + L1_TILE_OFFSET*hartid; + char *address = (char *)(uintptr_t)(STR_BASE + L1_TILE_OFFSET * hartid); return utoa(x, 2, address); } char* ds(uint32_t x) { uint32_t hartid = get_hartid(); - char *address = STR_BASE + L1_TILE_OFFSET*hartid; + char *address = (char *)(uintptr_t)(STR_BASE + L1_TILE_OFFSET * hartid); return utoa(x, 10, address); } char* hs(uint32_t x) { uint32_t hartid = get_hartid(); - char *address = STR_BASE + L1_TILE_OFFSET*hartid; + char *address = (char *)(uintptr_t)(STR_BASE + L1_TILE_OFFSET * hartid); return utoa(x, 16, address); } diff --git a/sw/utils/tinyprintf.h b/sw/utils/tinyprintf.h index df789e78..9c9ad61f 100644 --- a/sw/utils/tinyprintf.h +++ b/sw/utils/tinyprintf.h @@ -106,8 +106,9 @@ For further details see source code. regs Kusti, 23.10.2004 */ +#define NULL 0 -void putf(char *null, char c) { +void putf(void *null, char c) { *(volatile int *) (0xFFFF0004) = (int)c; } @@ -377,7 +378,7 @@ static char a2u(char ch, const char **src, int base, unsigned int *nump) return ch; } -static void putchw(void *putp, putcf putf__, struct param *p) +static void putchw(void *putp, putcf putf, struct param *p) { char ch; int n = p->width; @@ -429,7 +430,7 @@ static void putchw(void *putp, putcf putf__, struct param *p) } } -void tfp_format(void *putp, putcf putf__, const char *fmt, va_list va) +void tfp_format(void *putp, putcf putf, const char *fmt, va_list va) { struct param p; #ifdef PRINTF_LONG_SUPPORT @@ -602,10 +603,21 @@ void tfp_format(void *putp, putcf putf__, const char *fmt, va_list va) } #if TINYPRINTF_DEFINE_TFP_PRINTF -static putcf stdout_putf; -static void *stdout_putp; +static putcf stdout_putf = NULL; +static void *stdout_putp = NULL; -void init_printf(void *putp, putcf putf__) +static inline putcf tfp_default_putf(void) +{ +#if defined(__riscv) + putcf fn; + __asm__ volatile ("lla %0, putf" : "=r"(fn)); + return fn; +#else + return putf; +#endif +} + +void init_printf(void *putp, putcf putf) { stdout_putf = putf; stdout_putp = putp; @@ -615,7 +627,8 @@ void tfp_printf(char *fmt, ...) { va_list va; va_start(va, fmt); - tfp_format(stdout_putp, stdout_putf, fmt, va); + putcf out = stdout_putf ? stdout_putf : tfp_default_putf(); + tfp_format(stdout_putp, out, fmt, va); va_end(va); } #endif diff --git a/target/sim/src/mesh/magia_tb.sv b/target/sim/src/mesh/magia_tb.sv index e9cd9c43..25d82ca5 100644 --- a/target/sim/src/mesh/magia_tb.sv +++ b/target/sim/src/mesh/magia_tb.sv @@ -30,11 +30,13 @@ module magia_tb; initial begin // Fetch plusargs or use safe (fail-fast) defaults - if (!$value$plusargs("INST_HEX=%s" , inst_hex)) inst_hex = ""; - if (!$value$plusargs("DATA_HEX=%s" , data_hex)) data_hex = ""; - if (!$value$plusargs("BOOT_ADDR=%h", boot_addr)) boot_addr = 0; + if (!$value$plusargs("INST_HEX=%s" , inst_hex)) inst_hex = ""; + if (!$value$plusargs("DATA_HEX=%s" , data_hex)) data_hex = ""; + if (!$value$plusargs("BOOT_ADDR=%h", boot_addr)) boot_addr = 0; - // Preload data (dummy L2 MEM) and instructions (I$) + // Single-binary flow: the CV32 ELF (@ 0xCC000000) embeds the optional + // Spatz/PULP task binaries in dedicated linker sections + // (.spatz_binary / .pulp_binary), so only one stimulus pair is preloaded. fixture.vip.inst_preload(inst_hex); fixture.vip.data_preload(data_hex); diff --git a/target/sim/src/tile/floo_axi_nw_mesh_1x2_pkg.sv b/target/sim/src/tile/floo_axi_nw_mesh_1x2_pkg.sv index edeb5783..3e8c9ee1 100644 --- a/target/sim/src/tile/floo_axi_nw_mesh_1x2_pkg.sv +++ b/target/sim/src/tile/floo_axi_nw_mesh_1x2_pkg.sv @@ -52,7 +52,7 @@ typedef struct packed { localparam sam_rule_t[SamNumRules-1:0] Sam = '{ '{ idx: '{x: 0, y: 0, port_id: 0}, start_addr: 32'hc0000000, - end_addr: 32'he0000000},// L20 + end_addr: 32'hffffffff},// L20 '{ idx: '{x: 1, y: 0, port_id: 0}, start_addr: 32'h00000000, end_addr: 32'h00100000} // MagiaTile0 @@ -73,7 +73,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_narrow_data_mst_addr_t; typedef logic[31:0] axi_narrow_data_mst_data_t; typedef logic[3:0] axi_narrow_data_mst_strb_t; -typedef logic[1:0] axi_narrow_data_mst_id_t; +typedef logic[2:0] axi_narrow_data_mst_id_t; typedef logic[0:0] axi_narrow_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_mst, axi_narrow_data_mst_req_t, axi_narrow_data_mst_rsp_t, axi_narrow_data_mst_addr_t, axi_narrow_data_mst_id_t, axi_narrow_data_mst_data_t, axi_narrow_data_mst_strb_t, axi_narrow_data_mst_user_t) @@ -81,7 +81,7 @@ typedef logic[0:0] axi_narrow_data_mst_user_t; typedef logic[31:0] axi_narrow_data_slv_addr_t; typedef logic[31:0] axi_narrow_data_slv_data_t; typedef logic[3:0] axi_narrow_data_slv_strb_t; -typedef logic[3:0] axi_narrow_data_slv_id_t; +typedef logic[5:0] axi_narrow_data_slv_id_t; typedef logic[0:0] axi_narrow_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_data_slv, axi_narrow_data_slv_req_t, axi_narrow_data_slv_rsp_t, axi_narrow_data_slv_addr_t, axi_narrow_data_slv_id_t, axi_narrow_data_slv_data_t, axi_narrow_data_slv_strb_t, axi_narrow_data_slv_user_t) @@ -89,7 +89,7 @@ typedef logic[0:0] axi_narrow_data_slv_user_t; typedef logic[31:0] axi_wide_data_mst_addr_t; typedef logic[255:0] axi_wide_data_mst_data_t; typedef logic[31:0] axi_wide_data_mst_strb_t; -typedef logic[1:0] axi_wide_data_mst_id_t; +typedef logic[2:0] axi_wide_data_mst_id_t; typedef logic[0:0] axi_wide_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_mst, axi_wide_data_mst_req_t, axi_wide_data_mst_rsp_t, axi_wide_data_mst_addr_t, axi_wide_data_mst_id_t, axi_wide_data_mst_data_t, axi_wide_data_mst_strb_t, axi_wide_data_mst_user_t) @@ -97,7 +97,7 @@ typedef logic[0:0] axi_wide_data_mst_user_t; typedef logic[31:0] axi_wide_data_slv_addr_t; typedef logic[255:0] axi_wide_data_slv_data_t; typedef logic[31:0] axi_wide_data_slv_strb_t; -typedef logic[1:0] axi_wide_data_slv_id_t; +typedef logic[2:0] axi_wide_data_slv_id_t; typedef logic[0:0] axi_wide_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_wide_data_slv, axi_wide_data_slv_req_t, axi_wide_data_slv_rsp_t, axi_wide_data_slv_addr_t, axi_wide_data_slv_id_t, axi_wide_data_slv_data_t, axi_wide_data_slv_strb_t, axi_wide_data_slv_user_t) @@ -106,13 +106,13 @@ typedef logic[0:0] axi_wide_data_slv_user_t; `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t) localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 32, DataWidth: 32, - InIdWidth: 4, - OutIdWidth: 2, + InIdWidth: 6, + OutIdWidth: 3, UserWidth: 1}; localparam axi_cfg_t AxiCfgW = '{ AddrWidth: 32, DataWidth: 256, - InIdWidth: 2, - OutIdWidth: 2, + InIdWidth: 3, + OutIdWidth: 3, UserWidth: 1}; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_data_slv, axi_wide_data_slv, AxiCfgN, AxiCfgW, hdr_t) diff --git a/target/sim/src/tile/magia_tile_fixture.sv b/target/sim/src/tile/magia_tile_fixture.sv index 041ccf57..1f8e4714 100644 --- a/target/sim/src/tile/magia_tile_fixture.sv +++ b/target/sim/src/tile/magia_tile_fixture.sv @@ -82,7 +82,7 @@ module magia_tile_fixture; logic[magia_pkg::N_IRQ-1:0] irq; - logic debug_req; + logic[magia_tile_pkg::N_CLUSTER_CORES:0] debug_req; logic debug_havereset; logic debug_running; logic debug_halted; diff --git a/target/sim/src/tile/magia_tile_tb.sv b/target/sim/src/tile/magia_tile_tb.sv index fe0ea6ff..ee259aba 100644 --- a/target/sim/src/tile/magia_tile_tb.sv +++ b/target/sim/src/tile/magia_tile_tb.sv @@ -30,11 +30,13 @@ module magia_tile_tb; initial begin // Fetch plusargs or use safe (fail-fast) defaults - if (!$value$plusargs("INST_HEX=%s" , inst_hex)) inst_hex = ""; - if (!$value$plusargs("DATA_HEX=%s" , data_hex)) data_hex = ""; - if (!$value$plusargs("BOOT_ADDR=%h", boot_addr)) boot_addr = 0; + if (!$value$plusargs("INST_HEX=%s" , inst_hex)) inst_hex = ""; + if (!$value$plusargs("DATA_HEX=%s" , data_hex)) data_hex = ""; + if (!$value$plusargs("BOOT_ADDR=%h", boot_addr)) boot_addr = 0; - // Preload data (dummy L2 MEM) and instructions (I$) + // Single-binary flow: the CV32 ELF embeds the optional Spatz/PULP + // task binaries in dedicated sections (.spatz_binary / .pulp_binary) + // of instrram, so only one stimulus pair is preloaded. fixture.vip.inst_preload(inst_hex); fixture.vip.data_preload(data_hex); diff --git a/target/sim/src/tile/magia_tile_vip.sv b/target/sim/src/tile/magia_tile_vip.sv index 801e4a8b..339d3181 100644 --- a/target/sim/src/tile/magia_tile_vip.sv +++ b/target/sim/src/tile/magia_tile_vip.sv @@ -83,7 +83,7 @@ module magia_tile_vip output logic[magia_pkg::N_IRQ-1:0] irq, - output logic debug_req, + output logic[magia_tile_pkg::N_CLUSTER_CORES:0] debug_req, input logic debug_havereset, input logic debug_running, input logic debug_halted, @@ -110,7 +110,7 @@ module magia_tile_vip assign dm_exception_addr = '0; assign mhartid = '0; assign mimpid_patch = '0; - assign debug_req = 1'b0; + assign debug_req = '0; assign wu_wfe = 1'b0; assign ht_fsync_if_o[0].wake = 1'b0; assign ht_fsync_if_o[0].lvl = '0;