Introduction
Welcome to tt-mlir, Tenstorrent's open-source MLIR-based compiler infrastructure designed to optimize and deploy machine learning models and other computational workloads on Tenstorrent hardware. This documentation provides an overview of the key components, features, and usage of tt-mlir.
Architecture & Dialect Overview
tt-mlir is structured around several core dialects and components that facilitate the compilation process from high-level representations to low-level code generation. While the architecture diagram below illustrates Tenstorrent’s compiler flow, it also reflects the various dialect abstractions defined within tt-mlir.
tt-mlir is a library of reusable components that can be used to build compilers targeting Tenstorrent hardware. Other compiler technologies may choose to leverage whichever abstractions best suit their needs.
// TTIR: Named ops on tensors (akin to shlo, tosa, etc)
//
// This should be the default IR that users who need a higher-level abstraction
// over tensors.
//
// Example IR:
func.func @simple_linear(
%arg0: tensor<64x128xbf16>,
%arg1: tensor<128x64xbf16>,
%bias: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
%0 = ttir.empty() : tensor<64x64xbf16>
%1 = "ttir.linear"(%arg0, %arg1, %bias, %0) : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
return %1 : tensor<64x64xbf16>
}
// TTNN: Named ops on tensors (akin to shlo, tosa, etc)
//
// The TTNN dialect models the tt-nn API (from the tt-metalium project)
// as closely as possible. It is intended to be a high-level IR over tensors.
//
// Example IR:
#ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<4x2x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x2x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
func.func @simple_linear(
%arg0: tensor<64x128xbf16, #ttnn_layout>,
%arg1: tensor<128x64xbf16, #ttnn_layout1>,
%arg2: tensor<64x64xbf16, #ttnn_layout2>
) -> tensor<64x64xbf16, #ttnn_layout2> {
%0 = "ttnn.linear"(%arg0, %arg1, %arg2) <{
transpose_a = false,
transpose_b = false
}> : (tensor<64x128xbf16, #ttnn_layout>, tensor<128x64xbf16, #ttnn_layout1>, tensor<64x64xbf16, #ttnn_layout2>) -> tensor<64x64xbf16, #ttnn_layout2>
"ttnn.deallocate"(%arg2) <{force = false}> : (tensor<64x64xbf16, #ttnn_layout2>) -> ()
"ttnn.deallocate"(%arg1) <{force = false}> : (tensor<128x64xbf16, #ttnn_layout1>) -> ()
"ttnn.deallocate"(%arg0) <{force = false}> : (tensor<64x128xbf16, #ttnn_layout>) -> ()
return %0 : tensor<64x64xbf16, #ttnn_layout2>
}
// D2M: Generic compute dialect (akin to linalg)
//
// The D2M dialect models generic compute on tensors and memrefs,
// similar to linalg.generic, but with constructs that map well to
// the Tenstorrent execution model (e.g., sharded tensors, grids,
// explicit datamovement).
//
// Example IR:
#layout = #ttcore.metal_layout<logical_shape = 64x128, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#layout1 = #ttcore.metal_layout<logical_shape = 128x96, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#layout2 = #ttcore.metal_layout<logical_shape = 64x96, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#parallel = #ttcore.iterator_type
#reduction = #ttcore.iterator_type
func.func @simple_matmul(
%arg0: tensor<64x128xbf16>,
%arg1: tensor<128x96xbf16>
) -> tensor<64x96xbf16> {
%0 = d2m.empty() : tensor<64x96xbf16>
%1 = d2m.empty() : tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>
%2 = d2m.to_layout %arg0, %1 : tensor<64x128xbf16> into tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout> -> tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>
%3 = d2m.empty() : tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>
%4 = d2m.to_layout %arg1, %3 : tensor<128x96xbf16> into tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1> -> tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>
%5 = d2m.empty() : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
%6 = d2m.to_layout %0, %5 : tensor<64x96xbf16> into tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2> -> tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
%7 = d2m.generic {
block_factors = [1, 1, 1],
grid = #ttcore.grid<1x1>,
indexing_maps = [#map, #map1, #map2],
iterator_types = [#parallel, #parallel, #reduction],
threads = [#d2m.thread]
} ins(%2, %4 : tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>, tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>)
outs(%6 : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>) {
^compute0(%cb0: !d2m.cb>>, %cb1: !d2m.cb>>, %cb2: !d2m.cb>>):
%10 = d2m.wait %cb0 : >> -> tensor<2x4x!ttcore.tile<32x32, bf16>>
%11 = d2m.wait %cb1 : >> -> tensor<4x3x!ttcore.tile<32x32, bf16>>
%12 = d2m.reserve %cb2 : >> -> tensor<2x3x!ttcore.tile<32x32, bf16>>
%13 = linalg.generic {
indexing_maps = [#map, #map1, #map2],
iterator_types = ["parallel", "parallel", "reduction"]
} ins(%10, %11 : tensor<2x4x!ttcore.tile<32x32, bf16>>, tensor<4x3x!ttcore.tile<32x32, bf16>>) outs(%12 : tensor<2x3x!ttcore.tile<32x32, bf16>>) {
^bb0(%in: !ttcore.tile<32x32, bf16>, %in_0: !ttcore.tile<32x32, bf16>, %out: !ttcore.tile<32x32, bf16>):
%14 = "d2m.tile_matmul"(%in, %in_0, %out) : (!ttcore.tile<32x32, bf16>, !ttcore.tile<32x32, bf16>, !ttcore.tile<32x32, bf16>) -> !ttcore.tile<32x32, bf16>
linalg.yield %14 : !ttcore.tile<32x32, bf16>
} -> tensor<2x3x!ttcore.tile<32x32, bf16>>
d2m.yield %13 : (tensor<2x3x!ttcore.tile<32x32, bf16>>)
} : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
%8 = d2m.empty() : tensor<64x96xbf16>
%9 = d2m.to_layout %7, %8 : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2> into tensor<64x96xbf16> -> tensor<64x96xbf16>
return %9 : tensor<64x96xbf16>
}
// TTKernel: tt-metal device kernel dialect.
//
// The TTKernel dialect models low-level kernels that run on Tenstorrent
// devices. It exposes concepts such as circular buffers, tile registers,
// noc transactions and explicit synchronization. It is intended to be a
// 1-1 mapping to tt-metalium kernels.
//
// Example Datamovement Kernel IR:
func.func private @datamovement_kernel() attributes {
ttkernel.arg_spec = #ttkernel.arg_spec<ct_args = [
<arg_type = cb_port, operand_index = 0>,
<arg_type = cb_port, operand_index = 1>
]>,
ttkernel.thread = #ttkernel.thread<noc>
} {
%0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
%1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1024, bf16>
%c1_i32 = arith.constant 1 : i32
ttkernel.cb_reserve_back(%0, %c1_i32) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
ttkernel.cb_push_back(%0, %c1_i32) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
return
}
//
// Example Compute Kernel IR:
func.func private @compute_kernel8() attributes {
ttkernel.arg_spec = #ttkernel.arg_spec<ct_args = [
<arg_type = cb_port, operand_index = 0>,
<arg_type = cb_port, operand_index = 1>,
<arg_type = cb_port, operand_index = 2>
]>,
ttkernel.thread = #ttkernel.thread<compute>
} {
%0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
%1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
%2 = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
%c0 = arith.constant 0 : index
%c1_i32 = arith.constant 1 : i32
%c1_i32_0 = arith.constant 1 : i32
%c1_i32_1 = arith.constant 1 : i32
%c1_i32_2 = arith.constant 1 : i32
%c0_i32 = arith.constant 0 : i32
"ttkernel.mm_block_init"(%0, %1, %2, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c0_3 = arith.constant 0 : index
scf.for %arg0 = %c0_3 to %c4 step %c1 {
%c1_i32_4 = arith.constant 1 : i32
ttkernel.cb_wait_front(%0, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
%c1_i32_5 = arith.constant 1 : i32
ttkernel.cb_wait_front(%1, %c1_i32_5) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
%c1_i32_6 = arith.constant 1 : i32
ttkernel.cb_reserve_back(%2, %c1_i32_6) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
ttkernel.tile_regs_acquire() : () -> ()
%3 = arith.cmpi ne, %arg0, %c0_3 : index
scf.if %3 {
ttkernel.copy_tile_init(%2) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>) -> ()
ttkernel.copy_tile(%2, %c0_3, %c0_3) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index) -> ()
}
"ttkernel.mm_block_init_short"(%0, %1, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
%c0_7 = arith.constant 0 : index
%c0_8 = arith.constant 0 : index
"ttkernel.experimental::matmul_block"(%0, %1, %c0_7, %c0_8, %c0, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0, %c1_i32_2) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32, i32) -> ()
ttkernel.pack_tile(%c0_3, %2, %c0_3, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
ttkernel.cb_pop_front(%0, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
ttkernel.cb_pop_front(%1, %c1_i32_5) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
ttkernel.cb_push_back(%2, %c1_i32_6) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
%c1_i32_4 = arith.constant 1 : i32
ttkernel.cb_wait_front(%2, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
ttkernel.cb_pop_front(%2, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
}
return
}
// TTMetal: tt-metal host/device interop dialect.
//
// The TTMetal dialect models host-side operations for managing Tenstorrent
// devices, such as buffer allocation, data transfer, and enqueueing programs.
//
// Example IR:
func.func @simple_matmul(
%arg0: memref<64x128xbf16>,
%arg1: memref<128x96xbf16>
) -> memref<64x96xbf16> {
%0 = "ttmetal.create_buffer"() <{address = 13312 : i64}> : () -> memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
%1 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
"ttmetal.enqueue_write_buffer"(%arg0, %1) : (memref<64x128xbf16>, memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.enqueue_program"(%1, %0, %1, %0) <{
cb_ports = array<i64: 0, 1>,
kernelConfigs = [
#ttmetal.noc_config<@datamovement_kernel0, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args<ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
#ttmetal.noc_config<@datamovement_kernel1, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
#ttmetal.compute_config<@compute_kernel2, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
],
operandSegmentSizes = array<i32: 2, 2>
}> : (memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.deallocate_buffer"(%1) : (memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
%2 = "ttmetal.create_buffer"() <{address = 11264 : i64}> : () -> memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
%3 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
"ttmetal.enqueue_write_buffer"(%arg1, %3) : (memref<128x96xbf16>, memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.enqueue_program"(%3, %2, %3, %2) <{
cb_ports = array<i64: 0, 1>,
kernelConfigs = [
#ttmetal.noc_config<@datamovement_kernel3, #ttmetal.core_range<0x0, 4x3>,#ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
#ttmetal.noc_config<@datamovement_kernel4, #ttmetal.core_range<0x0, 4x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
#ttmetal.compute_config<@compute_kernel5, #ttmetal.core_range<0x0, 4x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
],
operandSegmentSizes = array<i32: 2, 2>
}> : (memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.deallocate_buffer"(%3) : (memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
%4 = "ttmetal.create_buffer"() <{address = 9216 : i64}> : () -> memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
%5 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>
"ttmetal.enqueue_program"(%0, %2, %4, %5, %6, %4) <{
cb_ports = array<i64: 0, 1, 2>,
kernelConfigs = [
#ttmetal.noc_config<@datamovement_kernel6, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>, <semaphore[0]>, <semaphore[1]>, <semaphore[2]>, <semaphore[3]>, <buffer_address[0]>]>, noc0>,
#ttmetal.noc_config<@datamovement_kernel7, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>, <semaphore[0]>, <semaphore[1]>, <semaphore[2]>, <semaphore[3]>, <buffer_address[1]>]>, noc1>,
#ttmetal.compute_config<@compute_kernel8, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>]>, hifi4, false, false, false, [default]>
],
operandSegmentSizes = array<i32: 3, 3>
}> : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.deallocate_buffer"(%6) : (memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.deallocate_buffer"(%2) : (memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.deallocate_buffer"(%5) : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.deallocate_buffer"(%0) : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
%alloc = memref.alloc() : memref<64x96xbf16>
%7 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
"ttmetal.enqueue_program"(%4, %7, %4, %7) <{
cb_ports = array<i64: 0, 1>,
kernelConfigs = [
#ttmetal.noc_config<@datamovement_kernel9, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
#ttmetal.noc_config<@datamovement_kernel10, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
#ttmetal.compute_config<@compute_kernel11, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
],
operandSegmentSizes = array<i32: 2, 2>
}> : (memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.deallocate_buffer"(%4) : (memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
"ttmetal.enqueue_read_buffer"(%7, %alloc) : (memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<64x96xbf16>) -> ()
"ttmetal.finish"() : () -> ()
"ttmetal.deallocate_buffer"(%7) : (memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
return %alloc : memref<64x96xbf16>
}