Introduction

Welcome to tt-mlir, Tenstorrent's open-source MLIR-based compiler infrastructure designed to optimize and deploy machine learning models and other computational workloads on Tenstorrent hardware. This documentation provides an overview of the key components, features, and usage of tt-mlir.

Architecture & Dialect Overview

tt-mlir is structured around several core dialects and components that facilitate the compilation process from high-level representations to low-level code generation. While the architecture diagram below illustrates Tenstorrent’s compiler flow, it also reflects the various dialect abstractions defined within tt-mlir.

tt-mlir is a library of reusable components that can be used to build compilers targeting Tenstorrent hardware. Other compiler technologies may choose to leverage whichever abstractions best suit their needs.

StableHLO TTIR tt-mlir D2M TTNN TTKernel TTMetal tt-metalium tt-mlir runtime tt-nn tt-metal EmitC TTMetal Flatbuffer TTNN Flatbuffer
TTIR
TTNN
D2M
TTKernel
TTMetal
// TTIR: Named ops on tensors (akin to shlo, tosa, etc)
//
// This should be the default IR that users who need a higher-level abstraction
// over tensors.
//
// Example IR:
func.func @simple_linear(
  %arg0: tensor<64x128xbf16>,
  %arg1: tensor<128x64xbf16>,
  %bias: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
  %0 = ttir.empty() : tensor<64x64xbf16>
  %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
  return %1 : tensor<64x64xbf16>
}

// TTNN: Named ops on tensors (akin to shlo, tosa, etc)
//
// The TTNN dialect models the tt-nn API (from the tt-metalium project)
// as closely as possible. It is intended to be a high-level IR over tensors.
//
// Example IR:
#ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<4x2x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x2x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
func.func @simple_linear(
    %arg0: tensor<64x128xbf16, #ttnn_layout>,
    %arg1: tensor<128x64xbf16, #ttnn_layout1>,
    %arg2: tensor<64x64xbf16, #ttnn_layout2>
  ) -> tensor<64x64xbf16, #ttnn_layout2> {
  %0 = "ttnn.linear"(%arg0, %arg1, %arg2) <{
    transpose_a = false,
    transpose_b = false
  }> : (tensor<64x128xbf16, #ttnn_layout>, tensor<128x64xbf16, #ttnn_layout1>, tensor<64x64xbf16, #ttnn_layout2>) -> tensor<64x64xbf16, #ttnn_layout2>
  "ttnn.deallocate"(%arg2) <{force = false}> : (tensor<64x64xbf16, #ttnn_layout2>) -> ()
  "ttnn.deallocate"(%arg1) <{force = false}> : (tensor<128x64xbf16, #ttnn_layout1>) -> ()
  "ttnn.deallocate"(%arg0) <{force = false}> : (tensor<64x128xbf16, #ttnn_layout>) -> ()
  return %0 : tensor<64x64xbf16, #ttnn_layout2>
}

// D2M: Generic compute dialect (akin to linalg)
//
// The D2M dialect models generic compute on tensors and memrefs,
// similar to linalg.generic, but with constructs that map well to
// the Tenstorrent execution model (e.g., sharded tensors, grids,
// explicit datamovement).
//
// Example IR:
#layout = #ttcore.metal_layout<logical_shape = 64x128, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#layout1 = #ttcore.metal_layout<logical_shape = 128x96, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#layout2 = #ttcore.metal_layout<logical_shape = 64x96, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#parallel = #ttcore.iterator_type
#reduction = #ttcore.iterator_type
func.func @simple_matmul(
    %arg0: tensor<64x128xbf16>,
    %arg1: tensor<128x96xbf16>
  ) -> tensor<64x96xbf16> {
  %0 = d2m.empty() : tensor<64x96xbf16>
  %1 = d2m.empty() : tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>
  %2 = d2m.to_layout %arg0, %1 : tensor<64x128xbf16> into tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout> -> tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>
  %3 = d2m.empty() : tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>
  %4 = d2m.to_layout %arg1, %3 : tensor<128x96xbf16> into tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1> -> tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>
  %5 = d2m.empty() : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
  %6 = d2m.to_layout %0, %5 : tensor<64x96xbf16> into tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2> -> tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
  %7 = d2m.generic {
    block_factors = [1, 1, 1],
    grid = #ttcore.grid<1x1>,
    indexing_maps = [#map, #map1, #map2],
    iterator_types = [#parallel, #parallel, #reduction],
    threads = [#d2m.thread]
  } ins(%2, %4 : tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>, tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>)
   outs(%6 : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>)  {
  ^compute0(%cb0: !d2m.cb>>, %cb1: !d2m.cb>>, %cb2: !d2m.cb>>):
    %10 = d2m.wait %cb0 : >> -> tensor<2x4x!ttcore.tile<32x32, bf16>>
    %11 = d2m.wait %cb1 : >> -> tensor<4x3x!ttcore.tile<32x32, bf16>>
    %12 = d2m.reserve %cb2 : >> -> tensor<2x3x!ttcore.tile<32x32, bf16>>
    %13 = linalg.generic {
      indexing_maps = [#map, #map1, #map2],
      iterator_types = ["parallel", "parallel", "reduction"]
    } ins(%10, %11 : tensor<2x4x!ttcore.tile<32x32, bf16>>, tensor<4x3x!ttcore.tile<32x32, bf16>>) outs(%12 : tensor<2x3x!ttcore.tile<32x32, bf16>>) {
    ^bb0(%in: !ttcore.tile<32x32, bf16>, %in_0: !ttcore.tile<32x32, bf16>, %out: !ttcore.tile<32x32, bf16>):
      %14 = "d2m.tile_matmul"(%in, %in_0, %out) : (!ttcore.tile<32x32, bf16>, !ttcore.tile<32x32, bf16>, !ttcore.tile<32x32, bf16>) -> !ttcore.tile<32x32, bf16>
      linalg.yield %14 : !ttcore.tile<32x32, bf16>
    } -> tensor<2x3x!ttcore.tile<32x32, bf16>>
    d2m.yield %13 : (tensor<2x3x!ttcore.tile<32x32, bf16>>)
  } : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
  %8 = d2m.empty() : tensor<64x96xbf16>
  %9 = d2m.to_layout %7, %8 : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2> into tensor<64x96xbf16> -> tensor<64x96xbf16>
  return %9 : tensor<64x96xbf16>
}

// TTKernel: tt-metal device kernel dialect.
//
// The TTKernel dialect models low-level kernels that run on Tenstorrent
// devices. It exposes concepts such as circular buffers, tile registers,
// noc transactions and explicit synchronization. It is intended to be a
// 1-1 mapping to tt-metalium kernels.
//
// Example Datamovement Kernel IR:
func.func private @datamovement_kernel() attributes {
    ttkernel.arg_spec = #ttkernel.arg_spec<ct_args = [
      <arg_type = cb_port, operand_index = 0>,
      <arg_type = cb_port, operand_index = 1>
    ]>,
    ttkernel.thread = #ttkernel.thread<noc>
  } {
  %0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1024, bf16>
  %c1_i32 = arith.constant 1 : i32
  ttkernel.cb_reserve_back(%0, %c1_i32) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
  ttkernel.cb_push_back(%0, %c1_i32) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
  return
}
//
// Example Compute Kernel IR:
func.func private @compute_kernel8() attributes {
    ttkernel.arg_spec = #ttkernel.arg_spec<ct_args = [
      <arg_type = cb_port, operand_index = 0>,
      <arg_type = cb_port, operand_index = 1>,
      <arg_type = cb_port, operand_index = 2>
    ]>,
    ttkernel.thread = #ttkernel.thread<compute>
  } {
  %0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %2 = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %c0 = arith.constant 0 : index
  %c1_i32 = arith.constant 1 : i32
  %c1_i32_0 = arith.constant 1 : i32
  %c1_i32_1 = arith.constant 1 : i32
  %c1_i32_2 = arith.constant 1 : i32
  %c0_i32 = arith.constant 0 : i32
  "ttkernel.mm_block_init"(%0, %1, %2, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c0_3 = arith.constant 0 : index
  scf.for %arg0 = %c0_3 to %c4 step %c1 {
    %c1_i32_4 = arith.constant 1 : i32
    ttkernel.cb_wait_front(%0, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    %c1_i32_5 = arith.constant 1 : i32
    ttkernel.cb_wait_front(%1, %c1_i32_5) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    %c1_i32_6 = arith.constant 1 : i32
    ttkernel.cb_reserve_back(%2, %c1_i32_6) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.tile_regs_acquire() : () -> ()
    %3 = arith.cmpi ne, %arg0, %c0_3 : index
    scf.if %3 {
      ttkernel.copy_tile_init(%2) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>) -> ()
      ttkernel.copy_tile(%2, %c0_3, %c0_3) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index) -> ()
    }
    "ttkernel.mm_block_init_short"(%0, %1, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
    %c0_7 = arith.constant 0 : index
    %c0_8 = arith.constant 0 : index
    "ttkernel.experimental::matmul_block"(%0, %1, %c0_7, %c0_8, %c0, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0, %c1_i32_2) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32, i32) -> ()
    ttkernel.pack_tile(%c0_3, %2, %c0_3, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
    ttkernel.cb_pop_front(%0, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.cb_pop_front(%1, %c1_i32_5) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.cb_push_back(%2, %c1_i32_6) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    %c1_i32_4 = arith.constant 1 : i32
    ttkernel.cb_wait_front(%2, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.cb_pop_front(%2, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
  }
  return
}

// TTMetal: tt-metal host/device interop dialect.
//
// The TTMetal dialect models host-side operations for managing Tenstorrent
// devices, such as buffer allocation, data transfer, and enqueueing programs.
//
// Example IR:
func.func @simple_matmul(
    %arg0: memref<64x128xbf16>,
    %arg1: memref<128x96xbf16>
  ) -> memref<64x96xbf16> {
  %0 = "ttmetal.create_buffer"() <{address = 13312 : i64}> : () -> memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
  %1 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_write_buffer"(%arg0, %1) : (memref<64x128xbf16>, memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.enqueue_program"(%1, %0, %1, %0) <{
    cb_ports = array<i64: 0, 1>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel0, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args<ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel1, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel2, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 2, 2>
  }> : (memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%1) : (memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  %2 = "ttmetal.create_buffer"() <{address = 11264 : i64}> : () -> memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
  %3 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_write_buffer"(%arg1, %3) : (memref<128x96xbf16>, memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.enqueue_program"(%3, %2, %3, %2) <{
    cb_ports = array<i64: 0, 1>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel3, #ttmetal.core_range<0x0, 4x3>,#ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel4, #ttmetal.core_range<0x0, 4x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel5, #ttmetal.core_range<0x0, 4x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 2, 2>
  }> : (memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%3) : (memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  %4 = "ttmetal.create_buffer"() <{address = 9216 : i64}> : () -> memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
  %5 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_program"(%0, %2, %4, %5, %6, %4) <{
    cb_ports = array<i64: 0, 1, 2>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel6, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>, <semaphore[0]>, <semaphore[1]>, <semaphore[2]>, <semaphore[3]>, <buffer_address[0]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel7, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>, <semaphore[0]>, <semaphore[1]>, <semaphore[2]>, <semaphore[3]>, <buffer_address[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel8, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 3, 3>
  }> : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%6) : (memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%2) : (memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%5) : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%0) : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  %alloc = memref.alloc() : memref<64x96xbf16>
  %7 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_program"(%4, %7, %4, %7) <{
    cb_ports = array<i64: 0, 1>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel9, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel10, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel11, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 2, 2>
  }> : (memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%4) : (memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.enqueue_read_buffer"(%7, %alloc) : (memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<64x96xbf16>) -> ()
  "ttmetal.finish"() : () -> ()
  "ttmetal.deallocate_buffer"(%7) : (memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  return %alloc : memref<64x96xbf16>
}