Introduction

Welcome to tt-mlir, Tenstorrent's open-source MLIR-based compiler infrastructure designed to optimize and deploy machine learning models and other computational workloads on Tenstorrent hardware. This documentation provides an overview of the key components, features, and usage of tt-mlir.

Architecture & Dialect Overview

StableHLO TTIR tt-mlir D2M TTNN TTKernel TTMetal tt-metalium tt-mlir runtime tt-nn tt-metal EmitC TTMetal Flatbuffer TTNN Flatbuffer
TTIR
TTNN
D2M
TTKernel
TTMetal
// TTIR: Named ops on tensors (akin to shlo, tosa, etc)
//
// This should be the default IR that users who need a higher-level abstraction
// over tensors.
//
// Example IR:
func.func @simple_linear(
  %arg0: tensor<64x128xbf16>,
  %arg1: tensor<128x64xbf16>,
  %bias: tensor<64x64xbf16>) -> tensor<64x64xbf16> {
  %0 = ttir.empty() : tensor<64x64xbf16>
  %1 = "ttir.linear"(%arg0, %arg1, %bias, %0) : (tensor<64x128xbf16>, tensor<128x64xbf16>, tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16>
  return %1 : tensor<64x64xbf16>
}

// TTNN: Named ops on tensors (akin to shlo, tosa, etc)
//
// The TTNN dialect models the tt-nn API (from the tt-metalium project)
// as closely as possible. It is intended to be a high-level IR over tensors.
//
// Example IR:
#ttnn_layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<4x2x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x2x!ttcore.tile<32x32, bf16>, #dram>, <interleaved>>
func.func @simple_linear(
    %arg0: tensor<64x128xbf16, #ttnn_layout>,
    %arg1: tensor<128x64xbf16, #ttnn_layout1>,
    %arg2: tensor<64x64xbf16, #ttnn_layout2>
  ) -> tensor<64x64xbf16, #ttnn_layout2> {
  %0 = "ttnn.linear"(%arg0, %arg1, %arg2) <{
    transpose_a = false,
    transpose_b = false
  }> : (tensor<64x128xbf16, #ttnn_layout>, tensor<128x64xbf16, #ttnn_layout1>, tensor<64x64xbf16, #ttnn_layout2>) -> tensor<64x64xbf16, #ttnn_layout2>
  "ttnn.deallocate"(%arg2) <{force = false}> : (tensor<64x64xbf16, #ttnn_layout2>) -> ()
  "ttnn.deallocate"(%arg1) <{force = false}> : (tensor<128x64xbf16, #ttnn_layout1>) -> ()
  "ttnn.deallocate"(%arg0) <{force = false}> : (tensor<64x128xbf16, #ttnn_layout>) -> ()
  return %0 : tensor<64x64xbf16, #ttnn_layout2>
}

// D2M: Generic compute dialect (akin to linalg)
//
// The D2M dialect models generic compute on tensors and memrefs,
// similar to linalg.generic, but with constructs that map well to
// the Tenstorrent execution model (e.g., sharded tensors, grids,
// explicit datamovement).
//
// Example IR:
#layout = #ttcore.metal_layout<logical_shape = 64x128, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#layout1 = #ttcore.metal_layout<logical_shape = 128x96, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#layout2 = #ttcore.metal_layout<logical_shape = 64x96, dim_alignments = 32x32, collapsed_intervals = dense<[[0, 1], [1, 2]]> : tensor<2x2xi64>, undef, l1>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#parallel = #ttcore.iterator_type
#reduction = #ttcore.iterator_type
func.func @simple_matmul(
    %arg0: tensor<64x128xbf16>,
    %arg1: tensor<128x96xbf16>
  ) -> tensor<64x96xbf16> {
  %0 = d2m.empty() : tensor<64x96xbf16>
  %1 = d2m.empty() : tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>
  %2 = d2m.to_layout %arg0, %1 : tensor<64x128xbf16> into tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout> -> tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>
  %3 = d2m.empty() : tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>
  %4 = d2m.to_layout %arg1, %3 : tensor<128x96xbf16> into tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1> -> tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>
  %5 = d2m.empty() : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
  %6 = d2m.to_layout %0, %5 : tensor<64x96xbf16> into tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2> -> tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
  %7 = d2m.generic {
    block_factors = [1, 1, 1],
    grid = #ttcore.grid<1x1>,
    indexing_maps = [#map, #map1, #map2],
    iterator_types = [#parallel, #parallel, #reduction],
    threads = [#d2m.thread]
  } ins(%2, %4 : tensor<1x1x2x4x!ttcore.tile<32x32, bf16>, #layout>, tensor<1x1x4x3x!ttcore.tile<32x32, bf16>, #layout1>)
   outs(%6 : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>)  {
  ^compute0(%cb0: !d2m.cb>>, %cb1: !d2m.cb>>, %cb2: !d2m.cb>>):
    %10 = d2m.wait %cb0 : >> -> tensor<2x4x!ttcore.tile<32x32, bf16>>
    %11 = d2m.wait %cb1 : >> -> tensor<4x3x!ttcore.tile<32x32, bf16>>
    %12 = d2m.reserve %cb2 : >> -> tensor<2x3x!ttcore.tile<32x32, bf16>>
    %13 = linalg.generic {
      indexing_maps = [#map, #map1, #map2],
      iterator_types = ["parallel", "parallel", "reduction"]
    } ins(%10, %11 : tensor<2x4x!ttcore.tile<32x32, bf16>>, tensor<4x3x!ttcore.tile<32x32, bf16>>) outs(%12 : tensor<2x3x!ttcore.tile<32x32, bf16>>) {
    ^bb0(%in: !ttcore.tile<32x32, bf16>, %in_0: !ttcore.tile<32x32, bf16>, %out: !ttcore.tile<32x32, bf16>):
      %14 = "d2m.tile_matmul"(%in, %in_0, %out) : (!ttcore.tile<32x32, bf16>, !ttcore.tile<32x32, bf16>, !ttcore.tile<32x32, bf16>) -> !ttcore.tile<32x32, bf16>
      linalg.yield %14 : !ttcore.tile<32x32, bf16>
    } -> tensor<2x3x!ttcore.tile<32x32, bf16>>
    d2m.yield %13 : (tensor<2x3x!ttcore.tile<32x32, bf16>>)
  } : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2>
  %8 = d2m.empty() : tensor<64x96xbf16>
  %9 = d2m.to_layout %7, %8 : tensor<1x1x2x3x!ttcore.tile<32x32, bf16>, #layout2> into tensor<64x96xbf16> -> tensor<64x96xbf16>
  return %9 : tensor<64x96xbf16>
}

// TTKernel: tt-metal device kernel dialect.
//
// The TTKernel dialect models low-level kernels that run on Tenstorrent
// devices. It exposes concepts such as circular buffers, tile registers,
// noc transactions and explicit synchronization. It is intended to be a
// 1-1 mapping to tt-metalium kernels.
//
// Example Datamovement Kernel IR:
func.func private @datamovement_kernel() attributes {
    ttkernel.arg_spec = #ttkernel.arg_spec<ct_args = [
      <arg_type = cb_port, operand_index = 0>,
      <arg_type = cb_port, operand_index = 1>
    ]>,
    ttkernel.thread = #ttkernel.thread<noc>
  } {
  %0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1024, bf16>
  %c1_i32 = arith.constant 1 : i32
  ttkernel.cb_reserve_back(%0, %c1_i32) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
  ttkernel.cb_push_back(%0, %c1_i32) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
  return
}
//
// Example Compute Kernel IR:
func.func private @compute_kernel8() attributes {
    ttkernel.arg_spec = #ttkernel.arg_spec<ct_args = [
      <arg_type = cb_port, operand_index = 0>,
      <arg_type = cb_port, operand_index = 1>,
      <arg_type = cb_port, operand_index = 2>
    ]>,
    ttkernel.thread = #ttkernel.thread<compute>
  } {
  %0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %2 = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
  %c0 = arith.constant 0 : index
  %c1_i32 = arith.constant 1 : i32
  %c1_i32_0 = arith.constant 1 : i32
  %c1_i32_1 = arith.constant 1 : i32
  %c1_i32_2 = arith.constant 1 : i32
  %c0_i32 = arith.constant 0 : i32
  "ttkernel.mm_block_init"(%0, %1, %2, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c0_3 = arith.constant 0 : index
  scf.for %arg0 = %c0_3 to %c4 step %c1 {
    %c1_i32_4 = arith.constant 1 : i32
    ttkernel.cb_wait_front(%0, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    %c1_i32_5 = arith.constant 1 : i32
    ttkernel.cb_wait_front(%1, %c1_i32_5) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    %c1_i32_6 = arith.constant 1 : i32
    ttkernel.cb_reserve_back(%2, %c1_i32_6) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.tile_regs_acquire() : () -> ()
    %3 = arith.cmpi ne, %arg0, %c0_3 : index
    scf.if %3 {
      ttkernel.copy_tile_init(%2) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>) -> ()
      ttkernel.copy_tile(%2, %c0_3, %c0_3) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index) -> ()
    }
    "ttkernel.mm_block_init_short"(%0, %1, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
    %c0_7 = arith.constant 0 : index
    %c0_8 = arith.constant 0 : index
    "ttkernel.experimental::matmul_block"(%0, %1, %c0_7, %c0_8, %c0, %c0_i32, %c1_i32_1, %c1_i32, %c1_i32_0, %c1_i32_2) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32, i32) -> ()
    ttkernel.pack_tile(%c0_3, %2, %c0_3, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
    ttkernel.cb_pop_front(%0, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.cb_pop_front(%1, %c1_i32_5) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.cb_push_back(%2, %c1_i32_6) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    %c1_i32_4 = arith.constant 1 : i32
    ttkernel.cb_wait_front(%2, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
    ttkernel.cb_pop_front(%2, %c1_i32_4) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, i32) -> ()
  }
  return
}

// TTMetal: tt-metal host/device interop dialect.
//
// The TTMetal dialect models host-side operations for managing Tenstorrent
// devices, such as buffer allocation, data transfer, and enqueueing programs.
//
// Example IR:
func.func @simple_matmul(
    %arg0: memref<64x128xbf16>,
    %arg1: memref<128x96xbf16>
  ) -> memref<64x96xbf16> {
  %0 = "ttmetal.create_buffer"() <{address = 13312 : i64}> : () -> memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
  %1 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_write_buffer"(%arg0, %1) : (memref<64x128xbf16>, memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.enqueue_program"(%1, %0, %1, %0) <{
    cb_ports = array<i64: 0, 1>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel0, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args<ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel1, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel2, #ttmetal.core_range<0x0, 2x4>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 2, 2>
  }> : (memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%1) : (memref<2x4x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  %2 = "ttmetal.create_buffer"() <{address = 11264 : i64}> : () -> memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
  %3 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_write_buffer"(%arg1, %3) : (memref<128x96xbf16>, memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.enqueue_program"(%3, %2, %3, %2) <{
    cb_ports = array<i64: 0, 1>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel3, #ttmetal.core_range<0x0, 4x3>,#ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel4, #ttmetal.core_range<0x0, 4x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel5, #ttmetal.core_range<0x0, 4x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 2, 2>
  }> : (memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%3) : (memref<4x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  %4 = "ttmetal.create_buffer"() <{address = 9216 : i64}> : () -> memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>
  %5 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_program"(%0, %2, %4, %5, %6, %4) <{
    cb_ports = array<i64: 0, 1, 2>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel6, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>, <semaphore[0]>, <semaphore[1]>, <semaphore[2]>, <semaphore[3]>, <buffer_address[0]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel7, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>, <semaphore[0]>, <semaphore[1]>, <semaphore[2]>, <semaphore[3]>, <buffer_address[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel8, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>, <cb_port[2]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 3, 3>
  }> : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>, memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%6) : (memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%2) : (memref<4x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%5) : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048, 2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%0) : (memref<2x4x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  %alloc = memref.alloc() : memref<64x96xbf16>
  %7 = "ttmetal.create_buffer"() <{address = 1024 : i64}> : () -> memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>
  "ttmetal.enqueue_program"(%4, %7, %4, %7) <{
    cb_ports = array<i64: 0, 1>,
    kernelConfigs = [
      #ttmetal.noc_config<@datamovement_kernel9, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc0>,
      #ttmetal.noc_config<@datamovement_kernel10, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, noc1>,
      #ttmetal.compute_config<@compute_kernel11, #ttmetal.core_range<0x0, 2x3>, #ttmetal.kernel_args< ct_args = [<cb_port[0]>, <cb_port[1]>]>, hifi4, false, false, false, [default]>
    ],
    operandSegmentSizes = array<i32: 2, 2>
  }> : (memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>, memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.deallocate_buffer"(%4) : (memref<2x3x1x1x!ttcore.tile<32x32, bf16>, #ttcore.shard<2048x2048>, #ttcore.memory_space<l1>>) -> ()
  "ttmetal.enqueue_read_buffer"(%7, %alloc) : (memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>, memref<64x96xbf16>) -> ()
  "ttmetal.finish"() : () -> ()
  "ttmetal.deallocate_buffer"(%7) : (memref<2x3x32x32xbf16, #ttcore.shard<64x2>, #ttcore.memory_space<l1>>) -> ()
  return %alloc : memref<64x96xbf16>
}