xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/cuda/TensorModeKernel.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/native/cuda/TensorModeKernel.h>
3 #include <ATen/cuda/CUDAConfig.h>
4 #include <ATen/native/CanUse32BitIndexMath.h>
5 #include <ATen/native/ReduceOpsUtils.h>
6 #include <ATen/native/Resize.h>
7 #include <ATen/native/TensorCompare.h>
8 
9 constexpr int MAX_BLOCK_SIZE = AT_ROCM_ENABLED() ? 256 : 1024;
10 
11 // Maximum size per grid dimension that we assume (compute capability >= 2.0)
12 constexpr int64_t MAX_GRID_SIZE = 65535LL;
13 
14 namespace at::native {
15 
mode_kernel_impl(Tensor & values,Tensor & indices,const Tensor & self,int64_t dim,bool keepdim)16 void mode_kernel_impl(
17     Tensor& values,
18     Tensor& indices,
19     const Tensor& self,
20     int64_t dim,
21     bool keepdim) {
22   auto self_sizes = ensure_nonempty_vec(self.sizes().vec());
23   int64_t ndim = ensure_nonempty_dim(self.dim());
24   int64_t slice_size = ensure_nonempty_size(self, dim);
25   int64_t slices = self.numel() / slice_size;
26 
27   // Resize output value, index Tensors to appropriate sizes (i.e. the same as
28   // the input Tensor, except at dim=dimension, the size is 1)
29   assert(0 <= dim && static_cast<size_t>(dim) < self_sizes.size());
30   self_sizes[dim] = 1;
31 
32   if (!keepdim) {
33     if (values.ndimension() >= dim) {
34       values.unsqueeze_(dim);
35     }
36     if (indices.ndimension() >= dim) {
37       indices.unsqueeze_(dim);
38     }
39   }
40 
41   at::native::resize_output(values, self_sizes);
42   at::native::resize_output(indices, self_sizes);
43 
44   // If sliceSize is 1, copy input to values and set indices
45   if (slice_size == 1) {
46     values.copy_(self);
47     indices.fill_(0);
48     if (!keepdim) {
49       values.squeeze_(dim);
50       indices.squeeze_(dim);
51     }
52     return;
53   }
54 
55   // Beginning our optimized implementation. First thing we want to do is to
56   // transpose the input Tensor along the sort dimension, and then make it
57   // contiguous.
58   auto transposed = self.transpose(dim, ndim - 1);
59   auto contiguous = transposed.contiguous();
60 
61   // We also need to view the values and indices Tensors as transposed in order
62   // to properly determine the offset into the underlying storage in which to
63   // place the mode and index for a particular set of dimension values.
64   auto values_transposed = values.transpose(dim, ndim - 1);
65   auto indices_transposed = indices.transpose(dim, ndim - 1);
66 
67   // Requirements for fused kernel implementation:
68   //
69   // 1. sliceSize <= 2 * max threads per block
70   // 2. uses one block per slice, so number of slices must be less than the
71   // maximum number of blocks for a kernel launch
72   // 3. Can use 32-bit index math for indexing (mainly just for implementation
73   // conciseness, could be changed)
74   //
75   // MAX_BLOCK_SIZE and MAX_GRID_SIZE come from:
76   //     ATen/native/cuda/SortingCommon.cuh
77   if (slice_size <= 2 * MAX_BLOCK_SIZE &&
78       slices <= MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE &&
79       canUse32BitIndexMath(self)) {
80     launch_fused_mode_kernel(
81         values_transposed, indices_transposed, contiguous, slice_size, slices);
82   } else {
83     // [Note: CUDA torch.mode clones self]
84     //
85     // If transposed is already contiguous, it will return a tensor with the
86     // same storage. So, since we do not want to modify self, we clone it.
87     if (transposed.is_same(contiguous)) {
88       contiguous = contiguous.clone();
89     }
90 
91     launch_apply_mode_kernel(
92         values_transposed, indices_transposed, contiguous, dim, ndim);
93   }
94 
95   if (!keepdim) {
96     values.squeeze_(dim);
97     indices.squeeze_(dim);
98   }
99 }
100 
101 REGISTER_CUDA_DISPATCH(mode_stub, &mode_kernel_impl);
102 } // namespace at::native
103