1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/native/cuda/TensorModeKernel.h>
3 #include <ATen/cuda/CUDAConfig.h>
4 #include <ATen/native/CanUse32BitIndexMath.h>
5 #include <ATen/native/ReduceOpsUtils.h>
6 #include <ATen/native/Resize.h>
7 #include <ATen/native/TensorCompare.h>
8
9 constexpr int MAX_BLOCK_SIZE = AT_ROCM_ENABLED() ? 256 : 1024;
10
11 // Maximum size per grid dimension that we assume (compute capability >= 2.0)
12 constexpr int64_t MAX_GRID_SIZE = 65535LL;
13
14 namespace at::native {
15
mode_kernel_impl(Tensor & values,Tensor & indices,const Tensor & self,int64_t dim,bool keepdim)16 void mode_kernel_impl(
17 Tensor& values,
18 Tensor& indices,
19 const Tensor& self,
20 int64_t dim,
21 bool keepdim) {
22 auto self_sizes = ensure_nonempty_vec(self.sizes().vec());
23 int64_t ndim = ensure_nonempty_dim(self.dim());
24 int64_t slice_size = ensure_nonempty_size(self, dim);
25 int64_t slices = self.numel() / slice_size;
26
27 // Resize output value, index Tensors to appropriate sizes (i.e. the same as
28 // the input Tensor, except at dim=dimension, the size is 1)
29 assert(0 <= dim && static_cast<size_t>(dim) < self_sizes.size());
30 self_sizes[dim] = 1;
31
32 if (!keepdim) {
33 if (values.ndimension() >= dim) {
34 values.unsqueeze_(dim);
35 }
36 if (indices.ndimension() >= dim) {
37 indices.unsqueeze_(dim);
38 }
39 }
40
41 at::native::resize_output(values, self_sizes);
42 at::native::resize_output(indices, self_sizes);
43
44 // If sliceSize is 1, copy input to values and set indices
45 if (slice_size == 1) {
46 values.copy_(self);
47 indices.fill_(0);
48 if (!keepdim) {
49 values.squeeze_(dim);
50 indices.squeeze_(dim);
51 }
52 return;
53 }
54
55 // Beginning our optimized implementation. First thing we want to do is to
56 // transpose the input Tensor along the sort dimension, and then make it
57 // contiguous.
58 auto transposed = self.transpose(dim, ndim - 1);
59 auto contiguous = transposed.contiguous();
60
61 // We also need to view the values and indices Tensors as transposed in order
62 // to properly determine the offset into the underlying storage in which to
63 // place the mode and index for a particular set of dimension values.
64 auto values_transposed = values.transpose(dim, ndim - 1);
65 auto indices_transposed = indices.transpose(dim, ndim - 1);
66
67 // Requirements for fused kernel implementation:
68 //
69 // 1. sliceSize <= 2 * max threads per block
70 // 2. uses one block per slice, so number of slices must be less than the
71 // maximum number of blocks for a kernel launch
72 // 3. Can use 32-bit index math for indexing (mainly just for implementation
73 // conciseness, could be changed)
74 //
75 // MAX_BLOCK_SIZE and MAX_GRID_SIZE come from:
76 // ATen/native/cuda/SortingCommon.cuh
77 if (slice_size <= 2 * MAX_BLOCK_SIZE &&
78 slices <= MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE &&
79 canUse32BitIndexMath(self)) {
80 launch_fused_mode_kernel(
81 values_transposed, indices_transposed, contiguous, slice_size, slices);
82 } else {
83 // [Note: CUDA torch.mode clones self]
84 //
85 // If transposed is already contiguous, it will return a tensor with the
86 // same storage. So, since we do not want to modify self, we clone it.
87 if (transposed.is_same(contiguous)) {
88 contiguous = contiguous.clone();
89 }
90
91 launch_apply_mode_kernel(
92 values_transposed, indices_transposed, contiguous, dim, ndim);
93 }
94
95 if (!keepdim) {
96 values.squeeze_(dim);
97 indices.squeeze_(dim);
98 }
99 }
100
101 REGISTER_CUDA_DISPATCH(mode_stub, &mode_kernel_impl);
102 } // namespace at::native
103