xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole Faust /*
2*c217d954SCole Faust  * Copyright (c) 2021-2022 Arm Limited.
3*c217d954SCole Faust  *
4*c217d954SCole Faust  * SPDX-License-Identifier: MIT
5*c217d954SCole Faust  *
6*c217d954SCole Faust  * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust  * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust  * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust  * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust  * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust  *
13*c217d954SCole Faust  * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust  * copies or substantial portions of the Software.
15*c217d954SCole Faust  *
16*c217d954SCole Faust  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust  * SOFTWARE.
23*c217d954SCole Faust  */
24*c217d954SCole Faust #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
25*c217d954SCole Faust #include "arm_compute/core/Utils.h"
26*c217d954SCole Faust #include "arm_compute/core/Validate.h"
27*c217d954SCole Faust #include "arm_compute/core/utils/misc/ShapeCalculator.h"
28*c217d954SCole Faust #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
29*c217d954SCole Faust #include "src/core/CPP/Validate.h"
30*c217d954SCole Faust #include "src/core/NEON/INEKernel.h"
31*c217d954SCole Faust #include "src/core/helpers/AutoConfiguration.h"
32*c217d954SCole Faust #include "src/core/helpers/WindowHelpers.h"
33*c217d954SCole Faust 
34*c217d954SCole Faust #include <arm_neon.h>
35*c217d954SCole Faust 
36*c217d954SCole Faust namespace arm_compute
37*c217d954SCole Faust {
38*c217d954SCole Faust namespace cpu
39*c217d954SCole Faust {
40*c217d954SCole Faust namespace kernels
41*c217d954SCole Faust {
42*c217d954SCole Faust using namespace arm_compute::misc::shape_calculator;
43*c217d954SCole Faust 
configure(const ITensorInfo * src,ITensorInfo * dst,const PoolingLayerInfo & info,const CPUInfo & cpu_info)44*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
45*c217d954SCole Faust {
46*c217d954SCole Faust     ARM_COMPUTE_UNUSED(cpu_info);
47*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
48*c217d954SCole Faust 
49*c217d954SCole Faust     // dst initialization if not yet initialized
50*c217d954SCole Faust     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
51*c217d954SCole Faust 
52*c217d954SCole Faust #if defined(__aarch64__)
53*c217d954SCole Faust     const bool requantize = src->quantization_info() != dst->quantization_info();
54*c217d954SCole Faust 
55*c217d954SCole Faust     switch(src->data_type())
56*c217d954SCole Faust     {
57*c217d954SCole Faust         case DataType::QASYMM8:
58*c217d954SCole Faust             if(requantize)
59*c217d954SCole Faust             {
60*c217d954SCole Faust                 create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
61*c217d954SCole Faust             }
62*c217d954SCole Faust             else
63*c217d954SCole Faust             {
64*c217d954SCole Faust                 create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
65*c217d954SCole Faust             }
66*c217d954SCole Faust             break;
67*c217d954SCole Faust         case DataType::QASYMM8_SIGNED:
68*c217d954SCole Faust             if(requantize)
69*c217d954SCole Faust             {
70*c217d954SCole Faust                 create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
71*c217d954SCole Faust             }
72*c217d954SCole Faust             else
73*c217d954SCole Faust             {
74*c217d954SCole Faust                 create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
75*c217d954SCole Faust             }
76*c217d954SCole Faust             break;
77*c217d954SCole Faust #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
78*c217d954SCole Faust         case DataType::F16:
79*c217d954SCole Faust             create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
80*c217d954SCole Faust             break;
81*c217d954SCole Faust #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
82*c217d954SCole Faust         case DataType::F32:
83*c217d954SCole Faust             create_arm_pooling<float, float>(src, dst, info, cpu_info);
84*c217d954SCole Faust             break;
85*c217d954SCole Faust         default:
86*c217d954SCole Faust             break;
87*c217d954SCole Faust     }
88*c217d954SCole Faust #endif // defined(__aarch64__)
89*c217d954SCole Faust 
90*c217d954SCole Faust     Window win = calculate_max_window(*dst, Steps());
91*c217d954SCole Faust     INEKernel::configure(win);
92*c217d954SCole Faust }
93*c217d954SCole Faust 
validate(const ITensorInfo * src,const ITensorInfo * dst,const PoolingLayerInfo & info)94*c217d954SCole Faust Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
95*c217d954SCole Faust {
96*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
97*c217d954SCole Faust 
98*c217d954SCole Faust #ifndef __aarch64__
99*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
100*c217d954SCole Faust #endif /* __aarch64__ */
101*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
102*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
103*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
104*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
105*c217d954SCole Faust                                     "Only AVG and MAX pooling are supported by assembly kernels");
106*c217d954SCole Faust 
107*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
108*c217d954SCole Faust 
109*c217d954SCole Faust     if(dst->total_size() > 0)
110*c217d954SCole Faust     {
111*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
112*c217d954SCole Faust 
113*c217d954SCole Faust         const auto src_qinfo = src->quantization_info().uniform();
114*c217d954SCole Faust         const auto dst_qinfo = dst->quantization_info().uniform();
115*c217d954SCole Faust 
116*c217d954SCole Faust         if(src_qinfo != dst_qinfo)
117*c217d954SCole Faust         {
118*c217d954SCole Faust             const float multiplier = src_qinfo.scale / dst_qinfo.scale;
119*c217d954SCole Faust             int32_t     dst_multiplier{};
120*c217d954SCole Faust             int32_t     dst_shift{};
121*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
122*c217d954SCole Faust         }
123*c217d954SCole Faust         else
124*c217d954SCole Faust         {
125*c217d954SCole Faust             if(src->data_type() == DataType::QASYMM8)
126*c217d954SCole Faust             {
127*c217d954SCole Faust                 const bool has_padding = info.pad_stride_info.has_padding();
128*c217d954SCole Faust                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
129*c217d954SCole Faust             }
130*c217d954SCole Faust         }
131*c217d954SCole Faust     }
132*c217d954SCole Faust     else
133*c217d954SCole Faust     {
134*c217d954SCole Faust         if(src->data_type() == DataType::QASYMM8)
135*c217d954SCole Faust         {
136*c217d954SCole Faust             // If dst is not configured, the quantization info are the same
137*c217d954SCole Faust             const bool has_padding = info.pad_stride_info.has_padding();
138*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
139*c217d954SCole Faust         }
140*c217d954SCole Faust     }
141*c217d954SCole Faust     return Status{};
142*c217d954SCole Faust }
143*c217d954SCole Faust 
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)144*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
145*c217d954SCole Faust {
146*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
147*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
148*c217d954SCole Faust     ARM_COMPUTE_UNUSED(window);
149*c217d954SCole Faust     ARM_COMPUTE_UNUSED(info);
150*c217d954SCole Faust 
151*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON(tensors.empty());
152*c217d954SCole Faust 
153*c217d954SCole Faust     const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC);
154*c217d954SCole Faust     ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
155*c217d954SCole Faust     ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
156*c217d954SCole Faust 
157*c217d954SCole Faust     const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
158*c217d954SCole Faust     auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
159*c217d954SCole Faust     auto       working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
160*c217d954SCole Faust 
161*c217d954SCole Faust     const auto src_shape   = src->info()->tensor_shape();
162*c217d954SCole Faust     const auto dst_shape   = dst->info()->tensor_shape();
163*c217d954SCole Faust     const auto src_padding = src->info()->padding();
164*c217d954SCole Faust     const auto dst_padding = dst->info()->padding();
165*c217d954SCole Faust 
166*c217d954SCole Faust     const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
167*c217d954SCole Faust     const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
168*c217d954SCole Faust     const size_t ld_src_batch = ld_src_row * src_shape[2];
169*c217d954SCole Faust     const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
170*c217d954SCole Faust     const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
171*c217d954SCole Faust     const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
172*c217d954SCole Faust 
173*c217d954SCole Faust     _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
174*c217d954SCole Faust                          out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
175*c217d954SCole Faust                          working_space, info.thread_id, info.num_threads);
176*c217d954SCole Faust }
177*c217d954SCole Faust 
get_working_size(unsigned int num_threads) const178*c217d954SCole Faust size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
179*c217d954SCole Faust {
180*c217d954SCole Faust     return _kernel_asm->get_working_size(num_threads);
181*c217d954SCole Faust }
182*c217d954SCole Faust 
is_configured() const183*c217d954SCole Faust bool CpuPool2dAssemblyWrapperKernel::is_configured() const
184*c217d954SCole Faust {
185*c217d954SCole Faust     return _kernel_asm != nullptr;
186*c217d954SCole Faust }
187*c217d954SCole Faust 
188*c217d954SCole Faust template <typename Typesrc, typename Typedst>
create_arm_pooling(const ITensorInfo * src,ITensorInfo * dst,const PoolingLayerInfo & info,const CPUInfo & cpu_info)189*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
190*c217d954SCole Faust {
191*c217d954SCole Faust     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
192*c217d954SCole Faust 
193*c217d954SCole Faust     arm_conv::pooling::PoolingWindow window{};
194*c217d954SCole Faust     window.cols = static_cast<unsigned int>(info.pool_size.x());
195*c217d954SCole Faust     window.rows = static_cast<unsigned int>(info.pool_size.y());
196*c217d954SCole Faust 
197*c217d954SCole Faust     arm_conv::pooling::PoolingStride stride{};
198*c217d954SCole Faust     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
199*c217d954SCole Faust 
200*c217d954SCole Faust     const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
201*c217d954SCole Faust 
202*c217d954SCole Faust     constexpr unsigned int idx_width    = 1;
203*c217d954SCole Faust     constexpr unsigned int idx_height   = 2;
204*c217d954SCole Faust     constexpr unsigned int idx_channels = 0;
205*c217d954SCole Faust     constexpr unsigned int idx_batches  = 3;
206*c217d954SCole Faust 
207*c217d954SCole Faust     const unsigned int n_batches  = src->dimension(idx_batches);
208*c217d954SCole Faust     const unsigned int src_rows   = src->dimension(idx_height);
209*c217d954SCole Faust     const unsigned int src_cols   = src->dimension(idx_width);
210*c217d954SCole Faust     const unsigned int n_channels = src->dimension(idx_channels);
211*c217d954SCole Faust     const unsigned int dst_rows   = dst->dimension(idx_height);
212*c217d954SCole Faust     const unsigned int dst_cols   = dst->dimension(idx_width);
213*c217d954SCole Faust 
214*c217d954SCole Faust     arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
215*c217d954SCole Faust 
216*c217d954SCole Faust     // Configure assembly pooling kernel
217*c217d954SCole Faust     auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
218*c217d954SCole Faust     if(pooling_kernel_asm == nullptr)
219*c217d954SCole Faust     {
220*c217d954SCole Faust         // Configuration not supported: Leave function unconfigured:
221*c217d954SCole Faust         return;
222*c217d954SCole Faust     }
223*c217d954SCole Faust 
224*c217d954SCole Faust     _kernel_asm = std::move(pooling_kernel_asm);
225*c217d954SCole Faust }
226*c217d954SCole Faust 
227*c217d954SCole Faust template <typename Typesrc, typename Typedst>
create_arm_pooling_requant(const ITensorInfo * src,ITensorInfo * dst,const PoolingLayerInfo & info,const CPUInfo & cpu_info)228*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
229*c217d954SCole Faust {
230*c217d954SCole Faust     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
231*c217d954SCole Faust 
232*c217d954SCole Faust     arm_conv::pooling::PoolingWindow window{};
233*c217d954SCole Faust     window.cols = static_cast<unsigned int>(info.pool_size.x());
234*c217d954SCole Faust     window.rows = static_cast<unsigned int>(info.pool_size.y());
235*c217d954SCole Faust 
236*c217d954SCole Faust     arm_conv::pooling::PoolingStride stride{};
237*c217d954SCole Faust     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
238*c217d954SCole Faust 
239*c217d954SCole Faust     const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
240*c217d954SCole Faust 
241*c217d954SCole Faust     constexpr unsigned int idx_width    = 1;
242*c217d954SCole Faust     constexpr unsigned int idx_height   = 2;
243*c217d954SCole Faust     constexpr unsigned int idx_channels = 0;
244*c217d954SCole Faust     constexpr unsigned int idx_batches  = 3;
245*c217d954SCole Faust 
246*c217d954SCole Faust     const unsigned int n_batches  = src->dimension(idx_batches);
247*c217d954SCole Faust     const unsigned int src_rows   = src->dimension(idx_height);
248*c217d954SCole Faust     const unsigned int src_cols   = src->dimension(idx_width);
249*c217d954SCole Faust     const unsigned int n_channels = src->dimension(idx_channels);
250*c217d954SCole Faust     const unsigned int dst_rows   = dst->dimension(idx_height);
251*c217d954SCole Faust     const unsigned int dst_cols   = dst->dimension(idx_width);
252*c217d954SCole Faust 
253*c217d954SCole Faust     arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
254*c217d954SCole Faust 
255*c217d954SCole Faust     const auto src_qinfo = src->quantization_info().uniform();
256*c217d954SCole Faust     const auto dst_qinfo = dst->quantization_info().uniform();
257*c217d954SCole Faust 
258*c217d954SCole Faust     const float multiplier = src_qinfo.scale / dst_qinfo.scale;
259*c217d954SCole Faust     int32_t     dst_multiplier{};
260*c217d954SCole Faust     int32_t     dst_shift{};
261*c217d954SCole Faust     quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
262*c217d954SCole Faust 
263*c217d954SCole Faust     const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
264*c217d954SCole Faust                                                        dst_qinfo.offset,
265*c217d954SCole Faust                                                        dst_shift, // left shift
266*c217d954SCole Faust                                                        0,         // right shift
267*c217d954SCole Faust                                                        dst_multiplier);
268*c217d954SCole Faust 
269*c217d954SCole Faust     // Configure assembly pooling kernel with requantization
270*c217d954SCole Faust     auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
271*c217d954SCole Faust     if(pooling_kernel_asm == nullptr)
272*c217d954SCole Faust     {
273*c217d954SCole Faust         // Configuration not supported: Leave function unconfigured:
274*c217d954SCole Faust         return;
275*c217d954SCole Faust     }
276*c217d954SCole Faust 
277*c217d954SCole Faust     _kernel_asm = std::move(pooling_kernel_asm);
278*c217d954SCole Faust }
279*c217d954SCole Faust 
get_mws(const CPUInfo & platform,size_t thread_count) const280*c217d954SCole Faust size_t CpuPool2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
281*c217d954SCole Faust {
282*c217d954SCole Faust     ARM_COMPUTE_UNUSED(thread_count);
283*c217d954SCole Faust     ARM_COMPUTE_UNUSED(platform);
284*c217d954SCole Faust 
285*c217d954SCole Faust     return ICPPKernel::default_mws;
286*c217d954SCole Faust }
287*c217d954SCole Faust } // namespace kernels
288*c217d954SCole Faust } // namespace cpu
289*c217d954SCole Faust } // namespace arm_compute
290