1*c217d954SCole Faust /*
2*c217d954SCole Faust * Copyright (c) 2021-2022 Arm Limited.
3*c217d954SCole Faust *
4*c217d954SCole Faust * SPDX-License-Identifier: MIT
5*c217d954SCole Faust *
6*c217d954SCole Faust * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust *
13*c217d954SCole Faust * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust * copies or substantial portions of the Software.
15*c217d954SCole Faust *
16*c217d954SCole Faust * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust * SOFTWARE.
23*c217d954SCole Faust */
24*c217d954SCole Faust #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
25*c217d954SCole Faust #include "arm_compute/core/Utils.h"
26*c217d954SCole Faust #include "arm_compute/core/Validate.h"
27*c217d954SCole Faust #include "arm_compute/core/utils/misc/ShapeCalculator.h"
28*c217d954SCole Faust #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
29*c217d954SCole Faust #include "src/core/CPP/Validate.h"
30*c217d954SCole Faust #include "src/core/NEON/INEKernel.h"
31*c217d954SCole Faust #include "src/core/helpers/AutoConfiguration.h"
32*c217d954SCole Faust #include "src/core/helpers/WindowHelpers.h"
33*c217d954SCole Faust
34*c217d954SCole Faust #include <arm_neon.h>
35*c217d954SCole Faust
36*c217d954SCole Faust namespace arm_compute
37*c217d954SCole Faust {
38*c217d954SCole Faust namespace cpu
39*c217d954SCole Faust {
40*c217d954SCole Faust namespace kernels
41*c217d954SCole Faust {
42*c217d954SCole Faust using namespace arm_compute::misc::shape_calculator;
43*c217d954SCole Faust
configure(const ITensorInfo * src,ITensorInfo * dst,const PoolingLayerInfo & info,const CPUInfo & cpu_info)44*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
45*c217d954SCole Faust {
46*c217d954SCole Faust ARM_COMPUTE_UNUSED(cpu_info);
47*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
48*c217d954SCole Faust
49*c217d954SCole Faust // dst initialization if not yet initialized
50*c217d954SCole Faust auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
51*c217d954SCole Faust
52*c217d954SCole Faust #if defined(__aarch64__)
53*c217d954SCole Faust const bool requantize = src->quantization_info() != dst->quantization_info();
54*c217d954SCole Faust
55*c217d954SCole Faust switch(src->data_type())
56*c217d954SCole Faust {
57*c217d954SCole Faust case DataType::QASYMM8:
58*c217d954SCole Faust if(requantize)
59*c217d954SCole Faust {
60*c217d954SCole Faust create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
61*c217d954SCole Faust }
62*c217d954SCole Faust else
63*c217d954SCole Faust {
64*c217d954SCole Faust create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
65*c217d954SCole Faust }
66*c217d954SCole Faust break;
67*c217d954SCole Faust case DataType::QASYMM8_SIGNED:
68*c217d954SCole Faust if(requantize)
69*c217d954SCole Faust {
70*c217d954SCole Faust create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
71*c217d954SCole Faust }
72*c217d954SCole Faust else
73*c217d954SCole Faust {
74*c217d954SCole Faust create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
75*c217d954SCole Faust }
76*c217d954SCole Faust break;
77*c217d954SCole Faust #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
78*c217d954SCole Faust case DataType::F16:
79*c217d954SCole Faust create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
80*c217d954SCole Faust break;
81*c217d954SCole Faust #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
82*c217d954SCole Faust case DataType::F32:
83*c217d954SCole Faust create_arm_pooling<float, float>(src, dst, info, cpu_info);
84*c217d954SCole Faust break;
85*c217d954SCole Faust default:
86*c217d954SCole Faust break;
87*c217d954SCole Faust }
88*c217d954SCole Faust #endif // defined(__aarch64__)
89*c217d954SCole Faust
90*c217d954SCole Faust Window win = calculate_max_window(*dst, Steps());
91*c217d954SCole Faust INEKernel::configure(win);
92*c217d954SCole Faust }
93*c217d954SCole Faust
validate(const ITensorInfo * src,const ITensorInfo * dst,const PoolingLayerInfo & info)94*c217d954SCole Faust Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
95*c217d954SCole Faust {
96*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
97*c217d954SCole Faust
98*c217d954SCole Faust #ifndef __aarch64__
99*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
100*c217d954SCole Faust #endif /* __aarch64__ */
101*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
102*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
103*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
104*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
105*c217d954SCole Faust "Only AVG and MAX pooling are supported by assembly kernels");
106*c217d954SCole Faust
107*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
108*c217d954SCole Faust
109*c217d954SCole Faust if(dst->total_size() > 0)
110*c217d954SCole Faust {
111*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
112*c217d954SCole Faust
113*c217d954SCole Faust const auto src_qinfo = src->quantization_info().uniform();
114*c217d954SCole Faust const auto dst_qinfo = dst->quantization_info().uniform();
115*c217d954SCole Faust
116*c217d954SCole Faust if(src_qinfo != dst_qinfo)
117*c217d954SCole Faust {
118*c217d954SCole Faust const float multiplier = src_qinfo.scale / dst_qinfo.scale;
119*c217d954SCole Faust int32_t dst_multiplier{};
120*c217d954SCole Faust int32_t dst_shift{};
121*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
122*c217d954SCole Faust }
123*c217d954SCole Faust else
124*c217d954SCole Faust {
125*c217d954SCole Faust if(src->data_type() == DataType::QASYMM8)
126*c217d954SCole Faust {
127*c217d954SCole Faust const bool has_padding = info.pad_stride_info.has_padding();
128*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
129*c217d954SCole Faust }
130*c217d954SCole Faust }
131*c217d954SCole Faust }
132*c217d954SCole Faust else
133*c217d954SCole Faust {
134*c217d954SCole Faust if(src->data_type() == DataType::QASYMM8)
135*c217d954SCole Faust {
136*c217d954SCole Faust // If dst is not configured, the quantization info are the same
137*c217d954SCole Faust const bool has_padding = info.pad_stride_info.has_padding();
138*c217d954SCole Faust ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
139*c217d954SCole Faust }
140*c217d954SCole Faust }
141*c217d954SCole Faust return Status{};
142*c217d954SCole Faust }
143*c217d954SCole Faust
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)144*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
145*c217d954SCole Faust {
146*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
147*c217d954SCole Faust ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
148*c217d954SCole Faust ARM_COMPUTE_UNUSED(window);
149*c217d954SCole Faust ARM_COMPUTE_UNUSED(info);
150*c217d954SCole Faust
151*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(tensors.empty());
152*c217d954SCole Faust
153*c217d954SCole Faust const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
154*c217d954SCole Faust ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
155*c217d954SCole Faust ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
156*c217d954SCole Faust
157*c217d954SCole Faust const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
158*c217d954SCole Faust auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
159*c217d954SCole Faust auto working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
160*c217d954SCole Faust
161*c217d954SCole Faust const auto src_shape = src->info()->tensor_shape();
162*c217d954SCole Faust const auto dst_shape = dst->info()->tensor_shape();
163*c217d954SCole Faust const auto src_padding = src->info()->padding();
164*c217d954SCole Faust const auto dst_padding = dst->info()->padding();
165*c217d954SCole Faust
166*c217d954SCole Faust const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
167*c217d954SCole Faust const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
168*c217d954SCole Faust const size_t ld_src_batch = ld_src_row * src_shape[2];
169*c217d954SCole Faust const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
170*c217d954SCole Faust const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
171*c217d954SCole Faust const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
172*c217d954SCole Faust
173*c217d954SCole Faust _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
174*c217d954SCole Faust out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
175*c217d954SCole Faust working_space, info.thread_id, info.num_threads);
176*c217d954SCole Faust }
177*c217d954SCole Faust
get_working_size(unsigned int num_threads) const178*c217d954SCole Faust size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
179*c217d954SCole Faust {
180*c217d954SCole Faust return _kernel_asm->get_working_size(num_threads);
181*c217d954SCole Faust }
182*c217d954SCole Faust
is_configured() const183*c217d954SCole Faust bool CpuPool2dAssemblyWrapperKernel::is_configured() const
184*c217d954SCole Faust {
185*c217d954SCole Faust return _kernel_asm != nullptr;
186*c217d954SCole Faust }
187*c217d954SCole Faust
188*c217d954SCole Faust template <typename Typesrc, typename Typedst>
create_arm_pooling(const ITensorInfo * src,ITensorInfo * dst,const PoolingLayerInfo & info,const CPUInfo & cpu_info)189*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
190*c217d954SCole Faust {
191*c217d954SCole Faust const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
192*c217d954SCole Faust
193*c217d954SCole Faust arm_conv::pooling::PoolingWindow window{};
194*c217d954SCole Faust window.cols = static_cast<unsigned int>(info.pool_size.x());
195*c217d954SCole Faust window.rows = static_cast<unsigned int>(info.pool_size.y());
196*c217d954SCole Faust
197*c217d954SCole Faust arm_conv::pooling::PoolingStride stride{};
198*c217d954SCole Faust std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
199*c217d954SCole Faust
200*c217d954SCole Faust const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
201*c217d954SCole Faust
202*c217d954SCole Faust constexpr unsigned int idx_width = 1;
203*c217d954SCole Faust constexpr unsigned int idx_height = 2;
204*c217d954SCole Faust constexpr unsigned int idx_channels = 0;
205*c217d954SCole Faust constexpr unsigned int idx_batches = 3;
206*c217d954SCole Faust
207*c217d954SCole Faust const unsigned int n_batches = src->dimension(idx_batches);
208*c217d954SCole Faust const unsigned int src_rows = src->dimension(idx_height);
209*c217d954SCole Faust const unsigned int src_cols = src->dimension(idx_width);
210*c217d954SCole Faust const unsigned int n_channels = src->dimension(idx_channels);
211*c217d954SCole Faust const unsigned int dst_rows = dst->dimension(idx_height);
212*c217d954SCole Faust const unsigned int dst_cols = dst->dimension(idx_width);
213*c217d954SCole Faust
214*c217d954SCole Faust arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
215*c217d954SCole Faust
216*c217d954SCole Faust // Configure assembly pooling kernel
217*c217d954SCole Faust auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
218*c217d954SCole Faust if(pooling_kernel_asm == nullptr)
219*c217d954SCole Faust {
220*c217d954SCole Faust // Configuration not supported: Leave function unconfigured:
221*c217d954SCole Faust return;
222*c217d954SCole Faust }
223*c217d954SCole Faust
224*c217d954SCole Faust _kernel_asm = std::move(pooling_kernel_asm);
225*c217d954SCole Faust }
226*c217d954SCole Faust
227*c217d954SCole Faust template <typename Typesrc, typename Typedst>
create_arm_pooling_requant(const ITensorInfo * src,ITensorInfo * dst,const PoolingLayerInfo & info,const CPUInfo & cpu_info)228*c217d954SCole Faust void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
229*c217d954SCole Faust {
230*c217d954SCole Faust const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
231*c217d954SCole Faust
232*c217d954SCole Faust arm_conv::pooling::PoolingWindow window{};
233*c217d954SCole Faust window.cols = static_cast<unsigned int>(info.pool_size.x());
234*c217d954SCole Faust window.rows = static_cast<unsigned int>(info.pool_size.y());
235*c217d954SCole Faust
236*c217d954SCole Faust arm_conv::pooling::PoolingStride stride{};
237*c217d954SCole Faust std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
238*c217d954SCole Faust
239*c217d954SCole Faust const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
240*c217d954SCole Faust
241*c217d954SCole Faust constexpr unsigned int idx_width = 1;
242*c217d954SCole Faust constexpr unsigned int idx_height = 2;
243*c217d954SCole Faust constexpr unsigned int idx_channels = 0;
244*c217d954SCole Faust constexpr unsigned int idx_batches = 3;
245*c217d954SCole Faust
246*c217d954SCole Faust const unsigned int n_batches = src->dimension(idx_batches);
247*c217d954SCole Faust const unsigned int src_rows = src->dimension(idx_height);
248*c217d954SCole Faust const unsigned int src_cols = src->dimension(idx_width);
249*c217d954SCole Faust const unsigned int n_channels = src->dimension(idx_channels);
250*c217d954SCole Faust const unsigned int dst_rows = dst->dimension(idx_height);
251*c217d954SCole Faust const unsigned int dst_cols = dst->dimension(idx_width);
252*c217d954SCole Faust
253*c217d954SCole Faust arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
254*c217d954SCole Faust
255*c217d954SCole Faust const auto src_qinfo = src->quantization_info().uniform();
256*c217d954SCole Faust const auto dst_qinfo = dst->quantization_info().uniform();
257*c217d954SCole Faust
258*c217d954SCole Faust const float multiplier = src_qinfo.scale / dst_qinfo.scale;
259*c217d954SCole Faust int32_t dst_multiplier{};
260*c217d954SCole Faust int32_t dst_shift{};
261*c217d954SCole Faust quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
262*c217d954SCole Faust
263*c217d954SCole Faust const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
264*c217d954SCole Faust dst_qinfo.offset,
265*c217d954SCole Faust dst_shift, // left shift
266*c217d954SCole Faust 0, // right shift
267*c217d954SCole Faust dst_multiplier);
268*c217d954SCole Faust
269*c217d954SCole Faust // Configure assembly pooling kernel with requantization
270*c217d954SCole Faust auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
271*c217d954SCole Faust if(pooling_kernel_asm == nullptr)
272*c217d954SCole Faust {
273*c217d954SCole Faust // Configuration not supported: Leave function unconfigured:
274*c217d954SCole Faust return;
275*c217d954SCole Faust }
276*c217d954SCole Faust
277*c217d954SCole Faust _kernel_asm = std::move(pooling_kernel_asm);
278*c217d954SCole Faust }
279*c217d954SCole Faust
get_mws(const CPUInfo & platform,size_t thread_count) const280*c217d954SCole Faust size_t CpuPool2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
281*c217d954SCole Faust {
282*c217d954SCole Faust ARM_COMPUTE_UNUSED(thread_count);
283*c217d954SCole Faust ARM_COMPUTE_UNUSED(platform);
284*c217d954SCole Faust
285*c217d954SCole Faust return ICPPKernel::default_mws;
286*c217d954SCole Faust }
287*c217d954SCole Faust } // namespace kernels
288*c217d954SCole Faust } // namespace cpu
289*c217d954SCole Faust } // namespace arm_compute
290