xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/CpuQuantizeKernel.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole Faust /*
2*c217d954SCole Faust  * Copyright (c) 2017-2022 Arm Limited.
3*c217d954SCole Faust  *
4*c217d954SCole Faust  * SPDX-License-Identifier: MIT
5*c217d954SCole Faust  *
6*c217d954SCole Faust  * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust  * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust  * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust  * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust  * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust  *
13*c217d954SCole Faust  * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust  * copies or substantial portions of the Software.
15*c217d954SCole Faust  *
16*c217d954SCole Faust  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust  * SOFTWARE.
23*c217d954SCole Faust  */
24*c217d954SCole Faust #include "src/cpu/kernels/CpuQuantizeKernel.h"
25*c217d954SCole Faust 
26*c217d954SCole Faust #include "arm_compute/core/Error.h"
27*c217d954SCole Faust #include "arm_compute/core/Helpers.h"
28*c217d954SCole Faust #include "arm_compute/core/Utils.h"
29*c217d954SCole Faust #include "arm_compute/core/Validate.h"
30*c217d954SCole Faust #include "arm_compute/core/Window.h"
31*c217d954SCole Faust #include "src/core/NEON/NEAsymm.h"
32*c217d954SCole Faust #include "src/core/NEON/NEMath.h"
33*c217d954SCole Faust #include "src/core/NEON/wrapper/wrapper.h"
34*c217d954SCole Faust #include "src/core/helpers/AutoConfiguration.h"
35*c217d954SCole Faust #include "src/core/helpers/WindowHelpers.h"
36*c217d954SCole Faust 
37*c217d954SCole Faust #include "src/core/CPP/Validate.h"
38*c217d954SCole Faust 
39*c217d954SCole Faust #include <arm_neon.h>
40*c217d954SCole Faust #include <map>
41*c217d954SCole Faust 
42*c217d954SCole Faust namespace arm_compute
43*c217d954SCole Faust {
44*c217d954SCole Faust namespace cpu
45*c217d954SCole Faust {
46*c217d954SCole Faust namespace kernels
47*c217d954SCole Faust {
48*c217d954SCole Faust namespace
49*c217d954SCole Faust {
50*c217d954SCole Faust constexpr auto window_step = 16;
51*c217d954SCole Faust 
validate_arguments(const ITensorInfo * src,const ITensorInfo * dst)52*c217d954SCole Faust Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
53*c217d954SCole Faust {
54*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
55*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
56*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
57*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
58*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
59*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
60*c217d954SCole Faust 
61*c217d954SCole Faust     return Status{};
62*c217d954SCole Faust }
63*c217d954SCole Faust 
64*c217d954SCole Faust template <typename T>
load_value(const T * input_ptr)65*c217d954SCole Faust inline float32x4x4_t load_value(const T *input_ptr)
66*c217d954SCole Faust {
67*c217d954SCole Faust     using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
68*c217d954SCole Faust     return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
69*c217d954SCole Faust }
70*c217d954SCole Faust 
71*c217d954SCole Faust template <>
load_value(const float * input_ptr)72*c217d954SCole Faust inline float32x4x4_t load_value(const float *input_ptr)
73*c217d954SCole Faust {
74*c217d954SCole Faust     return { wrapper::vloadq(input_ptr),
75*c217d954SCole Faust              wrapper::vloadq(input_ptr + 4),
76*c217d954SCole Faust              wrapper::vloadq(input_ptr + 8),
77*c217d954SCole Faust              wrapper::vloadq(input_ptr + 12) };
78*c217d954SCole Faust }
79*c217d954SCole Faust #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
80*c217d954SCole Faust template <>
load_value(const float16_t * input_ptr)81*c217d954SCole Faust inline float32x4x4_t load_value(const float16_t *input_ptr)
82*c217d954SCole Faust {
83*c217d954SCole Faust     return { vcvt_f32_f16(wrapper::vload(input_ptr)),
84*c217d954SCole Faust              vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
85*c217d954SCole Faust              vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
86*c217d954SCole Faust              vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
87*c217d954SCole Faust }
88*c217d954SCole Faust 
89*c217d954SCole Faust #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
90*c217d954SCole Faust 
91*c217d954SCole Faust template <typename element_type>
92*c217d954SCole Faust using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
93*c217d954SCole Faust 
94*c217d954SCole Faust template <typename quantized_type>
95*c217d954SCole Faust vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
96*c217d954SCole Faust 
97*c217d954SCole Faust template <>
vquantize_qasymm8(const float32x4x4_t & qv,const UniformQuantizationInfo & qi)98*c217d954SCole Faust vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
99*c217d954SCole Faust {
100*c217d954SCole Faust     return vquantize(qv, qi);
101*c217d954SCole Faust }
102*c217d954SCole Faust 
103*c217d954SCole Faust template <>
vquantize_qasymm8(const float32x4x4_t & qv,const UniformQuantizationInfo & qi)104*c217d954SCole Faust vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
105*c217d954SCole Faust {
106*c217d954SCole Faust     return vquantize_signed(qv, qi);
107*c217d954SCole Faust }
108*c217d954SCole Faust 
109*c217d954SCole Faust } // namespace
110*c217d954SCole Faust 
configure(const ITensorInfo * src,ITensorInfo * dst)111*c217d954SCole Faust void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
112*c217d954SCole Faust {
113*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
114*c217d954SCole Faust     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
115*c217d954SCole Faust 
116*c217d954SCole Faust     static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
117*c217d954SCole Faust     {
118*c217d954SCole Faust         { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
119*c217d954SCole Faust         { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
120*c217d954SCole Faust         { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
121*c217d954SCole Faust 
122*c217d954SCole Faust         { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
123*c217d954SCole Faust         { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
124*c217d954SCole Faust         { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
125*c217d954SCole Faust 
126*c217d954SCole Faust         { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t> },
127*c217d954SCole Faust 
128*c217d954SCole Faust         { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
129*c217d954SCole Faust         { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
130*c217d954SCole Faust         { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
131*c217d954SCole Faust 
132*c217d954SCole Faust #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
133*c217d954SCole Faust         { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
134*c217d954SCole Faust         { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
135*c217d954SCole Faust         { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
136*c217d954SCole Faust #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
137*c217d954SCole Faust     };
138*c217d954SCole Faust 
139*c217d954SCole Faust     std::string function_to_call("op_");
140*c217d954SCole Faust     function_to_call += string_from_data_type(src->data_type()) + "_";
141*c217d954SCole Faust     function_to_call += string_from_data_type(dst->data_type());
142*c217d954SCole Faust 
143*c217d954SCole Faust     auto it = quant_map.find(function_to_call);
144*c217d954SCole Faust 
145*c217d954SCole Faust     if(it == quant_map.end())
146*c217d954SCole Faust     {
147*c217d954SCole Faust         ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
148*c217d954SCole Faust     }
149*c217d954SCole Faust     _func = it->second;
150*c217d954SCole Faust 
151*c217d954SCole Faust     // Configure kernel window
152*c217d954SCole Faust     Window win_config = calculate_max_window(*src, Steps());
153*c217d954SCole Faust     ICpuKernel::configure(win_config);
154*c217d954SCole Faust }
155*c217d954SCole Faust 
validate(const ITensorInfo * src,const ITensorInfo * dst)156*c217d954SCole Faust Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
157*c217d954SCole Faust {
158*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
159*c217d954SCole Faust     return Status{};
160*c217d954SCole Faust }
161*c217d954SCole Faust 
162*c217d954SCole Faust template <typename TIn, typename TOut>
run_quantize_qsymm8(const ITensor * src,ITensor * dst,const Window & window)163*c217d954SCole Faust void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window)
164*c217d954SCole Faust {
165*c217d954SCole Faust     const auto window_start_x = static_cast<int>(window.x().start());
166*c217d954SCole Faust     const auto window_end_x   = static_cast<int>(window.x().end());
167*c217d954SCole Faust 
168*c217d954SCole Faust     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
169*c217d954SCole Faust     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
170*c217d954SCole Faust     if(is_data_type_quantized_asymmetric(src->info()->data_type()))
171*c217d954SCole Faust     {
172*c217d954SCole Faust         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
173*c217d954SCole Faust     }
174*c217d954SCole Faust     // Collapse window and reset first dimension to handle tail calculations manually
175*c217d954SCole Faust     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
176*c217d954SCole Faust     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
177*c217d954SCole Faust 
178*c217d954SCole Faust     Iterator input(src, win_collapsed);
179*c217d954SCole Faust     Iterator output(dst, win_collapsed);
180*c217d954SCole Faust     execute_window_loop(win_collapsed, [&](const Coordinates &)
181*c217d954SCole Faust     {
182*c217d954SCole Faust         auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
183*c217d954SCole Faust         auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
184*c217d954SCole Faust         int x = window_start_x;
185*c217d954SCole Faust         for(; x <= (window_end_x - window_step); x += window_step)
186*c217d954SCole Faust         {
187*c217d954SCole Faust             wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
188*c217d954SCole Faust         }
189*c217d954SCole Faust         // Compute left-over elements
190*c217d954SCole Faust         for(; x < window_end_x; ++x)
191*c217d954SCole Faust         {
192*c217d954SCole Faust             output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
193*c217d954SCole Faust         }
194*c217d954SCole Faust     },
195*c217d954SCole Faust     input, output);
196*c217d954SCole Faust }
197*c217d954SCole Faust 
198*c217d954SCole Faust template <typename TIn, typename TOut>
run_quantize_qasymm8(const ITensor * src,ITensor * dst,const Window & window)199*c217d954SCole Faust void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
200*c217d954SCole Faust {
201*c217d954SCole Faust     const auto window_start_x = static_cast<int>(window.x().start());
202*c217d954SCole Faust     const auto window_end_x   = static_cast<int>(window.x().end());
203*c217d954SCole Faust 
204*c217d954SCole Faust     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
205*c217d954SCole Faust     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
206*c217d954SCole Faust     if(is_data_type_quantized_asymmetric(src->info()->data_type()))
207*c217d954SCole Faust     {
208*c217d954SCole Faust         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
209*c217d954SCole Faust     }
210*c217d954SCole Faust #ifdef __aarch64__
211*c217d954SCole Faust     constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
212*c217d954SCole Faust #else  //__aarch64__
213*c217d954SCole Faust     constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
214*c217d954SCole Faust #endif //__aarch64__
215*c217d954SCole Faust 
216*c217d954SCole Faust     // Collapse window and reset first dimension to handle tail calculations manually
217*c217d954SCole Faust     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
218*c217d954SCole Faust     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
219*c217d954SCole Faust 
220*c217d954SCole Faust     Iterator input(src, win_collapsed);
221*c217d954SCole Faust     Iterator output(dst, win_collapsed);
222*c217d954SCole Faust     execute_window_loop(win_collapsed, [&](const Coordinates &)
223*c217d954SCole Faust     {
224*c217d954SCole Faust         auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
225*c217d954SCole Faust         auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
226*c217d954SCole Faust 
227*c217d954SCole Faust         int x = window_start_x;
228*c217d954SCole Faust         for(; x <= (window_end_x - window_step); x += window_step)
229*c217d954SCole Faust         {
230*c217d954SCole Faust             wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
231*c217d954SCole Faust         }
232*c217d954SCole Faust         // Compute left-over elements
233*c217d954SCole Faust         for(; x < window_end_x; ++x)
234*c217d954SCole Faust         {
235*c217d954SCole Faust             output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
236*c217d954SCole Faust         }
237*c217d954SCole Faust     },
238*c217d954SCole Faust     input, output);
239*c217d954SCole Faust }
240*c217d954SCole Faust 
241*c217d954SCole Faust template <typename T>
run_quantize_qasymm16(const ITensor * src,ITensor * dst,const Window & window)242*c217d954SCole Faust void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
243*c217d954SCole Faust {
244*c217d954SCole Faust     const auto window_start_x = static_cast<int>(window.x().start());
245*c217d954SCole Faust     const auto window_end_x   = static_cast<int>(window.x().end());
246*c217d954SCole Faust 
247*c217d954SCole Faust     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
248*c217d954SCole Faust     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
249*c217d954SCole Faust     if(is_data_type_quantized_asymmetric(src->info()->data_type()))
250*c217d954SCole Faust     {
251*c217d954SCole Faust         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
252*c217d954SCole Faust     }
253*c217d954SCole Faust #ifdef __aarch64__
254*c217d954SCole Faust     constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
255*c217d954SCole Faust #else  //__aarch64__
256*c217d954SCole Faust     constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
257*c217d954SCole Faust #endif //__aarch64__
258*c217d954SCole Faust 
259*c217d954SCole Faust     // Collapse window and reset first dimension to handle tail calculations manually
260*c217d954SCole Faust     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
261*c217d954SCole Faust     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
262*c217d954SCole Faust 
263*c217d954SCole Faust     Iterator input(src, win_collapsed);
264*c217d954SCole Faust     Iterator output(dst, win_collapsed);
265*c217d954SCole Faust     execute_window_loop(win_collapsed, [&](const Coordinates &)
266*c217d954SCole Faust     {
267*c217d954SCole Faust         auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
268*c217d954SCole Faust         auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
269*c217d954SCole Faust 
270*c217d954SCole Faust         int x = window_start_x;
271*c217d954SCole Faust         for(; x <= (window_end_x - window_step); x += window_step)
272*c217d954SCole Faust         {
273*c217d954SCole Faust             uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
274*c217d954SCole Faust             vst1q_u16(&output_ptr[x], tmp.val[0]);
275*c217d954SCole Faust             vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
276*c217d954SCole Faust         }
277*c217d954SCole Faust         // Compute left-over elements
278*c217d954SCole Faust         for(; x < window_end_x; ++x)
279*c217d954SCole Faust         {
280*c217d954SCole Faust             output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
281*c217d954SCole Faust         }
282*c217d954SCole Faust     },
283*c217d954SCole Faust     input, output);
284*c217d954SCole Faust }
285*c217d954SCole Faust 
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)286*c217d954SCole Faust void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
287*c217d954SCole Faust {
288*c217d954SCole Faust     ARM_COMPUTE_UNUSED(info);
289*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
290*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
291*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON(_func == nullptr);
292*c217d954SCole Faust 
293*c217d954SCole Faust     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
294*c217d954SCole Faust     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
295*c217d954SCole Faust     (this->*_func)(src, dst, window);
296*c217d954SCole Faust }
297*c217d954SCole Faust 
name() const298*c217d954SCole Faust const char *CpuQuantizeKernel::name() const
299*c217d954SCole Faust {
300*c217d954SCole Faust     return "CpuQuantizeKernel";
301*c217d954SCole Faust }
302*c217d954SCole Faust } // namespace kernels
303*c217d954SCole Faust } // namespace cpu
304*c217d954SCole Faust } // namespace arm_compute
305