1 /* 2 * Copyright (c) 2020-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" 25 26 #include "arm_compute/core/Error.h" 27 #include "arm_compute/core/Helpers.h" 28 #include "arm_compute/core/ITensor.h" 29 #include "arm_compute/core/Types.h" 30 #include "arm_compute/core/Utils.h" 31 #include "arm_compute/core/Validate.h" 32 #include "arm_compute/core/Window.h" 33 #include "arm_compute/core/utils/quantization/AsymmHelpers.h" 34 #include "src/core/AccessWindowStatic.h" 35 #include "src/core/NEON/wrapper/wrapper.h" 36 #include "src/core/helpers/AutoConfiguration.h" 37 #include "src/core/helpers/WindowHelpers.h" 38 39 #include <arm_neon.h> 40 41 namespace arm_compute 42 { 43 namespace cpu 44 { 45 namespace kernels 46 { 47 namespace 48 { validate_arguments(const ITensorInfo * src,const ITensorInfo * bias,const ITensorInfo * dst,const GEMMLowpOutputStageInfo * output_stage)49 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) 50 { 51 ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); 52 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); 53 54 ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); 55 ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) 56 || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); 57 58 // Check biases if exist 59 if(bias != nullptr) 60 { 61 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); 62 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); 63 ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); 64 } 65 66 if(dst->total_size() != 0) 67 { 68 if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED)) 69 { 70 ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types"); 71 } 72 73 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); 74 } 75 76 return Status{}; 77 } 78 scale_input(int32x4x4_t & in_s32,int32x4_t result_offset_s32,int32_t result_mult_int)79 inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) 80 { 81 // Add the offset terms to GEMM's result 82 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); 83 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); 84 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); 85 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); 86 87 // Multiply by result_mult_int 88 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int); 89 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int); 90 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int); 91 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int); 92 } 93 94 template <typename T> 95 inline typename std::enable_if<std::is_same<T, uint8_t>::value, 96 typename wrapper::traits::neon_vector<T, 16>::type>::type convert_to_8bit(const int16x8x2_t in_s16)97 convert_to_8bit(const int16x8x2_t in_s16) 98 { 99 return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1])); 100 } 101 102 template <typename T> 103 inline typename std::enable_if<std::is_same<T, int8_t>::value, 104 typename wrapper::traits::neon_vector<T, 16>::type>::type convert_to_8bit(const int16x8x2_t in_s16)105 convert_to_8bit(const int16x8x2_t in_s16) 106 { 107 return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1])); 108 } 109 110 template <typename T> finalize_quantization(int32x4x4_t & in_s32,int32x4_t result_shift_s32,typename wrapper::traits::neon_vector<T,16>::type min,typename wrapper::traits::neon_vector<T,16>::type max)111 inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min, 112 typename wrapper::traits::neon_vector<T, 16>::type max) 113 { 114 // Shift final result (negative value shift right) 115 in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); 116 in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); 117 in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); 118 in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); 119 120 // Convert S32 to S16 121 const int16x8x2_t in_s16 = 122 { 123 { 124 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), 125 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) 126 } 127 }; 128 129 // Convert S16 to S8 or U8 130 typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16); 131 132 out = wrapper::vmax(out, min); 133 out = wrapper::vmin(out, max); 134 135 return out; 136 } 137 } // namespace 138 139 template <typename T> run_internal(const ITensor * src,const ITensor * bias,ITensor * dst,const Window & window)140 void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) 141 { 142 using VectorType = typename wrapper::traits::neon_vector<T, 16>::type; 143 144 const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset); 145 const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->gemmlowp_shift); 146 const int window_step_x = 16; 147 const auto window_start_x = static_cast<int>(window.x().start()); 148 const auto window_end_x = static_cast<int>(window.x().end()); 149 150 const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits<T>::lowest(); 151 const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits<T>::max(); 152 153 VectorType min = wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{}); 154 VectorType max = wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{}); 155 156 Window win(window); 157 win.set(Window::DimX, Window::Dimension(0, 1, 1)); 158 159 Iterator in(src, win); 160 Iterator out(dst, win); 161 162 if(bias != nullptr) 163 { 164 Window win_biases; 165 win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); 166 win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); 167 168 Iterator bias_i(bias, win_biases); 169 execute_window_loop(win, [&](const Coordinates &) 170 { 171 // Compute 16 elements per iteration 172 int x = window_start_x; 173 for(; x <= (window_end_x - window_step_x); x += window_step_x) 174 { 175 int32x4x4_t in_s32 = 176 { 177 { 178 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), 179 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), 180 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), 181 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) 182 } 183 }; 184 185 const int32x4x4_t bias_s32 = 186 { 187 { 188 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), 189 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), 190 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), 191 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12) 192 } 193 }; 194 195 // Add the bias to GEMM's result 196 in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); 197 in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); 198 in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); 199 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); 200 201 // Add the offset terms to GEMM's result and multiply by result_mult_int 202 scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); 203 204 wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max)); 205 } 206 207 // Compute left-over elements 208 for(; x < window_end_x; ++x) 209 { 210 const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x); 211 int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); 212 213 // Quantize 214 in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; 215 216 // Store the result 217 *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); 218 } 219 }, 220 in, bias_i, out); 221 } 222 else 223 { 224 execute_window_loop(win, [&](const Coordinates &) 225 { 226 // Compute 16 elements per iteration 227 int x = window_start_x; 228 for(; x <= (window_end_x - window_step_x); x += window_step_x) 229 { 230 int32x4x4_t in_s32 = 231 { 232 { 233 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), 234 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), 235 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), 236 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) 237 } 238 }; 239 240 // Add the offset terms to GEMM's result and multiply by result_mult_int 241 scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); 242 243 wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max)); 244 } 245 246 // Compute left-over elements 247 for(; x < window_end_x; ++x) 248 { 249 int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); 250 251 // Quantize 252 in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; 253 254 // Store the result 255 *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); 256 } 257 }, 258 in, out); 259 } 260 } 261 configure(ITensorInfo * src,ITensorInfo * bias,ITensorInfo * dst,const GEMMLowpOutputStageInfo * output_stage)262 void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) 263 { 264 ARM_COMPUTE_UNUSED(bias); 265 // Perform validate step 266 ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, output_stage); 267 268 // Output auto inizialitation if not yet initialized 269 auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); 270 271 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, 272 bias, 273 dst, 274 output_stage)); 275 276 _output_stage = output_stage; 277 278 // Configure kernel window 279 Window win = calculate_max_window(*src, Steps()); 280 281 ICpuKernel::configure(win); 282 283 // Check if we need to clamp the result using min and max 284 _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) 285 && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) 286 && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); 287 if(_output_stage->output_data_type == DataType::QASYMM8) 288 { 289 _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>; 290 } 291 else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED) 292 { 293 _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>; 294 } 295 else 296 { 297 ARM_COMPUTE_ERROR("Data type not supported"); 298 } 299 } 300 validate(const ITensorInfo * src,const ITensorInfo * bias,const ITensorInfo * dst,const GEMMLowpOutputStageInfo * output_stage)301 Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) 302 { 303 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); 304 return Status{}; 305 } 306 run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)307 void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) 308 { 309 ARM_COMPUTE_UNUSED(info); 310 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); 311 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); 312 ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); 313 314 auto src = tensors.get_const_tensor(TensorType::ACL_SRC); 315 auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); 316 auto dst = tensors.get_tensor(TensorType::ACL_DST); 317 (this->*_func)(src, bias, dst, window); 318 } 319 name() const320 const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const 321 { 322 return "CpuGemmLowpQuantizeDownInt32ScaleKernel"; 323 } 324 } // namespace kernels 325 } // namespace cpu 326 } // namespace arm_compute