1 /* 2 * Copyright (c) 2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #pragma once 26 27 #include "src/core/NEON/kernels/assembly/winograd.hpp" 28 29 #include "src/core/NEON/kernels/arm_conv/addressing.hpp" 30 31 #include <algorithm> 32 #include <cstring> 33 #include <functional> 34 #include <limits> 35 36 namespace arm_conv { 37 namespace winograd { 38 namespace output_transform { 39 40 /* Driver class for the Winograd output transforms. 41 * 42 * This provides a base implementation which handles iteration over the output 43 * tensor; subclasses are responsible for managing working space and executing 44 * the transform on individual tiles. 45 */ 46 template <typename TIn, typename TOut=TIn> 47 class TransformBase : public ITransform 48 { 49 const std::string m_name; 50 const unsigned int m_output_rows, m_output_cols; 51 const unsigned int m_kernel_rows, m_kernel_cols; 52 53 protected: get_working_space_per_thread(const ConvolutionArgs &) const54 virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const 55 { 56 return 0; 57 } 58 initialise_thread_working_space(const ConvolutionArgs &,void *) const59 virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const 60 { 61 // Nothing to do 62 } 63 64 virtual void execute_tile( 65 unsigned int n_channels, 66 const TIn *inptr, size_t ld_in_matrix, 67 const TIn *bias, 68 TOut *outptr, size_t ld_out_row, size_t ld_out_col, 69 TOut activation_min, TOut activation_max, 70 unsigned int valid_rows, unsigned int valid_cols, 71 void *working_space 72 ) const = 0; 73 execute_internal(const ConvolutionArgs & args,const TIn * inptr,size_t ld_in_batch,size_t ld_in_matrix,size_t ld_in_row,const TIn * bias,TOut * outptr,size_t ld_out_batch,size_t ld_out_row,size_t ld_out_col,void * working_space,unsigned int thread_id,unsigned int n_threads) const74 void execute_internal( 75 const ConvolutionArgs &args, 76 const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, 77 const TIn *bias, 78 TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, 79 void *working_space, unsigned int thread_id, unsigned int n_threads 80 ) const 81 { 82 // Get the working space for this thread, and initialise it. 83 working_space = reinterpret_cast<char *>(working_space) + 84 this->get_working_space_per_thread(args) * thread_id; 85 this->initialise_thread_working_space(args, working_space); 86 87 // Get the activation values 88 auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity()); 89 auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity()); 90 switch (args.activation.type) 91 { 92 case arm_gemm::Activation::Type::BoundedReLU: 93 activation_max = static_cast<TOut>(args.activation.param1); 94 // Fall through 95 case arm_gemm::Activation::Type::ReLU: 96 activation_min = static_cast<TOut>(0); 97 break; 98 default: 99 break; 100 } 101 102 // Determine the number of tiles in a row, we use this to get the right 103 // offset into the input data. 104 const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols(); 105 106 // Execute over all batches 107 for (unsigned int batch = 0; batch < args.n_batches; batch++) 108 { 109 auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row; 110 auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows(); 111 inptr += ld_in_batch; 112 outptr += ld_out_batch; 113 114 // Stripe rows of tiles over threads. 115 for (auto out_i = thread_id * this->get_output_rows(); 116 out_i < args.output_shape.rows; 117 out_i += n_threads * this->get_output_rows()) 118 { 119 auto inptr_tile = inptr_row; 120 auto outptr_tile = outptr_row; 121 inptr_row += n_threads * n_tile_cols * ld_in_row; 122 outptr_row += n_threads * this->get_output_rows() * ld_out_row; 123 124 // Iterate over all columns 125 for (auto out_j = 0u; out_j < args.output_shape.cols; 126 out_j += this->get_output_cols()) 127 { 128 // Execute the tile 129 this->execute_tile( 130 args.n_output_channels, 131 inptr_tile, ld_in_matrix, 132 bias, 133 outptr_tile, ld_out_row, ld_out_col, 134 activation_min, activation_max, 135 args.output_shape.rows - out_i, // Number of valid rows remaining 136 args.output_shape.cols - out_j, // Number of valid columns remaining 137 working_space 138 ); 139 140 // Progress the pointers 141 inptr_tile += ld_in_row; 142 outptr_tile += this->get_output_cols() * ld_out_col; 143 } 144 } 145 } 146 } 147 148 public: TransformBase(const std::string & name,unsigned int output_rows,unsigned int output_cols,unsigned int kernel_rows,unsigned int kernel_cols)149 TransformBase(const std::string &name, 150 unsigned int output_rows, unsigned int output_cols, 151 unsigned int kernel_rows, unsigned int kernel_cols) 152 : m_name(name), 153 m_output_rows(output_rows), m_output_cols(output_cols), 154 m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols) 155 { 156 } 157 get_name(void) const158 const std::string &get_name(void) const override { return m_name; } 159 get_input_rows(void) const160 unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; } get_input_cols(void) const161 unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; } 162 get_output_rows(void) const163 unsigned int get_output_rows(void) const override final { return m_output_rows; } get_output_cols(void) const164 unsigned int get_output_cols(void) const override final { return m_output_cols; } 165 get_kernel_rows(void) const166 unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; } get_kernel_cols(void) const167 unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; } 168 get_working_space_size(const ConvolutionArgs & args,unsigned int n_threads) const169 size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override 170 { 171 return n_threads * this->get_working_space_per_thread(args); 172 } 173 execute(const ConvolutionArgs & args,const void * inptr,size_t ld_in_batch,size_t ld_in_matrix,size_t ld_in_row,const void * bias,void * outptr,size_t ld_out_batch,size_t ld_out_row,size_t ld_out_col,void * working_space,unsigned int thread_id,unsigned int n_threads) const174 void execute( 175 const ConvolutionArgs &args, 176 const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, 177 const void *bias, 178 void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, 179 void *working_space, unsigned int thread_id, unsigned int n_threads 180 ) const override 181 { 182 execute_internal( 183 args, 184 reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row, 185 reinterpret_cast<const TIn *>(bias), 186 reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col, 187 working_space, thread_id, n_threads 188 ); 189 } 190 }; 191 192 template <typename TIn, typename TOut=TIn> 193 class TransformUnpadded : public TransformBase<TIn, TOut> 194 { 195 using Kernel = std::function<void( 196 unsigned int n_channels, 197 const TIn *inptr, size_t ld_in_matrix, 198 const TIn *bias, 199 TOut *outptr, size_t ld_out_row, size_t ld_out_col, 200 TOut activation_min, TOut activation_max 201 )>; 202 const Kernel m_kernel; 203 204 protected: get_working_space_per_thread(const ConvolutionArgs & args) const205 size_t get_working_space_per_thread(const ConvolutionArgs &args) const override 206 { 207 // We create a buffer the size of the output tile 208 const auto n_output_points = this->get_output_rows() * this->get_output_cols(); 209 return sizeof(TOut) * n_output_points * args.n_output_channels; 210 } 211 execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_matrix,const TIn * bias,TOut * outptr,size_t ld_out_row,size_t ld_out_col,TOut activation_min,TOut activation_max,unsigned int valid_rows,unsigned int valid_cols,void * working_space) const212 void execute_tile( 213 unsigned int n_channels, 214 const TIn *inptr, size_t ld_in_matrix, 215 const TIn *bias, 216 TOut *outptr, size_t ld_out_row, size_t ld_out_col, 217 TOut activation_min, TOut activation_max, 218 unsigned int valid_rows, unsigned int valid_cols, 219 void *working_space 220 ) const override final 221 { 222 // Get copies of the output tensor parameters 223 auto kernel_outptr = outptr; 224 auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col; 225 226 // If there's padding on either the left or the right, then we execute the 227 // kernel into the output buffer and then perform a copy. 228 if (valid_rows < this->get_output_rows() || 229 valid_cols < this->get_output_cols()) 230 { 231 // Override the kernel output parameters 232 kernel_outptr = reinterpret_cast<TOut *>(working_space); 233 kernel_ld_out_col = n_channels; 234 kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols(); 235 } 236 237 // Execute the kernel 238 m_kernel( 239 n_channels, 240 inptr, ld_in_matrix, 241 bias, 242 kernel_outptr, kernel_ld_out_row, kernel_ld_out_col, 243 activation_min, activation_max 244 ); 245 246 // If necessary, copy from the working space into the destination tensor. 247 if (valid_rows < this->get_output_rows() || 248 valid_cols < this->get_output_cols()) 249 { 250 const auto last_row = std::min(valid_rows, this->get_output_rows()); 251 const auto last_col = std::min(valid_cols, this->get_output_cols()); 252 253 for (auto i = 0u; i < last_row; i++) 254 { 255 auto patch_tile = kernel_outptr; 256 auto out_tile = outptr; 257 kernel_outptr += kernel_ld_out_row; 258 outptr += ld_out_row; 259 260 for (auto j = 0u; j < last_col; j++) 261 { 262 memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels); 263 patch_tile += kernel_ld_out_col; 264 out_tile += ld_out_col; 265 } 266 } 267 } 268 } 269 270 public: TransformUnpadded(const std::string & name,unsigned int output_rows,unsigned int output_cols,unsigned int kernel_rows,unsigned int kernel_cols,const Kernel kernel)271 TransformUnpadded(const std::string &name, 272 unsigned int output_rows, unsigned int output_cols, 273 unsigned int kernel_rows, unsigned int kernel_cols, 274 const Kernel kernel) 275 : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols), 276 m_kernel(kernel) 277 { 278 } 279 280 /* Utility method to get a transposed variant of a kernel, this transposed 281 * version simply calls the original kernel with the output row and column 282 * strides swapped. 283 */ get_transposed_kernel(const Kernel & kernel)284 static constexpr Kernel get_transposed_kernel(const Kernel &kernel) 285 { 286 return [kernel] ( 287 const unsigned int n_channels, 288 const TIn *const inptr, const size_t ld_in_matrix, 289 const TIn *const bias, 290 TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col, 291 const TOut activation_min, const TOut activation_max 292 ) { 293 kernel(n_channels, inptr, ld_in_matrix, bias, 294 outptr, ld_out_col, ld_out_row, 295 activation_min, activation_max); 296 }; 297 } 298 }; 299 300 } // namespace output_transform 301 } // namespace winograd 302 } // namespace arm_conv 303