xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/convolution/winograd/output_transform.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1  /*
2   * Copyright (c) 2022 Arm Limited.
3   *
4   * SPDX-License-Identifier: MIT
5   *
6   * Permission is hereby granted, free of charge, to any person obtaining a copy
7   * of this software and associated documentation files (the "Software"), to
8   * deal in the Software without restriction, including without limitation the
9   * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10   * sell copies of the Software, and to permit persons to whom the Software is
11   * furnished to do so, subject to the following conditions:
12   *
13   * The above copyright notice and this permission notice shall be included in all
14   * copies or substantial portions of the Software.
15   *
16   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22   * SOFTWARE.
23   */
24  
25  #pragma once
26  
27  #include "src/core/NEON/kernels/assembly/winograd.hpp"
28  
29  #include "src/core/NEON/kernels/arm_conv/addressing.hpp"
30  
31  #include <algorithm>
32  #include <cstring>
33  #include <functional>
34  #include <limits>
35  
36  namespace arm_conv {
37  namespace winograd {
38  namespace output_transform {
39  
40  /* Driver class for the Winograd output transforms.
41   *
42   * This provides a base implementation which handles iteration over the output
43   * tensor; subclasses are responsible for managing working space and executing
44   * the transform on individual tiles.
45   */
46  template <typename TIn, typename TOut=TIn>
47  class TransformBase : public ITransform
48  {
49    const std::string m_name;
50    const unsigned int m_output_rows, m_output_cols;
51    const unsigned int m_kernel_rows, m_kernel_cols;
52  
53    protected:
get_working_space_per_thread(const ConvolutionArgs &) const54    virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
55    {
56      return 0;
57    }
58  
initialise_thread_working_space(const ConvolutionArgs &,void *) const59    virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
60    {
61      // Nothing to do
62    }
63  
64    virtual void execute_tile(
65      unsigned int n_channels,
66      const TIn *inptr, size_t ld_in_matrix,
67      const TIn *bias,
68      TOut *outptr, size_t ld_out_row, size_t ld_out_col,
69      TOut activation_min, TOut activation_max,
70      unsigned int valid_rows, unsigned int valid_cols,
71      void *working_space
72    ) const = 0;
73  
execute_internal(const ConvolutionArgs & args,const TIn * inptr,size_t ld_in_batch,size_t ld_in_matrix,size_t ld_in_row,const TIn * bias,TOut * outptr,size_t ld_out_batch,size_t ld_out_row,size_t ld_out_col,void * working_space,unsigned int thread_id,unsigned int n_threads) const74    void execute_internal(
75      const ConvolutionArgs &args,
76      const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
77      const TIn *bias,
78      TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
79      void *working_space, unsigned int thread_id, unsigned int n_threads
80    ) const
81    {
82      // Get the working space for this thread, and initialise it.
83      working_space = reinterpret_cast<char *>(working_space) +
84                      this->get_working_space_per_thread(args) * thread_id;
85      this->initialise_thread_working_space(args, working_space);
86  
87      // Get the activation values
88      auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity());
89      auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity());
90      switch (args.activation.type)
91      {
92        case arm_gemm::Activation::Type::BoundedReLU:
93          activation_max = static_cast<TOut>(args.activation.param1);
94          // Fall through
95        case arm_gemm::Activation::Type::ReLU:
96          activation_min = static_cast<TOut>(0);
97          break;
98        default:
99          break;
100      }
101  
102      // Determine the number of tiles in a row, we use this to get the right
103      // offset into the input data.
104      const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols();
105  
106      // Execute over all batches
107      for (unsigned int batch = 0; batch < args.n_batches; batch++)
108      {
109        auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row;
110        auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows();
111        inptr += ld_in_batch;
112        outptr += ld_out_batch;
113  
114        // Stripe rows of tiles over threads.
115        for (auto out_i = thread_id * this->get_output_rows();
116             out_i < args.output_shape.rows;
117             out_i += n_threads * this->get_output_rows())
118        {
119          auto inptr_tile = inptr_row;
120          auto outptr_tile = outptr_row;
121          inptr_row += n_threads * n_tile_cols * ld_in_row;
122          outptr_row += n_threads * this->get_output_rows() * ld_out_row;
123  
124          // Iterate over all columns
125          for (auto out_j = 0u; out_j < args.output_shape.cols;
126               out_j += this->get_output_cols())
127          {
128            // Execute the tile
129            this->execute_tile(
130              args.n_output_channels,
131              inptr_tile, ld_in_matrix,
132              bias,
133              outptr_tile, ld_out_row, ld_out_col,
134              activation_min, activation_max,
135              args.output_shape.rows - out_i,  // Number of valid rows remaining
136              args.output_shape.cols - out_j,  // Number of valid columns remaining
137              working_space
138            );
139  
140            // Progress the pointers
141            inptr_tile += ld_in_row;
142            outptr_tile += this->get_output_cols() * ld_out_col;
143          }
144        }
145      }
146    }
147  
148    public:
TransformBase(const std::string & name,unsigned int output_rows,unsigned int output_cols,unsigned int kernel_rows,unsigned int kernel_cols)149    TransformBase(const std::string &name,
150                  unsigned int output_rows, unsigned int output_cols,
151                  unsigned int kernel_rows, unsigned int kernel_cols)
152    : m_name(name),
153      m_output_rows(output_rows), m_output_cols(output_cols),
154      m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols)
155    {
156    }
157  
get_name(void) const158    const std::string &get_name(void) const override { return m_name; }
159  
get_input_rows(void) const160    unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; }
get_input_cols(void) const161    unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; }
162  
get_output_rows(void) const163    unsigned int get_output_rows(void) const override final { return m_output_rows; }
get_output_cols(void) const164    unsigned int get_output_cols(void) const override final { return m_output_cols; }
165  
get_kernel_rows(void) const166    unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; }
get_kernel_cols(void) const167    unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; }
168  
get_working_space_size(const ConvolutionArgs & args,unsigned int n_threads) const169    size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
170    {
171      return n_threads * this->get_working_space_per_thread(args);
172    }
173  
execute(const ConvolutionArgs & args,const void * inptr,size_t ld_in_batch,size_t ld_in_matrix,size_t ld_in_row,const void * bias,void * outptr,size_t ld_out_batch,size_t ld_out_row,size_t ld_out_col,void * working_space,unsigned int thread_id,unsigned int n_threads) const174    void execute(
175      const ConvolutionArgs &args,
176      const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
177      const void *bias,
178      void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
179      void *working_space, unsigned int thread_id, unsigned int n_threads
180    ) const override
181    {
182      execute_internal(
183        args,
184        reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row,
185        reinterpret_cast<const TIn *>(bias),
186        reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col,
187        working_space, thread_id, n_threads
188      );
189    }
190  };
191  
192  template <typename TIn, typename TOut=TIn>
193  class TransformUnpadded : public TransformBase<TIn, TOut>
194  {
195    using Kernel = std::function<void(
196      unsigned int n_channels,
197      const TIn *inptr, size_t ld_in_matrix,
198      const TIn *bias,
199      TOut *outptr, size_t ld_out_row, size_t ld_out_col,
200      TOut activation_min, TOut activation_max
201    )>;
202    const Kernel m_kernel;
203  
204    protected:
get_working_space_per_thread(const ConvolutionArgs & args) const205    size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
206    {
207      // We create a buffer the size of the output tile
208      const auto n_output_points = this->get_output_rows() * this->get_output_cols();
209      return sizeof(TOut) * n_output_points * args.n_output_channels;
210    }
211  
execute_tile(unsigned int n_channels,const TIn * inptr,size_t ld_in_matrix,const TIn * bias,TOut * outptr,size_t ld_out_row,size_t ld_out_col,TOut activation_min,TOut activation_max,unsigned int valid_rows,unsigned int valid_cols,void * working_space) const212    void execute_tile(
213      unsigned int n_channels,
214      const TIn *inptr, size_t ld_in_matrix,
215      const TIn *bias,
216      TOut *outptr, size_t ld_out_row, size_t ld_out_col,
217      TOut activation_min, TOut activation_max,
218      unsigned int valid_rows, unsigned int valid_cols,
219      void *working_space
220    ) const override final
221    {
222      // Get copies of the output tensor parameters
223      auto kernel_outptr = outptr;
224      auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col;
225  
226      // If there's padding on either the left or the right, then we execute the
227      // kernel into the output buffer and then perform a copy.
228      if (valid_rows < this->get_output_rows() ||
229          valid_cols < this->get_output_cols())
230      {
231        // Override the kernel output parameters
232        kernel_outptr = reinterpret_cast<TOut *>(working_space);
233        kernel_ld_out_col = n_channels;
234        kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols();
235      }
236  
237      // Execute the kernel
238      m_kernel(
239        n_channels,
240        inptr, ld_in_matrix,
241        bias,
242        kernel_outptr, kernel_ld_out_row, kernel_ld_out_col,
243        activation_min, activation_max
244      );
245  
246      // If necessary, copy from the working space into the destination tensor.
247      if (valid_rows < this->get_output_rows() ||
248          valid_cols < this->get_output_cols())
249      {
250        const auto last_row = std::min(valid_rows, this->get_output_rows());
251        const auto last_col = std::min(valid_cols, this->get_output_cols());
252  
253        for (auto i = 0u; i < last_row; i++)
254        {
255          auto patch_tile = kernel_outptr;
256          auto out_tile = outptr;
257          kernel_outptr += kernel_ld_out_row;
258          outptr += ld_out_row;
259  
260          for (auto j = 0u; j < last_col; j++)
261          {
262            memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels);
263            patch_tile += kernel_ld_out_col;
264            out_tile += ld_out_col;
265          }
266        }
267      }
268    }
269  
270    public:
TransformUnpadded(const std::string & name,unsigned int output_rows,unsigned int output_cols,unsigned int kernel_rows,unsigned int kernel_cols,const Kernel kernel)271    TransformUnpadded(const std::string &name,
272                      unsigned int output_rows, unsigned int output_cols,
273                      unsigned int kernel_rows, unsigned int kernel_cols,
274                      const Kernel kernel)
275    : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols),
276      m_kernel(kernel)
277    {
278    }
279  
280    /* Utility method to get a transposed variant of a kernel, this transposed
281     * version simply calls the original kernel with the output row and column
282     * strides swapped.
283     */
get_transposed_kernel(const Kernel & kernel)284    static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
285    {
286      return [kernel] (
287        const unsigned int n_channels,
288        const TIn *const inptr, const size_t ld_in_matrix,
289        const TIn *const bias,
290        TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col,
291        const TOut activation_min, const TOut activation_max
292      ) {
293        kernel(n_channels, inptr, ld_in_matrix, bias,
294               outptr, ld_out_col, ld_out_row,
295               activation_min, activation_max);
296      };
297    }
298  };
299  
300  }  // namespace output_transform
301  }  // namespace winograd
302  }  // namespace arm_conv
303