output_transform.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3) - OpenGrok cross reference for /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/convolution/winograd/output_transform.hpp

/*
 * Copyright (c) 2022 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#pragma once

#include "src/core/NEON/kernels/assembly/winograd.hpp"

#include "src/core/NEON/kernels/arm_conv/addressing.hpp"

#include <algorithm>
#include <cstring>
#include <functional>
#include <limits>

namespace arm_conv {
namespace winograd {
namespace output_transform {

/* Driver class for the Winograd output transforms.
 *
 * This provides a base implementation which handles iteration over the output
 * tensor; subclasses are responsible for managing working space and executing
 * the transform on individual tiles.
 */
template <typename TIn, typename TOut=TIn>
class TransformBase : public ITransform
{
  const std::string m_name;
  const unsigned int m_output_rows, m_output_cols;
  const unsigned int m_kernel_rows, m_kernel_cols;

  protected:
  virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
  {
    return 0;
  }

  virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
  {
    // Nothing to do
  }

  virtual void execute_tile(
    unsigned int n_channels,
    const TIn *inptr, size_t ld_in_matrix,
    const TIn *bias,
    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
    TOut activation_min, TOut activation_max,
    unsigned int valid_rows, unsigned int valid_cols,
    void *working_space
  ) const = 0;

  void execute_internal(
    const ConvolutionArgs &args,
    const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
    const TIn *bias,
    TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
    void *working_space, unsigned int thread_id, unsigned int n_threads
  ) const
  {
    // Get the working space for this thread, and initialise it.
    working_space = reinterpret_cast<char *>(working_space) +
                    this->get_working_space_per_thread(args) * thread_id;
    this->initialise_thread_working_space(args, working_space);

    // Get the activation values
    auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity());
    auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity());
    switch (args.activation.type)
    {
      case arm_gemm::Activation::Type::BoundedReLU:
        activation_max = static_cast<TOut>(args.activation.param1);
        // Fall through
      case arm_gemm::Activation::Type::ReLU:
        activation_min = static_cast<TOut>(0);
        break;
      default:
        break;
    }

    // Determine the number of tiles in a row, we use this to get the right
    // offset into the input data.
    const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols();

    // Execute over all batches
    for (unsigned int batch = 0; batch < args.n_batches; batch++)
    {
      auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row;
      auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows();
      inptr += ld_in_batch;
      outptr += ld_out_batch;

      // Stripe rows of tiles over threads.
      for (auto out_i = thread_id * this->get_output_rows();
           out_i < args.output_shape.rows;
           out_i += n_threads * this->get_output_rows())
      {
        auto inptr_tile = inptr_row;
        auto outptr_tile = outptr_row;
        inptr_row += n_threads * n_tile_cols * ld_in_row;
        outptr_row += n_threads * this->get_output_rows() * ld_out_row;

        // Iterate over all columns
        for (auto out_j = 0u; out_j < args.output_shape.cols;
             out_j += this->get_output_cols())
        {
          // Execute the tile
          this->execute_tile(
            args.n_output_channels,
            inptr_tile, ld_in_matrix,
            bias,
            outptr_tile, ld_out_row, ld_out_col,
            activation_min, activation_max,
            args.output_shape.rows - out_i,  // Number of valid rows remaining
            args.output_shape.cols - out_j,  // Number of valid columns remaining
            working_space
          );

          // Progress the pointers
          inptr_tile += ld_in_row;
          outptr_tile += this->get_output_cols() * ld_out_col;
        }
      }
    }
  }

  public:
  TransformBase(const std::string &name,
                unsigned int output_rows, unsigned int output_cols,
                unsigned int kernel_rows, unsigned int kernel_cols)
  : m_name(name),
    m_output_rows(output_rows), m_output_cols(output_cols),
    m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols)
  {
  }

  const std::string &get_name(void) const override { return m_name; }

  unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; }
  unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; }

  unsigned int get_output_rows(void) const override final { return m_output_rows; }
  unsigned int get_output_cols(void) const override final { return m_output_cols; }

  unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; }
  unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; }

  size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
  {
    return n_threads * this->get_working_space_per_thread(args);
  }

  void execute(
    const ConvolutionArgs &args,
    const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
    const void *bias,
    void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
    void *working_space, unsigned int thread_id, unsigned int n_threads
  ) const override
  {
    execute_internal(
      args,
      reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row,
      reinterpret_cast<const TIn *>(bias),
      reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col,
      working_space, thread_id, n_threads
    );
  }
};

template <typename TIn, typename TOut=TIn>
class TransformUnpadded : public TransformBase<TIn, TOut>
{
  using Kernel = std::function<void(
    unsigned int n_channels,
    const TIn *inptr, size_t ld_in_matrix,
    const TIn *bias,
    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
    TOut activation_min, TOut activation_max
  )>;
  const Kernel m_kernel;

  protected:
  size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
  {
    // We create a buffer the size of the output tile
    const auto n_output_points = this->get_output_rows() * this->get_output_cols();
    return sizeof(TOut) * n_output_points * args.n_output_channels;
  }

  void execute_tile(
    unsigned int n_channels,
    const TIn *inptr, size_t ld_in_matrix,
    const TIn *bias,
    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
    TOut activation_min, TOut activation_max,
    unsigned int valid_rows, unsigned int valid_cols,
    void *working_space
  ) const override final
  {
    // Get copies of the output tensor parameters
    auto kernel_outptr = outptr;
    auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col;

    // If there's padding on either the left or the right, then we execute the
    // kernel into the output buffer and then perform a copy.
    if (valid_rows < this->get_output_rows() ||
        valid_cols < this->get_output_cols())
    {
      // Override the kernel output parameters
      kernel_outptr = reinterpret_cast<TOut *>(working_space);
      kernel_ld_out_col = n_channels;
      kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols();
    }

    // Execute the kernel
    m_kernel(
      n_channels,
      inptr, ld_in_matrix,
      bias,
      kernel_outptr, kernel_ld_out_row, kernel_ld_out_col,
      activation_min, activation_max
    );

    // If necessary, copy from the working space into the destination tensor.
    if (valid_rows < this->get_output_rows() ||
        valid_cols < this->get_output_cols())
    {
      const auto last_row = std::min(valid_rows, this->get_output_rows());
      const auto last_col = std::min(valid_cols, this->get_output_cols());

      for (auto i = 0u; i < last_row; i++)
      {
        auto patch_tile = kernel_outptr;
        auto out_tile = outptr;
        kernel_outptr += kernel_ld_out_row;
        outptr += ld_out_row;

        for (auto j = 0u; j < last_col; j++)
        {
          memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels);
          patch_tile += kernel_ld_out_col;
          out_tile += ld_out_col;
        }
      }
    }
  }

  public:
  TransformUnpadded(const std::string &name,
                    unsigned int output_rows, unsigned int output_cols,
                    unsigned int kernel_rows, unsigned int kernel_cols,
                    const Kernel kernel)
  : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols),
    m_kernel(kernel)
  {
  }

  /* Utility method to get a transposed variant of a kernel, this transposed
   * version simply calls the original kernel with the output row and column
   * strides swapped.
   */
  static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
  {
    return [kernel] (
      const unsigned int n_channels,
      const TIn *const inptr, const size_t ld_in_matrix,
      const TIn *const bias,
      TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col,
      const TOut activation_min, const TOut activation_max
    ) {
      kernel(n_channels, inptr, ld_in_matrix, bias,
             outptr, ld_out_col, ld_out_row,
             activation_min, activation_max);
    };
  }
};

}  // namespace output_transform
}  // namespace winograd
}  // namespace arm_conv