xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
17 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
18 
19 #include <vector>
20 
21 #include "absl/base/const_init.h"
22 #include "absl/base/thread_annotations.h"
23 #include "absl/container/flat_hash_map.h"
24 #include "absl/synchronization/mutex.h"
25 #include "absl/types/span.h"
26 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
27 #include "tensorflow/compiler/xla/stream_executor/kernel.h"
28 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
29 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
30 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
31 #if GOOGLE_CUDA
32 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
33 #endif  // GOOGLE_CUDA
34 
35 namespace stream_executor {
36 namespace gpu {
37 class GpuContext;
38 }
39 
40 // Compiles the given PTX string using ptxas and returns the resulting machine
41 // code (i.e. a cubin) as a byte array. The generated cubin matches the compute
42 // capabilities of the device associated with 'device_ordinal'.
43 //
44 // 'options' is used to query for the CUDA location in case it is
45 // customized in a passed flag, and for controlling ptxas optimizations.
46 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
47                                                  const char* ptx_contents,
48                                                  GpuAsmOpts options);
49 
50 // Compiles the given PTX string using ptxas and returns the resulting machine
51 // code (i.e. a cubin) as a byte array. The generated cubin matches the compute
52 // capabilities provided by 'cc_major' and 'cc_minor'.
53 //
54 // 'options' is used to query for the CUDA location in case it is
55 // customized in a passed flag, and for controlling ptxas optimizations.
56 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
57                                                  const char* ptx_contents,
58                                                  GpuAsmOpts options);
59 
60 // Same as CompileGpuAsm, but caches the result, and returns unowned view of
61 // the compiled binary.
62 //
63 // A copy of the string provided in ptx will be made.
64 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
65     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
66 
67 struct CubinOrPTXImage {
68   std::string profile;
69   std::vector<uint8> bytes;
70 };
71 
72 // Bundles the GPU machine code (cubins) and PTX if requested and returns the
73 // resulting binary (i.e. a fatbin) as a byte array.
74 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
75     std::vector<CubinOrPTXImage> images, GpuAsmOpts options);
76 
77 struct HsacoImage {
78   std::string gfx_arch;
79   std::vector<uint8> bytes;
80 };
81 
82 // Bundles the GPU machine code (HSA Code Object) and returns the resulting
83 // binary (i.e. a fatbin) as a byte array.
84 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
85     std::vector<HsacoImage> images, const std::string rocm_root_dir);
86 
87 // Links multiple relocatable GPU images (e.g. results of ptxas -c) into a
88 // single image.
89 port::StatusOr<std::vector<uint8>> LinkGpuAsm(
90     gpu::GpuContext* context, std::vector<CubinOrPTXImage> images);
91 
92 #if GOOGLE_CUDA
93 // Maintains a cache of pointers to loaded kernels
94 template <typename... Args>
LoadKernelOrGetPtr(StreamExecutor * executor,absl::string_view kernel_name,absl::string_view ptx,absl::Span<const uint8> cubin_data)95 port::StatusOr<std::shared_ptr<TypedKernel<Args...>>> LoadKernelOrGetPtr(
96     StreamExecutor* executor, absl::string_view kernel_name,
97     absl::string_view ptx, absl::Span<const uint8> cubin_data) {
98   using KernelPtrCacheKey =
99       std::tuple<CUcontext, absl::string_view, absl::string_view>;
100 
101   static absl::Mutex kernel_ptr_cache_mutex(absl::kConstInit);
102   static auto& kernel_ptr_cache ABSL_GUARDED_BY(kernel_ptr_cache_mutex) =
103       *new absl::flat_hash_map<KernelPtrCacheKey,
104                                std::shared_ptr<TypedKernel<Args...>>>();
105   CUcontext current_context = cuda::CurrentContextOrDie();
106   KernelPtrCacheKey kernel_ptr_cache_key{current_context, kernel_name, ptx};
107   absl::MutexLock lock(&kernel_ptr_cache_mutex);
108 
109   auto it = kernel_ptr_cache.find(kernel_ptr_cache_key);
110   if (it == kernel_ptr_cache.end()) {
111     TF_ASSIGN_OR_RETURN(
112         std::shared_ptr<TypedKernel<Args...>> loaded,
113         executor->CreateTypedKernel<Args...>(kernel_name, ptx, cubin_data));
114     it =
115         kernel_ptr_cache.emplace(kernel_ptr_cache_key, std::move(loaded)).first;
116   }
117 
118   CHECK(it != kernel_ptr_cache.end());
119   return it->second;
120 }
121 #endif  // GOOGLE_CUDA
122 
123 }  // namespace stream_executor
124 
125 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
126