1*c217d954SCole Faust /*
2*c217d954SCole Faust * Copyright (c) 2017-2019, 2021 Arm Limited.
3*c217d954SCole Faust *
4*c217d954SCole Faust * SPDX-License-Identifier: MIT
5*c217d954SCole Faust *
6*c217d954SCole Faust * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust *
13*c217d954SCole Faust * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust * copies or substantial portions of the Software.
15*c217d954SCole Faust *
16*c217d954SCole Faust * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust * SOFTWARE.
23*c217d954SCole Faust */
24*c217d954SCole Faust #include "OpenCLTimer.h"
25*c217d954SCole Faust
26*c217d954SCole Faust #include "../Framework.h"
27*c217d954SCole Faust #include "../Utils.h"
28*c217d954SCole Faust
29*c217d954SCole Faust #include "arm_compute/graph/INode.h"
30*c217d954SCole Faust #include "arm_compute/runtime/CL/CLScheduler.h"
31*c217d954SCole Faust
32*c217d954SCole Faust #ifndef ARM_COMPUTE_CL
33*c217d954SCole Faust #error "You can't use OpenCLTimer without OpenCL"
34*c217d954SCole Faust #endif /* ARM_COMPUTE_CL */
35*c217d954SCole Faust
36*c217d954SCole Faust namespace arm_compute
37*c217d954SCole Faust {
38*c217d954SCole Faust namespace test
39*c217d954SCole Faust {
40*c217d954SCole Faust namespace framework
41*c217d954SCole Faust {
42*c217d954SCole Faust template <bool output_timestamps>
id() const43*c217d954SCole Faust std::string OpenCLClock<output_timestamps>::id() const
44*c217d954SCole Faust {
45*c217d954SCole Faust if(output_timestamps)
46*c217d954SCole Faust {
47*c217d954SCole Faust return "OpenCLTimestamps";
48*c217d954SCole Faust }
49*c217d954SCole Faust else
50*c217d954SCole Faust {
51*c217d954SCole Faust return "OpenCLTimer";
52*c217d954SCole Faust }
53*c217d954SCole Faust }
54*c217d954SCole Faust
55*c217d954SCole Faust template <bool output_timestamps>
OpenCLClock(ScaleFactor scale_factor)56*c217d954SCole Faust OpenCLClock<output_timestamps>::OpenCLClock(ScaleFactor scale_factor)
57*c217d954SCole Faust : _kernels(),
58*c217d954SCole Faust _real_function(nullptr),
59*c217d954SCole Faust #ifdef ARM_COMPUTE_GRAPH_ENABLED
60*c217d954SCole Faust _real_graph_function(nullptr),
61*c217d954SCole Faust #endif /* ARM_COMPUTE_GRAPH_ENABLED */
62*c217d954SCole Faust _prefix(),
63*c217d954SCole Faust _timer_enabled(false)
64*c217d954SCole Faust {
65*c217d954SCole Faust auto q = CLScheduler::get().queue();
66*c217d954SCole Faust cl_command_queue_properties props = q.getInfo<CL_QUEUE_PROPERTIES>();
67*c217d954SCole Faust if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
68*c217d954SCole Faust {
69*c217d954SCole Faust CLScheduler::get().set_queue(cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE));
70*c217d954SCole Faust }
71*c217d954SCole Faust
72*c217d954SCole Faust switch(scale_factor)
73*c217d954SCole Faust {
74*c217d954SCole Faust case ScaleFactor::NONE:
75*c217d954SCole Faust _scale_factor = 1.f;
76*c217d954SCole Faust _unit = "ns";
77*c217d954SCole Faust break;
78*c217d954SCole Faust case ScaleFactor::TIME_US:
79*c217d954SCole Faust _scale_factor = 1000.f;
80*c217d954SCole Faust _unit = "us";
81*c217d954SCole Faust break;
82*c217d954SCole Faust case ScaleFactor::TIME_MS:
83*c217d954SCole Faust _scale_factor = 1000000.f;
84*c217d954SCole Faust _unit = "ms";
85*c217d954SCole Faust break;
86*c217d954SCole Faust case ScaleFactor::TIME_S:
87*c217d954SCole Faust _scale_factor = 1000000000.f;
88*c217d954SCole Faust _unit = "s";
89*c217d954SCole Faust break;
90*c217d954SCole Faust default:
91*c217d954SCole Faust ARM_COMPUTE_ERROR("Invalid scale");
92*c217d954SCole Faust }
93*c217d954SCole Faust }
94*c217d954SCole Faust
95*c217d954SCole Faust template <bool output_timestamps>
test_start()96*c217d954SCole Faust void OpenCLClock<output_timestamps>::test_start()
97*c217d954SCole Faust {
98*c217d954SCole Faust // Start intercepting enqueues:
99*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(_real_function != nullptr);
100*c217d954SCole Faust _real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
101*c217d954SCole Faust auto interceptor = [this](
102*c217d954SCole Faust cl_command_queue command_queue,
103*c217d954SCole Faust cl_kernel kernel,
104*c217d954SCole Faust cl_uint work_dim,
105*c217d954SCole Faust const size_t *gwo,
106*c217d954SCole Faust const size_t *gws,
107*c217d954SCole Faust const size_t *lws,
108*c217d954SCole Faust cl_uint num_events_in_wait_list,
109*c217d954SCole Faust const cl_event * event_wait_list,
110*c217d954SCole Faust cl_event * event)
111*c217d954SCole Faust {
112*c217d954SCole Faust if(this->_timer_enabled)
113*c217d954SCole Faust {
114*c217d954SCole Faust kernel_info info;
115*c217d954SCole Faust cl::Kernel cpp_kernel(kernel, true);
116*c217d954SCole Faust std::stringstream ss;
117*c217d954SCole Faust ss << this->_prefix << cpp_kernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
118*c217d954SCole Faust if(gws != nullptr)
119*c217d954SCole Faust {
120*c217d954SCole Faust ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]";
121*c217d954SCole Faust }
122*c217d954SCole Faust if(lws != nullptr)
123*c217d954SCole Faust {
124*c217d954SCole Faust ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]";
125*c217d954SCole Faust }
126*c217d954SCole Faust info.name = ss.str();
127*c217d954SCole Faust cl_event tmp;
128*c217d954SCole Faust cl_int retval = this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
129*c217d954SCole Faust info.event = tmp;
130*c217d954SCole Faust this->_kernels.push_back(std::move(info));
131*c217d954SCole Faust
132*c217d954SCole Faust if(event != nullptr)
133*c217d954SCole Faust {
134*c217d954SCole Faust //return cl_event from the intercepted call
135*c217d954SCole Faust clRetainEvent(tmp);
136*c217d954SCole Faust *event = tmp;
137*c217d954SCole Faust }
138*c217d954SCole Faust return retval;
139*c217d954SCole Faust }
140*c217d954SCole Faust else
141*c217d954SCole Faust {
142*c217d954SCole Faust return this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, event);
143*c217d954SCole Faust }
144*c217d954SCole Faust };
145*c217d954SCole Faust CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
146*c217d954SCole Faust
147*c217d954SCole Faust #ifdef ARM_COMPUTE_GRAPH_ENABLED
148*c217d954SCole Faust ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
149*c217d954SCole Faust _real_graph_function = graph::TaskExecutor::get().execute_function;
150*c217d954SCole Faust // Start intercepting tasks:
151*c217d954SCole Faust auto task_interceptor = [this](graph::ExecutionTask & task)
152*c217d954SCole Faust {
153*c217d954SCole Faust if(task.node != nullptr && !task.node->name().empty())
154*c217d954SCole Faust {
155*c217d954SCole Faust this->_prefix = task.node->name() + "/";
156*c217d954SCole Faust }
157*c217d954SCole Faust else
158*c217d954SCole Faust {
159*c217d954SCole Faust this->_prefix = "";
160*c217d954SCole Faust }
161*c217d954SCole Faust this->_real_graph_function(task);
162*c217d954SCole Faust this->_prefix = "";
163*c217d954SCole Faust };
164*c217d954SCole Faust graph::TaskExecutor::get().execute_function = task_interceptor;
165*c217d954SCole Faust #endif /* ARM_COMPUTE_GRAPH_ENABLED */
166*c217d954SCole Faust }
167*c217d954SCole Faust
168*c217d954SCole Faust template <bool output_timestamps>
start()169*c217d954SCole Faust void OpenCLClock<output_timestamps>::start()
170*c217d954SCole Faust {
171*c217d954SCole Faust _kernels.clear();
172*c217d954SCole Faust _timer_enabled = true;
173*c217d954SCole Faust }
174*c217d954SCole Faust template <bool output_timestamps>
stop()175*c217d954SCole Faust void OpenCLClock<output_timestamps>::stop()
176*c217d954SCole Faust {
177*c217d954SCole Faust _timer_enabled = false;
178*c217d954SCole Faust }
179*c217d954SCole Faust
180*c217d954SCole Faust template <bool output_timestamps>
test_stop()181*c217d954SCole Faust void OpenCLClock<output_timestamps>::test_stop()
182*c217d954SCole Faust {
183*c217d954SCole Faust // Restore real function
184*c217d954SCole Faust CLSymbols::get().clEnqueueNDRangeKernel_ptr = _real_function;
185*c217d954SCole Faust _real_function = nullptr;
186*c217d954SCole Faust #ifdef ARM_COMPUTE_GRAPH_ENABLED
187*c217d954SCole Faust graph::TaskExecutor::get().execute_function = _real_graph_function;
188*c217d954SCole Faust _real_graph_function = nullptr;
189*c217d954SCole Faust #endif /* ARM_COMPUTE_GRAPH_ENABLED */
190*c217d954SCole Faust }
191*c217d954SCole Faust
192*c217d954SCole Faust template <bool output_timestamps>
measurements() const193*c217d954SCole Faust Instrument::MeasurementsMap OpenCLClock<output_timestamps>::measurements() const
194*c217d954SCole Faust {
195*c217d954SCole Faust MeasurementsMap measurements;
196*c217d954SCole Faust unsigned int kernel_number = 0;
197*c217d954SCole Faust for(auto const &kernel : _kernels)
198*c217d954SCole Faust {
199*c217d954SCole Faust cl_ulong queued;
200*c217d954SCole Faust cl_ulong flushed;
201*c217d954SCole Faust cl_ulong start;
202*c217d954SCole Faust cl_ulong end;
203*c217d954SCole Faust kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &queued);
204*c217d954SCole Faust kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &flushed);
205*c217d954SCole Faust kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
206*c217d954SCole Faust kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
207*c217d954SCole Faust std::string name = kernel.name + " #" + support::cpp11::to_string(kernel_number++);
208*c217d954SCole Faust
209*c217d954SCole Faust if(output_timestamps)
210*c217d954SCole Faust {
211*c217d954SCole Faust measurements.emplace("[start]" + name, Measurement(start / static_cast<cl_ulong>(_scale_factor), _unit));
212*c217d954SCole Faust measurements.emplace("[queued]" + name, Measurement(queued / static_cast<cl_ulong>(_scale_factor), _unit));
213*c217d954SCole Faust measurements.emplace("[flushed]" + name, Measurement(flushed / static_cast<cl_ulong>(_scale_factor), _unit));
214*c217d954SCole Faust measurements.emplace("[end]" + name, Measurement(end / static_cast<cl_ulong>(_scale_factor), _unit));
215*c217d954SCole Faust }
216*c217d954SCole Faust else
217*c217d954SCole Faust {
218*c217d954SCole Faust measurements.emplace(name, Measurement((end - start) / _scale_factor, _unit));
219*c217d954SCole Faust }
220*c217d954SCole Faust }
221*c217d954SCole Faust
222*c217d954SCole Faust return measurements;
223*c217d954SCole Faust }
224*c217d954SCole Faust
225*c217d954SCole Faust template <bool output_timestamps>
test_measurements() const226*c217d954SCole Faust Instrument::MeasurementsMap OpenCLClock<output_timestamps>::test_measurements() const
227*c217d954SCole Faust {
228*c217d954SCole Faust MeasurementsMap measurements;
229*c217d954SCole Faust
230*c217d954SCole Faust if(output_timestamps)
231*c217d954SCole Faust {
232*c217d954SCole Faust // The OpenCL clock and the wall clock are not in sync, so we use
233*c217d954SCole Faust // this trick to calculate the offset between the two clocks:
234*c217d954SCole Faust ::cl::Event event;
235*c217d954SCole Faust cl_ulong now_gpu;
236*c217d954SCole Faust
237*c217d954SCole Faust // Enqueue retrieve current CPU clock and enqueue a dummy marker
238*c217d954SCole Faust std::chrono::high_resolution_clock::time_point now_cpu = std::chrono::high_resolution_clock::now();
239*c217d954SCole Faust CLScheduler::get().queue().enqueueMarker(&event);
240*c217d954SCole Faust
241*c217d954SCole Faust CLScheduler::get().queue().finish();
242*c217d954SCole Faust //Access the time at which the marker was enqueued:
243*c217d954SCole Faust event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &now_gpu);
244*c217d954SCole Faust
245*c217d954SCole Faust measurements.emplace("Now Wall clock", Measurement(now_cpu.time_since_epoch().count() / 1000, "us"));
246*c217d954SCole Faust measurements.emplace("Now OpenCL", Measurement(now_gpu / static_cast<cl_ulong>(_scale_factor), _unit));
247*c217d954SCole Faust }
248*c217d954SCole Faust
249*c217d954SCole Faust return measurements;
250*c217d954SCole Faust }
251*c217d954SCole Faust
252*c217d954SCole Faust } // namespace framework
253*c217d954SCole Faust } // namespace test
254*c217d954SCole Faust } // namespace arm_compute
255*c217d954SCole Faust
256*c217d954SCole Faust template class arm_compute::test::framework::OpenCLClock<true>;
257*c217d954SCole Faust template class arm_compute::test::framework::OpenCLClock<false>;
258