xref: /aosp_15_r20/external/pytorch/torch/_functorch/benchmark_utils.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1# mypy: ignore-errors
2
3import contextlib
4import json
5import operator
6import os
7import time
8
9import torch
10from torch.profiler import profile, ProfilerActivity
11
12
13def synchronize():
14    pass
15
16
17def dump_chrome_trace(
18    f,
19    input,
20    trace_filename,
21    optimize_ctx,
22    activities,
23    num_runs=1,
24    devices=None,
25    kwargs_for_f=None,
26    kwargs_for_profiler=None,
27):
28    """
29    Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
30    [num_runs] times to [trace_filename].
31
32    [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
33    Return total runtime without the profiler
34
35    Outputs to trace_filename
36    """
37
38    if devices is None:
39        devices = ["cuda"]
40
41    global synchronize
42    if devices != ["cpu"] and torch.cuda.is_available():
43        synchronize = torch.cuda.synchronize
44
45    if kwargs_for_f is None:
46        kwargs_for_f = {}
47    if kwargs_for_profiler is None:
48        kwargs_for_profiler = {}
49
50    with optimize_ctx:
51        torch.manual_seed(1337)
52        for _ in range(5):  # warmup runs
53            f(input, **kwargs_for_f)
54            synchronize()
55        torch.manual_seed(1337)
56        t0 = time.perf_counter()
57        for _ in range(num_runs):
58            f(input, **kwargs_for_f)
59            synchronize()
60        t1 = time.perf_counter()
61    timing = t1 - t0
62
63    with profile(activities=activities, **kwargs_for_profiler) as prof:
64        with optimize_ctx:
65            synchronize()
66            torch.manual_seed(1337)
67            for _ in range(num_runs):
68                f(input, **kwargs_for_f)
69                synchronize()
70    prof.export_chrome_trace(trace_filename)
71
72    return timing
73
74
75def get_chrome_trace_events(filename):
76    f = open(filename)
77    data = json.load(f)
78    events = data["traceEvents"]
79    return events
80
81
82def is_gpu_compute_event(event):
83    global gpu_pids
84    return (
85        "pid" in event
86        and event["pid"] in gpu_pids
87        and "ph" in event
88        and event["ph"] == "X"
89    )
90
91
92def get_sorted_gpu_events(events):
93    sorted_gpu_events = []
94    for event in events:
95        if not is_gpu_compute_event(event):
96            continue
97        sorted_gpu_events.append(event)
98    return sorted(sorted_gpu_events, key=operator.itemgetter("ts"))
99
100
101def get_duration(sorted_gpu_events):
102    if len(sorted_gpu_events) == 0:
103        return 0
104    event = sorted_gpu_events[0]
105    current_end_time = event["ts"] + event["dur"]
106    total_duration = event["dur"]
107    for event in sorted_gpu_events[1:]:
108        start_time = max(event["ts"], current_end_time)
109        end_time = event["ts"] + event["dur"]
110        total_duration = total_duration + max(end_time - start_time, 0)
111        current_end_time = max(current_end_time, end_time)
112    return total_duration
113
114
115def get_sorted_gpu_mm_conv_events(events):
116    def is_mm_conv_event(event):
117        return "name" in event and (
118            "gemm" in event["name"]
119            or "conv" in event["name"]
120            or "cutlass" in event["name"]
121            or "wgrad" in event["name"]
122        )
123
124    gpu_events = get_sorted_gpu_events(events)
125    sorted_events = []
126    for event in gpu_events:
127        if not is_mm_conv_event(event):
128            continue
129        sorted_events.append(event)
130    return sorted_events
131
132
133gpu_pids = []
134
135
136def compute_utilization(filename: str, total_length: float):
137    """
138    Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
139    and percent of times spent on matmul and convolution
140
141    Args:
142        filename(str): Name of chrome traces file produced by pytorch profiler
143
144        total_length(float): total length of the process without profiler in second
145
146    Return:
147        tuple: (GPU Utilization, percent of time spent on matmul and convolution)
148    """
149    events = get_chrome_trace_events(filename)
150
151    # get pids of GPU events
152    global gpu_pids
153    gpu_pids = []
154    for event in events:
155        if "name" not in event:
156            continue
157        if event["name"] == "process_labels" and "GPU" in event["args"]["labels"]:
158            gpu_pids.append(event["pid"])
159
160    total_length = total_length * 1e6
161    sorted_gpu_events = get_sorted_gpu_events(events)
162    utilization = get_duration(sorted_gpu_events) / total_length
163
164    sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
165    mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length
166
167    return utilization, mm_conv_utilization
168
169
170def benchmark_utilization(
171    f,
172    input,
173    trace_folder,
174    optimize_ctx=None,
175    trace_file_name="tmp_chrome_trace",
176    num_runs=1,
177):
178    """
179    Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of
180    running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
181    It will produce a chrome trace file in trace_folder/trace_file_name.json
182
183    Example:
184
185    ```
186    def f(a):
187        return a.sum()
188    a = torch.rand(2**20, device="cuda")
189    utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
190    ```
191
192    Args:
193        f: function to benchmark
194
195        input: input to :attr:`f`
196
197        trace_folder: name of the folder to store the chrome trace
198
199        optimize_ctx: the context in which f will run
200
201        trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"
202
203        num_runs: number of times to run f, excluding the warm-up runs, default to 1.
204
205    Return:
206        tuple: (GPU Utilization, percent of time spent on matmul and convolution)
207
208    """
209    isExist = os.path.exists(trace_folder)
210    if not isExist:
211        os.makedirs(trace_folder)
212        print("create folder " + trace_folder)
213
214    if optimize_ctx is None:
215        optimize_ctx = contextlib.nullcontext()
216
217    chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
218    total_length = dump_chrome_trace(
219        f,
220        input,
221        chrome_trace_file_name,
222        optimize_ctx,
223        [ProfilerActivity.CUDA],
224        num_runs=num_runs,
225        devices="cuda",
226    )
227    utilization, mm_conv_utilization = compute_utilization(
228        chrome_trace_file_name, total_length
229    )
230
231    return utilization, mm_conv_utilization
232