1# mypy: ignore-errors 2 3import contextlib 4import json 5import operator 6import os 7import time 8 9import torch 10from torch.profiler import profile, ProfilerActivity 11 12 13def synchronize(): 14 pass 15 16 17def dump_chrome_trace( 18 f, 19 input, 20 trace_filename, 21 optimize_ctx, 22 activities, 23 num_runs=1, 24 devices=None, 25 kwargs_for_f=None, 26 kwargs_for_profiler=None, 27): 28 """ 29 Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx] 30 [num_runs] times to [trace_filename]. 31 32 [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA. 33 Return total runtime without the profiler 34 35 Outputs to trace_filename 36 """ 37 38 if devices is None: 39 devices = ["cuda"] 40 41 global synchronize 42 if devices != ["cpu"] and torch.cuda.is_available(): 43 synchronize = torch.cuda.synchronize 44 45 if kwargs_for_f is None: 46 kwargs_for_f = {} 47 if kwargs_for_profiler is None: 48 kwargs_for_profiler = {} 49 50 with optimize_ctx: 51 torch.manual_seed(1337) 52 for _ in range(5): # warmup runs 53 f(input, **kwargs_for_f) 54 synchronize() 55 torch.manual_seed(1337) 56 t0 = time.perf_counter() 57 for _ in range(num_runs): 58 f(input, **kwargs_for_f) 59 synchronize() 60 t1 = time.perf_counter() 61 timing = t1 - t0 62 63 with profile(activities=activities, **kwargs_for_profiler) as prof: 64 with optimize_ctx: 65 synchronize() 66 torch.manual_seed(1337) 67 for _ in range(num_runs): 68 f(input, **kwargs_for_f) 69 synchronize() 70 prof.export_chrome_trace(trace_filename) 71 72 return timing 73 74 75def get_chrome_trace_events(filename): 76 f = open(filename) 77 data = json.load(f) 78 events = data["traceEvents"] 79 return events 80 81 82def is_gpu_compute_event(event): 83 global gpu_pids 84 return ( 85 "pid" in event 86 and event["pid"] in gpu_pids 87 and "ph" in event 88 and event["ph"] == "X" 89 ) 90 91 92def get_sorted_gpu_events(events): 93 sorted_gpu_events = [] 94 for event in events: 95 if not is_gpu_compute_event(event): 96 continue 97 sorted_gpu_events.append(event) 98 return sorted(sorted_gpu_events, key=operator.itemgetter("ts")) 99 100 101def get_duration(sorted_gpu_events): 102 if len(sorted_gpu_events) == 0: 103 return 0 104 event = sorted_gpu_events[0] 105 current_end_time = event["ts"] + event["dur"] 106 total_duration = event["dur"] 107 for event in sorted_gpu_events[1:]: 108 start_time = max(event["ts"], current_end_time) 109 end_time = event["ts"] + event["dur"] 110 total_duration = total_duration + max(end_time - start_time, 0) 111 current_end_time = max(current_end_time, end_time) 112 return total_duration 113 114 115def get_sorted_gpu_mm_conv_events(events): 116 def is_mm_conv_event(event): 117 return "name" in event and ( 118 "gemm" in event["name"] 119 or "conv" in event["name"] 120 or "cutlass" in event["name"] 121 or "wgrad" in event["name"] 122 ) 123 124 gpu_events = get_sorted_gpu_events(events) 125 sorted_events = [] 126 for event in gpu_events: 127 if not is_mm_conv_event(event): 128 continue 129 sorted_events.append(event) 130 return sorted_events 131 132 133gpu_pids = [] 134 135 136def compute_utilization(filename: str, total_length: float): 137 """ 138 Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization 139 and percent of times spent on matmul and convolution 140 141 Args: 142 filename(str): Name of chrome traces file produced by pytorch profiler 143 144 total_length(float): total length of the process without profiler in second 145 146 Return: 147 tuple: (GPU Utilization, percent of time spent on matmul and convolution) 148 """ 149 events = get_chrome_trace_events(filename) 150 151 # get pids of GPU events 152 global gpu_pids 153 gpu_pids = [] 154 for event in events: 155 if "name" not in event: 156 continue 157 if event["name"] == "process_labels" and "GPU" in event["args"]["labels"]: 158 gpu_pids.append(event["pid"]) 159 160 total_length = total_length * 1e6 161 sorted_gpu_events = get_sorted_gpu_events(events) 162 utilization = get_duration(sorted_gpu_events) / total_length 163 164 sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events) 165 mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length 166 167 return utilization, mm_conv_utilization 168 169 170def benchmark_utilization( 171 f, 172 input, 173 trace_folder, 174 optimize_ctx=None, 175 trace_file_name="tmp_chrome_trace", 176 num_runs=1, 177): 178 """ 179 Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of 180 running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times. 181 It will produce a chrome trace file in trace_folder/trace_file_name.json 182 183 Example: 184 185 ``` 186 def f(a): 187 return a.sum() 188 a = torch.rand(2**20, device="cuda") 189 utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace") 190 ``` 191 192 Args: 193 f: function to benchmark 194 195 input: input to :attr:`f` 196 197 trace_folder: name of the folder to store the chrome trace 198 199 optimize_ctx: the context in which f will run 200 201 trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace" 202 203 num_runs: number of times to run f, excluding the warm-up runs, default to 1. 204 205 Return: 206 tuple: (GPU Utilization, percent of time spent on matmul and convolution) 207 208 """ 209 isExist = os.path.exists(trace_folder) 210 if not isExist: 211 os.makedirs(trace_folder) 212 print("create folder " + trace_folder) 213 214 if optimize_ctx is None: 215 optimize_ctx = contextlib.nullcontext() 216 217 chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json") 218 total_length = dump_chrome_trace( 219 f, 220 input, 221 chrome_trace_file_name, 222 optimize_ctx, 223 [ProfilerActivity.CUDA], 224 num_runs=num_runs, 225 devices="cuda", 226 ) 227 utilization, mm_conv_utilization = compute_utilization( 228 chrome_trace_file_name, total_length 229 ) 230 231 return utilization, mm_conv_utilization 232