1 #!/usr/bin/env python3 2 3 """This script runs cuda-memcheck on the specified unit test. Each test case 4 is run in its isolated process with a timeout so that: 5 1) different test cases won't influence each other, and 6 2) in case of hang, the script would still finish in a finite amount of time. 7 The output will be written to a log file result.log 8 9 Example usage: 10 python run_cuda_memcheck.py ../test_torch.py 600 11 12 Note that running cuda-memcheck could be very slow. 13 """ 14 15 import argparse 16 import asyncio 17 import multiprocessing 18 import os 19 import subprocess 20 import sys 21 22 import cuda_memcheck_common as cmc 23 import tqdm 24 25 import torch 26 27 28 ALL_TESTS = [] 29 GPUS = torch.cuda.device_count() 30 31 # parse arguments 32 parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests") 33 parser.add_argument( 34 "filename", help="the python file for a test, such as test_torch.py" 35 ) 36 parser.add_argument( 37 "timeout", 38 type=int, 39 help="kill the test if it does not terminate in a certain amount of seconds", 40 ) 41 parser.add_argument( 42 "--strict", 43 action="store_true", 44 help="Whether to show cublas/cudnn errors. These errors are ignored by default because" 45 "cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors", 46 ) 47 parser.add_argument( 48 "--nproc", 49 type=int, 50 default=multiprocessing.cpu_count(), 51 help="Number of processes running tests, default to number of cores in the system", 52 ) 53 parser.add_argument( 54 "--gpus", 55 default="all", 56 help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"', 57 ) 58 parser.add_argument( 59 "--ci", 60 action="store_true", 61 help="Whether this script is executed in CI. When executed inside a CI, this script fails when " 62 "an error is detected. Also, it will not show tqdm progress bar, but directly print the error" 63 "to stdout instead.", 64 ) 65 parser.add_argument("--nohang", action="store_true", help="Treat timeout as success") 66 parser.add_argument("--split", type=int, default=1, help="Split the job into pieces") 67 parser.add_argument( 68 "--rank", type=int, default=0, help="Which piece this process should pick" 69 ) 70 args = parser.parse_args() 71 72 73 # Filters that ignores cublas/cudnn errors 74 # TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck? 75 def is_ignored_only(output): 76 try: 77 report = cmc.parse(output) 78 except cmc.ParseError: 79 # in case the simple parser fails parsing the output of cuda memcheck 80 # then this error is never ignored. 81 return False 82 count_ignored_errors = 0 83 for e in report.errors: 84 if ( 85 "libcublas" in "".join(e.stack) 86 or "libcudnn" in "".join(e.stack) 87 or "libcufft" in "".join(e.stack) 88 ): 89 count_ignored_errors += 1 90 return count_ignored_errors == report.num_errors 91 92 93 # Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests 94 os.environ["PYTORCH_CUDA_MEMCHECK"] = "1" 95 96 # Discover tests: 97 # To get a list of tests, run: 98 # pytest --setup-only test/test_torch.py 99 # and then parse the output 100 proc = subprocess.Popen( 101 ["pytest", "--setup-only", args.filename], 102 stdout=subprocess.PIPE, 103 stderr=subprocess.PIPE, 104 ) 105 stdout, stderr = proc.communicate() 106 lines = stdout.decode().strip().splitlines() 107 for line in lines: 108 if "(fixtures used:" in line: 109 line = line.strip().split()[0] 110 line = line[line.find("::") + 2 :] 111 line = line.replace("::", ".") 112 ALL_TESTS.append(line) 113 114 115 # Do a simple filtering: 116 # if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it 117 def is_cpu_only(name): 118 name = name.lower() 119 return ("cpu" in name) and "cuda" not in name 120 121 122 ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)] 123 124 # Split all tests into chunks, and only on the selected chunk 125 ALL_TESTS.sort() 126 chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split 127 start = chunk_size * args.rank 128 end = chunk_size * (args.rank + 1) 129 ALL_TESTS = ALL_TESTS[start:end] 130 131 # Run tests: 132 # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel. 133 # This is done by using the coroutine feature in new Python versions. A number of coroutines are created; 134 # they create subprocesses and awaiting them to finish. The number of running subprocesses could be 135 # specified by the user and by default is the same as the number of CPUs in the machine. 136 # These subprocesses are balanced across different GPUs on the system by assigning one devices per process, 137 # or as specified by the user 138 progress = 0 139 if not args.ci: 140 logfile = open("result.log", "w") 141 progressbar = tqdm.tqdm(total=len(ALL_TESTS)) 142 else: 143 logfile = sys.stdout 144 145 # create a fake progress bar that does not display anything 146 class ProgressbarStub: 147 def update(self, *args): 148 return 149 150 progressbar = ProgressbarStub() 151 152 153 async def run1(coroutine_id): 154 global progress 155 156 if args.gpus == "all": 157 gpuid = coroutine_id % GPUS 158 else: 159 gpu_assignments = args.gpus.split(":") 160 assert args.nproc == len( 161 gpu_assignments 162 ), "Please specify GPU assignment for each process, separated by :" 163 gpuid = gpu_assignments[coroutine_id] 164 165 while progress < len(ALL_TESTS): 166 test = ALL_TESTS[progress] 167 progress += 1 168 cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}" 169 proc = await asyncio.create_subprocess_shell( 170 cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE 171 ) 172 try: 173 stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout) 174 except asyncio.TimeoutError: 175 print("Timeout:", test, file=logfile) 176 proc.kill() 177 if args.ci and not args.nohang: 178 sys.exit("Hang detected on cuda-memcheck") 179 else: 180 if proc.returncode == 0: 181 print("Success:", test, file=logfile) 182 else: 183 stdout = stdout.decode() 184 stderr = stderr.decode() 185 should_display = args.strict or not is_ignored_only(stdout) 186 if should_display: 187 print("Fail:", test, file=logfile) 188 print(stdout, file=logfile) 189 print(stderr, file=logfile) 190 if args.ci: 191 sys.exit("Failure detected on cuda-memcheck") 192 else: 193 print("Ignored:", test, file=logfile) 194 del proc 195 progressbar.update(1) 196 197 198 async def main(): 199 tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)] 200 for t in tasks: 201 await t 202 203 204 if __name__ == "__main__": 205 loop = asyncio.get_event_loop() 206 loop.run_until_complete(main()) 207