xref: /aosp_15_r20/external/pytorch/test/scripts/run_cuda_memcheck.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #!/usr/bin/env python3
2 
3 """This script runs cuda-memcheck on the specified unit test. Each test case
4 is run in its isolated process with a timeout so that:
5 1) different test cases won't influence each other, and
6 2) in case of hang, the script would still finish in a finite amount of time.
7 The output will be written to a log file result.log
8 
9 Example usage:
10     python run_cuda_memcheck.py ../test_torch.py 600
11 
12 Note that running cuda-memcheck could be very slow.
13 """
14 
15 import argparse
16 import asyncio
17 import multiprocessing
18 import os
19 import subprocess
20 import sys
21 
22 import cuda_memcheck_common as cmc
23 import tqdm
24 
25 import torch
26 
27 
28 ALL_TESTS = []
29 GPUS = torch.cuda.device_count()
30 
31 # parse arguments
32 parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
33 parser.add_argument(
34     "filename", help="the python file for a test, such as test_torch.py"
35 )
36 parser.add_argument(
37     "timeout",
38     type=int,
39     help="kill the test if it does not terminate in a certain amount of seconds",
40 )
41 parser.add_argument(
42     "--strict",
43     action="store_true",
44     help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
45     "cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
46 )
47 parser.add_argument(
48     "--nproc",
49     type=int,
50     default=multiprocessing.cpu_count(),
51     help="Number of processes running tests, default to number of cores in the system",
52 )
53 parser.add_argument(
54     "--gpus",
55     default="all",
56     help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
57 )
58 parser.add_argument(
59     "--ci",
60     action="store_true",
61     help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
62     "an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
63     "to stdout instead.",
64 )
65 parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
66 parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
67 parser.add_argument(
68     "--rank", type=int, default=0, help="Which piece this process should pick"
69 )
70 args = parser.parse_args()
71 
72 
73 # Filters that ignores cublas/cudnn errors
74 # TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
75 def is_ignored_only(output):
76     try:
77         report = cmc.parse(output)
78     except cmc.ParseError:
79         # in case the simple parser fails parsing the output of cuda memcheck
80         # then this error is never ignored.
81         return False
82     count_ignored_errors = 0
83     for e in report.errors:
84         if (
85             "libcublas" in "".join(e.stack)
86             or "libcudnn" in "".join(e.stack)
87             or "libcufft" in "".join(e.stack)
88         ):
89             count_ignored_errors += 1
90     return count_ignored_errors == report.num_errors
91 
92 
93 # Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
94 os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"
95 
96 # Discover tests:
97 # To get a list of tests, run:
98 # pytest --setup-only test/test_torch.py
99 # and then parse the output
100 proc = subprocess.Popen(
101     ["pytest", "--setup-only", args.filename],
102     stdout=subprocess.PIPE,
103     stderr=subprocess.PIPE,
104 )
105 stdout, stderr = proc.communicate()
106 lines = stdout.decode().strip().splitlines()
107 for line in lines:
108     if "(fixtures used:" in line:
109         line = line.strip().split()[0]
110         line = line[line.find("::") + 2 :]
111         line = line.replace("::", ".")
112         ALL_TESTS.append(line)
113 
114 
115 # Do a simple filtering:
116 # if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
117 def is_cpu_only(name):
118     name = name.lower()
119     return ("cpu" in name) and "cuda" not in name
120 
121 
122 ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
123 
124 # Split all tests into chunks, and only on the selected chunk
125 ALL_TESTS.sort()
126 chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
127 start = chunk_size * args.rank
128 end = chunk_size * (args.rank + 1)
129 ALL_TESTS = ALL_TESTS[start:end]
130 
131 # Run tests:
132 # Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
133 # This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
134 # they create subprocesses and awaiting them to finish. The number of running subprocesses could be
135 # specified by the user and by default is the same as the number of CPUs in the machine.
136 # These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
137 # or as specified by the user
138 progress = 0
139 if not args.ci:
140     logfile = open("result.log", "w")
141     progressbar = tqdm.tqdm(total=len(ALL_TESTS))
142 else:
143     logfile = sys.stdout
144 
145     # create a fake progress bar that does not display anything
146     class ProgressbarStub:
147         def update(self, *args):
148             return
149 
150     progressbar = ProgressbarStub()
151 
152 
153 async def run1(coroutine_id):
154     global progress
155 
156     if args.gpus == "all":
157         gpuid = coroutine_id % GPUS
158     else:
159         gpu_assignments = args.gpus.split(":")
160         assert args.nproc == len(
161             gpu_assignments
162         ), "Please specify GPU assignment for each process, separated by :"
163         gpuid = gpu_assignments[coroutine_id]
164 
165     while progress < len(ALL_TESTS):
166         test = ALL_TESTS[progress]
167         progress += 1
168         cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
169         proc = await asyncio.create_subprocess_shell(
170             cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
171         )
172         try:
173             stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
174         except asyncio.TimeoutError:
175             print("Timeout:", test, file=logfile)
176             proc.kill()
177             if args.ci and not args.nohang:
178                 sys.exit("Hang detected on cuda-memcheck")
179         else:
180             if proc.returncode == 0:
181                 print("Success:", test, file=logfile)
182             else:
183                 stdout = stdout.decode()
184                 stderr = stderr.decode()
185                 should_display = args.strict or not is_ignored_only(stdout)
186                 if should_display:
187                     print("Fail:", test, file=logfile)
188                     print(stdout, file=logfile)
189                     print(stderr, file=logfile)
190                     if args.ci:
191                         sys.exit("Failure detected on cuda-memcheck")
192                 else:
193                     print("Ignored:", test, file=logfile)
194         del proc
195         progressbar.update(1)
196 
197 
198 async def main():
199     tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
200     for t in tasks:
201         await t
202 
203 
204 if __name__ == "__main__":
205     loop = asyncio.get_event_loop()
206     loop.run_until_complete(main())
207