xref: /aosp_15_r20/external/pytorch/test/scripts/cuda_memcheck_common.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1# this file contains a simple parser that parses report
2# from cuda-memcheck
3
4
5class ParseError(Exception):
6    """Whenever the simple parser is unable to parse the report, this exception will be raised"""
7
8
9class Report:
10    """A report is a container of errors, and a summary on how many errors are found"""
11
12    def __init__(self, text, errors):
13        # text is something like
14        # ERROR SUMMARY: 1 error
15        # or
16        # ERROR SUMMARY: 2 errors
17        self.text = text
18        self.num_errors = int(text.strip().split()[2])
19        self.errors = errors
20        if len(errors) != self.num_errors:
21            if len(errors) == 10000 and self.num_errors > 10000:
22                # When there are more than 10k errors, cuda-memcheck only display 10k
23                self.num_errors = 10000
24            else:
25                raise ParseError("Number of errors does not match")
26
27
28class Error:
29    """Each error is a section in the output of cuda-memcheck.
30    Each error in the report has an error message and a backtrace. It looks like:
31
32    ========= Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaGetLastError.
33    =========     Saved host backtrace up to driver entry point at error
34    =========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x38c7b3]
35    =========     Host Frame:/usr/local/cuda/lib64/libcudart.so.10.1 (cudaGetLastError + 0x163) [0x4c493]
36    =========     Host Frame:/home/xgao/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch.so [0x5b77a05]
37    =========     Host Frame:/home/xgao/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch.so [0x39d6d1d]
38    =========     .....
39    """
40
41    def __init__(self, lines):
42        self.message = lines[0]
43        lines = lines[2:]
44        self.stack = [l.strip() for l in lines]
45
46
47def parse(message):
48    """A simple parser that parses the report of cuda-memcheck. This parser is meant to be simple
49    and it only split the report into separate errors and a summary. Where each error is further
50    splitted into error message and backtrace. No further details are parsed.
51
52    A report contains multiple errors and a summary on how many errors are detected. It looks like:
53
54    ========= CUDA-MEMCHECK
55    ========= Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaPointerGetAttributes.
56    =========     Saved host backtrace up to driver entry point at error
57    =========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x38c7b3]
58    =========     Host Frame:/usr/local/cuda/lib64/libcudart.so.10.1 (cudaPointerGetAttributes + 0x1a9) [0x428b9]
59    =========     Host Frame:/home/xgao/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch.so [0x5b778a9]
60    =========     .....
61    =========
62    ========= Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaGetLastError.
63    =========     Saved host backtrace up to driver entry point at error
64    =========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x38c7b3]
65    =========     Host Frame:/usr/local/cuda/lib64/libcudart.so.10.1 (cudaGetLastError + 0x163) [0x4c493]
66    =========     .....
67    =========
68    ========= .....
69    =========
70    ========= Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaGetLastError.
71    =========     Saved host backtrace up to driver entry point at error
72    =========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x38c7b3]
73    =========     .....
74    =========     Host Frame:python (_PyEval_EvalFrameDefault + 0x6a0) [0x1d0ad0]
75    =========     Host Frame:python (_PyEval_EvalCodeWithName + 0xbb9) [0x116db9]
76    =========
77    ========= ERROR SUMMARY: 4 errors
78    """
79    errors = []
80    HEAD = "========="
81    headlen = len(HEAD)
82    started = False
83    in_message = False
84    message_lines = []
85    lines = message.splitlines()
86    for l in lines:
87        if l == HEAD + " CUDA-MEMCHECK":
88            started = True
89            continue
90        if not started or not l.startswith(HEAD):
91            continue
92        l = l[headlen + 1 :]
93        if l.startswith("ERROR SUMMARY:"):
94            return Report(l, errors)
95        if not in_message:
96            in_message = True
97            message_lines = [l]
98        elif l == "":
99            errors.append(Error(message_lines))
100            in_message = False
101        else:
102            message_lines.append(l)
103    raise ParseError("No error summary found")
104