xref: /aosp_15_r20/external/pytorch/tools/stats/upload_metrics.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1*da0073e9SAndroid Build Coastguard Workerfrom __future__ import annotations
2*da0073e9SAndroid Build Coastguard Worker
3*da0073e9SAndroid Build Coastguard Workerimport datetime
4*da0073e9SAndroid Build Coastguard Workerimport inspect
5*da0073e9SAndroid Build Coastguard Workerimport os
6*da0073e9SAndroid Build Coastguard Workerimport time
7*da0073e9SAndroid Build Coastguard Workerimport uuid
8*da0073e9SAndroid Build Coastguard Workerfrom decimal import Decimal
9*da0073e9SAndroid Build Coastguard Workerfrom typing import Any
10*da0073e9SAndroid Build Coastguard Workerfrom warnings import warn
11*da0073e9SAndroid Build Coastguard Worker
12*da0073e9SAndroid Build Coastguard Worker
13*da0073e9SAndroid Build Coastguard Worker# boto3 is an optional dependency. If it's not installed,
14*da0073e9SAndroid Build Coastguard Worker# we'll just not emit the metrics.
15*da0073e9SAndroid Build Coastguard Worker# Keeping this logic here so that callers don't have to
16*da0073e9SAndroid Build Coastguard Worker# worry about it.
17*da0073e9SAndroid Build Coastguard WorkerEMIT_METRICS = False
18*da0073e9SAndroid Build Coastguard Workertry:
19*da0073e9SAndroid Build Coastguard Worker    import boto3  # type: ignore[import]
20*da0073e9SAndroid Build Coastguard Worker
21*da0073e9SAndroid Build Coastguard Worker    EMIT_METRICS = True
22*da0073e9SAndroid Build Coastguard Workerexcept ImportError as e:
23*da0073e9SAndroid Build Coastguard Worker    print(f"Unable to import boto3. Will not be emitting metrics.... Reason: {e}")
24*da0073e9SAndroid Build Coastguard Worker
25*da0073e9SAndroid Build Coastguard Worker# Sometimes our runner machines are located in one AWS account while the metrics table may be in
26*da0073e9SAndroid Build Coastguard Worker# another, so we need to specify the table's ARN explicitly.
27*da0073e9SAndroid Build Coastguard WorkerTORCHCI_METRICS_TABLE_ARN = (
28*da0073e9SAndroid Build Coastguard Worker    "arn:aws:dynamodb:us-east-1:308535385114:table/torchci-metrics"
29*da0073e9SAndroid Build Coastguard Worker)
30*da0073e9SAndroid Build Coastguard Worker
31*da0073e9SAndroid Build Coastguard Worker
32*da0073e9SAndroid Build Coastguard Workerclass EnvVarMetric:
33*da0073e9SAndroid Build Coastguard Worker    name: str
34*da0073e9SAndroid Build Coastguard Worker    env_var: str
35*da0073e9SAndroid Build Coastguard Worker    required: bool = True
36*da0073e9SAndroid Build Coastguard Worker    # Used to cast the value of the env_var to the correct type (defaults to str)
37*da0073e9SAndroid Build Coastguard Worker    type_conversion_fn: Any = None
38*da0073e9SAndroid Build Coastguard Worker
39*da0073e9SAndroid Build Coastguard Worker    def __init__(
40*da0073e9SAndroid Build Coastguard Worker        self,
41*da0073e9SAndroid Build Coastguard Worker        name: str,
42*da0073e9SAndroid Build Coastguard Worker        env_var: str,
43*da0073e9SAndroid Build Coastguard Worker        required: bool = True,
44*da0073e9SAndroid Build Coastguard Worker        type_conversion_fn: Any = None,
45*da0073e9SAndroid Build Coastguard Worker    ) -> None:
46*da0073e9SAndroid Build Coastguard Worker        self.name = name
47*da0073e9SAndroid Build Coastguard Worker        self.env_var = env_var
48*da0073e9SAndroid Build Coastguard Worker        self.required = required
49*da0073e9SAndroid Build Coastguard Worker        self.type_conversion_fn = type_conversion_fn
50*da0073e9SAndroid Build Coastguard Worker
51*da0073e9SAndroid Build Coastguard Worker    def value(self) -> Any:
52*da0073e9SAndroid Build Coastguard Worker        value = os.environ.get(self.env_var)
53*da0073e9SAndroid Build Coastguard Worker
54*da0073e9SAndroid Build Coastguard Worker        # Github CI will set some env vars to an empty string
55*da0073e9SAndroid Build Coastguard Worker        DEFAULT_ENVVAR_VALUES = [None, ""]
56*da0073e9SAndroid Build Coastguard Worker        if value in DEFAULT_ENVVAR_VALUES:
57*da0073e9SAndroid Build Coastguard Worker            if not self.required:
58*da0073e9SAndroid Build Coastguard Worker                return None
59*da0073e9SAndroid Build Coastguard Worker
60*da0073e9SAndroid Build Coastguard Worker            raise ValueError(
61*da0073e9SAndroid Build Coastguard Worker                f"Missing {self.name}. Please set the {self.env_var} "
62*da0073e9SAndroid Build Coastguard Worker                "environment variable to pass in this value."
63*da0073e9SAndroid Build Coastguard Worker            )
64*da0073e9SAndroid Build Coastguard Worker
65*da0073e9SAndroid Build Coastguard Worker        if self.type_conversion_fn:
66*da0073e9SAndroid Build Coastguard Worker            return self.type_conversion_fn(value)
67*da0073e9SAndroid Build Coastguard Worker        return value
68*da0073e9SAndroid Build Coastguard Worker
69*da0073e9SAndroid Build Coastguard Worker
70*da0073e9SAndroid Build Coastguard Workerglobal_metrics: dict[str, Any] = {}
71*da0073e9SAndroid Build Coastguard Worker
72*da0073e9SAndroid Build Coastguard Worker
73*da0073e9SAndroid Build Coastguard Workerdef add_global_metric(metric_name: str, metric_value: Any) -> None:
74*da0073e9SAndroid Build Coastguard Worker    """
75*da0073e9SAndroid Build Coastguard Worker    Adds stats that should be emitted with every metric by the current process.
76*da0073e9SAndroid Build Coastguard Worker    If the emit_metrics method specifies a metric with the same name, it will
77*da0073e9SAndroid Build Coastguard Worker    overwrite this value.
78*da0073e9SAndroid Build Coastguard Worker    """
79*da0073e9SAndroid Build Coastguard Worker    global_metrics[metric_name] = metric_value
80*da0073e9SAndroid Build Coastguard Worker
81*da0073e9SAndroid Build Coastguard Worker
82*da0073e9SAndroid Build Coastguard Workerdef emit_metric(
83*da0073e9SAndroid Build Coastguard Worker    metric_name: str,
84*da0073e9SAndroid Build Coastguard Worker    metrics: dict[str, Any],
85*da0073e9SAndroid Build Coastguard Worker) -> None:
86*da0073e9SAndroid Build Coastguard Worker    """
87*da0073e9SAndroid Build Coastguard Worker    Upload a metric to DynamoDB (and from there, Rockset).
88*da0073e9SAndroid Build Coastguard Worker
89*da0073e9SAndroid Build Coastguard Worker    Even if EMIT_METRICS is set to False, this function will still run the code to
90*da0073e9SAndroid Build Coastguard Worker    validate and shape the metrics, skipping just the upload.
91*da0073e9SAndroid Build Coastguard Worker
92*da0073e9SAndroid Build Coastguard Worker    Parameters:
93*da0073e9SAndroid Build Coastguard Worker        metric_name:
94*da0073e9SAndroid Build Coastguard Worker            Name of the metric. Every unique metric should have a different name
95*da0073e9SAndroid Build Coastguard Worker            and be emitted just once per run attempt.
96*da0073e9SAndroid Build Coastguard Worker            Metrics are namespaced by their module and the function that emitted them.
97*da0073e9SAndroid Build Coastguard Worker        metrics: The actual data to record.
98*da0073e9SAndroid Build Coastguard Worker
99*da0073e9SAndroid Build Coastguard Worker    Some default values are populated from environment variables, which must be set
100*da0073e9SAndroid Build Coastguard Worker    for metrics to be emitted. (If they're not set, this function becomes a noop):
101*da0073e9SAndroid Build Coastguard Worker    """
102*da0073e9SAndroid Build Coastguard Worker
103*da0073e9SAndroid Build Coastguard Worker    if metrics is None:
104*da0073e9SAndroid Build Coastguard Worker        raise ValueError("You didn't ask to upload any metrics!")
105*da0073e9SAndroid Build Coastguard Worker
106*da0073e9SAndroid Build Coastguard Worker    # Merge the given metrics with the global metrics, overwriting any duplicates
107*da0073e9SAndroid Build Coastguard Worker    # with the given metrics.
108*da0073e9SAndroid Build Coastguard Worker    metrics = {**global_metrics, **metrics}
109*da0073e9SAndroid Build Coastguard Worker
110*da0073e9SAndroid Build Coastguard Worker    # We use these env vars that to determine basic info about the workflow run.
111*da0073e9SAndroid Build Coastguard Worker    # By using env vars, we don't have to pass this info around to every function.
112*da0073e9SAndroid Build Coastguard Worker    # It also helps ensure that we only emit metrics during CI
113*da0073e9SAndroid Build Coastguard Worker    env_var_metrics = [
114*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("repo", "GITHUB_REPOSITORY"),
115*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("workflow", "GITHUB_WORKFLOW"),
116*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("build_environment", "BUILD_ENVIRONMENT", required=False),
117*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("job", "GITHUB_JOB"),
118*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("test_config", "TEST_CONFIG", required=False),
119*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("pr_number", "PR_NUMBER", required=False, type_conversion_fn=int),
120*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("run_id", "GITHUB_RUN_ID", type_conversion_fn=int),
121*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("run_number", "GITHUB_RUN_NUMBER", type_conversion_fn=int),
122*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("run_attempt", "GITHUB_RUN_ATTEMPT", type_conversion_fn=int),
123*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("job_id", "JOB_ID", type_conversion_fn=int),
124*da0073e9SAndroid Build Coastguard Worker        EnvVarMetric("job_name", "JOB_NAME"),
125*da0073e9SAndroid Build Coastguard Worker    ]
126*da0073e9SAndroid Build Coastguard Worker
127*da0073e9SAndroid Build Coastguard Worker    # Use info about the function that invoked this one as a namespace and a way to filter metrics.
128*da0073e9SAndroid Build Coastguard Worker    calling_frame = inspect.currentframe().f_back  # type: ignore[union-attr]
129*da0073e9SAndroid Build Coastguard Worker    calling_frame_info = inspect.getframeinfo(calling_frame)  # type: ignore[arg-type]
130*da0073e9SAndroid Build Coastguard Worker    calling_file = os.path.basename(calling_frame_info.filename)
131*da0073e9SAndroid Build Coastguard Worker    calling_module = inspect.getmodule(calling_frame).__name__  # type: ignore[union-attr]
132*da0073e9SAndroid Build Coastguard Worker    calling_function = calling_frame_info.function
133*da0073e9SAndroid Build Coastguard Worker
134*da0073e9SAndroid Build Coastguard Worker    try:
135*da0073e9SAndroid Build Coastguard Worker        reserved_metrics = {
136*da0073e9SAndroid Build Coastguard Worker            "metric_name": metric_name,
137*da0073e9SAndroid Build Coastguard Worker            "calling_file": calling_file,
138*da0073e9SAndroid Build Coastguard Worker            "calling_module": calling_module,
139*da0073e9SAndroid Build Coastguard Worker            "calling_function": calling_function,
140*da0073e9SAndroid Build Coastguard Worker            "timestamp": datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f"),
141*da0073e9SAndroid Build Coastguard Worker            **{m.name: m.value() for m in env_var_metrics if m.value()},
142*da0073e9SAndroid Build Coastguard Worker        }
143*da0073e9SAndroid Build Coastguard Worker    except ValueError as e:
144*da0073e9SAndroid Build Coastguard Worker        warn(f"Not emitting metrics for {metric_name}. {e}")
145*da0073e9SAndroid Build Coastguard Worker        return
146*da0073e9SAndroid Build Coastguard Worker
147*da0073e9SAndroid Build Coastguard Worker    # Prefix key with metric name and timestamp to derisk chance of a uuid1 name collision
148*da0073e9SAndroid Build Coastguard Worker    reserved_metrics[
149*da0073e9SAndroid Build Coastguard Worker        "dynamo_key"
150*da0073e9SAndroid Build Coastguard Worker    ] = f"{metric_name}_{int(time.time())}_{uuid.uuid1().hex}"
151*da0073e9SAndroid Build Coastguard Worker
152*da0073e9SAndroid Build Coastguard Worker    # Ensure the metrics dict doesn't contain any reserved keys
153*da0073e9SAndroid Build Coastguard Worker    for key in reserved_metrics.keys():
154*da0073e9SAndroid Build Coastguard Worker        used_reserved_keys = [k for k in metrics.keys() if k == key]
155*da0073e9SAndroid Build Coastguard Worker        if used_reserved_keys:
156*da0073e9SAndroid Build Coastguard Worker            raise ValueError(f"Metrics dict contains reserved keys: [{', '.join(key)}]")
157*da0073e9SAndroid Build Coastguard Worker
158*da0073e9SAndroid Build Coastguard Worker    # boto3 doesn't support uploading float values to DynamoDB, so convert them all to decimals.
159*da0073e9SAndroid Build Coastguard Worker    metrics = _convert_float_values_to_decimals(metrics)
160*da0073e9SAndroid Build Coastguard Worker
161*da0073e9SAndroid Build Coastguard Worker    if EMIT_METRICS:
162*da0073e9SAndroid Build Coastguard Worker        try:
163*da0073e9SAndroid Build Coastguard Worker            session = boto3.Session(region_name="us-east-1")
164*da0073e9SAndroid Build Coastguard Worker            session.resource("dynamodb").Table(TORCHCI_METRICS_TABLE_ARN).put_item(
165*da0073e9SAndroid Build Coastguard Worker                Item={
166*da0073e9SAndroid Build Coastguard Worker                    **reserved_metrics,
167*da0073e9SAndroid Build Coastguard Worker                    **metrics,
168*da0073e9SAndroid Build Coastguard Worker                }
169*da0073e9SAndroid Build Coastguard Worker            )
170*da0073e9SAndroid Build Coastguard Worker        except Exception as e:
171*da0073e9SAndroid Build Coastguard Worker            # We don't want to fail the job if we can't upload the metric.
172*da0073e9SAndroid Build Coastguard Worker            # We still raise the ValueErrors outside this try block since those indicate improperly configured metrics
173*da0073e9SAndroid Build Coastguard Worker            warn(f"Error uploading metric {metric_name} to DynamoDB: {e}")
174*da0073e9SAndroid Build Coastguard Worker            return
175*da0073e9SAndroid Build Coastguard Worker    else:
176*da0073e9SAndroid Build Coastguard Worker        print(f"Not emitting metrics for {metric_name}. Boto wasn't imported.")
177*da0073e9SAndroid Build Coastguard Worker
178*da0073e9SAndroid Build Coastguard Worker
179*da0073e9SAndroid Build Coastguard Workerdef _convert_float_values_to_decimals(data: dict[str, Any]) -> dict[str, Any]:
180*da0073e9SAndroid Build Coastguard Worker    # Attempt to recurse
181*da0073e9SAndroid Build Coastguard Worker    def _helper(o: Any) -> Any:
182*da0073e9SAndroid Build Coastguard Worker        if isinstance(o, float):
183*da0073e9SAndroid Build Coastguard Worker            return Decimal(str(o))
184*da0073e9SAndroid Build Coastguard Worker        if isinstance(o, list):
185*da0073e9SAndroid Build Coastguard Worker            return [_helper(v) for v in o]
186*da0073e9SAndroid Build Coastguard Worker        if isinstance(o, dict):
187*da0073e9SAndroid Build Coastguard Worker            return {_helper(k): _helper(v) for k, v in o.items()}
188*da0073e9SAndroid Build Coastguard Worker        if isinstance(o, tuple):
189*da0073e9SAndroid Build Coastguard Worker            return tuple(_helper(v) for v in o)
190*da0073e9SAndroid Build Coastguard Worker        return o
191*da0073e9SAndroid Build Coastguard Worker
192*da0073e9SAndroid Build Coastguard Worker    return {k: _helper(v) for k, v in data.items()}
193