#!/usr/bin/env python3 from __future__ import annotations import datetime import json import signal import time from typing import Any import psutil # type: ignore[import] def get_processes_running_python_tests() -> list[Any]: python_processes = [] for process in psutil.process_iter(): try: if "python" in process.name() and process.cmdline(): python_processes.append(process) except (psutil.NoSuchProcess, psutil.AccessDenied): # access denied or the process died pass return python_processes def get_per_process_cpu_info() -> list[dict[str, Any]]: processes = get_processes_running_python_tests() per_process_info = [] for p in processes: info = { "pid": p.pid, "cmd": " ".join(p.cmdline()), "cpu_percent": p.cpu_percent(), "rss_memory": p.memory_info().rss, } # https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info # requires higher user privileges and could throw AccessDenied error, i.e. mac try: memory_full_info = p.memory_full_info() info["uss_memory"] = memory_full_info.uss if "pss" in memory_full_info: # only availiable in linux info["pss_memory"] = memory_full_info.pss except psutil.AccessDenied as e: # It's ok to skip this pass per_process_info.append(info) return per_process_info def get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]: processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) per_process_info = [] for p in processes: info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory} per_process_info.append(info) return per_process_info def rocm_get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]: processes = amdsmi.amdsmi_get_gpu_process_list(handle) per_process_info = [] for p in processes: try: proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) except AttributeError: # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 # BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi proc_info = p info = { "pid": proc_info["pid"], "gpu_memory": proc_info["memory_usage"]["vram_mem"], } per_process_info.append(info) return per_process_info if __name__ == "__main__": handle = None try: import pynvml # type: ignore[import] try: pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) except pynvml.NVMLError: pass except ModuleNotFoundError: # no pynvml avaliable, probably because not cuda pass try: import amdsmi # type: ignore[import] try: amdsmi.amdsmi_init() amdsmi_handle = amdsmi.amdsmi_get_processor_handles()[0] except amdsmi.AmdSmiException: pass except ModuleNotFoundError: # no amdsmi is available pass kill_now = False def exit_gracefully(*args: Any) -> None: global kill_now kill_now = True signal.signal(signal.SIGTERM, exit_gracefully) while not kill_now: try: stats = { "time": datetime.datetime.utcnow().isoformat("T") + "Z", "total_cpu_percent": psutil.cpu_percent(), "per_process_cpu_info": get_per_process_cpu_info(), } if handle is not None: stats["per_process_gpu_info"] = get_per_process_gpu_info(handle) # https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) stats["total_gpu_utilization"] = gpu_utilization.gpu stats["total_gpu_mem_utilization"] = gpu_utilization.memory if amdsmi_handle is not None: stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info( amdsmi_handle ) stats["total_gpu_utilization"] = amdsmi.amdsmi_get_gpu_activity( amdsmi_handle )["gfx_activity"] stats["total_gpu_mem_utilization"] = amdsmi.amdsmi_get_gpu_activity( amdsmi_handle )["umc_activity"] except Exception as e: stats = { "time": datetime.datetime.utcnow().isoformat("T") + "Z", "error": str(e), } finally: print(json.dumps(stats)) time.sleep(1)