Source code for ptyrad.runtime.diagnostics

"""
System and environment diagnostic reporting.

This module provides utilities to query and log the current hardware 
(CPU, Memory, GPU) and software (OS, Python, dependencies) environment. 
It includes specific support for detecting SLURM cluster allocations and 
identifying NVIDIA Multi-Instance GPU (MIG) configurations.
"""

import logging
import subprocess

from ptyrad.runtime.logging import report

logger = logging.getLogger(__name__)


[docs] def is_mig_enabled(): """Detects if any NVIDIA GPU on the system is operating in MIG mode. Multi-Instance GPU (MIG) allows a physical GPU to be securely partitioned into multiple separate GPU instances. This function queries `nvidia-smi` to check if this hardware partitioning is currently active, which is important because certain multi-GPU communication backends (like NCCL) do not fully support MIG slices. Returns: bool: True if MIG mode is enabled on any detected GPU, False if it is disabled, or if the detection fails (e.g., `nvidia-smi` not found). """ try: # Run the `nvidia-smi` command to query MIG mode result = subprocess.run( ["nvidia-smi", "--query-gpu=mig.mode.current", "--format=csv,noheader"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) # Check for errors in the command execution if result.returncode != 0: report(f"Error running nvidia-smi: {result.stderr.strip()}") return False # Parse the output to check for MIG mode mig_modes = result.stdout.strip().split("\n") for mode in mig_modes: if mode.strip() == "Enabled": return True return False except FileNotFoundError: # `nvidia-smi` is not available report("nvidia-smi not found. Unable to detect MIG mode.") return False except Exception as e: # Catch other unexpected errors report(f"Error detecting MIG mode: {e}") return False