{
  "name": "nemo-mbridge-multi-node-slurm",
  "version": "0.1.0",
  "description": "Convert single-node scripts to multi-node Slurm sbatch jobs and debug common multi-node failures. Covers srun-native vs uv run torch.distributed approaches, container setup, NCCL timeouts, OOM sizing for MoE models, and interactive allocation.",
  "keywords": [
    "github-import",
    "NVIDIA",
    "skills"
  ],
  "license": "Apache-2.0",
  "source": {
    "repo": "NVIDIA/skills",
    "ref": "main",
    "path": "skills/nemo-mbridge-multi-node-slurm",
    "url": "https://github.com/NVIDIA/skills/tree/main/skills/nemo-mbridge-multi-node-slurm"
  }
}
