{
  "name": "mcore-run-on-slurm",
  "version": "0.1.0",
  "description": "How to launch distributed Megatron-LM training jobs on a SLURM cluster. Covers a minimal sbatch skeleton, environment-variable setup for torch.distributed.run, CUDA_DEVICE_MAX_CONNECTIONS rules across hardware and parallelism modes, container conventions, monitoring, and per-rank failure diagnosis.",
  "keywords": [
    "github-import",
    "NVIDIA",
    "skills"
  ],
  "license": "Apache-2.0",
  "source": {
    "repo": "NVIDIA/skills",
    "ref": "main",
    "path": "skills/mcore-run-on-slurm",
    "url": "https://github.com/NVIDIA/skills/tree/main/skills/mcore-run-on-slurm"
  }
}