[
  {
    "id": "nemo-automodel-distributed-training-001-strategy-selection",
    "question": "I am training a 70B LLM on 8 nodes and want tensor plus pipeline parallelism in NeMo AutoModel. Which distributed strategy should I use and what YAML fields matter?",
    "expected_skill": "nemo-automodel-distributed-training",
    "expected_script": null,
    "ground_truth": "The agent routes to nemo-automodel-distributed-training, recommends fsdp2 for large multi-node models that need TP and PP, and explains the distributed YAML keys strategy, tp_size, pp_size, cp_size, ep_size, and pipeline settings. It notes that dp_size is inferred from world_size divided by the product of TP, PP, and CP.",
    "expected_behavior": [
      "Routes to nemo-automodel-distributed-training",
      "Recommends strategy: fsdp2 for TP plus PP",
      "Mentions tp_size and pp_size as the key parallelism controls",
      "Mentions pipeline sub-config fields such as pp_schedule or pp_microbatch_size",
      "Explains that dp_size is inferred from world_size / (tp_size * pp_size * cp_size)",
      "Does not recommend Megatron FSDP for pipeline parallelism"
    ]
  },
  {
    "id": "nemo-automodel-distributed-training-002-moe-expert-parallel",
    "question": "I am training an MoE model in NeMo AutoModel and want expert parallelism across GPUs. What distributed config should I start with?",
    "expected_skill": "nemo-automodel-distributed-training",
    "expected_script": null,
    "ground_truth": "The agent routes to nemo-automodel-distributed-training, recommends FSDP2 with ep_size greater than 1 for MoE expert parallelism, and explains that this creates a separate moe_mesh. It should mention that the moe sub-config maps to MoEParallelizerConfig, that ep_size must divide dp_size times cp_size, and that MegatronFSDP does not support expert parallelism.",
    "expected_behavior": [
      "Routes to nemo-automodel-distributed-training",
      "Recommends fsdp2 with ep_size > 1",
      "Mentions the separate moe_mesh for expert parallelism",
      "Mentions the moe sub-config when relevant",
      "Mentions the ep_size divisibility constraint",
      "States that MegatronFSDP does not support EP",
      "Does not suggest DDP for expert parallelism"
    ]
  },
  {
    "id": "nemo-automodel-distributed-training-003-megatron-limitations",
    "question": "Can I use megatron_fsdp in NeMo AutoModel if I need pipeline parallelism, expert parallelism, or sequence_parallel?",
    "expected_skill": "nemo-automodel-distributed-training",
    "expected_script": null,
    "ground_truth": "The agent routes to nemo-automodel-distributed-training and says no: megatron_fsdp does not support pipeline parallelism, expert parallelism, or sequence_parallel in NeMo AutoModel. It should recommend fsdp2 when PP, EP, TP plus PP, or sequence_parallel is required, and it should mention that DDP is only simple data parallelism.",
    "expected_behavior": [
      "Routes to nemo-automodel-distributed-training",
      "States megatron_fsdp does not support pipeline parallelism",
      "States megatron_fsdp does not support expert parallelism",
      "States megatron_fsdp does not support sequence_parallel",
      "Recommends fsdp2 for PP, EP, or sequence_parallel",
      "Mentions DDP is simple data parallelism only"
    ]
  }
]
