# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# =============================================================================
# Required
# =============================================================================

environment     = "dev"                # dev | staging | prod
location        = "westus3"
resource_prefix = "nvpai" # Short prefix for all resource names (no spaces)
instance        = "001"

# =============================================================================
# Resource Group
# =============================================================================

should_create_resource_group = true
# resource_group_name = ""    # Override auto-generated name if needed

# =============================================================================
# Networking
# =============================================================================

virtual_network_config = {
  address_space          = "10.0.0.0/16"
  subnet_address_prefix  = "10.0.16.0/20"  # main subnet
  # subnet_address_prefix_vm       = "10.0.32.0/20"  # VM subnet (optional)
  # subnet_address_prefix_pe       = "10.0.48.0/20"  # Private endpoint subnet
  # subnet_address_prefix_resolver = "10.0.4.0/28"   # DNS resolver subnet
}

subnet_address_prefixes_aks     = ["10.0.80.0/20"]
subnet_address_prefixes_aks_pod = ["10.0.96.0/20"]

should_enable_nat_gateway  = true
nat_gateway_zones          = []   # Set to ["1"] for zone-pinned NAT GW
should_create_vm_subnet    = false

# =============================================================================
# Privacy / Security
# =============================================================================

should_enable_private_endpoint      = true
should_enable_private_aks_cluster   = true    # Requires VPN — set false for Hybrid mode
should_enable_public_network_access = true
should_enable_microsoft_defender    = true
should_enable_purge_protection      = false   # Set true for prod Key Vault

should_add_current_user_key_vault_admin = true
should_add_current_user_storage_blob    = true

# =============================================================================
# AKS System Node Pool
# =============================================================================

system_node_pool_vm_size                    = "Standard_D16ds_v5"
system_node_pool_node_count                 = 3
should_enable_system_node_pool_auto_scaling = true
system_node_pool_min_count                  = 3
system_node_pool_max_count                  = 6
# system_node_pool_zones = ["1", "2", "3"]

# =============================================================================
# GPU Node Pools
# =============================================================================

# Single A10 Spot pool (default)
node_pools = {
  gpu = {
    vm_size                    = "Standard_NV36ads_A10_v5"
    subnet_address_prefixes    = ["10.0.112.0/20"]
    node_taints                = ["nvidia.com/gpu:NoSchedule", "kubernetes.azure.com/scalesetpriority=spot:NoSchedule"]
    node_labels                = { "kubernetes.azure.com/scalesetpriority" = "spot" }

    # Microsoft recommends skipping GPU driver installation in AKS
    # and letting NVIDIA GPU Operator handle it.
    #
    # This way we can use default GPU Operator Helm chart.
    # https://learn.microsoft.com/en-us/azure/aks/nvidia-gpu-operator#get-the-credentials-for-your-cluster
    gpu_driver                 = "None"

    priority                   = "Spot"
    eviction_policy            = "Delete"
    should_enable_auto_scaling = true
    min_count                  = 4
    max_count                  = 4
    zones                      = []
  }
}

# Multi-pool example — RTX PRO 6000 + H100:
# node_pools = {
#   rtx-pro = {
#     vm_size                 = "Standard_NC128ds_xl_RTXPRO6000BSE_v6"
#     subnet_address_prefixes = ["10.0.112.0/20"]
#     node_taints             = ["nvidia.com/gpu:NoSchedule"]
#     node_labels             = { "nvidia.com/gpu.deploy.driver" = "false" }
#     gpu_driver              = "None"
#     priority                = "Regular"
#     eviction_policy         = "Delete"
#     should_enable_auto_scaling = true
#     min_count = 4 / max_count = 4
#     zones = null
#   }
#   h100 = {
#     vm_size                 = "Standard_NC40ads_H100_v5"
#     subnet_address_prefixes = ["10.0.128.0/20"]
#     node_taints             = ["nvidia.com/gpu:NoSchedule"]
#     node_labels             = {}
#     gpu_driver              = "None"
#     priority                = "Regular"
#     eviction_policy         = "Delete"
#     should_enable_auto_scaling = true
#     min_count = 4 / max_count = 4
#     zones = null
#   }
# }

# =============================================================================
# PostgreSQL
# =============================================================================

should_deploy_postgresql   = true
postgresql_sku_name        = "GP_Standard_D2s_v3"
postgresql_storage_mb      = 32768
postgresql_version         = "16"
postgresql_databases = {
  osmo = {
    collation = "en_US.utf8"
    charset   = "utf8"
  }
}
postgresql_zone            = null
# postgresql_location      = ""   # Defaults to var.location

postgresql_high_availability = {
  should_enable             = false
  standby_availability_zone = null
}

# =============================================================================
# Redis
# =============================================================================

should_deploy_redis              = true
redis_sku_name                   = "Balanced_B10"
redis_clustering_policy          = "EnterpriseCluster"
should_enable_redis_high_availability = false

# =============================================================================
# Observability
# =============================================================================

should_deploy_grafana           = true
should_deploy_monitor_workspace = true
should_deploy_ampls             = true
should_deploy_dce               = true

# =============================================================================
# AzureML
# =============================================================================

should_deploy_aml_compute         = false
should_enable_aml_diagnostic_logs = false
should_include_aks_dns_zone       = true

aml_compute_config = {
  vm_size        = "Standard_NC4as_T4_v3"
  priority       = "LowPriority"
  min_instances  = 0
  max_instances  = 1
  idle_time_secs = 300
}

# =============================================================================
# Storage Lifecycle
# =============================================================================

should_create_data_lake_storage = false

should_enable_raw_bags_lifecycle_policy           = true
raw_bags_retention_days                           = 30
should_enable_converted_datasets_lifecycle_policy = true
converted_datasets_cool_tier_days                 = 90
should_enable_reports_lifecycle_policy            = true
reports_cool_tier_days                            = 30
reports_archive_tier_days                         = 180

# =============================================================================
# Osmo
# =============================================================================

osmo_config = {
  should_enable_identity   = true
  should_federate_identity = true
  control_plane_namespace  = "osmo-control-plane"
  operator_namespace       = "osmo-operator"
  workflows_namespace      = "osmo-workflows"
}

# =============================================================================
# Tags
# =============================================================================

tags = {
  project     = "nvpai"
  managed-by  = "terraform"
}
