Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Internal change (diffbased). #11269

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions test/kubernetes/benchmarks/stablediffusion.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import (

const (
// Container image for Stable Diffusion XL.
stableDiffusionImage = k8s.ImageRepoPrefix + "gpu/stable-diffusion-xl"
stableDiffusionImage = k8s.ImageRepoPrefix + "gpu/stable-diffusion-xl:latest"
)

// kubernetesPodRunner implements `stablediffusion.ContainerRunner`.
Expand Down Expand Up @@ -171,7 +171,7 @@ func RunStableDiffusionXL(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kuber
t.Skipf("refiner failed in previous benchmark; skipping benchmark with refiner")
}
}
testCtx, testCancel := context.WithTimeout(ctx, 15*time.Minute)
testCtx, testCancel := context.WithTimeout(ctx, 50*time.Minute)
defer testCancel()
prompt := &stablediffusion.XLPrompt{
Query: test.query,
Expand Down
100 changes: 15 additions & 85 deletions test/kubernetes/testcluster/objects.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"errors"
"fmt"
"reflect"
"strconv"

cspb "google.golang.org/genproto/googleapis/container/v1"
"google.golang.org/protobuf/proto"
Expand Down Expand Up @@ -181,16 +180,14 @@ type RuntimeType string

// List of known runtime types.
const (
RuntimeTypeGVisor = RuntimeType("gvisor")
RuntimeTypeUnsandboxed = RuntimeType("runc")
RuntimeTypeGVisorNvidia = RuntimeType("gvisor-nvidia")
RuntimeTypeGVisorTPU = RuntimeType("gvisor-tpu")
RuntimeTypeUnsandboxedNvidia = RuntimeType("runc-nvidia")
RuntimeTypeUnsandboxedTPU = RuntimeType("runc-tpu")
RuntimeTypeGVisor = RuntimeType("gvisor")
RuntimeTypeUnsandboxed = RuntimeType("runc")
RuntimeTypeGVisorTPU = RuntimeType("gvisor-tpu")
RuntimeTypeUnsandboxedTPU = RuntimeType("runc-tpu")
)

// ApplyNodepool modifies the nodepool to configure it to use the runtime.
func (t RuntimeType) ApplyNodepool(nodepool *cspb.NodePool, accelType AcceleratorType, accelShape string, accelRes string) {
func (t RuntimeType) ApplyNodepool(nodepool *cspb.NodePool) {
if nodepool.GetConfig().GetLabels() == nil {
nodepool.GetConfig().Labels = map[string]string{}
}
Expand All @@ -204,81 +201,27 @@ func (t RuntimeType) ApplyNodepool(nodepool *cspb.NodePool, accelType Accelerato
case RuntimeTypeUnsandboxed:
nodepool.GetConfig().Labels[NodepoolRuntimeKey] = string(RuntimeTypeUnsandboxed)
// Do nothing.
case RuntimeTypeGVisorNvidia:
nodepool.Config.SandboxConfig = &cspb.SandboxConfig{
Type: cspb.SandboxConfig_GVISOR,
}
accelCount, err := strconv.Atoi(accelShape)
if err != nil {
panic(fmt.Sprintf("GPU count must be a valid number, got %v", accelShape))
}
if accelCount == 0 {
panic("GPU count needs to be >=1")
}
nodepool.Config.MachineType = DefaultNvidiaMachineType
nodepool.Config.Accelerators = []*cspb.AcceleratorConfig{
{
AcceleratorType: string(accelType),
AcceleratorCount: int64(accelCount),
},
}
nodepool.Config.Labels[NodepoolRuntimeKey] = string(RuntimeTypeGVisorNvidia)
nodepool.Config.Labels[NodepoolNumAcceleratorsKey] = strconv.Itoa(accelCount)
case RuntimeTypeGVisorTPU:
nodepool.Config.MachineType = TPUAcceleratorMachineTypeMap[accelType]
if err := setNodePlacementPolicyCompact(nodepool, accelShape); err != nil {
panic(fmt.Sprintf("failed to set node placement policy: %v", err))
}
nodepool.Config.Labels[gvisorNodepoolKey] = gvisorRuntimeClass
nodepool.Config.Labels[NodepoolRuntimeKey] = string(RuntimeTypeGVisorTPU)
nodepool.Config.Labels[NodepoolTPUTopologyKey] = accelShape
nodepool.Config.Taints = append(nodepool.Config.Taints, &cspb.NodeTaint{
Key: gvisorNodepoolKey,
Value: gvisorRuntimeClass,
Effect: cspb.NodeTaint_NO_SCHEDULE,
})
case RuntimeTypeUnsandboxedNvidia:
accelCount, err := strconv.Atoi(accelShape)
if err != nil {
panic(fmt.Sprintf("GPU count must be a valid number, got %v", accelShape))
}
if accelCount == 0 {
panic("GPU count needs to be >=1")
}
nodepool.Config.MachineType = DefaultNvidiaMachineType
nodepool.Config.Accelerators = []*cspb.AcceleratorConfig{
{
AcceleratorType: string(accelType),
AcceleratorCount: int64(accelCount),
},
}
nodepool.Config.Labels[NodepoolRuntimeKey] = string(RuntimeTypeUnsandboxedNvidia)
nodepool.Config.Labels[NodepoolNumAcceleratorsKey] = strconv.Itoa(accelCount)
case RuntimeTypeUnsandboxedTPU:
nodepool.Config.MachineType = TPUAcceleratorMachineTypeMap[accelType]
if err := setNodePlacementPolicyCompact(nodepool, accelShape); err != nil {
panic(fmt.Sprintf("failed to set node placement policy: %v", err))
}
nodepool.Config.Labels[NodepoolRuntimeKey] = string(RuntimeTypeUnsandboxedTPU)
nodepool.Config.Labels[NodepoolTPUTopologyKey] = accelShape
default:
panic(fmt.Sprintf("unsupported runtime %q", t))
}
if accelRes != "" {
nodepool.Config.ReservationAffinity = &cspb.ReservationAffinity{
ConsumeReservationType: cspb.ReservationAffinity_SPECIFIC_RESERVATION,
Key: "compute.googleapis.com/reservation-name",
Values: []string{accelRes},
}
}
}

// setNodePlacementPolicyCompact sets the node placement policy to COMPACT
// SetNodePlacementPolicyCompact sets the node placement policy to COMPACT
// and with the given TPU topology.
// This is done by reflection because the NodePool_PlacementPolicy proto
// message isn't available in the latest exported version of the genproto API.
// This is only used for TPU nodepools so not critical for most benchmarks.
func setNodePlacementPolicyCompact(nodepool *cspb.NodePool, tpuTopology string) error {
func SetNodePlacementPolicyCompact(nodepool *cspb.NodePool, tpuTopology string) error {
placementPolicyField := reflect.ValueOf(nodepool).Elem().FieldByName("PlacementPolicy")
if !placementPolicyField.IsValid() {
return errors.New("nodepool does not have a PlacementPolicy field")
Expand All @@ -305,7 +248,15 @@ func (t RuntimeType) ApplyPodSpec(podSpec *v13.PodSpec) {
case RuntimeTypeGVisor:
podSpec.RuntimeClassName = proto.String(gvisorRuntimeClass)
podSpec.NodeSelector[NodepoolRuntimeKey] = string(RuntimeTypeGVisor)
podSpec.Tolerations = append(podSpec.Tolerations, v13.Toleration{
Key: "nvidia.com/gpu",
Operator: v13.TolerationOpExists,
})
case RuntimeTypeUnsandboxed:
podSpec.Tolerations = append(podSpec.Tolerations, v13.Toleration{
Key: "nvidia.com/gpu",
Operator: v13.TolerationOpExists,
})
// Allow the pod to schedule on gVisor nodes as well.
// This enables the use of `--test-nodepool-runtime=runc` to run
// unsandboxed benchmarks on gVisor test clusters.
Expand All @@ -315,34 +266,13 @@ func (t RuntimeType) ApplyPodSpec(podSpec *v13.PodSpec) {
Operator: v13.TolerationOpEqual,
Value: gvisorRuntimeClass,
})
case RuntimeTypeGVisorNvidia:
podSpec.RuntimeClassName = proto.String(gvisorRuntimeClass)
podSpec.NodeSelector[NodepoolRuntimeKey] = string(RuntimeTypeGVisorNvidia)
podSpec.Tolerations = append(podSpec.Tolerations, v13.Toleration{
Key: "nvidia.com/gpu",
Operator: v13.TolerationOpExists,
})
case RuntimeTypeGVisorTPU:
podSpec.RuntimeClassName = proto.String(gvisorRuntimeClass)
podSpec.NodeSelector[NodepoolRuntimeKey] = string(RuntimeTypeGVisorTPU)
podSpec.Tolerations = append(podSpec.Tolerations, v13.Toleration{
Key: "google.com/tpu",
Operator: v13.TolerationOpExists,
})
case RuntimeTypeUnsandboxedNvidia:
podSpec.Tolerations = append(podSpec.Tolerations, v13.Toleration{
Key: "nvidia.com/gpu",
Operator: v13.TolerationOpExists,
})
// Allow the pod to schedule on gVisor nodes as well.
// This enables the use of `--test-nodepool-runtime=runc-nvidia` to run
// unsandboxed benchmarks on gVisor test clusters.
podSpec.Tolerations = append(podSpec.Tolerations, v13.Toleration{
Effect: v13.TaintEffectNoSchedule,
Key: gvisorNodepoolKey,
Operator: v13.TolerationOpEqual,
Value: gvisorRuntimeClass,
})
case RuntimeTypeUnsandboxedTPU:
podSpec.Tolerations = append(podSpec.Tolerations, v13.Toleration{
Key: "google.com/tpu",
Expand Down
2 changes: 1 addition & 1 deletion test/kubernetes/testcluster/testcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ func (t *TestCluster) HasGVisorTestRuntime(ctx context.Context) (bool, error) {
if err != nil {
return false, err
}
return testNodePool.runtime == RuntimeTypeGVisor || testNodePool.runtime == RuntimeTypeGVisorNvidia, nil
return testNodePool.runtime == RuntimeTypeGVisor || testNodePool.runtime == RuntimeTypeGVisorTPU, nil
}

// CreatePod is a helper to create a pod.
Expand Down
Loading