From 6413d86138cf1938b38182576d11774fe9f23594 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 31 Oct 2024 15:33:35 +0100 Subject: [PATCH 1/3] Honor fail-on-init-error when no resources are found As implemented GFD will not fail if no resources are detected -- even if fail-on-init-error is set. This change ensures that fail-on-init-error is honored if no resources are detected. Signed-off-by: Evan Lezar --- cmd/gpu-feature-discovery/main.go | 5 ++++- internal/resource/factory.go | 24 +++++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/cmd/gpu-feature-discovery/main.go b/cmd/gpu-feature-discovery/main.go index 2f259e31e..c5abb0773 100644 --- a/cmd/gpu-feature-discovery/main.go +++ b/cmd/gpu-feature-discovery/main.go @@ -155,7 +155,10 @@ func start(c *cli.Context, cfg *Config) error { } klog.Infof("\nRunning with config:\n%v", string(configJSON)) - manager := resource.NewManager(config) + manager, err := resource.NewManager(config) + if err != nil { + return err + } vgpul := vgpu.NewVGPULib(vgpu.NewNvidiaPCILib()) var clientSets flags.ClientSets diff --git a/internal/resource/factory.go b/internal/resource/factory.go index a3f96577c..ffa2d3a43 100644 --- a/internal/resource/factory.go +++ b/internal/resource/factory.go @@ -17,6 +17,8 @@ package resource import ( + "fmt" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" "k8s.io/klog/v2" @@ -24,8 +26,17 @@ import ( ) // NewManager is a factory method that creates a resource Manager based on the specified config. -func NewManager(config *spec.Config) Manager { - return WithConfig(getManager(), config) +func NewManager(config *spec.Config) (Manager, error) { + manager, err := getManager() + if err != nil { + if *config.Flags.FailOnInitError { + return nil, err + } + klog.ErrorS(err, "using empty manager") + return NewNullManager(), nil + } + + return WithConfig(manager, config), nil } // WithConfig modifies a manager depending on the specified config. @@ -39,7 +50,7 @@ func WithConfig(manager Manager, config *spec.Config) Manager { } // getManager returns the resource manager depending on the system configuration. -func getManager() Manager { +func getManager() (Manager, error) { // logWithReason logs the output of the has* / is* checks from the info.Interface logWithReason := func(f func() (bool, string), tag string) bool { is, reason := f() @@ -63,12 +74,11 @@ func getManager() Manager { if hasNVML { klog.Info("Using NVML manager") - return NewNVMLManager() + return NewNVMLManager(), nil } else if isTegra { klog.Info("Using CUDA manager") - return NewCudaManager() + return NewCudaManager(), nil } - klog.Warning("No valid resources detected; using empty manager.") - return NewNullManager() + return nil, fmt.Errorf("no valid resource detected") } From b4597653cb6024a612e76cb20546446d8562d93c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 31 Oct 2024 15:41:26 +0100 Subject: [PATCH 2/3] [no-relnote] Ignore integer overflow linter errors Signed-off-by: Evan Lezar --- .golangci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.golangci.yml b/.golangci.yml index 22d53eda1..84b5ecfee 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -22,6 +22,9 @@ linters-settings: local-prefixes: github.com/NVIDIA/k8s-device-plugin issues: + exclude: + # Exclude all integer overflow errors on this branch. + - "G115: integer overflow conversion" exclude-rules: # We use math/rand instead of crypto/rand for unique names in e2e tests. - path: tests/e2e/ From d986cdfa5ad3e77713375b868f93dd2e6ef15f74 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 31 Oct 2024 16:01:35 +0100 Subject: [PATCH 3/3] [no-relnote] Use Fatal for non-formatted string Signed-off-by: Evan Lezar --- tests/e2e/framework/framework.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/framework/framework.go b/tests/e2e/framework/framework.go index fbf017aff..ab26fbc9d 100644 --- a/tests/e2e/framework/framework.go +++ b/tests/e2e/framework/framework.go @@ -190,7 +190,7 @@ func (f *Framework) AfterEach(ctx context.Context) { for namespaceKey, namespaceErr := range nsDeletionErrors { messages = append(messages, fmt.Sprintf("Couldn't delete ns: %q: %s (%#v)", namespaceKey, namespaceErr, namespaceErr)) } - e2elog.Failf(strings.Join(messages, ",")) + e2elog.Fail(strings.Join(messages, ",")) } }()