From d040320c2b84ee7ef812118c9e5a5a5c6225452f Mon Sep 17 00:00:00 2001 From: Carlos Panato Date: Wed, 24 Feb 2021 10:15:19 +0100 Subject: [PATCH] aws-janitor-boskos: add clean time and process time metrics to collect the response time when cleaning and for the entire process when doing the janitor process --- cmd/aws-janitor-boskos/main.go | 48 +++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/cmd/aws-janitor-boskos/main.go b/cmd/aws-janitor-boskos/main.go index c3d6c226..7e574943 100644 --- a/cmd/aws-janitor-boskos/main.go +++ b/cmd/aws-janitor-boskos/main.go @@ -25,8 +25,15 @@ import ( "github.com/aws/aws-sdk-go/aws/credentials" "github.com/aws/aws-sdk-go/aws/session" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" + + "k8s.io/test-infra/pkg/flagutil" + "k8s.io/test-infra/prow/config" + prowflagutil "k8s.io/test-infra/prow/flagutil" "k8s.io/test-infra/prow/logrusutil" + prowmetrics "k8s.io/test-infra/prow/metrics" + "sigs.k8s.io/boskos/aws-janitor/account" "sigs.k8s.io/boskos/aws-janitor/regions" "sigs.k8s.io/boskos/aws-janitor/resources" @@ -52,6 +59,19 @@ var ( includeTags common.CommaSeparatedStrings excludeTM resources.TagMatcher includeTM resources.TagMatcher + + instrumentationOptions prowflagutil.InstrumentationOptions + + cleaningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "aws_janitor_boskos_cleaning_time_seconds", + ConstLabels: prometheus.Labels{}, + Buckets: prometheus.ExponentialBuckets(1, 1.4, 30), + }, []string{"resource_type", "status"}) + + sweepsGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "aws_janitor_boskos_sweeps", + ConstLabels: prometheus.Labels{}, + }, []string{"resource_type"}) ) const ( @@ -64,10 +84,16 @@ func init() { "Resources with any of these tags will not be managed by the janitor. Given as a comma-separated list of tags in key[=value] format; excluding the value will match any tag with that key. Keys can be repeated.") flag.Var(&includeTags, "include-tags", "Resources must include all of these tags in order to be managed by the janitor. Given as a comma-separated list of tags in key[=value] format; excluding the value will match any tag with that key. Keys can be repeated.") + + prometheus.MustRegister(cleaningTimeHistogram) + prometheus.MustRegister(sweepsGauge) } func main() { logrusutil.ComponentInit() + for _, o := range []flagutil.OptionGroup{&instrumentationOptions} { + o.AddFlags(flag.CommandLine) + } flag.Parse() level, err := logrus.ParseLevel(*logLevel) @@ -76,6 +102,13 @@ func main() { } logrus.SetLevel(level) + for _, o := range []flagutil.OptionGroup{&instrumentationOptions} { + if err := o.Validate(false); err != nil { + logrus.Fatalf("Invalid options: %v", err) + } + } + prowmetrics.ExposeMetrics("aws-janitor-boskos", config.PushGateway{}, instrumentationOptions.MetricsPort) + if d, err := time.ParseDuration(*sweepSleep); err != nil { sweepSleepDuration = time.Second * 30 } else { @@ -115,13 +148,17 @@ func run(boskos *client.Client) error { } else if err != nil { return errors.Wrap(err, "Couldn't retrieve resources from Boskos") } else { + startProcess := time.Now() logrus.WithField("name", res.Name).Info("Acquired resource") if err := cleanResource(res); err != nil { + collectMetric(startProcess, res.Name, "failed-clean") return errors.Wrapf(err, "Couldn't clean resource %q", res.Name) } if err := boskos.ReleaseOne(res.Name, common.Free); err != nil { + collectMetric(startProcess, res.Name, "failed-release") return errors.Wrapf(err, "Failed to release resoures %q", res.Name) } + collectMetric(startProcess, res.Name, "released") logrus.WithField("name", res.Name).Info("Released resource") } } @@ -165,8 +202,13 @@ func cleanResource(res *common.Resource) error { } } - duration := time.Since(start) - - logrus.WithFields(logrus.Fields{"name": res.Name, "duration": duration.Seconds()}).Info("Finished cleaning") + sweepsGauge.WithLabelValues(res.Name).Set(float64(*sweepCount)) + collectMetric(start, res.Name, "clean") + logrus.WithFields(logrus.Fields{"name": res.Name, "duration": time.Since(start).Seconds(), "sweeps": *sweepCount}).Info("Finished cleaning") return nil } + +func collectMetric(startTime time.Time, rType, status string) { + duration := time.Since(startTime).Seconds() + cleaningTimeHistogram.WithLabelValues(rType, status).Observe(duration) +}