From 525d469189282000a77ff42775488954f3bce927 Mon Sep 17 00:00:00 2001 From: hriships Date: Tue, 17 Jul 2018 14:06:40 +0530 Subject: [PATCH] Fixes #258 Reset the pods if Jenkins fails to wake up within certain duration work is in progress --- internal/api/api.go | 109 +++++++++++++++++- internal/openshift/client/openshift_client.go | 85 ++++++++++++-- internal/testutils/mock/mock_idler_api.go | 4 +- .../testutils/mock/mock_openshift_client.go | 15 ++- 4 files changed, 195 insertions(+), 18 deletions(-) diff --git a/internal/api/api.go b/internal/api/api.go index c248e56..d9dd90f 100644 --- a/internal/api/api.go +++ b/internal/api/api.go @@ -17,12 +17,15 @@ import ( "github.com/fabric8-services/fabric8-jenkins-idler/metric" "github.com/julienschmidt/httprouter" log "github.com/sirupsen/logrus" + "k8s.io/api/core/v1" ) const ( // OpenShiftAPIParam is the parameter name under which the OpenShift cluster API URL is passed using // Idle, UnIdle and IsIdle. OpenShiftAPIParam = "openshift_api_url" + PodResetRetryLimit = 5 + PodRetryInterval = 60 ) var ( @@ -57,7 +60,7 @@ type IdlerAPI interface { // ClusterDNSView writes a JSON representation of the current cluster state to the response writer. ClusterDNSView(w http.ResponseWriter, r *http.Request, ps httprouter.Params) - // Reset deletes a pod and starts a new one + // ResetNSPods deletes a pod and starts a new one Reset(w http.ResponseWriter, r *http.Request, ps httprouter.Params) } @@ -151,10 +154,8 @@ func (api *idler) UnIdle(w http.ResponseWriter, r *http.Request, ps httprouter.P return } - // unidle now for _, service := range pidler.JenkinsServices { startTime := time.Now() - err = api.openShiftClient.UnIdle(openshiftURL, openshiftToken, ns, service) elapsedTime := time.Since(startTime).Seconds() if err != nil { @@ -162,7 +163,8 @@ func (api *idler) UnIdle(w http.ResponseWriter, r *http.Request, ps httprouter.P respondWithError(w, http.StatusInternalServerError, err) return } - + // tries best to undle the pod + go resetServicePods(api.openShiftClient, openshiftURL, openshiftToken, ns, service, 0); Recorder.RecordReqDuration(service, "UnIdle", http.StatusOK, elapsedTime) } @@ -221,7 +223,7 @@ func (api *idler) Info(w http.ResponseWriter, r *http.Request, ps httprouter.Par func (api *idler) Reset(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { - logger := log.WithFields(log.Fields{"component": "api", "function": "Reset"}) + logger := log.WithFields(log.Fields{"component": "api", "function": "ResetNSPods"}) openShiftAPI, openShiftBearerToken, err := api.getURLAndToken(r) if err != nil { @@ -231,7 +233,7 @@ func (api *idler) Reset(w http.ResponseWriter, r *http.Request, ps httprouter.Pa return } - err = api.openShiftClient.Reset(openShiftAPI, openShiftBearerToken, ps.ByName("namespace")) + err = api.openShiftClient.ResetNSPods(openShiftAPI, openShiftBearerToken, ps.ByName("namespace")) if err != nil { logger.Error(err) w.WriteHeader(http.StatusInternalServerError) @@ -340,3 +342,98 @@ func writeResponse(w http.ResponseWriter, status int, response any) { w.WriteHeader(status) json.NewEncoder(w).Encode(response) } + +/*could be better to move it to some other file*/ +func resetServicePods(client client.OpenShiftClient, url string, token string, ns string, service string, retry int) bool { + + initiating := false + resetOccurred := false + + if retry >= PodResetRetryLimit { + log.Errorf("failed to reset the %s service in namespace %s", service, ns) + return false + } + + time.Sleep(PodRetryInterval * time.Second) + + states, err := client.PodState(url, token, ns, service); + if err != nil { + log.Error(err) + return false + } + + for n, s := range states { + if podInitiating(s) { + initiating = true; + break; + } + + if (failedToInitiatePod(s)) { + resetOccurred = true + error := client.ResetPod(url, token, ns, n) + if error != nil { + log.Warningf("failed to delete pod %s", n) + } + break; + } + } + + if initiating { + return resetServicePods(client, url, token, ns, service, retry) + } else if resetOccurred { + retry++ + return resetServicePods(client, url, token, ns, service, retry) + } else { + return true + } +} + +func podInitiating(status v1.PodStatus) bool { + return status.Phase == "Pending" && initiatingContainers(status) +} + +func initiatingContainers(status v1.PodStatus) bool { + + for _, cs := range status.ContainerStatuses { + if cs.State.Waiting != nil || cs.State.Running == nil { + return true + } + } + + for _, ics := range status.InitContainerStatuses { + if ics.State.Waiting != nil || ics.State.Running == nil { + return true + } + } + + return false +} + +func failedToInitiatePod(status v1.PodStatus) bool { + return status.Phase == "Failed" || status.Phase == "Pending" && containersFailed(status) +} + +func containersFailed(status v1.PodStatus) bool { + + for _, cs := range status.ContainerStatuses { + currentState := cs.State + lastState := cs.LastTerminationState + + if currentState.Terminated != nil && currentState.Terminated.Reason == "Error" && cs.RestartCount >= 1 || + lastState.Terminated != nil && lastState.Terminated.Reason == "Error" && cs.RestartCount >= 1 { + return true + } + } + + for _, ics := range status.InitContainerStatuses { + currentState := ics.State + lastState := ics.LastTerminationState + + if currentState.Terminated != nil && currentState.Terminated.Reason == "Error" && ics.RestartCount >= 1 || + lastState.Terminated != nil && lastState.Terminated.Reason == "Error" && ics.RestartCount >= 1 { + return true + } + } + + return false +} \ No newline at end of file diff --git a/internal/openshift/client/openshift_client.go b/internal/openshift/client/openshift_client.go index e0a57be..9069bdb 100644 --- a/internal/openshift/client/openshift_client.go +++ b/internal/openshift/client/openshift_client.go @@ -28,7 +28,9 @@ type OpenShiftClient interface { WhoAmI(apiURL string, bearerToken string) (string, error) WatchBuilds(apiURL string, bearerToken string, buildType string, callback func(model.Object) error) error WatchDeploymentConfigs(apiURL string, bearerToken string, namespaceSuffix string, callback func(model.DCObject) error) error - Reset(apiURL string, bearerToken string, namespace string) error + ResetPod(apiURL string, bearerToken string, namespace string, podName string) error + ResetNSPods(apiURL string, bearerToken string, namespace string) error + PodState(apiURL string, bearerToken string, namespace string, selector string) (map[string]v1.PodStatus, error) } type user struct { @@ -145,10 +147,10 @@ func (o openShift) Idle(apiURL string, bearerToken string, namespace string, ser return } -// Reset deletes a pod and start a new one -func (o *openShift) Reset(apiURL string, bearerToken string, namespace string) error { - logger.Infof("resetting pods in " + namespace) +// ResetPod deletes a pod and start a new one +func (o *openShift) ResetPod(apiURL string, bearerToken string, namespace string, podName string) error { +/* req, err := o.reqAPI(apiURL, bearerToken, "GET", namespace, "pods", nil) if err != nil { return err @@ -167,24 +169,60 @@ func (o *openShift) Reset(apiURL string, bearerToken string, namespace string) e } for _, element := range podList.Items { - - podName := element.GetName() +*/ if strings.Contains(podName, "deploy") { - continue + return nil; } - log.Infof("Resetting the pod %q", podName) + log.Infof("resetting pod %q in %q", podName, namespace) req, err := o.reqAPI(apiURL, bearerToken, "DELETE", namespace, "pods/"+podName, nil) if err != nil { return err } - resp, err = o.do(req) + resp, err := o.do(req) if err != nil { return err } defer bodyClose(resp) + + + return nil +} + +// ResetNSPods deletes a pod and start a new one +func (o *openShift) ResetNSPods(apiURL string, bearerToken string, namespace string) error { + + req, err := o.reqAPI(apiURL, bearerToken, "GET", namespace, "pods", nil) + if err != nil { + return err + } + + resp, err := o.do(req) + if err != nil { + return err + } + + defer bodyClose(resp) + + podList := &v1.PodList{} + err = json.NewDecoder(resp.Body).Decode(podList) + if err != nil { + return err + } + + for _, element := range podList.Items { + podName := element.Name + if strings.Contains(podName, "deploy") { + return nil; + } + + err = o.ResetPod(apiURL, bearerToken, namespace, podName) + if err != nil { + return err + } } + return nil } @@ -515,6 +553,35 @@ func (o *openShift) patch(req *http.Request) (b []byte, err error) { return } +func (o *openShift) PodState(apiURL string, bearerToken string, namespace string, selector string) (map[string]v1.PodStatus, error) { + status := make(map[string]v1.PodStatus) + + selector = fmt.Sprintf("%s=%s", "deploymentconfig",selector) + req, err := o.reqOAPI(apiURL, bearerToken, "GET", namespace, "pods?labelSelector="+selector, nil) + if err != nil { + return status, err + } + + resp, err := o.do(req) + if err != nil { + return status, err + } + + defer bodyClose(resp) + + podList := &v1.PodList{} + err = json.NewDecoder(resp.Body).Decode(podList) + if err != nil { + return status, err + } + + for _, pod := range podList.Items { + status[pod.Name] = pod.Status + } + + return status, nil; +} + func bodyClose(resp *http.Response) { io.Copy(ioutil.Discard, resp.Body) resp.Body.Close() diff --git a/internal/testutils/mock/mock_idler_api.go b/internal/testutils/mock/mock_idler_api.go index 4ed0ecf..b9fc992 100644 --- a/internal/testutils/mock/mock_idler_api.go +++ b/internal/testutils/mock/mock_idler_api.go @@ -47,10 +47,10 @@ func (i *IdlerAPI) Status(w http.ResponseWriter, r *http.Request, ps httprouter. w.WriteHeader(http.StatusOK) } -// Reset mock resets pods +// ResetNSPods mock resets pods func (i *IdlerAPI) Reset(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { w.WriteHeader(http.StatusOK) - w.Write([]byte("Reset")) + w.Write([]byte("ResetNSPods")) } // ClusterDNSView writes a JSON representation of the current cluster state to the response writer. diff --git a/internal/testutils/mock/mock_openshift_client.go b/internal/testutils/mock/mock_openshift_client.go index 935a8ad..68b3d72 100644 --- a/internal/testutils/mock/mock_openshift_client.go +++ b/internal/testutils/mock/mock_openshift_client.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/fabric8-services/fabric8-jenkins-idler/internal/model" + "k8s.io/api/core/v1" ) // OpenShiftClient is a client for OpenShift API @@ -43,7 +44,7 @@ func (c *OpenShiftClient) State(apiURL string, bearerToken string, namespace str return c.IdleState, nil } -// Reset deletes a pod and start a new one +// ResetNSPods deletes a pod and start a new one func (c *OpenShiftClient) Reset(apiURL string, bearerToken string, namespace string) error { if c.IdleError != "" { return fmt.Errorf(c.IdleError) @@ -77,6 +78,18 @@ func (c *OpenShiftClient) WatchDeploymentConfigs(apiURL string, bearerToken stri return nil } +func (o *OpenShiftClient) ResetPod(apiURL string, bearerToken string, namespace string, podName string) error { + return nil +} + +func (c *OpenShiftClient) ResetNSPods(apiURL string, bearerToken string, namespace string) error { + return nil +} + +func (c *OpenShiftClient) PodState(apiURL string, bearerToken string, namespace string, selector string) (map[string] v1.PodStatus, error) { + return nil, nil +} + // ResetCounts resets calls made to the idler(idle/unidle) to 0. func (c *OpenShiftClient) ResetCounts() { c.UnIdleCallCount = 0