diff --git a/pkg/disruption/backend/sampler/response_checker.go b/pkg/disruption/backend/sampler/response_checker.go index 469222486ccf..c2ec9ec27c1b 100644 --- a/pkg/disruption/backend/sampler/response_checker.go +++ b/pkg/disruption/backend/sampler/response_checker.go @@ -27,21 +27,29 @@ func (c checker) CheckResponse(rr backend.RequestResponse) error { } if _, retry := backend.IsRetryAfter(resp); retry { - if rr.ShutdownInProgress() { - // We will deem a Retry-After response as a failure except while - // the server is shutting down and is sending 429 to request(s) - // that are arriving late. (this points to a faulty load balancer) + if !rr.ShutdownInProgress() { + // For now, any retry-after not occurring during a graceful shutdown is + // deemed as error since we don't expect the server to be sending + // retry-after in CI. + return &KnownError{ + category: "APIServerAvailability", + err: fmt.Errorf("server overwhelmed: %v body: %v", resp.Status, string(rr.ResponseBody)), + } + } + + if rr.ConnectionReused() != "true" { + // The server should reach the HTTP 429 shutdown stage before it has been + // marked unhealthy by load balancers. return &KnownError{ category: "FaultyLoadBalancer", err: fmt.Errorf("very late request: %v body: %v", resp.Status, string(rr.ResponseBody)), } } - // For now, any other retry-after is deemed as error since we don't - // expect the server to be sending retry-after in CI. - return &KnownError{ - category: "APIServerAvailability", - err: fmt.Errorf("server overwhelmed: %v body: %v", resp.Status, string(rr.ResponseBody)), - } + + // Requests on reused connections may be initiated until the moment the connection + // is closed. If that happens in the final moments of shutdown, an HTTP 429 is + // expected. + return nil } if resp.StatusCode < 200 || resp.StatusCode > 399 { diff --git a/pkg/monitor/backenddisruption/disruption_backend_sampler.go b/pkg/monitor/backenddisruption/disruption_backend_sampler.go index f66279a40af9..e78be25357de 100644 --- a/pkg/monitor/backenddisruption/disruption_backend_sampler.go +++ b/pkg/monitor/backenddisruption/disruption_backend_sampler.go @@ -409,6 +409,14 @@ func (b *BackendSampler) CheckConnection(ctx context.Context) (string, error) { sampleErr = bodyReadErr case b.expectedStatusCode > 0 && b.expectedStatusCode == resp.StatusCode: // don't fail + case b.GetConnectionType() == monitorapi.ReusedConnectionType && resp.StatusCode == 429 && bytes.Contains(body, []byte(`The apiserver is shutting down, please try again later.`)): + // This is the response sent by API servers in the final stage of graceful shutdown + // when --shutdown-send-retry-after is enabled. The REST client in k8s.io/client-go + // recognizes it and automatically retries without exposing an error to the client + // program. It is designed to shunt clients to other API servers that are not + // shutting down. Only requests on reused connections should see this, because we + // expect terminating API servers to be disabled as load balancer backends before + // this stage of the shutdown process. case resp.StatusCode < 200 || resp.StatusCode > 399: sampleErr = fmt.Errorf("error running request: %v: %v", resp.Status, string(body)) default: diff --git a/pkg/monitor/backenddisruption/disruption_backend_sampler_test.go b/pkg/monitor/backenddisruption/disruption_backend_sampler_test.go index 460543ae17b3..007aab16bad7 100644 --- a/pkg/monitor/backenddisruption/disruption_backend_sampler_test.go +++ b/pkg/monitor/backenddisruption/disruption_backend_sampler_test.go @@ -37,6 +37,14 @@ func TestBackendSampler_checkConnection(t *testing.T) { time.Sleep(2 * time.Second) w.WriteHeader(200) w.Write([]byte("200")) + case req.URL.Path == "/429-retry-after-shutdown": + w.Header().Set("Retry-After", "5") + w.WriteHeader(429) + w.Write([]byte("The apiserver is shutting down, please try again later.")) + case req.URL.Path == "/429-retry-after": + w.Header().Set("Retry-After", "5") + w.WriteHeader(429) + w.Write([]byte("429")) default: w.WriteHeader(404) } @@ -126,6 +134,33 @@ func TestBackendSampler_checkConnection(t *testing.T) { }, wantErr: false, // cancelling doesn't produce errors, it just means we were shutdown }, + { + name: "new-429-retry-after-shutdown", + fields: fields{ + disruptionBackendName: "429", + connectionType: monitorapi.NewConnectionType, + path: "/429-retry-after-shutdown", + }, + wantErr: true, // load balancer shouldn't send new connections to this instance + }, + { + name: "reused-429-retry-after-shutdown", + fields: fields{ + disruptionBackendName: "429", + connectionType: monitorapi.ReusedConnectionType, + path: "/429-retry-after-shutdown", + }, + wantErr: false, + }, + { + name: "reused-429-retry-after", + fields: fields{ + disruptionBackendName: "429", + connectionType: monitorapi.ReusedConnectionType, + path: "/429-retry-after", + }, + wantErr: true, // reason for 429 unrecognized + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {