Merge branch 'ndyakov/feature/CAE-1313-maint-cluster' into ndyakov/additional-e2e-proxy-tests

ndyakov · web-flow · commit 8eef599a4a5e · 2025-12-16T18:33:38.000+02:00
diff --git a/commands_test.go b/commands_test.go
@@ -8870,11 +8870,15 @@ var _ = Describe("Commands", func() {
 		It("returns latencies", func() {
 			const key = "latency-monitor-threshold"
 
+			// reset all latencies first to ensure clean state
+			err := client.LatencyReset(ctx).Err()
+			Expect(err).NotTo(HaveOccurred())
+
 			old := client.ConfigGet(ctx, key).Val()
 			client.ConfigSet(ctx, key, "1")
 			defer client.ConfigSet(ctx, key, old[key])
 
-			err := client.Do(ctx, "DEBUG", "SLEEP", 0.01).Err()
+			err = client.Do(ctx, "DEBUG", "SLEEP", 0.01).Err()
 			Expect(err).NotTo(HaveOccurred())
 
 			result, err := client.Latency(ctx).Result()
@@ -8921,6 +8925,10 @@ var _ = Describe("Commands", func() {
 		It("reset latencies by add event name args", func() {
 			const key = "latency-monitor-threshold"
 
+			// reset all latencies first to ensure clean state
+			err := client.LatencyReset(ctx).Err()
+			Expect(err).NotTo(HaveOccurred())
+
 			old := client.ConfigGet(ctx, key).Val()
 			// Use a higher threshold (100ms) to avoid capturing normal operations
 			// that could cause flakiness due to timing variations
diff --git a/error.go b/error.go
@@ -124,6 +124,9 @@ func shouldRetry(err error, retryTimeout bool) bool {
 	if proto.IsTryAgainError(err) {
 		return true
 	}
+	if proto.IsNoReplicasError(err) {
+		return true
+	}
 
 	// Fallback to string checking for backward compatibility with plain errors
 	s := err.Error()
@@ -145,6 +148,9 @@ func shouldRetry(err error, retryTimeout bool) bool {
 	if strings.HasPrefix(s, "MASTERDOWN ") {
 		return true
 	}
+	if strings.HasPrefix(s, "NOREPLICAS ") {
+		return true
+	}
 
 	return false
 }
@@ -342,6 +348,14 @@ func IsOOMError(err error) bool {
 	return proto.IsOOMError(err)
 }
 
+// IsNoReplicasError checks if an error is a Redis NOREPLICAS error, even if wrapped.
+// NOREPLICAS errors occur when not enough replicas acknowledge a write operation.
+// This typically happens with WAIT/WAITAOF commands or CLUSTER SETSLOT with synchronous
+// replication when the required number of replicas cannot confirm the write within the timeout.
+func IsNoReplicasError(err error) bool {
+	return proto.IsNoReplicasError(err)
+}
+
 //------------------------------------------------------------------------------
 
 type timeoutError interface {
diff --git a/error_test.go b/error_test.go
@@ -45,7 +45,8 @@ var _ = Describe("error", func() {
 			proto.ParseErrorReply([]byte("-READONLY You can't write against a read only replica")):   true,
 			proto.ParseErrorReply([]byte("-CLUSTERDOWN The cluster is down")):                        true,
 			proto.ParseErrorReply([]byte("-TRYAGAIN Command cannot be processed, please try again")): true,
-			proto.ParseErrorReply([]byte("-ERR other")): false,
+			proto.ParseErrorReply([]byte("-NOREPLICAS Not enough good replicas to write")):           true,
+			proto.ParseErrorReply([]byte("-ERR other")):                                              false,
 		}
 
 		for err, expected := range data {
diff --git a/error_wrapping_test.go b/error_wrapping_test.go
@@ -239,10 +239,10 @@ func TestErrorWrappingInHookScenario(t *testing.T) {
 // TestShouldRetryWithTypedErrors tests that shouldRetry works with typed errors
 func TestShouldRetryWithTypedErrors(t *testing.T) {
 	tests := []struct {
-		name          string
-		errorMsg      string
-		shouldRetry   bool
-		retryTimeout  bool
+		name         string
+		errorMsg     string
+		shouldRetry  bool
+		retryTimeout bool
 	}{
 		{
 			name:         "LOADING error should retry",
@@ -280,6 +280,12 @@ func TestShouldRetryWithTypedErrors(t *testing.T) {
 			shouldRetry:  true,
 			retryTimeout: false,
 		},
+		{
+			name:         "NOREPLICAS error should retry",
+			errorMsg:     "NOREPLICAS Not enough good replicas to write",
+			shouldRetry:  true,
+			retryTimeout: false,
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/internal/pool/pool.go b/internal/pool/pool.go
@@ -321,6 +321,12 @@ func (p *ConnPool) newConn(ctx context.Context, pooled bool) (*Conn, error) {
 		return nil, ErrPoolExhausted
 	}
 
+	// Protect against nil context due to race condition in queuedNewConn
+	// where the context can be set to nil after timeout/cancellation
+	if ctx == nil {
+		ctx = context.Background()
+	}
+
 	dialCtx, cancel := context.WithTimeout(ctx, p.cfg.DialTimeout)
 	defer cancel()
 	cn, err := p.dialConn(dialCtx, pooled)
diff --git a/internal/pool/pool_test.go b/internal/pool/pool_test.go
@@ -1037,6 +1037,64 @@ var _ = Describe("queuedNewConn", func() {
 		testPool.Put(ctx, reqBConn)
 		Eventually(func() int { return testPool.QueueLen() }, "600ms").Should(Equal(0))
 	})
+	// Test for race condition where nil context can be passed to newConn
+	// This reproduces the issue reported in GitHub where queuedNewConn panics
+	// with "cannot create context from nil parent"
+	It("should handle nil context race condition in queuedNewConn", func() {
+		// Create a pool with very short timeouts to trigger the race condition
+		testPool := pool.NewConnPool(&pool.Options{
+			Dialer: func(ctx context.Context) (net.Conn, error) {
+				// Add a small delay to increase chance of race condition
+				time.Sleep(50 * time.Millisecond)
+				return dummyDialer(ctx)
+			},
+			PoolSize:           int32(10),
+			MaxConcurrentDials: 10,
+			PoolTimeout:        10 * time.Millisecond, // Very short timeout
+			DialTimeout:        100 * time.Millisecond,
+			ConnMaxIdleTime:    time.Millisecond,
+		})
+		defer testPool.Close()
+
+		// Try to trigger the race condition by making many concurrent requests
+		// with short timeouts
+		const numGoroutines = 50
+		var wg sync.WaitGroup
+		errors := make(chan error, numGoroutines)
+
+		for i := 0; i < numGoroutines; i++ {
+			wg.Add(1)
+			go func() {
+				defer GinkgoRecover()
+				defer wg.Done()
+
+				// Use a very short context timeout to trigger the race
+				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Millisecond)
+				defer cancel()
+
+				_, err := testPool.Get(ctx)
+				if err != nil {
+					// We expect timeout errors, but not panics
+					errors <- err
+				}
+			}()
+		}
+
+		wg.Wait()
+		close(errors)
+
+		// Check that we got timeout errors (expected) but no panics
+		// The test passes if it doesn't panic
+		timeoutCount := 0
+		for err := range errors {
+			if err == context.DeadlineExceeded || err == pool.ErrPoolTimeout {
+				timeoutCount++
+			}
+		}
+
+		// We should have at least some timeouts due to the short timeout
+		Expect(timeoutCount).To(BeNumerically(">", 0))
+	})
 })
 
 func init() {
diff --git a/internal/pool/want_conn_test.go b/internal/pool/want_conn_test.go
@@ -442,3 +442,56 @@ func BenchmarkWantConnQueue_EnqueueDequeue(b *testing.B) {
 		q.dequeue()
 	}
 }
+
+// TestWantConn_RaceConditionNilContext tests the race condition where
+// getCtxForDial can return nil after the context is cancelled.
+// This test verifies that the fix in newConn handles nil context gracefully.
+func TestWantConn_RaceConditionNilContext(t *testing.T) {
+	// This test simulates the race condition described in the issue:
+	// 1. Main goroutine creates a wantConn with a context
+	// 2. Background goroutine starts but hasn't called getCtxForDial yet
+	// 3. Main goroutine times out and calls cancel(), setting w.ctx to nil
+	// 4. Background goroutine calls getCtxForDial() and gets nil
+	// 5. Background goroutine calls newConn(nil, true) which should not panic
+
+	dialCtx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer cancel()
+
+	w := &wantConn{
+		ctx:       dialCtx,
+		cancelCtx: cancel,
+		result:    make(chan wantConnResult, 1),
+	}
+
+	// Simulate the race condition by canceling the context
+	// and then trying to get it
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	go func() {
+		defer wg.Done()
+		// Small delay to ensure cancel happens first
+		time.Sleep(10 * time.Millisecond)
+
+		// This should return nil after cancel
+		ctx := w.getCtxForDial()
+
+		// Verify that we got nil context
+		if ctx != nil {
+			t.Errorf("Expected nil context after cancel, got %v", ctx)
+		}
+	}()
+
+	// Cancel the context immediately
+	w.cancel()
+
+	wg.Wait()
+
+	// Verify the wantConn state
+	if !w.done {
+		t.Error("wantConn should be marked as done after cancel")
+	}
+	if w.ctx != nil {
+		t.Error("wantConn.ctx should be nil after cancel")
+	}
+}
diff --git a/internal/proto/redis_errors.go b/internal/proto/redis_errors.go
@@ -212,6 +212,25 @@ func NewOOMError(msg string) *OOMError {
 	return &OOMError{msg: msg}
 }
 
+// NoReplicasError is returned when not enough replicas acknowledge a write.
+// This error occurs when using WAIT/WAITAOF commands or CLUSTER SETSLOT with
+// synchronous replication, and the required number of replicas cannot confirm
+// the write within the timeout period.
+type NoReplicasError struct {
+	msg string
+}
+
+func (e *NoReplicasError) Error() string {
+	return e.msg
+}
+
+func (e *NoReplicasError) RedisError() {}
+
+// NewNoReplicasError creates a new NoReplicasError with the given message.
+func NewNoReplicasError(msg string) *NoReplicasError {
+	return &NoReplicasError{msg: msg}
+}
+
 // parseTypedRedisError parses a Redis error message and returns a typed error if applicable.
 // This function maintains backward compatibility by keeping the same error messages.
 func parseTypedRedisError(msg string) error {
@@ -235,6 +254,8 @@ func parseTypedRedisError(msg string) error {
 		return NewTryAgainError(msg)
 	case strings.HasPrefix(msg, "MASTERDOWN "):
 		return NewMasterDownError(msg)
+	case strings.HasPrefix(msg, "NOREPLICAS "):
+		return NewNoReplicasError(msg)
 	case msg == "ERR max number of clients reached":
 		return NewMaxClientsError(msg)
 	case strings.HasPrefix(msg, "NOAUTH "), strings.HasPrefix(msg, "WRONGPASS "), strings.Contains(msg, "unauthenticated"):
@@ -486,3 +507,21 @@ func IsOOMError(err error) bool {
 	// Fallback to string checking for backward compatibility
 	return strings.HasPrefix(err.Error(), "OOM ")
 }
+
+// IsNoReplicasError checks if an error is a NoReplicasError, even if wrapped.
+func IsNoReplicasError(err error) bool {
+	if err == nil {
+		return false
+	}
+	var noReplicasErr *NoReplicasError
+	if errors.As(err, &noReplicasErr) {
+		return true
+	}
+	// Check if wrapped error is a RedisError with NOREPLICAS prefix
+	var redisErr RedisError
+	if errors.As(err, &redisErr) && strings.HasPrefix(redisErr.Error(), "NOREPLICAS ") {
+		return true
+	}
+	// Fallback to string checking for backward compatibility
+	return strings.HasPrefix(err.Error(), "NOREPLICAS ")
+}
diff --git a/internal/proto/redis_errors_test.go b/internal/proto/redis_errors_test.go
@@ -9,12 +9,12 @@ import (
 // TestTypedRedisErrors tests that typed Redis errors are created correctly
 func TestTypedRedisErrors(t *testing.T) {
 	tests := []struct {
-		name          string
-		errorMsg      string
-		expectedType  interface{}
-		expectedMsg   string
-		checkFunc     func(error) bool
-		extractAddr   func(error) string
+		name         string
+		errorMsg     string
+		expectedType interface{}
+		expectedMsg  string
+		checkFunc    func(error) bool
+		extractAddr  func(error) string
 	}{
 		{
 			name:         "LOADING error",
@@ -132,6 +132,13 @@ func TestTypedRedisErrors(t *testing.T) {
 			expectedMsg:  "OOM command not allowed when used memory > 'maxmemory'",
 			checkFunc:    IsOOMError,
 		},
+		{
+			name:         "NOREPLICAS error",
+			errorMsg:     "NOREPLICAS Not enough good replicas to write",
+			expectedType: &NoReplicasError{},
+			expectedMsg:  "NOREPLICAS Not enough good replicas to write",
+			checkFunc:    IsNoReplicasError,
+		},
 	}
 
 	for _, tt := range tests {
@@ -389,4 +396,3 @@ func TestBackwardCompatibility(t *testing.T) {
 		})
 	}
 }
-

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,8 @@ var _ = Describe("error", func() {`
`45`	`45`	`proto.ParseErrorReply([]byte("-READONLY You can't write against a read only replica")): true,`
`46`	`46`	`proto.ParseErrorReply([]byte("-CLUSTERDOWN The cluster is down")): true,`
`47`	`47`	`proto.ParseErrorReply([]byte("-TRYAGAIN Command cannot be processed, please try again")): true,`
`48`		`- proto.ParseErrorReply([]byte("-ERR other")): false,`
	`48`	`+ proto.ParseErrorReply([]byte("-NOREPLICAS Not enough good replicas to write")): true,`
	`49`	`+ proto.ParseErrorReply([]byte("-ERR other")): false,`
`49`	`50`	`}`
`50`	`51`
`51`	`52`	`for err, expected := range data {`