healthcheck: stop checking when max attempts are reached

This commit adds a check to the returned error from calling retryCheck
such that when the max number of attempts is reached, the health check
will quit.
This commit is contained in:
yyforyongyu
2021-09-02 21:13:23 +08:00
parent 3204e2d74b
commit d19ee28089
2 changed files with 70 additions and 39 deletions

View File

@@ -86,6 +86,8 @@ func (m *Monitor) Stop() error {
return fmt.Errorf("monitor already stopped") return fmt.Errorf("monitor already stopped")
} }
log.Info("Health monitor shutting down")
close(m.quit) close(m.quit)
m.wg.Wait() m.wg.Wait()
@@ -166,10 +168,18 @@ func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
for { for {
select { select {
case <-o.Interval.Ticks(): case <-o.Interval.Ticks():
o.retryCheck(quit, shutdown) // retryCheck will return errMaxAttemptsReached when
// the max attempts are reached. In that case we will
// stop the ticker and quit.
if o.retryCheck(quit, shutdown) {
log.Debugf("Health check: max attempts " +
"failed, monitor exiting")
return
}
// Exit if we receive the instruction to shutdown. // Exit if we receive the instruction to shutdown.
case <-quit: case <-quit:
log.Debug("Health check: monitor quit")
return return
} }
} }
@@ -178,8 +188,11 @@ func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
// retryCheck calls a check function until it succeeds, or we reach our // retryCheck calls a check function until it succeeds, or we reach our
// configured number of attempts, waiting for our back off period between failed // configured number of attempts, waiting for our back off period between failed
// calls. If we fail to obtain a passing health check after the allowed number // calls. If we fail to obtain a passing health check after the allowed number
// of calls, we will request shutdown. // of calls, we will request shutdown. It returns a bool to indicate whether
func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) { // the max number of attempts is reached.
func (o *Observation) retryCheck(quit chan struct{},
shutdown shutdownFunc) bool {
var count int var count int
for count < o.Attempts { for count < o.Attempts {
@@ -197,13 +210,14 @@ func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
"%v", o, o.Timeout) "%v", o, o.Timeout)
case <-quit: case <-quit:
return log.Debug("Health check: monitor quit")
return false
} }
// If our error is nil, we have passed our health check, so we // If our error is nil, we have passed our health check, so we
// can exit. // can exit.
if err == nil { if err == nil {
return return false
} }
// If we have reached our allowed number of attempts, this // If we have reached our allowed number of attempts, this
@@ -211,8 +225,7 @@ func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
if count == o.Attempts { if count == o.Attempts {
shutdown("Health check: %v failed after %v "+ shutdown("Health check: %v failed after %v "+
"calls", o, o.Attempts) "calls", o, o.Attempts)
return true
return
} }
log.Infof("Health check: %v, call: %v failed with: %v, "+ log.Infof("Health check: %v, call: %v failed with: %v, "+
@@ -225,7 +238,10 @@ func (o *Observation) retryCheck(quit chan struct{}, shutdown shutdownFunc) {
case <-time.After(o.Backoff): case <-time.After(o.Backoff):
case <-quit: case <-quit:
return log.Debug("Health check: monitor quit")
return false
} }
} }
return false
} }

View File

@@ -132,6 +132,10 @@ func TestRetryCheck(t *testing.T) {
// expectedShutdown is true if we expect a shutdown to be // expectedShutdown is true if we expect a shutdown to be
// triggered because all of our calls failed. // triggered because all of our calls failed.
expectedShutdown bool expectedShutdown bool
// maxAttemptsReached specifies whether the max allowed
// attempts are reached from calling retryCheck.
maxAttemptsReached bool
}{ }{
{ {
name: "first call succeeds", name: "first call succeeds",
@@ -139,6 +143,7 @@ func TestRetryCheck(t *testing.T) {
attempts: 2, attempts: 2,
timeout: time.Hour, timeout: time.Hour,
expectedShutdown: false, expectedShutdown: false,
maxAttemptsReached: false,
}, },
{ {
name: "first call fails", name: "first call fails",
@@ -146,6 +151,7 @@ func TestRetryCheck(t *testing.T) {
attempts: 1, attempts: 1,
timeout: time.Hour, timeout: time.Hour,
expectedShutdown: true, expectedShutdown: true,
maxAttemptsReached: true,
}, },
{ {
name: "fail then recover", name: "fail then recover",
@@ -153,6 +159,7 @@ func TestRetryCheck(t *testing.T) {
attempts: 2, attempts: 2,
timeout: time.Hour, timeout: time.Hour,
expectedShutdown: false, expectedShutdown: false,
maxAttemptsReached: false,
}, },
{ {
name: "always fail", name: "always fail",
@@ -160,6 +167,7 @@ func TestRetryCheck(t *testing.T) {
attempts: 2, attempts: 2,
timeout: time.Hour, timeout: time.Hour,
expectedShutdown: true, expectedShutdown: true,
maxAttemptsReached: true,
}, },
{ {
name: "no calls", name: "no calls",
@@ -167,6 +175,7 @@ func TestRetryCheck(t *testing.T) {
attempts: 0, attempts: 0,
timeout: time.Hour, timeout: time.Hour,
expectedShutdown: false, expectedShutdown: false,
maxAttemptsReached: false,
}, },
{ {
name: "call times out", name: "call times out",
@@ -174,6 +183,7 @@ func TestRetryCheck(t *testing.T) {
attempts: 1, attempts: 1,
timeout: 1, timeout: 1,
expectedShutdown: true, expectedShutdown: true,
maxAttemptsReached: true,
}, },
} }
@@ -203,8 +213,11 @@ func TestRetryCheck(t *testing.T) {
// on us sending errors into the mocked caller's error // on us sending errors into the mocked caller's error
// channel. // channel.
done := make(chan struct{}) done := make(chan struct{})
retryResult := false
go func() { go func() {
observation.retryCheck(quit, shutdownFunc) retryResult = observation.retryCheck(
quit, shutdownFunc,
)
close(done) close(done)
}() }()
@@ -218,6 +231,8 @@ func TestRetryCheck(t *testing.T) {
// check function before we start checking results. // check function before we start checking results.
<-done <-done
require.Equal(t, test.maxAttemptsReached, retryResult,
"retryCheck returned unexpected error")
require.Equal(t, test.expectedShutdown, shutdown, require.Equal(t, test.expectedShutdown, shutdown,
"unexpected shutdown state") "unexpected shutdown state")
}) })