diff --git a/setup/base/base.go b/setup/base/base.go index 0636c7b8..2e3a3a19 100644 --- a/setup/base/base.go +++ b/setup/base/base.go @@ -18,6 +18,7 @@ import ( "context" "crypto/tls" "database/sql" + "encoding/json" "fmt" "io" "net" @@ -467,8 +468,13 @@ func (b *BaseDendrite) SetupAndServeHTTP( w.WriteHeader(200) }) b.DendriteAdminMux.HandleFunc("/monitor/health", func(w http.ResponseWriter, r *http.Request) { - if b.ProcessContext.IsDegraded() { + if isDegraded, reasons := b.ProcessContext.IsDegraded(); isDegraded { w.WriteHeader(503) + _ = json.NewEncoder(w).Encode(struct { + Warnings []string `json:"warnings"` + }{ + Warnings: reasons, + }) return } w.WriteHeader(200) diff --git a/setup/jetstream/nats.go b/setup/jetstream/nats.go index 7409fd6c..af4eb294 100644 --- a/setup/jetstream/nats.go +++ b/setup/jetstream/nats.go @@ -169,9 +169,9 @@ func setupNATS(process *process.ProcessContext, cfg *config.JetStream, nc *natsc // We've managed to add the stream in memory. What's on the // disk will be left alone, but our ability to recover from a // future crash will be limited. Yell about it. - sentry.CaptureException(fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible", namespaced.Name)) - logrus.Warn("Stream is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible") - process.Degraded() + err := fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory", namespaced.Name) + sentry.CaptureException(err) + process.Degraded(err) } } } diff --git a/setup/process/process.go b/setup/process/process.go index 06ef6021..b2d2844a 100644 --- a/setup/process/process.go +++ b/setup/process/process.go @@ -2,19 +2,18 @@ package process import ( "context" - "fmt" "sync" "github.com/getsentry/sentry-go" "github.com/sirupsen/logrus" - "go.uber.org/atomic" ) type ProcessContext struct { - wg *sync.WaitGroup // used to wait for components to shutdown - ctx context.Context // cancelled when Stop is called - shutdown context.CancelFunc // shut down Dendrite - degraded atomic.Bool + mu sync.RWMutex + wg *sync.WaitGroup // used to wait for components to shutdown + ctx context.Context // cancelled when Stop is called + shutdown context.CancelFunc // shut down Dendrite + degraded map[string]struct{} // reasons why the process is degraded } func NewProcessContext() *ProcessContext { @@ -50,13 +49,25 @@ func (b *ProcessContext) WaitForComponentsToFinish() { b.wg.Wait() } -func (b *ProcessContext) Degraded() { - if b.degraded.CompareAndSwap(false, true) { - logrus.Warn("Dendrite is running in a degraded state") - sentry.CaptureException(fmt.Errorf("Process is running in a degraded state")) +func (b *ProcessContext) Degraded(err error) { + b.mu.Lock() + defer b.mu.Unlock() + if _, ok := b.degraded[err.Error()]; !ok { + logrus.WithError(err).Warn("Dendrite has entered a degraded state") + sentry.CaptureException(err) + b.degraded[err.Error()] = struct{}{} } } -func (b *ProcessContext) IsDegraded() bool { - return b.degraded.Load() +func (b *ProcessContext) IsDegraded() (bool, []string) { + b.mu.RLock() + defer b.mu.RUnlock() + if len(b.degraded) == 0 { + return false, nil + } + reasons := make([]string, 0, len(b.degraded)) + for reason := range b.degraded { + reasons = append(reasons, reason) + } + return true, reasons }