diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index bb44bbf485..4c4dba65b8 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -5,6 +5,7 @@ sort: 15 # CHANGELOG * BUGFIX: properly remove stale parts outside the configured retention if `-retentionPeriod` is smaller than one month. Previously stale parts could remain active for up to a month after they go outside the retention. +* BUGFIX: stop the process on panic errors, since such errors may leave the process in inconsistent state. Previously panics could be recovered, which could result in unexpected hard-to-debug further behavior of running process. ## tip diff --git a/lib/httpserver/httpserver.go b/lib/httpserver/httpserver.go index 4bcdf21a38..022fe4d195 100644 --- a/lib/httpserver/httpserver.go +++ b/lib/httpserver/httpserver.go @@ -11,6 +11,7 @@ import ( "net" "net/http" "net/http/pprof" + "os" "path" "runtime" "strconv" @@ -190,6 +191,20 @@ var metricsHandlerDuration = metrics.NewHistogram(`vm_http_request_duration_seco var connTimeoutClosedConns = metrics.NewCounter(`vm_http_conn_timeout_closed_conns_total`) func handlerWrapper(s *server, w http.ResponseWriter, r *http.Request, rh RequestHandler) { + // All the VictoriaMetrics code assumes that panic stops the process. + // Unfortunately, the standard net/http.Server recovers from panics in request handlers, + // so VictoriaMetrics state can become inconsistent after the recovered panic. + // The following recover() code works around this by explicitly stopping the process after logging the panic. + // See https://github.com/golang/go/issues/16542#issuecomment-246549902 for details. + defer func() { + if err := recover(); err != nil { + buf := make([]byte, 1<<20) + n := runtime.Stack(buf, false) + fmt.Fprintf(os.Stderr, "panic: %v\n\n%s", err, buf[:n]) + os.Exit(1) + } + }() + requestsTotal.Inc() if whetherToCloseConn(r) { connTimeoutClosedConns.Inc()