[Fixed] Profiling and Monitoring timeout issues

The http servers for those two were recently modified to set a ReadTimeout and WriteTimeout. The WriteTimeout specifically caused issues for Profiling since it is common to ask sampling of several seconds. Pprof code would reject the request if it detected that http server's WriteTimeout was more than sampling in request. For monitoring, any situation that would cause the monitoring code to take more than 2 seconds to gather information (could be due to locking, amount of objects to return, time required for sorting, etc..) would also cause cURL to return empty response or WebBrowser to fail to display the page. Resolves #600
2026-04-14 10:10:42 -07:00 · 2017-11-08 14:52:27 -07:00
parent 92c4481d1b
commit 22cff99e58
4 changed files with 83 additions and 22 deletions
--- a/server/server.go
+++ b/server/server.go
@@ -85,6 +85,11 @@ type Server struct {
 		trace  int32
 		debug  int32
 	}
+
+	// Used by tests to check that http.Servers do
+	// not set any timeout.
+	monitoringServer *http.Server
+	profilingServer  *http.Server
 }

 // Make sure all are 64bits for atomic use
@@ -477,20 +482,24 @@ func (s *Server) StartProfiler() {
 	srv := &http.Server{
 		Addr:           hp,
 		Handler:        http.DefaultServeMux,
-		ReadTimeout:    2 * time.Second,
-		WriteTimeout:   2 * time.Second,
 		MaxHeaderBytes: 1 << 20,
 	}

 	s.mu.Lock()
 	s.profiler = l
+	s.profilingServer = srv
 	s.mu.Unlock()

 	go func() {
 		// if this errors out, it's probably because the server is being shutdown
 		err := srv.Serve(l)
 		if err != nil {
-			s.Fatalf("error starting profiler: %s", err)
+			s.mu.Lock()
+			shutdown := s.shutdown
+			s.mu.Unlock()
+			if !shutdown {
+				s.Fatalf("error starting profiler: %s", err)
+			}
 		}
 		s.done <- true
 	}()
@@ -606,16 +615,18 @@ func (s *Server) startMonitoring(secure bool) error {
 	// Stacksz
 	mux.HandleFunc(StackszPath, s.HandleStacksz)

+	// Do not set a WriteTimeout because it could cause cURL/browser
+	// to return empty response or unable to display page if the
+	// server needs more time to build the response.
 	srv := &http.Server{
 		Addr:           hp,
 		Handler:        mux,
-		ReadTimeout:    2 * time.Second,
-		WriteTimeout:   2 * time.Second,
 		MaxHeaderBytes: 1 << 20,
 	}
 	s.mu.Lock()
 	s.http = httpListener
 	s.httpHandler = mux
+	s.monitoringServer = srv
 	s.mu.Unlock()

 	go func() {
@@ -919,7 +930,7 @@ func (s *Server) MonitorAddr() *net.TCPAddr {
 	return s.http.Addr().(*net.TCPAddr)
 }

-// RouteAddr returns the net.Addr object for the route listener.
+// ClusterAddr returns the net.Addr object for the route listener.
 func (s *Server) ClusterAddr() *net.TCPAddr {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -929,6 +940,16 @@ func (s *Server) ClusterAddr() *net.TCPAddr {
 	return s.routeListener.Addr().(*net.TCPAddr)
 }

+// ProfilerAddr returns the net.Addr object for the route listener.
+func (s *Server) ProfilerAddr() *net.TCPAddr {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.profiler == nil {
+		return nil
+	}
+	return s.profiler.Addr().(*net.TCPAddr)
+}
+
 // ReadyForConnections returns `true` if the server is ready to accept client
 // and, if routing is enabled, route connections. If after the duration
 // `dur` the server is still not ready, returns `false`.
--- a/server/server_test.go
+++ b/server/server_test.go
@@ -19,7 +19,6 @@ func DefaultOptions() *Options {
 		Port:     -1,
 		HTTPPort: -1,
 		Cluster:  ClusterOpts{Port: -1},
-		ProfPort: -1,
 		NoLog:    true,
 		NoSigs:   true,
 		Debug:    true,
@@ -442,3 +441,51 @@ func TestCustomRouterAuthentication(t *testing.T) {
 	}

 }
+
+func TestMonitoringNoTimeout(t *testing.T) {
+	s := runMonitorServer()
+	defer s.Shutdown()
+
+	s.mu.Lock()
+	srv := s.monitoringServer
+	s.mu.Unlock()
+
+	if srv == nil {
+		t.Fatalf("Monitoring server not set")
+	}
+	if srv.ReadTimeout != 0 {
+		t.Fatalf("ReadTimeout should not be set, was set to %v", srv.ReadTimeout)
+	}
+	if srv.WriteTimeout != 0 {
+		t.Fatalf("WriteTimeout should not be set, was set to %v", srv.WriteTimeout)
+	}
+}
+
+func TestProfilingNoTimeout(t *testing.T) {
+	opts := DefaultOptions()
+	opts.ProfPort = -1
+	s := RunServer(opts)
+	defer s.Shutdown()
+
+	paddr := s.ProfilerAddr()
+	if paddr == nil {
+		t.Fatalf("Profiler not started")
+	}
+	pport := paddr.Port
+	if pport <= 0 {
+		t.Fatalf("Expected profiler port to be set, got %v", pport)
+	}
+	s.mu.Lock()
+	srv := s.profilingServer
+	s.mu.Unlock()
+
+	if srv == nil {
+		t.Fatalf("Profiling server not set")
+	}
+	if srv.ReadTimeout != 0 {
+		t.Fatalf("ReadTimeout should not be set, was set to %v", srv.ReadTimeout)
+	}
+	if srv.WriteTimeout != 0 {
+		t.Fatalf("WriteTimeout should not be set, was set to %v", srv.WriteTimeout)
+	}
+}
--- a/test/monitor_test.go
+++ b/test/monitor_test.go
@@ -71,7 +71,7 @@ func runMonitorServerNoHTTPPort() *server.Server {
 }

 func resetPreviousHTTPConnections() {
-	http.DefaultTransport = &http.Transport{}
+	http.DefaultTransport.(*http.Transport).CloseIdleConnections()
 }

 // Make sure that we do not run the http server for monitoring unless asked.
--- a/test/route_discovery_test.go
+++ b/test/route_discovery_test.go
@@ -289,8 +289,6 @@ func TestStressSeedSolicitWorks(t *testing.T) {
 			var err error
 			maxTime := time.Now().Add(5 * time.Second)
 			for time.Now().Before(maxTime) {
-				resetPreviousHTTPConnections()
-
 				for j := 0; j < len(serversInfo); j++ {
 					err = checkConnected(t, serversInfo, j, true)
 					// If error, start this for loop from beginning
@@ -430,8 +428,6 @@ func TestStressChainedSolicitWorks(t *testing.T) {
 			var err error
 			maxTime := time.Now().Add(5 * time.Second)
 			for time.Now().Before(maxTime) {
-				resetPreviousHTTPConnections()
-
 				for j := 0; j < len(serversInfo); j++ {
 					err = checkConnected(t, serversInfo, j, false)
 					// If error, start this for loop from beginning
@@ -554,25 +550,22 @@ func expectRidsNoFatal(t *testing.T, direct bool, rz *server.Routez, rids []stri

 // Helper to easily grab routez info.
 func readHTTPRoutez(t *testing.T, url string) *server.Routez {
+	resetPreviousHTTPConnections()
 	resp, err := http.Get(url + "routez")
 	if err != nil {
-		t.Fatalf("Expected no error: Got %v\n", err)
-	}
-	if resp.StatusCode != 200 {
-		// Do one retry - FIXME(dlc) - Why does this fail when running the solicit tests b2b?
-		resp, _ = http.Get(url + "routez")
-		if resp.StatusCode != 200 {
-			t.Fatalf("Expected a 200 response, got %d\n", resp.StatusCode)
-		}
+		stackFatalf(t, "Expected no error: Got %v\n", err)
 	}
 	defer resp.Body.Close()
+	if resp.StatusCode != 200 {
+		stackFatalf(t, "Expected a 200 response, got %d\n", resp.StatusCode)
+	}
 	body, err := ioutil.ReadAll(resp.Body)
 	if err != nil {
-		t.Fatalf("Got an error reading the body: %v\n", err)
+		stackFatalf(t, "Got an error reading the body: %v\n", err)
 	}
 	r := server.Routez{}
 	if err := json.Unmarshal(body, &r); err != nil {
-		t.Fatalf("Got an error unmarshalling the body: %v\n", err)
+		stackFatalf(t, "Got an error unmarshalling the body: %v\n", err)
 	}
 	return &r
 }