Merge pull request #1122 from nats-io/latency-v2

Latency tracking updates.
2026-04-02 03:38:42 -07:00 · 2019-09-11 17:41:59 -07:00
parent d125f06eaf 25d5cb337d
commit b0122f736d
5 changed files with 71 additions and 45 deletions
--- a/server/accounts.go
+++ b/server/accounts.go
@@ -486,12 +486,13 @@ func (a *Account) IsExportService(service string) bool {
 // IsExportServiceTracking will indicate if given publish subject is an export service with tracking enabled.
 func (a *Account) IsExportServiceTracking(service string) bool {
 	a.mu.RLock()
-	defer a.mu.RUnlock()
 	ea, ok := a.exports.services[service]
 	if ok && ea == nil {
+		a.mu.RUnlock()
 		return false
 	}
 	if ok && ea != nil && ea.latency != nil {
+		a.mu.RUnlock()
 		return true
 	}
 	// FIXME(dlc) - Might want to cache this is in the hot path checking for
@@ -499,23 +500,51 @@ func (a *Account) IsExportServiceTracking(service string) bool {
 	tokens := strings.Split(service, tsep)
 	for subj, ea := range a.exports.services {
 		if isSubsetMatch(tokens, subj) && ea != nil && ea.latency != nil {
+			a.mu.RUnlock()
 			return true
 		}
 	}
+	a.mu.RUnlock()
 	return false
 }

+// NATSLatency represents the internal NATS latencies, including RTTs to clients.
+type NATSLatency struct {
+	Requestor time.Duration `json:"requestor_rtt"`
+	Responder time.Duration `json:"responder_rtt"`
+	System    time.Duration `json:"system_latency"`
+}
+
+// TotalTime is a helper function that totals the NATS latencies.
+func (nl *NATSLatency) TotalTime() time.Duration {
+	return nl.Requestor + nl.Responder + nl.System
+}
+
 // ServiceLatency is the JSON message sent out in response to latency tracking for
 // exported services.
 type ServiceLatency struct {
 	AppName        string        `json:"app_name,omitempty"`
 	RequestStart   time.Time     `json:"request_start"`
 	ServiceLatency time.Duration `json:"service_latency"`
-	NATSLatency    time.Duration `json:"nats_latency"`
+	NATSLatency    NATSLatency   `json:"nats_latency"`
 	TotalLatency   time.Duration `json:"total_latency"`
 }

-// Used for transporting remote laytency measurements.
+// Merge function to merge m1 and m2 (requestor and responder) measurements
+// when there are two samples. This happens when the requestor and responder
+// are on different servers.
+//
+// m2 ServiceLatency is correct, so use that.
+// m1 TotalLatency is correct, so use that.
+// Will use those to back into NATS latency.
+func (m1 *ServiceLatency) merge(m2 *ServiceLatency) {
+	m1.AppName = m2.AppName
+	m1.NATSLatency.System = m1.ServiceLatency - (m2.ServiceLatency + m2.NATSLatency.Responder)
+	m1.ServiceLatency = m2.ServiceLatency
+	m1.NATSLatency.Responder = m2.NATSLatency.Responder
+}
+
+// Used for transporting remote latency measurements.
 type remoteLatency struct {
 	Account string         `json:"account"`
 	ReqId   string         `json:"req_id"`
@@ -532,7 +561,6 @@ func (a *Account) sendTrackingLatency(si *serviceImport, requestor, responder *c

 	var (
 		reqClientRTT  = requestor.getRTTValue()
-		natsRTT       = reqClientRTT
 		respClientRTT time.Duration
 		appName       string
 	)
@@ -541,7 +569,6 @@ func (a *Account) sendTrackingLatency(si *serviceImport, requestor, responder *c

 	if responder != nil && responder.kind == CLIENT {
 		respClientRTT = responder.getRTTValue()
-		natsRTT += respClientRTT
 		appName = responder.GetName()
 	}

@@ -552,8 +579,12 @@ func (a *Account) sendTrackingLatency(si *serviceImport, requestor, responder *c
 		AppName:        appName,
 		RequestStart:   reqStart,
 		ServiceLatency: serviceRTT - respClientRTT,
-		NATSLatency:    natsRTT,
-		TotalLatency:   reqClientRTT + serviceRTT,
+		NATSLatency: NATSLatency{
+			Requestor: reqClientRTT,
+			Responder: respClientRTT,
+			System:    0,
+		},
+		TotalLatency: reqClientRTT + serviceRTT,
 	}

 	// If we are expecting a remote measurement, store our sl here.
@@ -564,11 +595,8 @@ func (a *Account) sendTrackingLatency(si *serviceImport, requestor, responder *c
 	if expectRemoteM2 {
 		si.acc.mu.Lock()
 		if si.m1 != nil {
-			m2 := si.m1
-			m1 := &sl
-			m1.AppName = m2.AppName
-			m1.ServiceLatency = m2.ServiceLatency
-			m1.NATSLatency = m1.TotalLatency - m1.ServiceLatency
+			m1, m2 := &sl, si.m1
+			m1.merge(m2)
 			si.acc.mu.Unlock()
 			a.srv.sendInternalAccountMsg(a, si.latency.subject, m1)
 			return true
--- a/server/client.go
+++ b/server/client.go
@@ -2124,7 +2124,7 @@ var needFlush = struct{}{}

 // deliverMsg will deliver a message to a matching subscription and its underlying client.
 // We process all connection/client types. mh is the part that will be protocol/client specific.
-func (c *client) deliverMsg(sub *subscription, mh, msg []byte) bool {
+func (c *client) deliverMsg(sub *subscription, subject, mh, msg []byte) bool {
 	if sub.client == nil {
 		return false
 	}
@@ -2139,7 +2139,7 @@ func (c *client) deliverMsg(sub *subscription, mh, msg []byte) bool {

 	// Check if we have a subscribe deny clause. This will trigger us to check the subject
 	// for a match against the denied subjects.
-	if client.mperms != nil && client.checkDenySub(string(c.pa.subject)) {
+	if client.mperms != nil && client.checkDenySub(string(subject)) {
 		client.mu.Unlock()
 		return false
 	}
@@ -2204,7 +2204,7 @@ func (c *client) deliverMsg(sub *subscription, mh, msg []byte) bool {
 	if client.kind == SYSTEM {
 		s := client.srv
 		client.mu.Unlock()
-		s.deliverInternalMsg(sub, c.pa.subject, c.pa.reply, msg[:msgSize])
+		s.deliverInternalMsg(sub, subject, c.pa.reply, msg[:msgSize])
 		return true
 	}

@@ -2224,18 +2224,16 @@ func (c *client) deliverMsg(sub *subscription, mh, msg []byte) bool {
 	// Do a fast check here to see if we should be tracking this from a latency
 	// persepective. This will be for a request being received for an exported service.
 	// This needs to be from a non-client (otherwise tracking happens at requestor).
-	if c.kind != CLIENT && client.kind == CLIENT && len(c.pa.reply) > minReplyLen {
+	if client.kind == CLIENT && len(c.pa.reply) > minReplyLen {
+		// If we do not have a registered RTT queue that up now.
+		if client.rtt == 0 && client.flags.isSet(connectReceived) {
+			client.sendPing()
+		}
 		// FIXME(dlc) - We may need to optimize this.
-		if client.acc.IsExportServiceTracking(string(c.pa.subject)) {
-			// If we do not have a registered RTT queue that up now.
-			if client.rtt == 0 && c.flags.isSet(connectReceived) {
-				client.sendPing()
-			}
-			// We will have tagged this with a suffix ('.T') if we are tracking. This is
-			// needed from sampling. Not all will be tracked.
-			if isTrackedReply(c.pa.reply) {
-				client.trackRemoteReply(string(c.pa.reply))
-			}
+		// We will have tagged this with a suffix ('.T') if we are tracking. This is
+		// needed from sampling. Not all will be tracked.
+		if c.kind != CLIENT && client.acc.IsExportServiceTracking(string(subject)) && isTrackedReply(c.pa.reply) {
+			client.trackRemoteReply(string(c.pa.reply))
 		}
 	}

@@ -2531,8 +2529,8 @@ func (c *client) processInboundClientMsg(msg []byte) {
 			// Fill this in and send it off to the other side.
 			sl.AppName = c.opts.Name
 			sl.ServiceLatency = time.Since(sl.RequestStart) - rtt
-			sl.NATSLatency = rtt
-			sl.TotalLatency = sl.ServiceLatency + sl.NATSLatency
+			sl.NATSLatency.Responder = rtt
+			sl.TotalLatency = sl.ServiceLatency + rtt
 			lsub := remoteLatencySubjectForResponse(c.pa.subject)
 			c.srv.sendInternalAccountMsg(nil, lsub, &rl) // Send to SYS account
 		}
@@ -2624,7 +2622,6 @@ func (c *client) checkForImportServices(acc *Account, msg []byte) {
 				// We have a service import that we are tracking but have not established RTT
 				c.sendRTTPing()
 			}
-
 			// If this is a client or leaf connection and we are in gateway mode,
 			// we need to send RS+ to our local cluster and possibly to inbound
 			// GW connections for which we are in interest-only mode.
@@ -2757,7 +2754,7 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, subject,
 		}
 		// Normal delivery
 		mh := c.msgHeader(msgh[:si], sub, reply)
-		c.deliverMsg(sub, mh, msg)
+		c.deliverMsg(sub, subject, mh, msg)
 	}

 	// Set these up to optionally filter based on the queue lists.
@@ -2844,7 +2841,7 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, subject,
 			}

 			mh := c.msgHeader(msgh[:si], sub, reply)
-			if c.deliverMsg(sub, mh, msg) {
+			if c.deliverMsg(sub, subject, mh, msg) {
 				// Clear rsub
 				rsub = nil
 				if flags&pmrCollectQueueNames != 0 {
@@ -2908,7 +2905,7 @@ sendToRoutesOrLeafs:
 		}
 		mh = append(mh, c.pa.szb...)
 		mh = append(mh, _CRLF_...)
-		c.deliverMsg(rt.sub, mh, msg)
+		c.deliverMsg(rt.sub, subject, mh, msg)
 	}
 	return queues
 }
--- a/server/events.go
+++ b/server/events.go
@@ -1033,7 +1033,7 @@ func (s *Server) remoteLatencyUpdate(sub *subscription, subject, _ string, msg [
 	lsub := si.latency.subject
 	acc.mu.RUnlock()

-	// So we have no processed the response tracking measurement yet.
+	// So we have not processed the response tracking measurement yet.
 	if m1 == nil {
 		acc.mu.Lock()
 		// Double check since could have slipped in.
@@ -1052,9 +1052,7 @@ func (s *Server) remoteLatencyUpdate(sub *subscription, subject, _ string, msg [
 	// M2 ServiceLatency is correct, so use that.
 	// M1 TotalLatency is correct, so use that.
 	// Will use those to back into NATS latency.
-	m1.AppName = m2.AppName
-	m1.ServiceLatency = m2.ServiceLatency
-	m1.NATSLatency = m1.TotalLatency - m1.ServiceLatency
+	m1.merge(&m2)

 	// Make sure we remove the entry here.
 	si.acc.removeServiceImport(si.from)
--- a/server/gateway.go
+++ b/server/gateway.go
@@ -2356,7 +2356,7 @@ func (c *client) sendMsgToGateways(acc *Account, msg, subject, reply []byte, qgr
 		sub.nm, sub.max = 0, 0
 		sub.client = gwc
 		sub.subject = c.pa.subject
-		c.deliverMsg(sub, mh, msg)
+		c.deliverMsg(sub, c.pa.subject, mh, msg)
 	}
 	// Done with subscription, put back to pool. We don't need
 	// to reset content since we explicitly set when using it.
--- a/test/service_latency_test.go
+++ b/test/service_latency_test.go
@@ -115,15 +115,18 @@ func checkServiceLatency(t *testing.T, sl server.ServiceLatency, start time.Time
 	if sl.TotalLatency < sl.ServiceLatency {
 		t.Fatalf("Bad total latency: %v", sl.ServiceLatency)
 	}
+
 	// We should have NATS latency here that is non-zero with real clients.
-	if sl.NATSLatency == 0 {
-		t.Fatalf("Expected non-zero NATS latency")
+	if sl.NATSLatency.Requestor == 0 {
+		t.Fatalf("Expected non-zero NATS Requestor latency")
+	}
+	if sl.NATSLatency.Responder == 0 {
+		t.Fatalf("Expected non-zero NATS Requestor latency")
 	}
 	// Make sure they add up
-	if sl.TotalLatency != sl.ServiceLatency+sl.NATSLatency {
+	if sl.TotalLatency != sl.ServiceLatency+sl.NATSLatency.TotalTime() {
 		t.Fatalf("Numbers do not add up: %+v", sl)
 	}
-
 }

 func TestServiceLatencySingleServerConnect(t *testing.T) {
@@ -220,8 +223,8 @@ func TestServiceLatencyRemoteConnect(t *testing.T) {

 	// Lastly here, we need to make sure we are properly tracking the extra hops.
 	// We will make sure that NATS latency is close to what we see from the outside in terms of RTT.
-	if crtt := connRTT(nc) + connRTT(nc2); sl.NATSLatency < crtt {
-		t.Fatalf("Not tracking second measurement for NATS latency across servers: %v vs %v", sl.NATSLatency, crtt)
+	if crtt := connRTT(nc) + connRTT(nc2); sl.NATSLatency.TotalTime() < crtt {
+		t.Fatalf("Not tracking second measurement for NATS latency across servers: %v vs %v", sl.NATSLatency.TotalTime(), crtt)
 	}

 	// Gateway Requestor
@@ -244,8 +247,8 @@ func TestServiceLatencyRemoteConnect(t *testing.T) {

 	// Lastly here, we need to make sure we are properly tracking the extra hops.
 	// We will make sure that NATS latency is close to what we see from the outside in terms of RTT.
-	if crtt := connRTT(nc) + connRTT(nc2); sl.NATSLatency < crtt {
-		t.Fatalf("Not tracking second measurement for NATS latency across servers: %v vs %v", sl.NATSLatency, crtt)
+	if crtt := connRTT(nc) + connRTT(nc2); sl.NATSLatency.TotalTime() < crtt {
+		t.Fatalf("Not tracking second measurement for NATS latency across servers: %v vs %v", sl.NATSLatency.TotalTime(), crtt)
 	}
 }