Refactor Federation Destination Queues (#2807)

This is a refactor of the federation destination queues. It fixes a few things, namely: - actually retry outgoing events with backoff behaviour - obtain enough events from the database to fill messages as much as possible - minimize the amount of running goroutines - use pure timers for backoff - don't restart queue unless necessary - close the background task when backing off - increase max edus in a transaction to match the spec - cleanup timers more aggresively to reduce memory usage - add jitter to backoff timers to reduce resource spikes - add a bunch of tests (with real and fake databases) to ensure everything is working
2025-10-09 19:22:47 +00:00 · 2022-10-19 10:03:16 +00:00 · 2022-10-19 10:03:16 +00:00 · 241d5c47df
commit 241d5c47df
parent 3aa92efaa3
8 changed files with 1410 additions and 202 deletions
--- a/federationapi/statistics/statistics.go
+++ b/federationapi/statistics/statistics.go
@ -2,6 +2,7 @@ package statistics

 import (
 	"math"
+	"math/rand"
 	"sync"
 	"time"

@ -20,12 +21,23 @@ type Statistics struct {
 	servers map[gomatrixserverlib.ServerName]*ServerStatistics
 	mutex   sync.RWMutex

+	backoffTimers map[gomatrixserverlib.ServerName]*time.Timer
+	backoffMutex  sync.RWMutex
+
 	// How many times should we tolerate consecutive failures before we
 	// just blacklist the host altogether? The backoff is exponential,
 	// so the max time here to attempt is 2**failures seconds.
 	FailuresUntilBlacklist uint32
 }

+func NewStatistics(db storage.Database, failuresUntilBlacklist uint32) Statistics {
+	return Statistics{
+		DB:                     db,
+		FailuresUntilBlacklist: failuresUntilBlacklist,
+		backoffTimers:          make(map[gomatrixserverlib.ServerName]*time.Timer),
+	}
+}
+
 // ForServer returns server statistics for the given server name. If it
 // does not exist, it will create empty statistics and return those.
 func (s *Statistics) ForServer(serverName gomatrixserverlib.ServerName) *ServerStatistics {
@ -45,7 +57,6 @@ func (s *Statistics) ForServer(serverName gomatrixserverlib.ServerName) *ServerS
 		server = &ServerStatistics{
 			statistics: s,
 			serverName: serverName,
-			interrupt:  make(chan struct{}),
 		}
 		s.servers[serverName] = server
 		s.mutex.Unlock()
@ -64,29 +75,43 @@ func (s *Statistics) ForServer(serverName gomatrixserverlib.ServerName) *ServerS
 // many times we failed etc. It also manages the backoff time and black-
 // listing a remote host if it remains uncooperative.
 type ServerStatistics struct {
-	statistics     *Statistics                  //
-	serverName     gomatrixserverlib.ServerName //
-	blacklisted    atomic.Bool                  // is the node blacklisted
-	backoffStarted atomic.Bool                  // is the backoff started
-	backoffUntil   atomic.Value                 // time.Time until this backoff interval ends
-	backoffCount   atomic.Uint32                // number of times BackoffDuration has been called
-	interrupt      chan struct{}                // interrupts the backoff goroutine
-	successCounter atomic.Uint32                // how many times have we succeeded?
+	statistics      *Statistics                  //
+	serverName      gomatrixserverlib.ServerName //
+	blacklisted     atomic.Bool                  // is the node blacklisted
+	backoffStarted  atomic.Bool                  // is the backoff started
+	backoffUntil    atomic.Value                 // time.Time until this backoff interval ends
+	backoffCount    atomic.Uint32                // number of times BackoffDuration has been called
+	successCounter  atomic.Uint32                // how many times have we succeeded?
+	backoffNotifier func()                       // notifies destination queue when backoff completes
+	notifierMutex   sync.Mutex
 }

+const maxJitterMultiplier = 1.4
+const minJitterMultiplier = 0.8
+
 // duration returns how long the next backoff interval should be.
 func (s *ServerStatistics) duration(count uint32) time.Duration {
-	return time.Second * time.Duration(math.Exp2(float64(count)))
+	// Add some jitter to minimise the chance of having multiple backoffs
+	// ending at the same time.
+	jitter := rand.Float64()*(maxJitterMultiplier-minJitterMultiplier) + minJitterMultiplier
+	duration := time.Millisecond * time.Duration(math.Exp2(float64(count))*jitter*1000)
+	return duration
 }

 // cancel will interrupt the currently active backoff.
 func (s *ServerStatistics) cancel() {
 	s.blacklisted.Store(false)
 	s.backoffUntil.Store(time.Time{})
-	select {
-	case s.interrupt <- struct{}{}:
-	default:
-	}
+
+	s.ClearBackoff()
+}
+
+// AssignBackoffNotifier configures the channel to send to when
+// a backoff completes.
+func (s *ServerStatistics) AssignBackoffNotifier(notifier func()) {
+	s.notifierMutex.Lock()
+	defer s.notifierMutex.Unlock()
+	s.backoffNotifier = notifier
 }

 // Success updates the server statistics with a new successful
@ -95,8 +120,8 @@ func (s *ServerStatistics) cancel() {
 // we will unblacklist it.
 func (s *ServerStatistics) Success() {
 	s.cancel()
-	s.successCounter.Inc()
 	s.backoffCount.Store(0)
+	s.successCounter.Inc()
 	if s.statistics.DB != nil {
 		if err := s.statistics.DB.RemoveServerFromBlacklist(s.serverName); err != nil {
 			logrus.WithError(err).Errorf("Failed to remove %q from blacklist", s.serverName)
@ -105,13 +130,17 @@ func (s *ServerStatistics) Success() {
 }

 // Failure marks a failure and starts backing off if needed.
-// The next call to BackoffIfRequired will do the right thing
-// after this. It will return the time that the current failure
+// It will return the time that the current failure
 // will result in backoff waiting until, and a bool signalling
 // whether we have blacklisted and therefore to give up.
 func (s *ServerStatistics) Failure() (time.Time, bool) {
+	// Return immediately if we have blacklisted this node.
+	if s.blacklisted.Load() {
+		return time.Time{}, true
+	}
+
 	// If we aren't already backing off, this call will start
-	// a new backoff period. Increase the failure counter and
+	// a new backoff period, increase the failure counter and
 	// start a goroutine which will wait out the backoff and
 	// unset the backoffStarted flag when done.
 	if s.backoffStarted.CompareAndSwap(false, true) {
@ -122,40 +151,48 @@ func (s *ServerStatistics) Failure() (time.Time, bool) {
 					logrus.WithError(err).Errorf("Failed to add %q to blacklist", s.serverName)
 				}
 			}
+			s.ClearBackoff()
 			return time.Time{}, true
 		}

-		go func() {
-			until, ok := s.backoffUntil.Load().(time.Time)
-			if ok && !until.IsZero() {
-				select {
-				case <-time.After(time.Until(until)):
-				case <-s.interrupt:
-				}
-				s.backoffStarted.Store(false)
-			}
-		}()
+		// We're starting a new back off so work out what the next interval
+		// will be.
+		count := s.backoffCount.Load()
+		until := time.Now().Add(s.duration(count))
+		s.backoffUntil.Store(until)
+
+		s.statistics.backoffMutex.Lock()
+		defer s.statistics.backoffMutex.Unlock()
+		s.statistics.backoffTimers[s.serverName] = time.AfterFunc(time.Until(until), s.backoffFinished)
 	}

-	// Check if we have blacklisted this node.
-	if s.blacklisted.Load() {
-		return time.Now(), true
-	}
+	return s.backoffUntil.Load().(time.Time), false
+}

-	// If we're already backing off and we haven't yet surpassed
-	// the deadline then return that. Repeated calls to Failure
-	// within a single backoff interval will have no side effects.
-	if until, ok := s.backoffUntil.Load().(time.Time); ok && !time.Now().After(until) {
-		return until, false
+// ClearBackoff stops the backoff timer for this destination if it is running
+// and removes the timer from the backoffTimers map.
+func (s *ServerStatistics) ClearBackoff() {
+	// If the timer is still running then stop it so it's memory is cleaned up sooner.
+	s.statistics.backoffMutex.Lock()
+	defer s.statistics.backoffMutex.Unlock()
+	if timer, ok := s.statistics.backoffTimers[s.serverName]; ok {
+		timer.Stop()
 	}
+	delete(s.statistics.backoffTimers, s.serverName)

-	// We're either backing off and have passed the deadline, or
-	// we aren't backing off, so work out what the next interval
-	// will be.
-	count := s.backoffCount.Load()
-	until := time.Now().Add(s.duration(count))
-	s.backoffUntil.Store(until)
-	return until, false
+	s.backoffStarted.Store(false)
+}
+
+// backoffFinished will clear the previous backoff and notify the destination queue.
+func (s *ServerStatistics) backoffFinished() {
+	s.ClearBackoff()
+
+	// Notify the destinationQueue if one is currently running.
+	s.notifierMutex.Lock()
+	defer s.notifierMutex.Unlock()
+	if s.backoffNotifier != nil {
+		s.backoffNotifier()
+	}
 }

 // BackoffInfo returns information about the current or previous backoff.
@ -174,6 +211,12 @@ func (s *ServerStatistics) Blacklisted() bool {
 	return s.blacklisted.Load()
 }

+// RemoveBlacklist removes the blacklisted status from the server.
+func (s *ServerStatistics) RemoveBlacklist() {
+	s.cancel()
+	s.backoffCount.Store(0)
+}
+
 // SuccessCount returns the number of successful requests. This is
 // usually useful in constructing transaction IDs.
 func (s *ServerStatistics) SuccessCount() uint32 {
--- a/federationapi/statistics/statistics_test.go
+++ b/federationapi/statistics/statistics_test.go
@ -7,9 +7,7 @@ import (
 )

 func TestBackoff(t *testing.T) {
-	stats := Statistics{
-		FailuresUntilBlacklist: 7,
-	}
+	stats := NewStatistics(nil, 7)
 	server := ServerStatistics{
 		statistics: &stats,
 		serverName: "test.com",
@ -36,7 +34,7 @@ func TestBackoff(t *testing.T) {

 		// Get the duration.
 		_, blacklist := server.BackoffInfo()
-		duration := time.Until(until).Round(time.Second)
+		duration := time.Until(until)

 		// Unset the backoff, or otherwise our next call will think that
 		// there's a backoff in progress and return the same result.
@ -57,8 +55,17 @@ func TestBackoff(t *testing.T) {

 		// Check if the duration is what we expect.
 		t.Logf("Backoff %d is for %s", i, duration)
-		if wanted := time.Second * time.Duration(math.Exp2(float64(i))); !blacklist && duration != wanted {
-			t.Fatalf("Backoff %d should have been %s but was %s", i, wanted, duration)
+		roundingAllowance := 0.01
+		minDuration := time.Millisecond * time.Duration(math.Exp2(float64(i))*minJitterMultiplier*1000-roundingAllowance)
+		maxDuration := time.Millisecond * time.Duration(math.Exp2(float64(i))*maxJitterMultiplier*1000+roundingAllowance)
+		var inJitterRange bool
+		if duration >= minDuration && duration <= maxDuration {
+			inJitterRange = true
+		} else {
+			inJitterRange = false
+		}
+		if !blacklist && !inJitterRange {
+			t.Fatalf("Backoff %d should have been between %s and %s but was %s", i, minDuration, maxDuration, duration)
 		}
 	}
 }