mirror of
https://github.com/hoernschen/dendrite.git
synced 2025-07-29 12:42:46 +00:00
Handle errors differently in the DeviceListUpdater
(#2695)
`If a device list update goes missing, the server resyncs on the next one` was failing because a previous test would receive a `waitTime` of 1h, resulting in the test timing out. This now tries to handle the returned errors differently, e.g. by using the default `waitTime` of 2s. Also doesn't try further users in the list, if one of the errors would cause a longer `waitTime`.
This commit is contained in:
parent
847032df36
commit
440eb0f3a2
4 changed files with 44 additions and 19 deletions
|
@ -19,9 +19,11 @@ import (
|
|||
"encoding/json"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"net"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/matrix-org/gomatrix"
|
||||
"github.com/matrix-org/gomatrixserverlib"
|
||||
"github.com/matrix-org/util"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
@ -388,6 +390,8 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
|
|||
return waitTime, true
|
||||
}
|
||||
failCount := 0
|
||||
|
||||
userLoop:
|
||||
for _, userID := range userIDs {
|
||||
if ctx.Err() != nil {
|
||||
// we've timed out, give up and go to the back of the queue to let another server be processed.
|
||||
|
@ -397,19 +401,35 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
|
|||
res, err := u.fedClient.GetUserDevices(ctx, serverName, userID)
|
||||
if err != nil {
|
||||
failCount += 1
|
||||
fcerr, ok := err.(*fedsenderapi.FederationClientError)
|
||||
if ok {
|
||||
if fcerr.RetryAfter > 0 {
|
||||
waitTime = fcerr.RetryAfter
|
||||
} else if fcerr.Blacklisted {
|
||||
switch e := err.(type) {
|
||||
case *fedsenderapi.FederationClientError:
|
||||
if e.RetryAfter > 0 {
|
||||
waitTime = e.RetryAfter
|
||||
break userLoop
|
||||
} else if e.Blacklisted {
|
||||
waitTime = time.Hour * 8
|
||||
} else {
|
||||
// For all other errors (DNS resolution, network etc.) wait 1 hour.
|
||||
waitTime = time.Hour
|
||||
break userLoop
|
||||
}
|
||||
} else {
|
||||
waitTime = time.Hour
|
||||
logger.WithError(err).WithField("user_id", userID).Debug("GetUserDevices returned unknown error type")
|
||||
case net.Error:
|
||||
// Use the default waitTime, if it's a timeout.
|
||||
// It probably doesn't make sense to try further users.
|
||||
if !e.Timeout() {
|
||||
waitTime = time.Minute * 10
|
||||
logrus.WithError(e).Error("GetUserDevices returned net.Error")
|
||||
break userLoop
|
||||
}
|
||||
case gomatrix.HTTPError:
|
||||
// The remote server returned an error, give it some time to recover
|
||||
if e.Code >= 500 {
|
||||
waitTime = time.Minute * 10
|
||||
logrus.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError")
|
||||
break userLoop
|
||||
}
|
||||
default:
|
||||
// Something else failed
|
||||
waitTime = time.Minute * 10
|
||||
logger.WithError(err).WithField("user_id", userID).Debugf("GetUserDevices returned unknown error type: %T", err)
|
||||
break userLoop
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
@ -437,7 +457,11 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
|
|||
}
|
||||
}
|
||||
if failCount > 0 {
|
||||
logger.WithField("total", len(userIDs)).WithField("failed", failCount).WithField("wait", waitTime).Warn("Failed to query device keys for some users")
|
||||
logger.WithFields(logrus.Fields{
|
||||
"total": len(userIDs),
|
||||
"failed": failCount,
|
||||
"skipped": len(userIDs) - failCount,
|
||||
}).Warn("Failed to query device keys for some users")
|
||||
}
|
||||
for _, userID := range userIDs {
|
||||
// always clear the channel to unblock Update calls regardless of success/failure
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue