From 3bf5ae5ffef0ebc140f55320658d9b07bc58e848 Mon Sep 17 00:00:00 2001 From: Neil Alexander Date: Wed, 3 Aug 2022 17:37:27 +0100 Subject: [PATCH] Try more servers when calling `/state_ids` (#2610) * Try more servers when calling `/state_ids` * More logging * Maybe fix concurrent map write * Revert "Maybe fix concurrent map write" This reverts commit da0dbb836207a911afe77e6f6d63c4809669693c. * Enforce a limit of 20s per server, 5 mins total --- roomserver/internal/input/input_missing.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/roomserver/internal/input/input_missing.go b/roomserver/internal/input/input_missing.go index c78e5d79..0dd2b64c 100644 --- a/roomserver/internal/input/input_missing.go +++ b/roomserver/internal/input/input_missing.go @@ -326,8 +326,10 @@ func (t *missingStateReq) lookupStateAfterEvent(ctx context.Context, roomVersion return respState, true, nil } + logrus.WithContext(ctx).Warnf("State for event %s not available locally, falling back to federation (via %d servers)", eventID, len(t.servers)) respState, err := t.lookupStateBeforeEvent(ctx, roomVersion, roomID, eventID) if err != nil { + logrus.WithContext(ctx).WithError(err).Errorf("Failed to look up state before event %s", eventID) return nil, false, fmt.Errorf("t.lookupStateBeforeEvent: %w", err) } @@ -339,6 +341,7 @@ func (t *missingStateReq) lookupStateAfterEvent(ctx context.Context, roomVersion case nil: // do nothing default: + logrus.WithContext(ctx).WithError(err).Errorf("Failed to look up event %s", eventID) return nil, false, fmt.Errorf("t.lookupEvent: %w", err) } h = t.cacheAndReturn(h) @@ -662,9 +665,22 @@ func (t *missingStateReq) lookupMissingStateViaStateIDs(ctx context.Context, roo util.GetLogger(ctx).WithField("room_id", roomID).Infof("lookupMissingStateViaStateIDs %s", eventID) // fetch the state event IDs at the time of the event - stateIDs, err := t.federation.LookupStateIDs(ctx, t.origin, roomID, eventID) + var stateIDs gomatrixserverlib.RespStateIDs + var err error + count := 0 + totalctx, totalcancel := context.WithTimeout(ctx, time.Minute*5) + for _, serverName := range t.servers { + reqctx, reqcancel := context.WithTimeout(totalctx, time.Second*20) + stateIDs, err = t.federation.LookupStateIDs(reqctx, serverName, roomID, eventID) + reqcancel() + if err == nil { + break + } + count++ + } + totalcancel() if err != nil { - return nil, err + return nil, fmt.Errorf("t.federation.LookupStateIDs tried %d server(s), last error: %w", count, err) } // work out which auth/state IDs are missing wantIDs := append(stateIDs.StateEventIDs, stateIDs.AuthEventIDs...)