bugfix: E2EE device keys could sometimes not be sent to remote servers (#2466)

* Fix flakey sytest 'Local device key changes get to remote servers'

* Debug logs

* Remove internal/test and use /test only

Remove a lot of ancient code too.

* Use FederationRoomserverAPI in more places

* Use more interfaces in federationapi; begin adding regression test

* Linting

* Add regression test

* Unbreak tests

* ALL THE LOGS

* Fix a race condition which could cause events to not be sent to servers

If a new room event which rewrites state arrives, we remove all joined hosts
then re-calculate them. This wasn't done in a transaction so for a brief period
we would have no joined hosts. During this interim, key change events which arrive
would not be sent to destination servers. This would sporadically fail on sytest.

* Unbreak new tests

* Linting
This commit is contained in:
kegsay 2022-05-17 13:23:35 +01:00 committed by GitHub
parent cd82460513
commit 6de29c1cd2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
48 changed files with 566 additions and 618 deletions

View file

@ -25,13 +25,12 @@ import (
type Database interface {
gomatrixserverlib.KeyDatabase
UpdateRoom(ctx context.Context, roomID, oldEventID, newEventID string, addHosts []types.JoinedHost, removeHosts []string) (joinedHosts []types.JoinedHost, err error)
UpdateRoom(ctx context.Context, roomID string, addHosts []types.JoinedHost, removeHosts []string, purgeRoomFirst bool) (joinedHosts []types.JoinedHost, err error)
GetJoinedHosts(ctx context.Context, roomID string) ([]types.JoinedHost, error)
GetAllJoinedHosts(ctx context.Context) ([]gomatrixserverlib.ServerName, error)
// GetJoinedHostsForRooms returns the complete set of servers in the rooms given.
GetJoinedHostsForRooms(ctx context.Context, roomIDs []string, excludeSelf bool) ([]gomatrixserverlib.ServerName, error)
PurgeRoomState(ctx context.Context, roomID string) error
StoreJSON(ctx context.Context, js string) (*shared.Receipt, error)

View file

@ -24,6 +24,7 @@ import (
"github.com/matrix-org/dendrite/internal"
"github.com/matrix-org/dendrite/internal/sqlutil"
"github.com/matrix-org/gomatrixserverlib"
"github.com/sirupsen/logrus"
)
const joinedHostsSchema = `
@ -111,6 +112,7 @@ func (s *joinedHostsStatements) InsertJoinedHosts(
roomID, eventID string,
serverName gomatrixserverlib.ServerName,
) error {
logrus.Debugf("FederationJoinedHosts: INSERT %v %v %v", roomID, eventID, serverName)
stmt := sqlutil.TxStmt(txn, s.insertJoinedHostsStmt)
_, err := stmt.ExecContext(ctx, roomID, eventID, serverName)
return err
@ -119,6 +121,7 @@ func (s *joinedHostsStatements) InsertJoinedHosts(
func (s *joinedHostsStatements) DeleteJoinedHosts(
ctx context.Context, txn *sql.Tx, eventIDs []string,
) error {
logrus.Debugf("FederationJoinedHosts: DELETE WITH EVENTS %v", eventIDs)
stmt := sqlutil.TxStmt(txn, s.deleteJoinedHostsStmt)
_, err := stmt.ExecContext(ctx, pq.StringArray(eventIDs))
return err
@ -127,6 +130,7 @@ func (s *joinedHostsStatements) DeleteJoinedHosts(
func (s *joinedHostsStatements) DeleteJoinedHostsForRoom(
ctx context.Context, txn *sql.Tx, roomID string,
) error {
logrus.Debugf("FederationJoinedHosts: DELETE ALL IN ROOM %v", roomID)
stmt := sqlutil.TxStmt(txn, s.deleteJoinedHostsForRoomStmt)
_, err := stmt.ExecContext(ctx, roomID)
return err
@ -207,6 +211,7 @@ func joinedHostsFromStmt(
ServerName: gomatrixserverlib.ServerName(serverName),
})
}
logrus.Debugf("FederationJoinedHosts: SELECT %v => %+v", roomID, result)
return result, rows.Err()
}

View file

@ -63,11 +63,21 @@ func (r *Receipt) String() string {
// this isn't a duplicate message.
func (d *Database) UpdateRoom(
ctx context.Context,
roomID, oldEventID, newEventID string,
roomID string,
addHosts []types.JoinedHost,
removeHosts []string,
purgeRoomFirst bool,
) (joinedHosts []types.JoinedHost, err error) {
err = d.Writer.Do(d.DB, nil, func(txn *sql.Tx) error {
if purgeRoomFirst {
// If the event is a create event then we'll delete all of the existing
// data for the room. The only reason that a create event would be replayed
// to us in this way is if we're about to receive the entire room state.
if err = d.FederationJoinedHosts.DeleteJoinedHostsForRoom(ctx, txn, roomID); err != nil {
return fmt.Errorf("d.FederationJoinedHosts.DeleteJoinedHosts: %w", err)
}
}
joinedHosts, err = d.FederationJoinedHosts.SelectJoinedHostsWithTx(ctx, txn, roomID)
if err != nil {
return err
@ -138,20 +148,6 @@ func (d *Database) StoreJSON(
}, nil
}
func (d *Database) PurgeRoomState(
ctx context.Context, roomID string,
) error {
return d.Writer.Do(d.DB, nil, func(txn *sql.Tx) error {
// If the event is a create event then we'll delete all of the existing
// data for the room. The only reason that a create event would be replayed
// to us in this way is if we're about to receive the entire room state.
if err := d.FederationJoinedHosts.DeleteJoinedHostsForRoom(ctx, txn, roomID); err != nil {
return fmt.Errorf("d.FederationJoinedHosts.DeleteJoinedHosts: %w", err)
}
return nil
})
}
func (d *Database) AddServerToBlacklist(serverName gomatrixserverlib.ServerName) error {
return d.Writer.Do(d.DB, nil, func(txn *sql.Tx) error {
return d.FederationBlacklist.InsertBlacklist(context.TODO(), txn, serverName)