From 5dae70069b8406f7e9131d6a64b269af0bc6b6b7 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 22 Apr 2024 08:55:05 +0100 Subject: [PATCH 01/37] Clean the syncv3_snapshots table periodically Also cleans the transaction table periodically. Fixes https://github.com/matrix-org/sliding-sync/issues/372 On testing, this cuts db size to about 1/3 of its original size. --- state/storage.go | 81 ++++++++++++++++++++ state/storage_test.go | 170 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+) diff --git a/state/storage.go b/state/storage.go index bf838237..0d0faa74 100644 --- a/state/storage.go +++ b/state/storage.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "strings" + "time" "golang.org/x/exp/slices" @@ -69,6 +70,8 @@ type Storage struct { ReceiptTable *ReceiptTable DB *sqlx.DB MaxTimelineLimit int + shutdownCh chan struct{} + shutdown bool } func NewStorage(postgresURI string) *Storage { @@ -104,6 +107,7 @@ func NewStorageWithDB(db *sqlx.DB, addPrometheusMetrics bool) *Storage { ReceiptTable: NewReceiptTable(db), DB: db, MaxTimelineLimit: 50, + shutdownCh: make(chan struct{}), } } @@ -758,6 +762,50 @@ func (s *Storage) LatestEventsInRooms(userID string, roomIDs []string, to int64, return result, err } +// Remove state snapshots which cannot be accessed by clients. The latest MaxTimelineEvents +// snapshots must be kept, +1 for the current state. This handles the worst case where all +// MaxTimelineEvents are state events and hence each event makes a new snapshot. We can safely +// delete all snapshots older than this, as it's not possible to reach this snapshot as the proxy +// does not handle historical state (deferring to the homeserver for that). +func (s *Storage) RemoveInaccessibleStateSnapshots() error { + numToKeep := s.MaxTimelineLimit + 1 + // Create a CTE which ranks each snapshot so we can figure out which snapshots to delete + // then execute the delete using the CTE. + // + // A per-room version of this query: + // WITH ranked_snapshots AS ( + // SELECT + // snapshot_id, + // room_id, + // ROW_NUMBER() OVER (PARTITION BY room_id ORDER BY snapshot_id DESC) AS row_num + // FROM syncv3_snapshots + // ) + // DELETE FROM syncv3_snapshots WHERE snapshot_id IN( + // SELECT snapshot_id FROM ranked_snapshots WHERE row_num > 51 AND room_id='!....' + // ); + awfulQuery := fmt.Sprintf(`WITH ranked_snapshots AS ( + SELECT + snapshot_id, + room_id, + ROW_NUMBER() OVER (PARTITION BY room_id ORDER BY snapshot_id DESC) AS row_num + FROM + syncv3_snapshots + ) + DELETE FROM syncv3_snapshots USING ranked_snapshots + WHERE syncv3_snapshots.snapshot_id = ranked_snapshots.snapshot_id + AND ranked_snapshots.row_num > %d;`, numToKeep) + + result, err := s.DB.Exec(awfulQuery) + if err != nil { + return fmt.Errorf("failed to RemoveInaccessibleStateSnapshots: Exec %s", err) + } + rowsAffected, err := result.RowsAffected() + if err == nil { + logger.Info().Int64("rows_affected", rowsAffected).Msg("RemoveInaccessibleStateSnapshots: deleted rows") + } + return nil +} + func (s *Storage) GetClosestPrevBatch(roomID string, eventNID int64) (prevBatch string) { var err error sqlutil.WithTransaction(s.DB, func(txn *sqlx.Tx) error { @@ -1024,6 +1072,34 @@ func (s *Storage) AllJoinedMembers(txn *sqlx.Tx, tempTableName string) (joinedMe return joinedMembers, metadata, nil } +func (s *Storage) Cleaner(n time.Duration) { +Loop: + for { + select { + case <-time.After(n): + now := time.Now() + boundaryTime := now.Add(-1 * n) + if n < time.Hour { + boundaryTime = now.Add(-1 * time.Hour) + } + logger.Info().Time("boundaryTime", boundaryTime).Msg("Cleaner running") + err := s.TransactionsTable.Clean(boundaryTime) + if err != nil { + logger.Warn().Err(err).Msg("failed to clean txn ID table") + sentry.CaptureException(err) + } + // we also want to clean up stale state snapshots which are inaccessible, to + // keep the size of the syncv3_snapshots table low. + if err = s.RemoveInaccessibleStateSnapshots(); err != nil { + logger.Warn().Err(err).Msg("failed to remove inaccessible state snapshots") + sentry.CaptureException(err) + } + case <-s.shutdownCh: + break Loop + } + } +} + func (s *Storage) LatestEventNIDInRooms(roomIDs []string, highestNID int64) (roomToNID map[string]int64, err error) { roomToNID = make(map[string]int64) err = sqlutil.WithTransaction(s.Accumulator.db, func(txn *sqlx.Tx) error { @@ -1113,6 +1189,11 @@ func (s *Storage) determineJoinedRoomsFromMemberships(membershipEvents []Event) } func (s *Storage) Teardown() { + if !s.shutdown { + s.shutdown = true + close(s.shutdownCh) + } + err := s.Accumulator.db.Close() if err != nil { panic("Storage.Teardown: " + err.Error()) diff --git a/state/storage_test.go b/state/storage_test.go index f11d5492..4cef726f 100644 --- a/state/storage_test.go +++ b/state/storage_test.go @@ -5,11 +5,13 @@ import ( "context" "encoding/json" "fmt" + "math/rand" "reflect" "sort" "testing" "time" + "github.com/matrix-org/gomatrixserverlib" "github.com/matrix-org/sliding-sync/sync2" "github.com/jmoiron/sqlx" @@ -913,6 +915,174 @@ func TestStorage_FetchMemberships(t *testing.T) { assertValue(t, "joins", leaves, []string{"@chris:test", "@david:test", "@glory:test", "@helen:test"}) } +type persistOpts struct { + withInitialEvents bool + numTimelineEvents int + ofWhichNumState int +} + +func mustPersistEvents(t *testing.T, roomID string, store *Storage, opts persistOpts) { + t.Helper() + var events []json.RawMessage + if opts.withInitialEvents { + events = createInitialEvents(t, userID) + } + numAddedStateEvents := 0 + for i := 0; i < opts.numTimelineEvents; i++ { + var ev json.RawMessage + if numAddedStateEvents < opts.ofWhichNumState { + numAddedStateEvents++ + ev = testutils.NewStateEvent(t, "some_kind_of_state", fmt.Sprintf("%d", rand.Int63()), userID, map[string]interface{}{ + "num": numAddedStateEvents, + }) + } else { + ev = testutils.NewEvent(t, "some_kind_of_message", userID, map[string]interface{}{ + "msg": "yep", + }) + } + events = append(events, ev) + } + mustAccumulate(t, store, roomID, events) +} + +func mustAccumulate(t *testing.T, store *Storage, roomID string, events []json.RawMessage) { + t.Helper() + _, err := store.Accumulate(userID, roomID, sync2.TimelineResponse{ + Events: events, + }) + if err != nil { + t.Fatalf("Failed to accumulate: %s", err) + } +} + +func mustHaveNumSnapshots(t *testing.T, db *sqlx.DB, roomID string, numSnapshots int) { + t.Helper() + var val int + err := db.QueryRow(`SELECT count(*) FROM syncv3_snapshots WHERE room_id=$1`, roomID).Scan(&val) + if err != nil { + t.Fatalf("mustHaveNumSnapshots: %s", err) + } + if val != numSnapshots { + t.Fatalf("mustHaveNumSnapshots: got %d want %d snapshots", val, numSnapshots) + } +} + +func mustNotError(t *testing.T, err error) { + t.Helper() + if err == nil { + return + } + t.Fatalf("err: %s", err) +} + +func TestRemoveInaccessibleStateSnapshots(t *testing.T) { + store := NewStorage(postgresConnectionString) + store.MaxTimelineLimit = 50 // we nuke if we have >50+1 snapshots + + roomOnlyMessages := "!TestRemoveInaccessibleStateSnapshots_roomOnlyMessages:localhost" + mustPersistEvents(t, roomOnlyMessages, store, persistOpts{ + withInitialEvents: true, + numTimelineEvents: 100, + ofWhichNumState: 0, + }) + roomOnlyState := "!TestRemoveInaccessibleStateSnapshots_roomOnlyState:localhost" + mustPersistEvents(t, roomOnlyState, store, persistOpts{ + withInitialEvents: true, + numTimelineEvents: 100, + ofWhichNumState: 100, + }) + roomPartialStateAndMessages := "!TestRemoveInaccessibleStateSnapshots_roomPartialStateAndMessages:localhost" + mustPersistEvents(t, roomPartialStateAndMessages, store, persistOpts{ + withInitialEvents: true, + numTimelineEvents: 100, + ofWhichNumState: 30, + }) + roomOverwriteState := "TestRemoveInaccessibleStateSnapshots_roomOverwriteState:localhost" + mustPersistEvents(t, roomOverwriteState, store, persistOpts{ + withInitialEvents: true, + }) + mustAccumulate(t, store, roomOverwriteState, []json.RawMessage{testutils.NewStateEvent(t, "overwrite", "", userID, map[string]interface{}{"val": 1})}) + mustAccumulate(t, store, roomOverwriteState, []json.RawMessage{testutils.NewStateEvent(t, "overwrite", "", userID, map[string]interface{}{"val": 2})}) + mustHaveNumSnapshots(t, store.DB, roomOnlyMessages, 4) // initial state only, one for each state event + mustHaveNumSnapshots(t, store.DB, roomOnlyState, 104) // initial state + 100 state events + mustHaveNumSnapshots(t, store.DB, roomPartialStateAndMessages, 34) // initial state + 30 state events + mustHaveNumSnapshots(t, store.DB, roomOverwriteState, 6) // initial state + 2 overwrite state events + mustNotError(t, store.RemoveInaccessibleStateSnapshots()) + mustHaveNumSnapshots(t, store.DB, roomOnlyMessages, 4) // it should not be touched as 4 < 51 + mustHaveNumSnapshots(t, store.DB, roomOnlyState, 51) // it should be capped at 51 + mustHaveNumSnapshots(t, store.DB, roomPartialStateAndMessages, 34) // it should not be touched as 34 < 51 + mustHaveNumSnapshots(t, store.DB, roomOverwriteState, 6) // it should not be touched as 6 < 51 + // calling it again does nothing + mustNotError(t, store.RemoveInaccessibleStateSnapshots()) + mustHaveNumSnapshots(t, store.DB, roomOnlyMessages, 4) + mustHaveNumSnapshots(t, store.DB, roomOnlyState, 51) + mustHaveNumSnapshots(t, store.DB, roomPartialStateAndMessages, 34) + mustHaveNumSnapshots(t, store.DB, roomOverwriteState, 6) // it should not be touched as 6 < 51 + // adding one extra state snapshot to each room and repeating RemoveInaccessibleStateSnapshots + mustPersistEvents(t, roomOnlyMessages, store, persistOpts{numTimelineEvents: 1, ofWhichNumState: 1}) + mustPersistEvents(t, roomOnlyState, store, persistOpts{numTimelineEvents: 1, ofWhichNumState: 1}) + mustPersistEvents(t, roomPartialStateAndMessages, store, persistOpts{numTimelineEvents: 1, ofWhichNumState: 1}) + mustNotError(t, store.RemoveInaccessibleStateSnapshots()) + mustHaveNumSnapshots(t, store.DB, roomOnlyMessages, 5) + mustHaveNumSnapshots(t, store.DB, roomOnlyState, 51) // still capped + mustHaveNumSnapshots(t, store.DB, roomPartialStateAndMessages, 35) + // adding 51 timeline events and repeating RemoveInaccessibleStateSnapshots does nothing + mustPersistEvents(t, roomOnlyMessages, store, persistOpts{numTimelineEvents: 51}) + mustPersistEvents(t, roomOnlyState, store, persistOpts{numTimelineEvents: 51}) + mustPersistEvents(t, roomPartialStateAndMessages, store, persistOpts{numTimelineEvents: 51}) + mustNotError(t, store.RemoveInaccessibleStateSnapshots()) + mustHaveNumSnapshots(t, store.DB, roomOnlyMessages, 5) + mustHaveNumSnapshots(t, store.DB, roomOnlyState, 51) + mustHaveNumSnapshots(t, store.DB, roomPartialStateAndMessages, 35) + + // overwrite 52 times and check the current state is 52 (shows we are keeping the right snapshots) + for i := 0; i < 52; i++ { + mustAccumulate(t, store, roomOverwriteState, []json.RawMessage{testutils.NewStateEvent(t, "overwrite", "", userID, map[string]interface{}{"val": 1 + i})}) + } + mustHaveNumSnapshots(t, store.DB, roomOverwriteState, 58) + mustNotError(t, store.RemoveInaccessibleStateSnapshots()) + mustHaveNumSnapshots(t, store.DB, roomOverwriteState, 51) + roomsTable := NewRoomsTable(store.DB) + mustNotError(t, sqlutil.WithTransaction(store.DB, func(txn *sqlx.Tx) error { + snapID, err := roomsTable.CurrentAfterSnapshotID(txn, roomOverwriteState) + if err != nil { + return err + } + state, err := store.StateSnapshot(snapID) + if err != nil { + return err + } + // find the 'overwrite' event and make sure the val is 52 + for _, ev := range state { + evv := gjson.ParseBytes(ev) + if evv.Get("type").Str != "overwrite" { + continue + } + if evv.Get("content.val").Int() != 52 { + return fmt.Errorf("val for overwrite state event was not 52: %v", evv.Raw) + } + } + return nil + })) +} + +func createInitialEvents(t *testing.T, creator string) []json.RawMessage { + t.Helper() + baseTimestamp := time.Now() + var pl gomatrixserverlib.PowerLevelContent + pl.Defaults() + pl.Users = map[string]int64{ + creator: 100, + } + // all with the same timestamp as they get made atomically + return []json.RawMessage{ + testutils.NewStateEvent(t, "m.room.create", "", creator, map[string]interface{}{"creator": creator}, testutils.WithTimestamp(baseTimestamp)), + testutils.NewJoinEvent(t, creator, testutils.WithTimestamp(baseTimestamp)), + testutils.NewStateEvent(t, "m.room.power_levels", "", creator, pl, testutils.WithTimestamp(baseTimestamp)), + testutils.NewStateEvent(t, "m.room.join_rules", "", creator, map[string]interface{}{"join_rule": "public"}, testutils.WithTimestamp(baseTimestamp)), + } +} + func cleanDB(t *testing.T) error { // make a fresh DB which is unpolluted from other tests db, close := connectToDB(t) From a1b5e48529aa08658fdd0d152875bbccc79bfdde Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 22 Apr 2024 08:59:23 +0100 Subject: [PATCH 02/37] Run the cleaner on startup --- cmd/syncv3/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 2b51bd11..f24e8c60 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -220,6 +220,7 @@ func main() { }) go h2.StartV2Pollers() + go h2.Store.Cleaner(time.Hour) if args[EnvOTLP] != "" { h3 = otelhttp.NewHandler(h3, "Sync") } From c4f2617a2d68454d964f61f271a158519dd6c0c2 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 24 Apr 2024 16:20:07 +0100 Subject: [PATCH 03/37] Remove MSC4115 unsigned.membership field With integration tests. This is required because this field is scoped per-v2-user, but because the proxy deduplicates events it means we might see the membership value for a different user. We can't set this field correctly as we lack that information, so rather than lie to clients instead just delete the field. Fixes https://github.com/matrix-org/sliding-sync/issues/421 --- sync2/handler2/handler.go | 10 +++- tests-integration/lists_test.go | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index f336f30a..792437b8 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -21,6 +21,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" "github.com/tidwall/gjson" + "github.com/tidwall/sjson" ) var logger = zerolog.New(os.Stdout).With().Timestamp().Logger().Output(zerolog.ConsoleWriter{ @@ -270,8 +271,10 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID strin // Also remember events which were sent by this user but lack a transaction ID. eventIDsLackingTxns := make([]string, 0, len(timeline.Events)) - for _, e := range timeline.Events { - parsed := gjson.ParseBytes(e) + for i := range timeline.Events { + // Delete MSC4115 field as it isn't accurate when we reuse the same event for >1 user + timeline.Events[i], _ = sjson.DeleteBytes(timeline.Events[i], "unsigned.membership") + parsed := gjson.ParseBytes(timeline.Events[i]) eventID := parsed.Get("event_id").Str if txnID := parsed.Get("unsigned.transaction_id"); txnID.Exists() { @@ -371,6 +374,9 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID strin } func (h *Handler) Initialise(ctx context.Context, roomID string, state []json.RawMessage) error { + for i := range state { // Delete MSC4115 field as it isn't accurate when we reuse the same event for >1 user + state[i], _ = sjson.DeleteBytes(state[i], "unsigned.membership") + } res, err := h.Store.Initialise(roomID, state) if err != nil { logger.Err(err).Int("state", len(state)).Str("room", roomID).Msg("V2: failed to initialise room") diff --git a/tests-integration/lists_test.go b/tests-integration/lists_test.go index 85c6991a..32562bc1 100644 --- a/tests-integration/lists_test.go +++ b/tests-integration/lists_test.go @@ -9,6 +9,7 @@ import ( "github.com/matrix-org/sliding-sync/sync3" "github.com/matrix-org/sliding-sync/testutils" "github.com/matrix-org/sliding-sync/testutils/m" + "github.com/tidwall/sjson" ) func TestListsAsKeys(t *testing.T) { @@ -361,3 +362,89 @@ func TestBumpEventTypesOnStartup(t *testing.T) { )) } } + +func TestDeleteMSC4115Field(t *testing.T) { + rig := NewTestRig(t) + defer rig.Finish() + roomID := "!TestDeleteMSC4115Field:localhost" + rig.SetupV2RoomsForUser(t, alice, NoFlush, map[string]RoomDescriptor{ + roomID: {}, + }) + aliceToken := rig.Token(alice) + res := rig.V3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + Ranges: sync3.SliceRanges{{0, 20}}, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 10, + RequiredState: [][2]string{{"m.room.name", "*"}}, + }, + }, + }, + }) + m.MatchResponse(t, res, m.MatchLists(map[string][]m.ListMatcher{ + "a": { + m.MatchV3Count(1), + }, + }), m.MatchRoomSubscription(roomID)) + + // ensure live events remove the field. + liveEvent := testutils.NewMessageEvent(t, alice, "live event", testutils.WithUnsigned(map[string]interface{}{ + "membership": "join", + })) + liveEventWithoutMembership := make(json.RawMessage, len(liveEvent)) + copy(liveEventWithoutMembership, liveEvent) + liveEventWithoutMembership, err := sjson.DeleteBytes(liveEventWithoutMembership, "unsigned.membership") + if err != nil { + t.Fatalf("failed to delete unsigned.membership field") + } + rig.FlushEvent(t, alice, roomID, liveEvent) + + res = rig.V3.mustDoV3RequestWithPos(t, aliceToken, res.Pos, sync3.Request{}) + m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + roomID: { + m.MatchRoomTimelineMostRecent(1, []json.RawMessage{liveEventWithoutMembership}), + }, + })) + + // ensure state events remove the field. + stateEvent := testutils.NewStateEvent(t, "m.room.name", "", alice, map[string]interface{}{ + "name": "Room Name", + }, testutils.WithUnsigned(map[string]interface{}{ + "membership": "join", + })) + stateEventWithoutMembership := make(json.RawMessage, len(stateEvent)) + copy(stateEventWithoutMembership, stateEvent) + stateEventWithoutMembership, err = sjson.DeleteBytes(stateEventWithoutMembership, "unsigned.membership") + if err != nil { + t.Fatalf("failed to delete unsigned.membership field") + } + rig.V2.queueResponse(alice, sync2.SyncResponse{ + Rooms: sync2.SyncRoomsResponse{ + Join: v2JoinTimeline(roomEvents{ + roomID: roomID, + state: []json.RawMessage{stateEvent}, + events: []json.RawMessage{testutils.NewMessageEvent(t, alice, "dummy")}, + }), + }, + }) + rig.V2.waitUntilEmpty(t, alice) + + // sending v2 state invalidates the SS connection so start again pre-emptively. + res = rig.V3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{ + "a": { + Ranges: sync3.SliceRanges{{0, 20}}, + RoomSubscription: sync3.RoomSubscription{ + TimelineLimit: 10, + RequiredState: [][2]string{{"m.room.name", "*"}}, + }, + }, + }, + }) + m.MatchResponse(t, res, m.MatchRoomSubscriptionsStrict(map[string][]m.RoomMatcher{ + roomID: { + m.MatchRoomRequiredState([]json.RawMessage{stateEventWithoutMembership}), + }, + })) +} From 0af85096aaad7e876786cb4e77ec619e54dfea0b Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 25 Apr 2024 15:15:42 +0100 Subject: [PATCH 04/37] Check for unstable prefix in MSC4115 --- sync2/handler2/handler.go | 2 ++ tests-integration/lists_test.go | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 792437b8..dd03ab4e 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -274,6 +274,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID strin for i := range timeline.Events { // Delete MSC4115 field as it isn't accurate when we reuse the same event for >1 user timeline.Events[i], _ = sjson.DeleteBytes(timeline.Events[i], "unsigned.membership") + timeline.Events[i], _ = sjson.DeleteBytes(timeline.Events[i], `unsigned.io\.element\.msc4115\.membership`) parsed := gjson.ParseBytes(timeline.Events[i]) eventID := parsed.Get("event_id").Str @@ -376,6 +377,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID strin func (h *Handler) Initialise(ctx context.Context, roomID string, state []json.RawMessage) error { for i := range state { // Delete MSC4115 field as it isn't accurate when we reuse the same event for >1 user state[i], _ = sjson.DeleteBytes(state[i], "unsigned.membership") + state[i], _ = sjson.DeleteBytes(state[i], `unsigned.io\.element\.msc4115\.membership`) } res, err := h.Store.Initialise(roomID, state) if err != nil { diff --git a/tests-integration/lists_test.go b/tests-integration/lists_test.go index 32562bc1..c0ceae94 100644 --- a/tests-integration/lists_test.go +++ b/tests-integration/lists_test.go @@ -2,6 +2,7 @@ package syncv3 import ( "encoding/json" + "strings" "testing" "time" @@ -364,6 +365,15 @@ func TestBumpEventTypesOnStartup(t *testing.T) { } func TestDeleteMSC4115Field(t *testing.T) { + t.Run("stable prefix", func(t *testing.T) { + testDeleteMSC4115Field(t, "membership") + }) + t.Run("unstable prefix", func(t *testing.T) { + testDeleteMSC4115Field(t, "io.element.msc4115.membership") + }) +} + +func testDeleteMSC4115Field(t *testing.T, fieldName string) { rig := NewTestRig(t) defer rig.Finish() roomID := "!TestDeleteMSC4115Field:localhost" @@ -390,11 +400,11 @@ func TestDeleteMSC4115Field(t *testing.T) { // ensure live events remove the field. liveEvent := testutils.NewMessageEvent(t, alice, "live event", testutils.WithUnsigned(map[string]interface{}{ - "membership": "join", + fieldName: "join", })) liveEventWithoutMembership := make(json.RawMessage, len(liveEvent)) copy(liveEventWithoutMembership, liveEvent) - liveEventWithoutMembership, err := sjson.DeleteBytes(liveEventWithoutMembership, "unsigned.membership") + liveEventWithoutMembership, err := sjson.DeleteBytes(liveEventWithoutMembership, "unsigned."+strings.ReplaceAll(fieldName, ".", `\.`)) if err != nil { t.Fatalf("failed to delete unsigned.membership field") } @@ -411,11 +421,11 @@ func TestDeleteMSC4115Field(t *testing.T) { stateEvent := testutils.NewStateEvent(t, "m.room.name", "", alice, map[string]interface{}{ "name": "Room Name", }, testutils.WithUnsigned(map[string]interface{}{ - "membership": "join", + fieldName: "join", })) stateEventWithoutMembership := make(json.RawMessage, len(stateEvent)) copy(stateEventWithoutMembership, stateEvent) - stateEventWithoutMembership, err = sjson.DeleteBytes(stateEventWithoutMembership, "unsigned.membership") + stateEventWithoutMembership, err = sjson.DeleteBytes(stateEventWithoutMembership, "unsigned."+strings.ReplaceAll(fieldName, ".", `\.`)) if err != nil { t.Fatalf("failed to delete unsigned.membership field") } From 9620a624d793317dc640e4b6f43f41f0e7f88867 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 26 Apr 2024 12:16:39 +0100 Subject: [PATCH 05/37] Comment on escaped dots --- sync2/handler2/handler.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index dd03ab4e..40512489 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -274,6 +274,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID strin for i := range timeline.Events { // Delete MSC4115 field as it isn't accurate when we reuse the same event for >1 user timeline.Events[i], _ = sjson.DeleteBytes(timeline.Events[i], "unsigned.membership") + // escape .'s in the key name timeline.Events[i], _ = sjson.DeleteBytes(timeline.Events[i], `unsigned.io\.element\.msc4115\.membership`) parsed := gjson.ParseBytes(timeline.Events[i]) eventID := parsed.Get("event_id").Str @@ -377,6 +378,7 @@ func (h *Handler) Accumulate(ctx context.Context, userID, deviceID, roomID strin func (h *Handler) Initialise(ctx context.Context, roomID string, state []json.RawMessage) error { for i := range state { // Delete MSC4115 field as it isn't accurate when we reuse the same event for >1 user state[i], _ = sjson.DeleteBytes(state[i], "unsigned.membership") + // escape .'s in the key name state[i], _ = sjson.DeleteBytes(state[i], `unsigned.io\.element\.msc4115\.membership`) } res, err := h.Store.Initialise(roomID, state) From 66fd58bc4965d7f2549910174135cdbfbd0fa246 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:42:42 +0100 Subject: [PATCH 06/37] v0.99.16 --- cmd/syncv3/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index f24e8c60..6de80f7e 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -28,7 +28,7 @@ import ( var GitCommit string -const version = "0.99.15" +const version = "0.99.16" var ( flags = flag.NewFlagSet("goose", flag.ExitOnError) From 3e0162d70b6abe4574db3e302e8822c943cfb6eb Mon Sep 17 00:00:00 2001 From: Christoph Settgast Date: Fri, 26 Apr 2024 17:36:50 +0200 Subject: [PATCH 07/37] README: document prebuild binaries Now that there is a release which is statically built, advertise it in the README as well --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1d4c9886..706a1b39 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ location /.well-known/matrix/client { ``` ### Running -There are two ways to run the proxy: +There are three ways to run the proxy: - Compiling from source: ``` $ CGO_ENABLED=0 go build ./cmd/syncv3 @@ -95,6 +95,12 @@ $ SYNCV3_SECRET=$(cat .secret) SYNCV3_SERVER="https://matrix-client.matrix.org" docker run --rm -e "SYNCV3_SERVER=https://matrix-client.matrix.org" -e "SYNCV3_SECRET=$(cat .secret)" -e "SYNCV3_BINDADDR=:8008" -e "SYNCV3_DB=user=$(whoami) dbname=syncv3 sslmode=disable host=host.docker.internal password='DATABASE_PASSWORD_HERE'" -p 8008:8008 ghcr.io/matrix-org/sliding-sync:latest ``` +- Precompiled binaries: + +Download the binary for your architcture (eg. `syncv3_linux_amd64` for 64-bit AMD/Intel) from https://github.com/matrix-org/sliding-sync/releases/latest +``` +$ SYNCV3_SECRET=$(cat .secret) SYNCV3_SERVER="https://matrix-client.matrix.org" SYNCV3_DB="user=$(whoami) dbname=syncv3 sslmode=disable password='DATABASE_PASSWORD_HERE'" SYNCV3_BINDADDR=0.0.0.0:8008 ./syncv3_linux_amd64 +``` Optionally also set `SYNCV3_TLS_CERT=path/to/cert.pem` and `SYNCV3_TLS_KEY=path/to/key.pem` to listen on HTTPS instead of HTTP. Make sure to tweak the `SYNCV3_DB` environment variable if the Postgres database isn't running on the host. From 718513a1cb0fa503d185567fe02249bbb6d2ddb7 Mon Sep 17 00:00:00 2001 From: Will Lewis <1543626+wrjlewis@users.noreply.github.com> Date: Wed, 1 May 2024 16:13:13 +0100 Subject: [PATCH 08/37] Add grafana chart for SS proxy --- grafana/README.md | 18 + grafana/sliding-sync.json | 2120 +++++++++++++++++++++++++++++++++++++ 2 files changed, 2138 insertions(+) create mode 100644 grafana/README.md create mode 100644 grafana/sliding-sync.json diff --git a/grafana/README.md b/grafana/README.md new file mode 100644 index 00000000..3981001a --- /dev/null +++ b/grafana/README.md @@ -0,0 +1,18 @@ +# Using the Sliding Sync Grafana dashboard + +**Set up Prometheus and Grafana.** Out of scope for this readme. Useful documentation about using Grafana with Prometheus: http://docs.grafana.org/features/datasources/prometheus/ + +**Configure the bind address** for prometheus metrics in sliding sync. For example: `SYNCV3_PROM=2112`. Metrics will be accessible at /metrics at this address. + +**Configure a new job in Prometheus** to scrape the sliding sync endpoint. + +For example by adding the following job to `prometheus.yml`, if your sliding sync is running locally and you have `SYNCV3_PROM` configured to port 2112: + +``` +# Sliding Sync + - job_name: "Sliding Sync" + static_configs: + - targets: ["localhost:2112"] +``` + +**Import the sliding sync dashboard into Grafana.** Download `sliding-sync.json`. Import it to Grafana and select the correct Prometheus datasource. http://docs.grafana.org/reference/export_import/ \ No newline at end of file diff --git a/grafana/sliding-sync.json b/grafana/sliding-sync.json new file mode 100644 index 00000000..639f9144 --- /dev/null +++ b/grafana/sliding-sync.json @@ -0,0 +1,2120 @@ +{ + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.4.1" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 14, + "panels": [], + "title": "Sliding Sync API", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Actively syncing clients i.e the connection hasn't expired yet. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "conns" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Expired Connections (full buffer)" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 0, + 10 + ], + "fill": "dot" + } + }, + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + }, + { + "id": "custom.lineWidth", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Expired Connections (timed out)" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineWidth", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum(sliding_sync_api_num_active_conns)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Active Connections", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "increase(sliding_sync_api_expiry_conn_buffer_full[5m])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "Expired Connections (full buffer)", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "increase(sliding_sync_api_expiry_conn_timed_out[5m])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "Expired Connections (timed out)", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "# active sliding sync connections", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "sliding-sync-api-(.*)", + "renamePattern": "$1" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The number of devices blocked on an initial v2 sync. This number should never remain >0 for more than 15 minutes, unless there is a flood of new users to the system.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "conns" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 166, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sliding_sync_api_num_devices_pending_ensure_polling", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "EnsurePolling calls outstanding", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "sliding-sync-api-(.*)", + "renamePattern": "$1" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Excludes time waiting for live updates. Excludes initial requests.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "99%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 35 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "95%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "50%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "25%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "75%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Request rate" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "hertz" + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "90%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-orange", + "mode": "fixed" + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Request rate" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 152, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "99%", + "range": true, + "refId": "99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "90%", + "range": true, + "refId": "90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "75%", + "range": true, + "refId": "75" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "50%", + "range": true, + "refId": "50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.25, sum by(le) (rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "25%", + "range": true, + "refId": "25" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(sliding_sync_api_process_duration_secs_count{initial=\"0\"}[$window_size]))", + "hide": false, + "instant": false, + "legendFormat": "Request rate", + "range": true, + "refId": "A" + } + ], + "title": "API processing time quantiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 157, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "builder", + "expr": "sum(rate(sliding_sync_api_setup_duration_secs_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Request setup times", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Requests take more that 50s. This is our best proxy for \"how many people's requests are wedged\". Should be a flat 0.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 159, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(increase(sliding_sync_api_slow_requests[5m]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Slow requests", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "If this goes high, it may indicate poor DB performance when querying.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 10, + "legend": { + "show": true + }, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "dtdurations" + } + }, + "pluginVersion": "10.4.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(increase(sliding_sync_api_process_duration_secs_bucket{initial=\"1\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Time taken to process initial sliding sync requests", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "format": "dtdurations", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Excludes live streaming blocks. If this goes high, it may indicate poor DB performance when querying.", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 15, + "legend": { + "show": true + }, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 3 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "dtdurations" + } + }, + "pluginVersion": "10.4.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(rate(sliding_sync_api_process_duration_secs_bucket{initial=\"0\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Time taken to process changes (ranges/filter/sorting/etc) in sliding sync requests", + "tooltip": { + "show": true, + "showHistogram": false + }, + "tooltipDecimals": 3, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketSize": "", + "yAxis": { + "format": "dtdurations", + "logBase": 1, + "min": "0", + "show": true + }, + "yBucketBound": "auto" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 12, + "panels": [], + "title": "V2 Poller", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Number of /sync connections to upstream homeserver. Generally always goes up, unless users log out and invalidate the access_token being used.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pollers", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sliding_sync_poller_num_pollers{}", + "instant": false, + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "# v2 pollers", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The higher this is, the bigger the latency from sending events -> receiving events. Excludes initial /sync requests", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 8, + "legend": { + "show": false + }, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "dtdurations" + } + }, + "pluginVersion": "10.3.3", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(increase(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Time taken to process sync v2 responses", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "format": "dtdurations", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "If this drops too low, this means pollers are blocked on something (DB conns, executor, etc)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + }, + { + "color": "green", + "value": 0.75 + } + ] + }, + "unit": "percentunit", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 165, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sliding_sync_poller_num_outstanding_sync_v2_reqs{}/sliding_sync_poller_num_pollers{}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Ratio of pollers waiting for v2 response", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "If this number drops to 0, this indicates something is blocking all pollers from doing work. This number will scale with the number of pollers on the process.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 167, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "rate(sliding_sync_poller_total_num_polls{}[5m])", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of poll loop iterations", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "NB: Proxy requests a timeline limit of 50.\n\nTODO: make it clear how many syncs were limited", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "id": 16, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "v2 syncs" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisLabel": "timeline events", + "axisPlacement": "left", + "max": "50", + "reverse": false + } + }, + "pluginVersion": "10.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(rate(sliding_sync_poller_timeline_size_bucket{limited=\"unlimited\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{limited}}", + "range": true, + "refId": "A" + } + ], + "title": "Timeline size of unlimited pollers", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Updates from v2 pollers sent to pubsub. Abnormal spikes could be from spam or lack of duplicate suppression in the proxy.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 50 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum by (payload_type) (rate(sliding_sync_poller_num_payloads{}[$window_size]))", + "legendFormat": "{{payload_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Payload Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Excludes time waiting for v2 sync requests. Excludes initial polls.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s", + "unitScale": true + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "99%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 35 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "95%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "50%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "25%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "75%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Poll rate" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "hertz" + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.showPoints", + "value": "always" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "90%" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "super-light-orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 153, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "99%", + "range": true, + "refId": "99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "90%", + "range": true, + "refId": "90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "75%", + "range": true, + "refId": "75" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "50%", + "range": true, + "refId": "50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.25, sum by(le) (rate(sliding_sync_poller_process_duration_secs_bucket{initial=\"0\"}[$window_size])))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "25%", + "range": true, + "refId": "25" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(rate(sliding_sync_poller_process_duration_secs_count{initial=\"0\"}[$window_size]))", + "hide": false, + "legendFormat": "Poll rate", + "range": true, + "refId": "A" + } + ], + "title": "Poller processing time quantiles", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 100, + "auto_min": "30s", + "current": { + "selected": true, + "text": "auto", + "value": "$__auto_interval_window_size" + }, + "description": "Window to use for aggregating buckets/moving averages", + "hide": 0, + "label": "Window size", + "name": "window_size", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_window_size" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "2h", + "value": "2h" + } + ], + "query": "30s,1m,2m,5m,10m,15m,30m,1h,2h", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "c433c715-0878-4d85-877c-465ce5b8cac4" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Sliding Sync", + "uid": "slidingsync", + "version": 27, + "weekStart": "" + } \ No newline at end of file From 5816c60e33410d47523dd0e9c02a8b80b5d01a10 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Wed, 8 May 2024 14:26:24 +0100 Subject: [PATCH 09/37] Add SELECT .. FOR UPDATE clauses to DeviceDataTable As we are selecting... for updates. Without this, we can drop updates to the floor incorrectly. See https://github.com/matrix-org/sliding-sync/issues/430 --- state/device_data_table.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/state/device_data_table.go b/state/device_data_table.go index cd9fc34b..ceeef9a5 100644 --- a/state/device_data_table.go +++ b/state/device_data_table.go @@ -46,7 +46,7 @@ func NewDeviceDataTable(db *sqlx.DB) *DeviceDataTable { func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *internal.DeviceData, err error) { err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { var row DeviceDataRow - err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2`, userID, deviceID) + err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2 FOR UPDATE`, userID, deviceID) if err != nil { if err == sql.ErrNoRows { // if there is no device data for this user, it's not an error. @@ -104,7 +104,7 @@ func (t *DeviceDataTable) Upsert(dd *internal.DeviceData) (err error) { err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { // select what already exists var row DeviceDataRow - err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2`, dd.UserID, dd.DeviceID) + err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2 FOR UPDATE`, dd.UserID, dd.DeviceID) if err != nil && err != sql.ErrNoRows { return err } From c5b0afb2bebe5b2743a5b54dbcd406608ae78243 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 9 May 2024 12:36:18 +0100 Subject: [PATCH 10/37] bugfix: ensure we send back device list updates at the correct time Otherwise we can cause lost device list updates if the client resets the connection (as the device list update won't be in any in-memory cache). --- state/device_data_table.go | 3 ++ tests-integration/extensions_test.go | 42 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/state/device_data_table.go b/state/device_data_table.go index ceeef9a5..1ed3e870 100644 --- a/state/device_data_table.go +++ b/state/device_data_table.go @@ -70,6 +70,9 @@ func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *in if !swap { return nil // don't swap } + // the caller will only look at sent, so make sure what is new is now in sent + result.DeviceLists.Sent = result.DeviceLists.New + // swap over the fields writeBack := *result writeBack.DeviceLists.Sent = result.DeviceLists.New diff --git a/tests-integration/extensions_test.go b/tests-integration/extensions_test.go index 9d30d978..b2f271f7 100644 --- a/tests-integration/extensions_test.go +++ b/tests-integration/extensions_test.go @@ -193,6 +193,48 @@ func TestExtensionE2EE(t *testing.T) { if time.Since(start) >= (500 * time.Millisecond) { t.Fatalf("sync request did not return immediately with OTK counts") } + + // check that if we lose a device list update and restart from nothing, we see the same update + v2.queueResponse(alice, sync2.SyncResponse{ + DeviceLists: struct { + Changed []string `json:"changed,omitempty"` + Left []string `json:"left,omitempty"` + }{ + Changed: wantChanged, + Left: wantLeft, + }, + }) + v2.waitUntilEmpty(t, alice) + res = v3.mustDoV3RequestWithPos(t, aliceToken, res.Pos, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, // doesn't matter + }, + }}, + // enable the E2EE extension + Extensions: extensions.Request{ + E2EE: &extensions.E2EERequest{ + Core: extensions.Core{Enabled: &boolTrue}, + }, + }, + }) + m.MatchResponse(t, res, m.MatchDeviceLists(wantChanged, wantLeft)) + // we actually lost this update: start again and we should see it. + res = v3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, // doesn't matter + }, + }}, + // enable the E2EE extension + Extensions: extensions.Request{ + E2EE: &extensions.E2EERequest{ + Core: extensions.Core{Enabled: &boolTrue}, + }, + }, + }) + m.MatchResponse(t, res, m.MatchDeviceLists(wantChanged, wantLeft)) + } // Checks that to-device messages are passed from v2 to v3 From f564f2d774a5cec52203b4ee9e6151138885e120 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 9 May 2024 15:32:21 +0100 Subject: [PATCH 11/37] Redo the device data unit tests They were A) confusing and B) testing the wrong thing as they got refactored after the bad refactor of the source code.. --- state/device_data_table_test.go | 220 +++++++++++++++++++++----------- 1 file changed, 147 insertions(+), 73 deletions(-) diff --git a/state/device_data_table_test.go b/state/device_data_table_test.go index 00c53cd0..d099eda0 100644 --- a/state/device_data_table_test.go +++ b/state/device_data_table_test.go @@ -22,17 +22,20 @@ func assertDeviceData(t *testing.T, g, w internal.DeviceData) { assertVal(t, "FallbackKeyTypes", g.FallbackKeyTypes, w.FallbackKeyTypes) assertVal(t, "OTKCounts", g.OTKCounts, w.OTKCounts) assertVal(t, "ChangedBits", g.ChangedBits, w.ChangedBits) - assertVal(t, "DeviceLists", g.DeviceLists, w.DeviceLists) + if w.DeviceLists.Sent != nil { + assertVal(t, "DeviceLists.Sent", g.DeviceLists.Sent, w.DeviceLists.Sent) + } } -func TestDeviceDataTableSwaps(t *testing.T) { +// Tests OTKCounts and FallbackKeyTypes behaviour +func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { db, close := connectToDB(t) defer close() table := NewDeviceDataTable(db) - userID := "@bob" + userID := "@TestDeviceDataTableOTKCountAndFallbackKeyTypes" deviceID := "BOB" - // test accumulating deltas + // these are individual updates from Synapse from /sync v2 deltas := []internal.DeviceData{ { UserID: userID, @@ -46,9 +49,6 @@ func TestDeviceDataTableSwaps(t *testing.T) { UserID: userID, DeviceID: deviceID, FallbackKeyTypes: []string{"foobar"}, - DeviceLists: internal.DeviceLists{ - New: internal.ToDeviceListChangesMap([]string{"alice"}, nil), - }, }, { UserID: userID, @@ -60,16 +60,38 @@ func TestDeviceDataTableSwaps(t *testing.T) { { UserID: userID, DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - New: internal.ToDeviceListChangesMap([]string{"💣"}, nil), - }, }, } + + // apply them for _, dd := range deltas { err := table.Upsert(&dd) assertNoError(t, err) } + // read them without swap, it should have replaced them correctly. + // Because sync v2 sends the complete OTK count and complete fallback key types + // every time, we always use the latest values. Because we aren't swapping, repeated + // reads produce the same result. + for i := 0; i < 3; i++ { + got, err := table.Select(userID, deviceID, false) + mustNotError(t, err) + want := internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + OTKCounts: map[string]int{ + "foo": 99, + }, + FallbackKeyTypes: []string{"foobar"}, + } + want.SetFallbackKeysChanged() + want.SetOTKCountChanged() + assertDeviceData(t, *got, want) + } + // now we swap the data. This still returns the same values, but the changed bits are no longer set + // on subsequent reads. + got, err := table.Select(userID, deviceID, true) + mustNotError(t, err) want := internal.DeviceData{ UserID: userID, DeviceID: deviceID, @@ -77,68 +99,118 @@ func TestDeviceDataTableSwaps(t *testing.T) { "foo": 99, }, FallbackKeyTypes: []string{"foobar"}, - DeviceLists: internal.DeviceLists{ - New: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - Sent: map[string]int{}, - }, } want.SetFallbackKeysChanged() want.SetOTKCountChanged() - // check we can read-only select + assertDeviceData(t, *got, want) + + // subsequent read + got, err = table.Select(userID, deviceID, false) + mustNotError(t, err) + want = internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + OTKCounts: map[string]int{ + "foo": 99, + }, + FallbackKeyTypes: []string{"foobar"}, + } + assertDeviceData(t, *got, want) +} + +// Tests the DeviceLists field +func TestDeviceDataTableDeviceList(t *testing.T) { + db, close := connectToDB(t) + defer close() + table := NewDeviceDataTable(db) + userID := "@TestDeviceDataTableDeviceList" + deviceID := "BOB" + + // these are individual updates from Synapse from /sync v2 + deltas := []internal.DeviceData{ + { + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + New: internal.ToDeviceListChangesMap([]string{"alice"}, nil), + }, + }, + { + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + New: internal.ToDeviceListChangesMap([]string{"💣"}, nil), + }, + }, + } + // apply them + for _, dd := range deltas { + err := table.Upsert(&dd) + assertNoError(t, err) + } + + // check we can read-only select. This doesn't modify any fields. for i := 0; i < 3; i++ { got, err := table.Select(userID, deviceID, false) assertNoError(t, err) - assertDeviceData(t, *got, want) + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.MapStringInt{}, // until we "swap" we don't consume the New entries + }, + }) } - // now swap-er-roo, at this point we still expect the "old" data, - // as it is the first time we swap + // now swap-er-roo, which shifts everything from New into Sent. got, err := table.Select(userID, deviceID, true) assertNoError(t, err) - assertDeviceData(t, *got, want) - - // changed bits were reset when we swapped - want2 := want - want2.DeviceLists = internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - New: map[string]int{}, - } - want2.ChangedBits = 0 - want.ChangedBits = 0 + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), + }, + }) // this is permanent, read-only views show this too. - // Since we have swapped previously, we now expect New to be empty - // and Sent to be set. Swap again to clear Sent. - got, err = table.Select(userID, deviceID, true) + got, err = table.Select(userID, deviceID, false) assertNoError(t, err) - assertDeviceData(t, *got, want2) + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), + }, + }) // We now expect empty DeviceLists, as we swapped twice. - got, err = table.Select(userID, deviceID, false) + got, err = table.Select(userID, deviceID, true) assertNoError(t, err) - want3 := want2 - want3.DeviceLists = internal.DeviceLists{ - Sent: map[string]int{}, - New: map[string]int{}, - } - assertDeviceData(t, *got, want3) + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.MapStringInt{}, + }, + }) // get back the original state - //err = table.DeleteDevice(userID, deviceID) assertNoError(t, err) for _, dd := range deltas { err = table.Upsert(&dd) assertNoError(t, err) } - want.SetFallbackKeysChanged() - want.SetOTKCountChanged() - got, err = table.Select(userID, deviceID, false) - assertNoError(t, err) - assertDeviceData(t, *got, want) - - // swap once then add once so both sent and new are populated - // Moves Alice and Bob to Sent - _, err = table.Select(userID, deviceID, true) + // Move original state to Sent by swapping + got, err = table.Select(userID, deviceID, true) assertNoError(t, err) + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), + }, + }) + // Add new entries to New before acknowledging Sent err = table.Upsert(&internal.DeviceData{ UserID: userID, DeviceID: deviceID, @@ -148,20 +220,18 @@ func TestDeviceDataTableSwaps(t *testing.T) { }) assertNoError(t, err) - want.ChangedBits = 0 - - want4 := want - want4.DeviceLists = internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - New: internal.ToDeviceListChangesMap([]string{"💣"}, []string{"charlie"}), - } - // Without swapping, we expect Alice and Bob in Sent, and Bob and Charlie in New + // Reading without swapping does not move New->Sent, so returns the previous value got, err = table.Select(userID, deviceID, false) assertNoError(t, err) - assertDeviceData(t, *got, want4) + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), + }, + }) - // another append then consume - // This results in dave to be added to New + // Append even more items to New err = table.Upsert(&internal.DeviceData{ UserID: userID, DeviceID: deviceID, @@ -170,24 +240,28 @@ func TestDeviceDataTableSwaps(t *testing.T) { }, }) assertNoError(t, err) + + // Now swap: all the combined items in New go into Sent got, err = table.Select(userID, deviceID, true) assertNoError(t, err) - want5 := want4 - want5.DeviceLists = internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - New: internal.ToDeviceListChangesMap([]string{"💣"}, []string{"charlie", "dave"}), - } - assertDeviceData(t, *got, want5) + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.ToDeviceListChangesMap([]string{"💣", "dave"}, []string{"charlie", "dave"}), + }, + }) - // Swapping again clears New + // Swapping again clears Sent out, and since nothing is in New we get an empty list got, err = table.Select(userID, deviceID, true) assertNoError(t, err) - want5 = want4 - want5.DeviceLists = internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"💣"}, []string{"charlie", "dave"}), - New: map[string]int{}, - } - assertDeviceData(t, *got, want5) + assertDeviceData(t, *got, internal.DeviceData{ + UserID: userID, + DeviceID: deviceID, + DeviceLists: internal.DeviceLists{ + Sent: internal.MapStringInt{}, + }, + }) // delete everything, no data returned assertNoError(t, table.DeleteDevice(userID, deviceID)) From 39f0d220e03e6d9d5a2772329f988185d193f58b Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 9 May 2024 16:03:29 +0100 Subject: [PATCH 12/37] Maybe sign releases --- .github/workflows/docker.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 37448d56..b853182b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -21,7 +21,10 @@ jobs: contents: read packages: write security-events: write # To upload Trivy sarif files + id-token: write # needed for signing the images with GitHub OIDC Token steps: + - name: Install Cosign + uses: sigstore/cosign-installer@v3.3.0 - name: Checkout uses: actions/checkout@v3 - name: Set up QEMU @@ -62,6 +65,18 @@ jobs: ghcr.io/${{ env.GHCR_NAMESPACE }}/sliding-sync:latest ghcr.io/${{ env.GHCR_NAMESPACE }}/sliding-sync:${{ github.ref_name }} + - name: Sign the images with GitHub OIDC Token + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + env: + DIGEST: ${{ steps.docker_build_sliding_sync_release.outputs.digest }} + TAGS: ghcr.io/${{ env.GHCR_NAMESPACE }}/sliding-sync:${{ github.ref_name }} + run: | + images="" + for tag in ${TAGS}; do + images+="${tag}@${DIGEST} " + done + cosign sign --yes ${images} + - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@master with: From 693587ef7e1c47cd04a667332ef133146132a713 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 10 May 2024 14:25:32 +0100 Subject: [PATCH 13/37] v0.99.17 --- cmd/syncv3/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 6de80f7e..70fb6bc4 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -28,7 +28,7 @@ import ( var GitCommit string -const version = "0.99.16" +const version = "0.99.17" var ( flags = flag.NewFlagSet("goose", flag.ExitOnError) From 62b181c3b6da8f2a0c5af3c593207d13153d2e81 Mon Sep 17 00:00:00 2001 From: Will Lewis <1543626+wrjlewis@users.noreply.github.com> Date: Thu, 16 May 2024 14:10:31 +0100 Subject: [PATCH 14/37] Apply suggestions from code review Co-authored-by: Kegan Dougal <7190048+kegsay@users.noreply.github.com> --- grafana/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grafana/README.md b/grafana/README.md index 3981001a..e8d2103f 100644 --- a/grafana/README.md +++ b/grafana/README.md @@ -2,11 +2,11 @@ **Set up Prometheus and Grafana.** Out of scope for this readme. Useful documentation about using Grafana with Prometheus: http://docs.grafana.org/features/datasources/prometheus/ -**Configure the bind address** for prometheus metrics in sliding sync. For example: `SYNCV3_PROM=2112`. Metrics will be accessible at /metrics at this address. +**Configure the bind address** for prometheus metrics in sliding sync. For example: `SYNCV3_PROM=:2112`. Metrics will be accessible at `/metrics` at this address. **Configure a new job in Prometheus** to scrape the sliding sync endpoint. -For example by adding the following job to `prometheus.yml`, if your sliding sync is running locally and you have `SYNCV3_PROM` configured to port 2112: +For example by adding the following job to `prometheus.yml`, if your sliding sync is running locally not in docker and you have `SYNCV3_PROM` configured to port 2112: ``` # Sliding Sync From 3ca77f2bc054092ffe48f6eb25ae6bbbe63176e7 Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Fri, 17 May 2024 10:26:09 +0200 Subject: [PATCH 15/37] Optimize getting the latest events in each room Signed-off-by: Till Faelligen <2353100+S7evinK@users.noreply.github.com> --- state/event_table.go | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index 985185ba..f8cd5a90 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -312,14 +312,35 @@ func (t *EventTable) UpdateBeforeSnapshotID(txn *sqlx.Tx, eventNID, snapID, repl return err } -// query the latest events in each of the room IDs given, using highestNID as the highest event. +// LatestEventInRooms queries the latest events in each of the room IDs given, using highestNID as the highest event. +// +// The following query does: +// +// 1. Create a list of the passed in roomIDs (`room_ids` CTE) +// 2. Fetches the highest event_nid before or equal to $2 for each room in room_ids (`max_ev_nid` CTE) +// 3. Fetches the latest events for each room using the data provided from room_ids and max_ev_nid (the `evs` LATERAL) func (t *EventTable) LatestEventInRooms(txn *sqlx.Tx, roomIDs []string, highestNID int64) (events []Event, err error) { - // the position (event nid) may be for a random different room, so we need to find the highest nid <= this position for this room err = txn.Select( &events, - `SELECT event_nid, room_id, event_replaces_nid, before_state_snapshot_id, event_type, state_key, event FROM syncv3_events - WHERE event_nid IN (SELECT max(event_nid) FROM syncv3_events WHERE event_nid <= $1 AND room_id = ANY($2) GROUP BY room_id)`, - highestNID, pq.StringArray(roomIDs), + ` +WITH room_ids AS ( + select unnest($1::text[]) AS room_id +), +max_ev_nid AS ( + SELECT * + FROM room_ids, + LATERAL ( + SELECT max(event_nid) FROM syncv3_events e WHERE e.room_id = room_ids.room_id AND event_nid <= $2 + ) AS x) +SELECT evs.* +FROM room_ids, + max_ev_nid, + LATERAL ( + SELECT event_nid, room_id, event_replaces_nid, before_state_snapshot_id, event_type, state_key, event + FROM syncv3_events e + WHERE e.event_nid = max_ev_nid.max AND room_ids.room_id = e.room_id + ) AS evs`, + pq.StringArray(roomIDs), highestNID, ) if err == sql.ErrNoRows { err = nil From 2cd9a81ab23f083db1228f3f31d6401b1c14c995 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 17 May 2024 09:37:38 +0100 Subject: [PATCH 16/37] Add DeviceListTable Shift over unit tests from DeviceDataTable --- state/device_data_table.go | 6 +- state/device_data_table_test.go | 154 -------------------------------- state/device_list_table.go | 110 +++++++++++++++++++++++ state/device_list_table_test.go | 108 ++++++++++++++++++++++ 4 files changed, 222 insertions(+), 156 deletions(-) create mode 100644 state/device_list_table.go create mode 100644 state/device_list_table_test.go diff --git a/state/device_data_table.go b/state/device_data_table.go index 1ed3e870..2c5576d7 100644 --- a/state/device_data_table.go +++ b/state/device_data_table.go @@ -21,7 +21,8 @@ type DeviceDataRow struct { } type DeviceDataTable struct { - db *sqlx.DB + db *sqlx.DB + deviceListTable *DeviceListTable } func NewDeviceDataTable(db *sqlx.DB) *DeviceDataTable { @@ -37,7 +38,8 @@ func NewDeviceDataTable(db *sqlx.DB) *DeviceDataTable { ALTER TABLE syncv3_device_data SET (fillfactor = 90); `) return &DeviceDataTable{ - db: db, + db: db, + deviceListTable: NewDeviceListTable(db), } } diff --git a/state/device_data_table_test.go b/state/device_data_table_test.go index d099eda0..b4fe6ad0 100644 --- a/state/device_data_table_test.go +++ b/state/device_data_table_test.go @@ -118,160 +118,6 @@ func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { assertDeviceData(t, *got, want) } -// Tests the DeviceLists field -func TestDeviceDataTableDeviceList(t *testing.T) { - db, close := connectToDB(t) - defer close() - table := NewDeviceDataTable(db) - userID := "@TestDeviceDataTableDeviceList" - deviceID := "BOB" - - // these are individual updates from Synapse from /sync v2 - deltas := []internal.DeviceData{ - { - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - New: internal.ToDeviceListChangesMap([]string{"alice"}, nil), - }, - }, - { - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - New: internal.ToDeviceListChangesMap([]string{"💣"}, nil), - }, - }, - } - // apply them - for _, dd := range deltas { - err := table.Upsert(&dd) - assertNoError(t, err) - } - - // check we can read-only select. This doesn't modify any fields. - for i := 0; i < 3; i++ { - got, err := table.Select(userID, deviceID, false) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.MapStringInt{}, // until we "swap" we don't consume the New entries - }, - }) - } - // now swap-er-roo, which shifts everything from New into Sent. - got, err := table.Select(userID, deviceID, true) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - }, - }) - - // this is permanent, read-only views show this too. - got, err = table.Select(userID, deviceID, false) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - }, - }) - - // We now expect empty DeviceLists, as we swapped twice. - got, err = table.Select(userID, deviceID, true) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.MapStringInt{}, - }, - }) - - // get back the original state - assertNoError(t, err) - for _, dd := range deltas { - err = table.Upsert(&dd) - assertNoError(t, err) - } - // Move original state to Sent by swapping - got, err = table.Select(userID, deviceID, true) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - }, - }) - // Add new entries to New before acknowledging Sent - err = table.Upsert(&internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - New: internal.ToDeviceListChangesMap([]string{"💣"}, []string{"charlie"}), - }, - }) - assertNoError(t, err) - - // Reading without swapping does not move New->Sent, so returns the previous value - got, err = table.Select(userID, deviceID, false) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"alice", "💣"}, nil), - }, - }) - - // Append even more items to New - err = table.Upsert(&internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - New: internal.ToDeviceListChangesMap([]string{"dave"}, []string{"dave"}), - }, - }) - assertNoError(t, err) - - // Now swap: all the combined items in New go into Sent - got, err = table.Select(userID, deviceID, true) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.ToDeviceListChangesMap([]string{"💣", "dave"}, []string{"charlie", "dave"}), - }, - }) - - // Swapping again clears Sent out, and since nothing is in New we get an empty list - got, err = table.Select(userID, deviceID, true) - assertNoError(t, err) - assertDeviceData(t, *got, internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceLists: internal.DeviceLists{ - Sent: internal.MapStringInt{}, - }, - }) - - // delete everything, no data returned - assertNoError(t, table.DeleteDevice(userID, deviceID)) - got, err = table.Select(userID, deviceID, false) - assertNoError(t, err) - if got != nil { - t.Errorf("wanted no data, got %v", got) - } -} - func TestDeviceDataTableBitset(t *testing.T) { db, close := connectToDB(t) defer close() diff --git a/state/device_list_table.go b/state/device_list_table.go new file mode 100644 index 00000000..5baae563 --- /dev/null +++ b/state/device_list_table.go @@ -0,0 +1,110 @@ +package state + +import ( + "fmt" + + "github.com/getsentry/sentry-go" + "github.com/jmoiron/sqlx" + "github.com/matrix-org/sliding-sync/internal" + "github.com/matrix-org/sliding-sync/sqlutil" +) + +const ( + BucketNew = 1 + BucketSent = 2 +) + +type DeviceListTable struct { + db *sqlx.DB +} + +func NewDeviceListTable(db *sqlx.DB) *DeviceListTable { + db.MustExec(` + CREATE TABLE IF NOT EXISTS syncv3_device_list_updates ( + user_id TEXT NOT NULL, + device_id TEXT NOT NULL, + target_user_id TEXT NOT NULL, + target_state SMALLINT NOT NULL, + bucket SMALLINT NOT NULL, + UNIQUE(user_id, device_id, target_user_id, bucket) + ); + -- make an index so selecting all the rows is faster + CREATE INDEX IF NOT EXISTS syncv3_device_list_updates_bucket_idx ON syncv3_device_list_updates(user_id, device_id, bucket); + -- Set the fillfactor to 90%, to allow for HOT updates (e.g. we only + -- change the data, not anything indexed like the id) + ALTER TABLE syncv3_device_list_updates SET (fillfactor = 90); + `) + return &DeviceListTable{ + db: db, + } +} + +// Upsert new device list changes. +func (t *DeviceListTable) Upsert(userID, deviceID string, deviceListChanges map[string]int) (err error) { + err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { + for targetUserID, targetState := range deviceListChanges { + if targetState != internal.DeviceListChanged && targetState != internal.DeviceListLeft { + sentry.CaptureException(fmt.Errorf("DeviceListTable.Upsert invalid target_state: %d this is a programming error", targetState)) + continue + } + _, err = txn.Exec( + `INSERT INTO syncv3_device_list_updates(user_id, device_id, target_user_id, target_state, bucket) VALUES($1,$2,$3,$4,$5) + ON CONFLICT (user_id, device_id, target_user_id, bucket) DO UPDATE SET target_state=$4`, + userID, deviceID, targetUserID, targetState, BucketNew, + ) + if err != nil { + return err + } + } + return nil + }) + if err != nil { + sentry.CaptureException(err) + } + return +} + +// Select device list changes for this client. Returns a map of user_id => change enum. +func (t *DeviceListTable) Select(userID, deviceID string, swap bool) (result internal.MapStringInt, err error) { + err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { + if !swap { + // read only view, just return what we previously sent and don't do anything else. + result, err = t.selectDeviceListChangesInBucket(txn, userID, deviceID, BucketSent) + return err + } + + // delete the now acknowledged 'sent' data + _, err = txn.Exec(`DELETE FROM syncv3_device_list_updates WHERE user_id=$1 AND device_id=$2 AND bucket=$3`, userID, deviceID, BucketSent) + if err != nil { + return err + } + // grab any 'new' updates + result, err = t.selectDeviceListChangesInBucket(txn, userID, deviceID, BucketNew) + if err != nil { + return err + } + + // mark these 'new' updates as 'sent' + _, err = txn.Exec(`UPDATE syncv3_device_list_updates SET bucket=$1 WHERE user_id=$2 AND device_id=$3 AND bucket=$4`, BucketSent, userID, deviceID, BucketNew) + return err + }) + return +} + +func (t *DeviceListTable) selectDeviceListChangesInBucket(txn *sqlx.Tx, userID, deviceID string, bucket int) (result internal.MapStringInt, err error) { + rows, err := txn.Query(`SELECT target_user_id, target_state FROM syncv3_device_list_updates WHERE user_id=$1 AND device_id=$2 AND bucket=$3`, userID, deviceID, bucket) + if err != nil { + return nil, err + } + defer rows.Close() + result = make(internal.MapStringInt) + var targetUserID string + var targetState int + for rows.Next() { + if err := rows.Scan(&targetUserID, &targetState); err != nil { + return nil, err + } + result[targetUserID] = targetState + } + return result, rows.Err() +} diff --git a/state/device_list_table_test.go b/state/device_list_table_test.go new file mode 100644 index 00000000..79cf8438 --- /dev/null +++ b/state/device_list_table_test.go @@ -0,0 +1,108 @@ +package state + +import ( + "testing" + + "github.com/matrix-org/sliding-sync/internal" +) + +// Tests the DeviceLists table +func TestDeviceListTable(t *testing.T) { + db, close := connectToDB(t) + defer close() + table := NewDeviceListTable(db) + userID := "@TestDeviceListTable" + deviceID := "BOB" + + // these are individual updates from Synapse from /sync v2 + deltas := []internal.MapStringInt{ + { + "alice": internal.DeviceListChanged, + }, + { + "💣": internal.DeviceListChanged, + }, + } + // apply them + for _, dd := range deltas { + err := table.Upsert(userID, deviceID, dd) + assertNoError(t, err) + } + + // check we can read-only select. This doesn't modify any fields. + for i := 0; i < 3; i++ { + got, err := table.Select(userID, deviceID, false) + assertNoError(t, err) + // until we "swap" we don't consume the New entries + assertVal(t, "unexpected data on swapless select", got, internal.MapStringInt{}) + } + // now swap-er-roo, which shifts everything from New into Sent. + got, err := table.Select(userID, deviceID, true) + assertNoError(t, err) + assertVal(t, "did not select what was upserted on swap select", got, internal.MapStringInt{ + "alice": internal.DeviceListChanged, + "💣": internal.DeviceListChanged, + }) + + // this is permanent, read-only views show this too. + got, err = table.Select(userID, deviceID, false) + assertNoError(t, err) + assertVal(t, "swapless select did not return the same data as before", got, internal.MapStringInt{ + "alice": internal.DeviceListChanged, + "💣": internal.DeviceListChanged, + }) + + // We now expect empty DeviceLists, as we swapped twice. + got, err = table.Select(userID, deviceID, true) + assertNoError(t, err) + assertVal(t, "swap select did not return nothing", got, internal.MapStringInt{}) + + // get back the original state + assertNoError(t, err) + for _, dd := range deltas { + err = table.Upsert(userID, deviceID, dd) + assertNoError(t, err) + } + // Move original state to Sent by swapping + got, err = table.Select(userID, deviceID, true) + assertNoError(t, err) + assertVal(t, "did not select what was upserted on swap select", got, internal.MapStringInt{ + "alice": internal.DeviceListChanged, + "💣": internal.DeviceListChanged, + }) + // Add new entries to New before acknowledging Sent + err = table.Upsert(userID, deviceID, internal.MapStringInt{ + "💣": internal.DeviceListChanged, + "charlie": internal.DeviceListLeft, + }) + assertNoError(t, err) + + // Reading without swapping does not move New->Sent, so returns the previous value + got, err = table.Select(userID, deviceID, false) + assertNoError(t, err) + assertVal(t, "swapless select did not return the same data as before", got, internal.MapStringInt{ + "alice": internal.DeviceListChanged, + "💣": internal.DeviceListChanged, + }) + + // Append even more items to New + err = table.Upsert(userID, deviceID, internal.MapStringInt{ + "charlie": internal.DeviceListChanged, // we previously said "left" for charlie, so as "changed" is newer, we should see "changed" + "dave": internal.DeviceListLeft, + }) + assertNoError(t, err) + + // Now swap: all the combined items in New go into Sent + got, err = table.Select(userID, deviceID, true) + assertNoError(t, err) + assertVal(t, "swap select did not return combined new items", got, internal.MapStringInt{ + "💣": internal.DeviceListChanged, + "charlie": internal.DeviceListChanged, + "dave": internal.DeviceListLeft, + }) + + // Swapping again clears Sent out, and since nothing is in New we get an empty list + got, err = table.Select(userID, deviceID, true) + assertNoError(t, err) + assertVal(t, "swap select did not return combined new items", got, internal.MapStringInt{}) +} From 1ae5b5cb3d6d8628d6b85415fbce81ff57ce3c1a Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Fri, 17 May 2024 10:41:28 +0200 Subject: [PATCH 17/37] Also optimize LatestEventNIDInRooms --- state/event_table.go | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index f8cd5a90..de4af2cc 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -348,13 +348,25 @@ FROM room_ids, return } +// LatestEventNIDInRooms queries the latest events in each of the room IDs given, using highestNID as the highest event. +// +// The following query does: +// +// 1. Create a list of the passed in roomIDs (`room_ids` CTE) +// 2. Fetches the latest eventNIDs for each room using the data provided from room_ids (the `evs` LATERAL) func (t *EventTable) LatestEventNIDInRooms(txn *sqlx.Tx, roomIDs []string, highestNID int64) (roomToNID map[string]int64, err error) { - // the position (event nid) may be for a random different room, so we need to find the highest nid <= this position for this room var events []Event err = txn.Select( &events, - `SELECT event_nid, room_id FROM syncv3_events - WHERE event_nid IN (SELECT max(event_nid) FROM syncv3_events WHERE event_nid <= $1 AND room_id = ANY($2) GROUP BY room_id)`, + ` +WITH room_ids AS ( + select unnest($1) AS room_id +) +SELECT evs.event_nid, room_id +FROM room_ids, + LATERAL ( + SELECT max(event_nid) event_nid FROM syncv3_events e WHERE e.room_id = room_ids.room_id AND event_nid <= $2 + ) AS evs;`, highestNID, pq.StringArray(roomIDs), ) if err == sql.ErrNoRows { From 425ed4efbaea56d4917e8cd8628caf6257da9c04 Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Fri, 17 May 2024 11:04:59 +0200 Subject: [PATCH 18/37] Fix order of params and tell pq what $1 is --- state/event_table.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index de4af2cc..48d81c58 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -360,14 +360,14 @@ func (t *EventTable) LatestEventNIDInRooms(txn *sqlx.Tx, roomIDs []string, highe &events, ` WITH room_ids AS ( - select unnest($1) AS room_id + select unnest($1::text[]) AS room_id ) SELECT evs.event_nid, room_id FROM room_ids, LATERAL ( SELECT max(event_nid) event_nid FROM syncv3_events e WHERE e.room_id = room_ids.room_id AND event_nid <= $2 - ) AS evs;`, - highestNID, pq.StringArray(roomIDs), + ) AS evs WHERE event_nid IS NOT NULL;`, + pq.StringArray(roomIDs), highestNID, ) if err == sql.ErrNoRows { err = nil From b383ed0d82f2e0fa166aa6a4c4560a7d767ec8f2 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 17 May 2024 13:45:14 +0100 Subject: [PATCH 19/37] Add migrations and refactor internal structs --- internal/device_data.go | 102 +++-------- state/device_data_table.go | 74 ++++---- state/device_data_table_test.go | 88 +++++----- state/device_list_table.go | 105 ++++++++---- state/device_list_table_test.go | 12 ++ .../20230802121023_device_data_jsonb_test.go | 5 +- .../20230814183302_cbor_device_data.go | 5 +- .../20230814183302_cbor_device_data_test.go | 80 ++++++++- .../20240517104423_device_list_table.go | 158 ++++++++++++++++++ .../20240517104423_device_list_table_test.go | 7 + sync2/handler2/handler.go | 13 +- sync3/extensions/e2ee.go | 7 +- 12 files changed, 439 insertions(+), 217 deletions(-) create mode 100644 state/migrations/20240517104423_device_list_table.go create mode 100644 state/migrations/20240517104423_device_list_table_test.go diff --git a/internal/device_data.go b/internal/device_data.go index 651fbf15..c5fe767b 100644 --- a/internal/device_data.go +++ b/internal/device_data.go @@ -1,9 +1,5 @@ package internal -import ( - "sync" -) - const ( bitOTKCount int = iota bitFallbackKeyTypes @@ -18,9 +14,22 @@ func isBitSet(n int, bit int) bool { return val > 0 } -// DeviceData contains useful data for this user's device. This list can be expanded without prompting -// schema changes. These values are upserted into the database and persisted forever. +// DeviceData contains useful data for this user's device. type DeviceData struct { + DeviceListChanges + DeviceKeyData + UserID string + DeviceID string +} + +// This is calculated from device_lists table +type DeviceListChanges struct { + DeviceListChanged []string + DeviceListLeft []string +} + +// This gets serialised as CBOR in device_data table +type DeviceKeyData struct { // Contains the latest device_one_time_keys_count values. // Set whenever this field arrives down the v2 poller, and it replaces what was previously there. OTKCounts MapStringInt `json:"otk"` @@ -28,95 +37,22 @@ type DeviceData struct { // Set whenever this field arrives down the v2 poller, and it replaces what was previously there. // If this is a nil slice this means no change. If this is an empty slice then this means the fallback key was used up. FallbackKeyTypes []string `json:"fallback"` - - DeviceLists DeviceLists `json:"dl"` - // bitset for which device data changes are present. They accumulate until they get swapped over // when they get reset ChangedBits int `json:"c"` - - UserID string - DeviceID string } -func (dd *DeviceData) SetOTKCountChanged() { +func (dd *DeviceKeyData) SetOTKCountChanged() { dd.ChangedBits = setBit(dd.ChangedBits, bitOTKCount) } -func (dd *DeviceData) SetFallbackKeysChanged() { +func (dd *DeviceKeyData) SetFallbackKeysChanged() { dd.ChangedBits = setBit(dd.ChangedBits, bitFallbackKeyTypes) } -func (dd *DeviceData) OTKCountChanged() bool { +func (dd *DeviceKeyData) OTKCountChanged() bool { return isBitSet(dd.ChangedBits, bitOTKCount) } -func (dd *DeviceData) FallbackKeysChanged() bool { +func (dd *DeviceKeyData) FallbackKeysChanged() bool { return isBitSet(dd.ChangedBits, bitFallbackKeyTypes) } - -type UserDeviceKey struct { - UserID string - DeviceID string -} - -type DeviceDataMap struct { - deviceDataMu *sync.Mutex - deviceDataMap map[UserDeviceKey]*DeviceData - Pos int64 -} - -func NewDeviceDataMap(startPos int64, devices []DeviceData) *DeviceDataMap { - ddm := &DeviceDataMap{ - deviceDataMu: &sync.Mutex{}, - deviceDataMap: make(map[UserDeviceKey]*DeviceData), - Pos: startPos, - } - for i, dd := range devices { - ddm.deviceDataMap[UserDeviceKey{ - UserID: dd.UserID, - DeviceID: dd.DeviceID, - }] = &devices[i] - } - return ddm -} - -func (d *DeviceDataMap) Get(userID, deviceID string) *DeviceData { - key := UserDeviceKey{ - UserID: userID, - DeviceID: deviceID, - } - d.deviceDataMu.Lock() - defer d.deviceDataMu.Unlock() - dd, ok := d.deviceDataMap[key] - if !ok { - return nil - } - return dd -} - -func (d *DeviceDataMap) Update(dd DeviceData) DeviceData { - key := UserDeviceKey{ - UserID: dd.UserID, - DeviceID: dd.DeviceID, - } - d.deviceDataMu.Lock() - defer d.deviceDataMu.Unlock() - existing, ok := d.deviceDataMap[key] - if !ok { - existing = &DeviceData{ - UserID: dd.UserID, - DeviceID: dd.DeviceID, - } - } - if dd.OTKCounts != nil { - existing.OTKCounts = dd.OTKCounts - } - if dd.FallbackKeyTypes != nil { - existing.FallbackKeyTypes = dd.FallbackKeyTypes - } - existing.DeviceLists = existing.DeviceLists.Combine(dd.DeviceLists) - - d.deviceDataMap[key] = existing - - return *existing -} diff --git a/state/device_data_table.go b/state/device_data_table.go index 2c5576d7..86388fee 100644 --- a/state/device_data_table.go +++ b/state/device_data_table.go @@ -15,9 +15,9 @@ type DeviceDataRow struct { ID int64 `db:"id"` UserID string `db:"user_id"` DeviceID string `db:"device_id"` - // This will contain internal.DeviceData serialised as JSON. It's stored in a single column as we don't + // This will contain internal.DeviceKeyData serialised as JSON. It's stored in a single column as we don't // need to perform searches on this data. - Data []byte `db:"data"` + KeyData []byte `db:"data"` } type DeviceDataTable struct { @@ -47,6 +47,7 @@ func NewDeviceDataTable(db *sqlx.DB) *DeviceDataTable { // This should only be called by the v3 HTTP APIs when servicing an E2EE extension request. func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *internal.DeviceData, err error) { err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { + // grab otk counts and fallback key types var row DeviceDataRow err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2 FOR UPDATE`, userID, deviceID) if err != nil { @@ -56,32 +57,38 @@ func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *in } return err } + result = &internal.DeviceData{} + var keyData *internal.DeviceKeyData // unmarshal to swap - opts := cbor.DecOptions{ - MaxMapPairs: 1000000000, // 1 billion :( + if err = cbor.Unmarshal(row.KeyData, &keyData); err != nil { + return err } - decMode, err := opts.DecMode() + result.UserID = userID + result.DeviceID = deviceID + if keyData != nil { + result.DeviceKeyData = *keyData + } + + deviceListChanges, err := t.deviceListTable.SelectTx(txn, userID, deviceID, swap) if err != nil { return err } - if err = decMode.Unmarshal(row.Data, &result); err != nil { - return err + for targetUserID, targetState := range deviceListChanges { + switch targetState { + case internal.DeviceListChanged: + result.DeviceListChanged = append(result.DeviceListChanged, targetUserID) + case internal.DeviceListLeft: + result.DeviceListLeft = append(result.DeviceListLeft, targetUserID) + } } - result.UserID = userID - result.DeviceID = deviceID if !swap { return nil // don't swap } - // the caller will only look at sent, so make sure what is new is now in sent - result.DeviceLists.Sent = result.DeviceLists.New - // swap over the fields - writeBack := *result - writeBack.DeviceLists.Sent = result.DeviceLists.New - writeBack.DeviceLists.New = make(map[string]int) + writeBack := *keyData writeBack.ChangedBits = 0 - if reflect.DeepEqual(result, &writeBack) { + if reflect.DeepEqual(keyData, &writeBack) { // The update to the DB would be a no-op; don't bother with it. // This helps reduce write usage and the contention on the unique index for // the device_data table. @@ -99,14 +106,13 @@ func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *in return } -func (t *DeviceDataTable) DeleteDevice(userID, deviceID string) error { - _, err := t.db.Exec(`DELETE FROM syncv3_device_data WHERE user_id = $1 AND device_id = $2`, userID, deviceID) - return err -} - // Upsert combines what is in the database for this user|device with the partial entry `dd` -func (t *DeviceDataTable) Upsert(dd *internal.DeviceData) (err error) { +func (t *DeviceDataTable) Upsert(dd *internal.DeviceData, deviceListChanges map[string]int) (err error) { err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { + // Update device lists + if err = t.deviceListTable.UpsertTx(txn, dd.UserID, dd.DeviceID, deviceListChanges); err != nil { + return err + } // select what already exists var row DeviceDataRow err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2 FOR UPDATE`, dd.UserID, dd.DeviceID) @@ -114,30 +120,22 @@ func (t *DeviceDataTable) Upsert(dd *internal.DeviceData) (err error) { return err } // unmarshal and combine - var tempDD internal.DeviceData - if len(row.Data) > 0 { - opts := cbor.DecOptions{ - MaxMapPairs: 1000000000, // 1 billion :( - } - decMode, err := opts.DecMode() - if err != nil { - return err - } - if err = decMode.Unmarshal(row.Data, &tempDD); err != nil { + var keyData internal.DeviceKeyData + if len(row.KeyData) > 0 { + if err = cbor.Unmarshal(row.KeyData, &keyData); err != nil { return err } } if dd.FallbackKeyTypes != nil { - tempDD.FallbackKeyTypes = dd.FallbackKeyTypes - tempDD.SetFallbackKeysChanged() + keyData.FallbackKeyTypes = dd.FallbackKeyTypes + keyData.SetFallbackKeysChanged() } if dd.OTKCounts != nil { - tempDD.OTKCounts = dd.OTKCounts - tempDD.SetOTKCountChanged() + keyData.OTKCounts = dd.OTKCounts + keyData.SetOTKCountChanged() } - tempDD.DeviceLists = tempDD.DeviceLists.Combine(dd.DeviceLists) - data, err := cbor.Marshal(tempDD) + data, err := cbor.Marshal(keyData) if err != nil { return err } diff --git a/state/device_data_table_test.go b/state/device_data_table_test.go index b4fe6ad0..eac73095 100644 --- a/state/device_data_table_test.go +++ b/state/device_data_table_test.go @@ -22,9 +22,6 @@ func assertDeviceData(t *testing.T, g, w internal.DeviceData) { assertVal(t, "FallbackKeyTypes", g.FallbackKeyTypes, w.FallbackKeyTypes) assertVal(t, "OTKCounts", g.OTKCounts, w.OTKCounts) assertVal(t, "ChangedBits", g.ChangedBits, w.ChangedBits) - if w.DeviceLists.Sent != nil { - assertVal(t, "DeviceLists.Sent", g.DeviceLists.Sent, w.DeviceLists.Sent) - } } // Tests OTKCounts and FallbackKeyTypes behaviour @@ -40,21 +37,27 @@ func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { { UserID: userID, DeviceID: deviceID, - OTKCounts: map[string]int{ - "foo": 100, - "bar": 92, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: map[string]int{ + "foo": 100, + "bar": 92, + }, }, }, { - UserID: userID, - DeviceID: deviceID, - FallbackKeyTypes: []string{"foobar"}, + UserID: userID, + DeviceID: deviceID, + DeviceKeyData: internal.DeviceKeyData{ + FallbackKeyTypes: []string{"foobar"}, + }, }, { UserID: userID, DeviceID: deviceID, - OTKCounts: map[string]int{ - "foo": 99, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: map[string]int{ + "foo": 99, + }, }, }, { @@ -65,7 +68,7 @@ func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { // apply them for _, dd := range deltas { - err := table.Upsert(&dd) + err := table.Upsert(&dd, nil) assertNoError(t, err) } @@ -79,10 +82,12 @@ func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { want := internal.DeviceData{ UserID: userID, DeviceID: deviceID, - OTKCounts: map[string]int{ - "foo": 99, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: map[string]int{ + "foo": 99, + }, + FallbackKeyTypes: []string{"foobar"}, }, - FallbackKeyTypes: []string{"foobar"}, } want.SetFallbackKeysChanged() want.SetOTKCountChanged() @@ -95,10 +100,12 @@ func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { want := internal.DeviceData{ UserID: userID, DeviceID: deviceID, - OTKCounts: map[string]int{ - "foo": 99, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: map[string]int{ + "foo": 99, + }, + FallbackKeyTypes: []string{"foobar"}, }, - FallbackKeyTypes: []string{"foobar"}, } want.SetFallbackKeysChanged() want.SetOTKCountChanged() @@ -110,10 +117,12 @@ func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { want = internal.DeviceData{ UserID: userID, DeviceID: deviceID, - OTKCounts: map[string]int{ - "foo": 99, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: map[string]int{ + "foo": 99, + }, + FallbackKeyTypes: []string{"foobar"}, }, - FallbackKeyTypes: []string{"foobar"}, } assertDeviceData(t, *got, want) } @@ -127,29 +136,32 @@ func TestDeviceDataTableBitset(t *testing.T) { otkUpdate := internal.DeviceData{ UserID: userID, DeviceID: deviceID, - OTKCounts: map[string]int{ - "foo": 100, - "bar": 92, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: map[string]int{ + "foo": 100, + "bar": 92, + }, }, - DeviceLists: internal.DeviceLists{New: map[string]int{}, Sent: map[string]int{}}, } fallbakKeyUpdate := internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - FallbackKeyTypes: []string{"foo", "bar"}, - DeviceLists: internal.DeviceLists{New: map[string]int{}, Sent: map[string]int{}}, + UserID: userID, + DeviceID: deviceID, + DeviceKeyData: internal.DeviceKeyData{ + FallbackKeyTypes: []string{"foo", "bar"}, + }, } bothUpdate := internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - FallbackKeyTypes: []string{"both"}, - OTKCounts: map[string]int{ - "both": 100, + UserID: userID, + DeviceID: deviceID, + DeviceKeyData: internal.DeviceKeyData{ + FallbackKeyTypes: []string{"both"}, + OTKCounts: map[string]int{ + "both": 100, + }, }, - DeviceLists: internal.DeviceLists{New: map[string]int{}, Sent: map[string]int{}}, } - err := table.Upsert(&otkUpdate) + err := table.Upsert(&otkUpdate, nil) assertNoError(t, err) got, err := table.Select(userID, deviceID, true) assertNoError(t, err) @@ -161,7 +173,7 @@ func TestDeviceDataTableBitset(t *testing.T) { otkUpdate.ChangedBits = 0 assertDeviceData(t, *got, otkUpdate) // now same for fallback keys, but we won't swap them so it should return those diffs - err = table.Upsert(&fallbakKeyUpdate) + err = table.Upsert(&fallbakKeyUpdate, nil) assertNoError(t, err) fallbakKeyUpdate.OTKCounts = otkUpdate.OTKCounts got, err = table.Select(userID, deviceID, false) @@ -173,7 +185,7 @@ func TestDeviceDataTableBitset(t *testing.T) { fallbakKeyUpdate.SetFallbackKeysChanged() assertDeviceData(t, *got, fallbakKeyUpdate) // updating both works - err = table.Upsert(&bothUpdate) + err = table.Upsert(&bothUpdate, nil) assertNoError(t, err) got, err = table.Select(userID, deviceID, true) assertNoError(t, err) diff --git a/state/device_list_table.go b/state/device_list_table.go index 5baae563..d1ab0b8c 100644 --- a/state/device_list_table.go +++ b/state/device_list_table.go @@ -14,6 +14,14 @@ const ( BucketSent = 2 ) +type DeviceListRow struct { + UserID string `db:"user_id"` + DeviceID string `db:"device_id"` + TargetUserID string `db:"target_user_id"` + TargetState int `db:"target_state"` + Bucket int `db:"bucket"` +} + type DeviceListTable struct { db *sqlx.DB } @@ -39,24 +47,9 @@ func NewDeviceListTable(db *sqlx.DB) *DeviceListTable { } } -// Upsert new device list changes. func (t *DeviceListTable) Upsert(userID, deviceID string, deviceListChanges map[string]int) (err error) { err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { - for targetUserID, targetState := range deviceListChanges { - if targetState != internal.DeviceListChanged && targetState != internal.DeviceListLeft { - sentry.CaptureException(fmt.Errorf("DeviceListTable.Upsert invalid target_state: %d this is a programming error", targetState)) - continue - } - _, err = txn.Exec( - `INSERT INTO syncv3_device_list_updates(user_id, device_id, target_user_id, target_state, bucket) VALUES($1,$2,$3,$4,$5) - ON CONFLICT (user_id, device_id, target_user_id, bucket) DO UPDATE SET target_state=$4`, - userID, deviceID, targetUserID, targetState, BucketNew, - ) - if err != nil { - return err - } - } - return nil + return t.UpsertTx(txn, userID, deviceID, deviceListChanges) }) if err != nil { sentry.CaptureException(err) @@ -64,33 +57,70 @@ func (t *DeviceListTable) Upsert(userID, deviceID string, deviceListChanges map[ return } -// Select device list changes for this client. Returns a map of user_id => change enum. -func (t *DeviceListTable) Select(userID, deviceID string, swap bool) (result internal.MapStringInt, err error) { - err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { - if !swap { - // read only view, just return what we previously sent and don't do anything else. - result, err = t.selectDeviceListChangesInBucket(txn, userID, deviceID, BucketSent) - return err - } - - // delete the now acknowledged 'sent' data - _, err = txn.Exec(`DELETE FROM syncv3_device_list_updates WHERE user_id=$1 AND device_id=$2 AND bucket=$3`, userID, deviceID, BucketSent) - if err != nil { - return err +// Upsert new device list changes. +func (t *DeviceListTable) UpsertTx(txn *sqlx.Tx, userID, deviceID string, deviceListChanges map[string]int) (err error) { + if len(deviceListChanges) == 0 { + return nil + } + var deviceListRows []DeviceListRow + for targetUserID, targetState := range deviceListChanges { + if targetState != internal.DeviceListChanged && targetState != internal.DeviceListLeft { + sentry.CaptureException(fmt.Errorf("DeviceListTable.Upsert invalid target_state: %d this is a programming error", targetState)) + continue } - // grab any 'new' updates - result, err = t.selectDeviceListChangesInBucket(txn, userID, deviceID, BucketNew) + deviceListRows = append(deviceListRows, DeviceListRow{ + UserID: userID, + DeviceID: deviceID, + TargetUserID: targetUserID, + TargetState: targetState, + Bucket: BucketNew, + }) + } + chunks := sqlutil.Chunkify(5, MaxPostgresParameters, DeviceListChunker(deviceListRows)) + for _, chunk := range chunks { + _, err := txn.NamedExec(` + INSERT INTO syncv3_device_list_updates(user_id, device_id, target_user_id, target_state, bucket) + VALUES(:user_id, :device_id, :target_user_id, :target_state, :bucket) + ON CONFLICT (user_id, device_id, target_user_id, bucket) DO UPDATE SET target_state = EXCLUDED.target_state`, chunk) if err != nil { return err } + } + return nil + return +} - // mark these 'new' updates as 'sent' - _, err = txn.Exec(`UPDATE syncv3_device_list_updates SET bucket=$1 WHERE user_id=$2 AND device_id=$3 AND bucket=$4`, BucketSent, userID, deviceID, BucketNew) +func (t *DeviceListTable) Select(userID, deviceID string, swap bool) (result internal.MapStringInt, err error) { + err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { + result, err = t.SelectTx(txn, userID, deviceID, swap) return err }) return } +// Select device list changes for this client. Returns a map of user_id => change enum. +func (t *DeviceListTable) SelectTx(txn *sqlx.Tx, userID, deviceID string, swap bool) (result internal.MapStringInt, err error) { + if !swap { + // read only view, just return what we previously sent and don't do anything else. + return t.selectDeviceListChangesInBucket(txn, userID, deviceID, BucketSent) + } + + // delete the now acknowledged 'sent' data + _, err = txn.Exec(`DELETE FROM syncv3_device_list_updates WHERE user_id=$1 AND device_id=$2 AND bucket=$3`, userID, deviceID, BucketSent) + if err != nil { + return nil, err + } + // grab any 'new' updates + result, err = t.selectDeviceListChangesInBucket(txn, userID, deviceID, BucketNew) + if err != nil { + return nil, err + } + + // mark these 'new' updates as 'sent' + _, err = txn.Exec(`UPDATE syncv3_device_list_updates SET bucket=$1 WHERE user_id=$2 AND device_id=$3 AND bucket=$4`, BucketSent, userID, deviceID, BucketNew) + return result, err +} + func (t *DeviceListTable) selectDeviceListChangesInBucket(txn *sqlx.Tx, userID, deviceID string, bucket int) (result internal.MapStringInt, err error) { rows, err := txn.Query(`SELECT target_user_id, target_state FROM syncv3_device_list_updates WHERE user_id=$1 AND device_id=$2 AND bucket=$3`, userID, deviceID, bucket) if err != nil { @@ -108,3 +138,12 @@ func (t *DeviceListTable) selectDeviceListChangesInBucket(txn *sqlx.Tx, userID, } return result, rows.Err() } + +type DeviceListChunker []DeviceListRow + +func (c DeviceListChunker) Len() int { + return len(c) +} +func (c DeviceListChunker) Subslice(i, j int) sqlutil.Chunker { + return c[i:j] +} diff --git a/state/device_list_table_test.go b/state/device_list_table_test.go index 79cf8438..e7c980fc 100644 --- a/state/device_list_table_test.go +++ b/state/device_list_table_test.go @@ -1,6 +1,7 @@ package state import ( + "fmt" "testing" "github.com/matrix-org/sliding-sync/internal" @@ -105,4 +106,15 @@ func TestDeviceListTable(t *testing.T) { got, err = table.Select(userID, deviceID, true) assertNoError(t, err) assertVal(t, "swap select did not return combined new items", got, internal.MapStringInt{}) + + // large updates work (chunker) + largeUpdate := internal.MapStringInt{} + for i := 0; i < 100000; i++ { + largeUpdate[fmt.Sprintf("user_%d", i)] = internal.DeviceListChanged + } + err = table.Upsert(userID, deviceID, largeUpdate) + assertNoError(t, err) + got, err = table.Select(userID, deviceID, true) + assertNoError(t, err) + assertVal(t, "swap select did not return large items", got, largeUpdate) } diff --git a/state/migrations/20230802121023_device_data_jsonb_test.go b/state/migrations/20230802121023_device_data_jsonb_test.go index eb6c5a8e..9a27bdcb 100644 --- a/state/migrations/20230802121023_device_data_jsonb_test.go +++ b/state/migrations/20230802121023_device_data_jsonb_test.go @@ -7,7 +7,6 @@ import ( "github.com/jmoiron/sqlx" _ "github.com/lib/pq" - "github.com/matrix-org/sliding-sync/internal" "github.com/matrix-org/sliding-sync/testutils" ) @@ -48,8 +47,8 @@ func TestJSONBMigration(t *testing.T) { defer tx.Commit() // insert some "invalid" data - dd := internal.DeviceData{ - DeviceLists: internal.DeviceLists{ + dd := OldDeviceData{ + DeviceLists: OldDeviceLists{ New: map[string]int{"@💣:localhost": 1}, Sent: map[string]int{}, }, diff --git a/state/migrations/20230814183302_cbor_device_data.go b/state/migrations/20230814183302_cbor_device_data.go index 02b7f2a4..b549d933 100644 --- a/state/migrations/20230814183302_cbor_device_data.go +++ b/state/migrations/20230814183302_cbor_device_data.go @@ -9,7 +9,6 @@ import ( "strings" "github.com/fxamacker/cbor/v2" - "github.com/matrix-org/sliding-sync/internal" "github.com/matrix-org/sliding-sync/sync2" "github.com/pressly/goose/v3" ) @@ -59,7 +58,7 @@ func upCborDeviceData(ctx context.Context, tx *sql.Tx) error { } for dd, jsonBytes := range deviceDatas { - var data internal.DeviceData + var data OldDeviceData if err := json.Unmarshal(jsonBytes, &data); err != nil { return fmt.Errorf("failed to unmarshal JSON: %v -> %v", string(jsonBytes), err) } @@ -115,7 +114,7 @@ func downCborDeviceData(ctx context.Context, tx *sql.Tx) error { } for dd, cborBytes := range deviceDatas { - var data internal.DeviceData + var data OldDeviceData if err := cbor.Unmarshal(cborBytes, &data); err != nil { return fmt.Errorf("failed to unmarshal CBOR: %v", err) } diff --git a/state/migrations/20230814183302_cbor_device_data_test.go b/state/migrations/20230814183302_cbor_device_data_test.go index 2537ee82..a6b3b884 100644 --- a/state/migrations/20230814183302_cbor_device_data_test.go +++ b/state/migrations/20230814183302_cbor_device_data_test.go @@ -2,13 +2,15 @@ package migrations import ( "context" + "database/sql" "encoding/json" "reflect" "testing" + "github.com/fxamacker/cbor/v2" + "github.com/jmoiron/sqlx" _ "github.com/lib/pq" - "github.com/matrix-org/sliding-sync/internal" - "github.com/matrix-org/sliding-sync/state" + "github.com/matrix-org/sliding-sync/sqlutil" ) func TestCBORBMigration(t *testing.T) { @@ -30,9 +32,9 @@ func TestCBORBMigration(t *testing.T) { t.Fatal(err) } - rowData := []internal.DeviceData{ + rowData := []OldDeviceData{ { - DeviceLists: internal.DeviceLists{ + DeviceLists: OldDeviceLists{ New: map[string]int{"@bob:localhost": 2}, Sent: map[string]int{}, }, @@ -43,7 +45,7 @@ func TestCBORBMigration(t *testing.T) { UserID: "@alice:localhost", }, { - DeviceLists: internal.DeviceLists{ + DeviceLists: OldDeviceLists{ New: map[string]int{"@💣:localhost": 1, "@bomb:localhost": 2}, Sent: map[string]int{"@sent:localhost": 1}, }, @@ -78,9 +80,8 @@ func TestCBORBMigration(t *testing.T) { tx.Commit() // ensure we can now select it - table := state.NewDeviceDataTable(db) for _, want := range rowData { - got, err := table.Select(want.UserID, want.DeviceID, false) + got, err := OldDeviceDataTableSelect(db, want.UserID, want.DeviceID, false) if err != nil { t.Fatal(err) } @@ -101,7 +102,7 @@ func TestCBORBMigration(t *testing.T) { // ensure it is what we originally inserted for _, want := range rowData { - var got internal.DeviceData + var got OldDeviceData var gotBytes []byte err = tx.QueryRow(`SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2`, want.UserID, want.DeviceID).Scan(&gotBytes) if err != nil { @@ -119,3 +120,66 @@ func TestCBORBMigration(t *testing.T) { tx.Commit() } + +type OldDeviceDataRow struct { + ID int64 `db:"id"` + UserID string `db:"user_id"` + DeviceID string `db:"device_id"` + // This will contain internal.DeviceData serialised as JSON. It's stored in a single column as we don't + // need to perform searches on this data. + Data []byte `db:"data"` +} + +func OldDeviceDataTableSelect(db *sqlx.DB, userID, deviceID string, swap bool) (result *OldDeviceData, err error) { + err = sqlutil.WithTransaction(db, func(txn *sqlx.Tx) error { + var row OldDeviceDataRow + err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2 FOR UPDATE`, userID, deviceID) + if err != nil { + if err == sql.ErrNoRows { + // if there is no device data for this user, it's not an error. + return nil + } + return err + } + // unmarshal to swap + opts := cbor.DecOptions{ + MaxMapPairs: 1000000000, // 1 billion :( + } + decMode, err := opts.DecMode() + if err != nil { + return err + } + if err = decMode.Unmarshal(row.Data, &result); err != nil { + return err + } + result.UserID = userID + result.DeviceID = deviceID + if !swap { + return nil // don't swap + } + // the caller will only look at sent, so make sure what is new is now in sent + result.DeviceLists.Sent = result.DeviceLists.New + + // swap over the fields + writeBack := *result + writeBack.DeviceLists.Sent = result.DeviceLists.New + writeBack.DeviceLists.New = make(map[string]int) + writeBack.ChangedBits = 0 + + if reflect.DeepEqual(result, &writeBack) { + // The update to the DB would be a no-op; don't bother with it. + // This helps reduce write usage and the contention on the unique index for + // the device_data table. + return nil + } + // re-marshal and write + data, err := cbor.Marshal(writeBack) + if err != nil { + return err + } + + _, err = txn.Exec(`UPDATE syncv3_device_data SET data=$1 WHERE user_id=$2 AND device_id=$3`, data, userID, deviceID) + return err + }) + return +} diff --git a/state/migrations/20240517104423_device_list_table.go b/state/migrations/20240517104423_device_list_table.go new file mode 100644 index 00000000..3081bf43 --- /dev/null +++ b/state/migrations/20240517104423_device_list_table.go @@ -0,0 +1,158 @@ +package migrations + +import ( + "context" + "database/sql" + "time" + + "github.com/fxamacker/cbor/v2" + "github.com/lib/pq" + "github.com/matrix-org/sliding-sync/internal" + "github.com/matrix-org/sliding-sync/state" + "github.com/pressly/goose/v3" +) + +type OldDeviceData struct { + // Contains the latest device_one_time_keys_count values. + // Set whenever this field arrives down the v2 poller, and it replaces what was previously there. + OTKCounts internal.MapStringInt `json:"otk"` + // Contains the latest device_unused_fallback_key_types value + // Set whenever this field arrives down the v2 poller, and it replaces what was previously there. + // If this is a nil slice this means no change. If this is an empty slice then this means the fallback key was used up. + FallbackKeyTypes []string `json:"fallback"` + + DeviceLists OldDeviceLists `json:"dl"` + + // bitset for which device data changes are present. They accumulate until they get swapped over + // when they get reset + ChangedBits int `json:"c"` + + UserID string + DeviceID string +} + +type OldDeviceLists struct { + // map user_id -> DeviceList enum + New internal.MapStringInt `json:"n"` + Sent internal.MapStringInt `json:"s"` +} + +func init() { + goose.AddMigrationContext(upDeviceListTable, downDeviceListTable) +} + +func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { + // create the table. It's a bit gross we need to dupe the schema here, but this is the first migration to + // add a new table like this. + _, err := tx.Exec(` + CREATE TABLE IF NOT EXISTS syncv3_device_list_updates ( + user_id TEXT NOT NULL, + device_id TEXT NOT NULL, + target_user_id TEXT NOT NULL, + target_state SMALLINT NOT NULL, + bucket SMALLINT NOT NULL, + UNIQUE(user_id, device_id, target_user_id, bucket) + ); + -- make an index so selecting all the rows is faster + CREATE INDEX IF NOT EXISTS syncv3_device_list_updates_bucket_idx ON syncv3_device_list_updates(user_id, device_id, bucket); + -- Set the fillfactor to 90%, to allow for HOT updates (e.g. we only + -- change the data, not anything indexed like the id) + ALTER TABLE syncv3_device_list_updates SET (fillfactor = 90); + `) + if err != nil { + return err + } + + var count int + if err = tx.QueryRow(`SELECT count(*) FROM syncv3_device_data`).Scan(&count); err != nil { + return err + } + logger.Info().Int("count", count).Msg("transferring device list data for devices") + + // scan for existing CBOR (streaming as the CBOR can be large) and for each row: + rows, err := tx.Query(`SELECT user_id, device_id, data FROM syncv3_device_data`) + if err != nil { + return err + } + defer rows.Close() + var userID string + var deviceID string + var data []byte + // every N seconds log an update + updateFrequency := time.Second * 2 + lastUpdate := time.Now() + i := 0 + for rows.Next() { + i++ + if time.Since(lastUpdate) > updateFrequency { + logger.Info().Msgf("%d/%d process device list data", i, count) + lastUpdate = time.Now() + } + // * deserialise the CBOR + if err := rows.Scan(&userID, &deviceID, &data); err != nil { + return err + } + result, err := deserialiseCBOR(data) + if err != nil { + return err + } + + // * transfer the device lists to the new device lists table + // uses a bulk copy that lib/pq supports + stmt, err := tx.Prepare(pq.CopyIn("syncv3_device_list_updates", "user_id", "device_id", "target_user_id", "target_state", "bucket")) + if err != nil { + return err + } + for targetUser, targetState := range result.DeviceLists.New { + if _, err := stmt.Exec(userID, deviceID, targetUser, targetState, state.BucketNew); err != nil { + return err + } + } + for targetUser, targetState := range result.DeviceLists.Sent { + if _, err := stmt.Exec(userID, deviceID, targetUser, targetState, state.BucketSent); err != nil { + return err + } + } + if _, err = stmt.Exec(); err != nil { + return err + } + if err = stmt.Close(); err != nil { + return err + } + + // * delete the device lists from the CBOR and update + result.DeviceLists = OldDeviceLists{ + New: make(internal.MapStringInt), + Sent: make(internal.MapStringInt), + } + data, err := cbor.Marshal(result) + if err != nil { + return err + } + _, err = tx.Exec(`UPDATE syncv3_device_data SET data=$1 WHERE user_id=$2 AND device_id=$3`, data, userID, deviceID) + if err != nil { + return err + } + } + return rows.Err() +} + +func downDeviceListTable(ctx context.Context, tx *sql.Tx) error { + // no-op: we'll drop the device list updates but still work correctly as new/sent are still in the cbor but are empty + return nil +} + +func deserialiseCBOR(data []byte) (*OldDeviceData, error) { + opts := cbor.DecOptions{ + MaxMapPairs: 1000000000, // 1 billion :( + } + decMode, err := opts.DecMode() + if err != nil { + return nil, err + } + var result *OldDeviceData + if err = decMode.Unmarshal(data, &result); err != nil { + return nil, err + } + return result, nil +} diff --git a/state/migrations/20240517104423_device_list_table_test.go b/state/migrations/20240517104423_device_list_table_test.go new file mode 100644 index 00000000..75212d8d --- /dev/null +++ b/state/migrations/20240517104423_device_list_table_test.go @@ -0,0 +1,7 @@ +package migrations + +import "testing" + +func TestDeviceListTableMigration(t *testing.T) { + +} diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index 40512489..b8f29757 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -234,15 +234,14 @@ func (h *Handler) OnE2EEData(ctx context.Context, userID, deviceID string, otkCo defer wg.Done() // some of these fields may be set partialDD := internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - OTKCounts: otkCounts, - FallbackKeyTypes: fallbackKeyTypes, - DeviceLists: internal.DeviceLists{ - New: deviceListChanges, + UserID: userID, + DeviceID: deviceID, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: otkCounts, + FallbackKeyTypes: fallbackKeyTypes, }, } - err := h.Store.DeviceDataTable.Upsert(&partialDD) + err := h.Store.DeviceDataTable.Upsert(&partialDD, deviceListChanges) if err != nil { logger.Err(err).Str("user", userID).Msg("failed to upsert device data") internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) diff --git a/sync3/extensions/e2ee.go b/sync3/extensions/e2ee.go index 91550457..01711d6a 100644 --- a/sync3/extensions/e2ee.go +++ b/sync3/extensions/e2ee.go @@ -70,11 +70,10 @@ func (r *E2EERequest) ProcessInitial(ctx context.Context, res *Response, extCtx extRes.OTKCounts = dd.OTKCounts hasUpdates = true } - changed, left := internal.DeviceListChangesArrays(dd.DeviceLists.Sent) - if len(changed) > 0 || len(left) > 0 { + if len(dd.DeviceListChanged) > 0 || len(dd.DeviceListLeft) > 0 { extRes.DeviceLists = &E2EEDeviceList{ - Changed: changed, - Left: left, + Changed: dd.DeviceListChanged, + Left: dd.DeviceListLeft, } hasUpdates = true } From b6f2f9d2730022ac4ac9d11b47065463df1996fd Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 17 May 2024 14:48:08 +0100 Subject: [PATCH 20/37] Use a CURSOR --- .../20240517104423_device_list_table.go | 91 ++++++++++++------- .../20240517104423_device_list_table_test.go | 69 +++++++++++++- 2 files changed, 127 insertions(+), 33 deletions(-) diff --git a/state/migrations/20240517104423_device_list_table.go b/state/migrations/20240517104423_device_list_table.go index 3081bf43..477ac985 100644 --- a/state/migrations/20240517104423_device_list_table.go +++ b/state/migrations/20240517104423_device_list_table.go @@ -3,11 +3,13 @@ package migrations import ( "context" "database/sql" + "fmt" + "strings" "time" "github.com/fxamacker/cbor/v2" - "github.com/lib/pq" "github.com/matrix-org/sliding-sync/internal" + "github.com/matrix-org/sliding-sync/sqlutil" "github.com/matrix-org/sliding-sync/state" "github.com/pressly/goose/v3" ) @@ -52,13 +54,7 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { target_state SMALLINT NOT NULL, bucket SMALLINT NOT NULL, UNIQUE(user_id, device_id, target_user_id, bucket) - ); - -- make an index so selecting all the rows is faster - CREATE INDEX IF NOT EXISTS syncv3_device_list_updates_bucket_idx ON syncv3_device_list_updates(user_id, device_id, bucket); - -- Set the fillfactor to 90%, to allow for HOT updates (e.g. we only - -- change the data, not anything indexed like the id) - ALTER TABLE syncv3_device_list_updates SET (fillfactor = 90); - `) + );`) if err != nil { return err } @@ -69,12 +65,12 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { } logger.Info().Int("count", count).Msg("transferring device list data for devices") - // scan for existing CBOR (streaming as the CBOR can be large) and for each row: - rows, err := tx.Query(`SELECT user_id, device_id, data FROM syncv3_device_data`) + // scan for existing CBOR (streaming as the CBOR with cursors as it can be large) + _, err = tx.Exec(`DECLARE device_data_migration_cursor CURSOR FOR SELECT user_id, device_id, data FROM syncv3_device_data`) if err != nil { return err } - defer rows.Close() + defer tx.Exec("CLOSE device_data_migration_cursor") var userID string var deviceID string var data []byte @@ -82,42 +78,73 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { updateFrequency := time.Second * 2 lastUpdate := time.Now() i := 0 - for rows.Next() { + for { + // logging i++ if time.Since(lastUpdate) > updateFrequency { logger.Info().Msgf("%d/%d process device list data", i, count) lastUpdate = time.Now() } - // * deserialise the CBOR - if err := rows.Scan(&userID, &deviceID, &data); err != nil { + + if err := tx.QueryRow( + `FETCH NEXT FROM device_data_migration_cursor`, + ).Scan(&userID, &deviceID, &data); err != nil { + if err == sql.ErrNoRows { + // End of rows. + break + } return err } + + // * deserialise the CBOR result, err := deserialiseCBOR(data) if err != nil { return err } // * transfer the device lists to the new device lists table - // uses a bulk copy that lib/pq supports - stmt, err := tx.Prepare(pq.CopyIn("syncv3_device_list_updates", "user_id", "device_id", "target_user_id", "target_state", "bucket")) - if err != nil { - return err - } + var deviceListRows []state.DeviceListRow for targetUser, targetState := range result.DeviceLists.New { - if _, err := stmt.Exec(userID, deviceID, targetUser, targetState, state.BucketNew); err != nil { - return err - } + deviceListRows = append(deviceListRows, state.DeviceListRow{ + UserID: userID, + DeviceID: deviceID, + TargetUserID: targetUser, + TargetState: targetState, + Bucket: state.BucketNew, + }) } for targetUser, targetState := range result.DeviceLists.Sent { - if _, err := stmt.Exec(userID, deviceID, targetUser, targetState, state.BucketSent); err != nil { - return err - } - } - if _, err = stmt.Exec(); err != nil { - return err + deviceListRows = append(deviceListRows, state.DeviceListRow{ + UserID: userID, + DeviceID: deviceID, + TargetUserID: targetUser, + TargetState: targetState, + Bucket: state.BucketSent, + }) } - if err = stmt.Close(); err != nil { - return err + chunks := sqlutil.Chunkify(5, state.MaxPostgresParameters, state.DeviceListChunker(deviceListRows)) + for _, chunk := range chunks { + var placeholders []string + var vals []interface{} + listChunk := chunk.(state.DeviceListChunker) + for i, deviceListRow := range listChunk { + placeholders = append(placeholders, fmt.Sprintf("($%d,$%d,$%d,$%d,$%d)", + i*5+1, + i*5+2, + i*5+3, + i*5+4, + i*5+5, + )) + vals = append(vals, deviceListRow.UserID, deviceListRow.DeviceID, deviceListRow.TargetUserID, deviceListRow.TargetState, deviceListRow.Bucket) + } + query := fmt.Sprintf( + `INSERT INTO syncv3_device_list_updates(user_id, device_id, target_user_id, target_state, bucket) VALUES %s`, + strings.Join(placeholders, ","), + ) + _, err = tx.ExecContext(ctx, query, vals...) + if err != nil { + return fmt.Errorf("failed to bulk insert: %s", err) + } } // * delete the device lists from the CBOR and update @@ -129,12 +156,12 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { if err != nil { return err } - _, err = tx.Exec(`UPDATE syncv3_device_data SET data=$1 WHERE user_id=$2 AND device_id=$3`, data, userID, deviceID) + _, err = tx.ExecContext(ctx, `UPDATE syncv3_device_data SET data=$1 WHERE user_id=$2 AND device_id=$3`, data, userID, deviceID) if err != nil { return err } } - return rows.Err() + return nil } func downDeviceListTable(ctx context.Context, tx *sql.Tx) error { diff --git a/state/migrations/20240517104423_device_list_table_test.go b/state/migrations/20240517104423_device_list_table_test.go index 75212d8d..a9d53dda 100644 --- a/state/migrations/20240517104423_device_list_table_test.go +++ b/state/migrations/20240517104423_device_list_table_test.go @@ -1,7 +1,74 @@ package migrations -import "testing" +import ( + "context" + "testing" + + "github.com/fxamacker/cbor/v2" +) func TestDeviceListTableMigration(t *testing.T) { + ctx := context.Background() + db, close := connectToDB(t) + defer close() + + // Create the table in the old format (data = JSONB instead of BYTEA) + // and insert some data: we'll make sure that this data is preserved + // after migrating. + _, err := db.Exec(`CREATE TABLE IF NOT EXISTS syncv3_device_data ( + user_id TEXT NOT NULL, + device_id TEXT NOT NULL, + data BYTEA NOT NULL, + UNIQUE(user_id, device_id) + );`) + if err != nil { + t.Fatalf("failed to create table: %s", err) + } + + // insert old data + rowData := []OldDeviceData{ + { + DeviceLists: OldDeviceLists{ + New: map[string]int{"@bob:localhost": 2}, + Sent: map[string]int{}, + }, + ChangedBits: 2, + OTKCounts: map[string]int{"bar": 42}, + FallbackKeyTypes: []string{"narp"}, + DeviceID: "ALICE", + UserID: "@alice:localhost", + }, + { + DeviceLists: OldDeviceLists{ + New: map[string]int{"@💣:localhost": 1, "@bomb:localhost": 2}, + Sent: map[string]int{"@sent:localhost": 1}, + }, + OTKCounts: map[string]int{"foo": 100}, + FallbackKeyTypes: []string{"yep"}, + DeviceID: "BOB", + UserID: "@bob:localhost", + }, + } + for _, data := range rowData { + blob, err := cbor.Marshal(data) + if err != nil { + t.Fatal(err) + } + _, err = db.ExecContext(ctx, `INSERT INTO syncv3_device_data (user_id, device_id, data) VALUES ($1, $2, $3)`, data.UserID, data.DeviceID, blob) + if err != nil { + t.Fatal(err) + } + } + + // now migrate and ensure we didn't lose any data + tx, err := db.Begin() + if err != nil { + t.Fatal(err) + } + err = upDeviceListTable(ctx, tx) + if err != nil { + t.Fatal(err) + } + tx.Commit() } From fcd9b490f9ab9f022555565d2d8423558eb121da Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 17 May 2024 15:20:18 +0100 Subject: [PATCH 21/37] Debug logging --- state/migrations/20240517104423_device_list_table.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/state/migrations/20240517104423_device_list_table.go b/state/migrations/20240517104423_device_list_table.go index 477ac985..1d427c0d 100644 --- a/state/migrations/20240517104423_device_list_table.go +++ b/state/migrations/20240517104423_device_list_table.go @@ -143,6 +143,8 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { ) _, err = tx.ExecContext(ctx, query, vals...) if err != nil { + fmt.Println(query) + fmt.Println(vals...) return fmt.Errorf("failed to bulk insert: %s", err) } } From 5028f93f83a2e1eb02966d2d7e9e2a7e4b7d8bf1 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 17 May 2024 15:37:45 +0100 Subject: [PATCH 22/37] If there's no updates, don't insert anything --- state/migrations/20240517104423_device_list_table.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/state/migrations/20240517104423_device_list_table.go b/state/migrations/20240517104423_device_list_table.go index 1d427c0d..5e6d7712 100644 --- a/state/migrations/20240517104423_device_list_table.go +++ b/state/migrations/20240517104423_device_list_table.go @@ -122,6 +122,9 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { Bucket: state.BucketSent, }) } + if len(deviceListRows) == 0 { + continue + } chunks := sqlutil.Chunkify(5, state.MaxPostgresParameters, state.DeviceListChunker(deviceListRows)) for _, chunk := range chunks { var placeholders []string @@ -143,8 +146,6 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { ) _, err = tx.ExecContext(ctx, query, vals...) if err != nil { - fmt.Println(query) - fmt.Println(vals...) return fmt.Errorf("failed to bulk insert: %s", err) } } From af1f34861ea10b94db651b7e07f928285dd05934 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 17 May 2024 16:09:02 +0100 Subject: [PATCH 23/37] Ensure txns are closed so we can wipe the db for other tests --- state/migrations/20231108122539_clear_stuck_invites_test.go | 1 + testutils/db.go | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/state/migrations/20231108122539_clear_stuck_invites_test.go b/state/migrations/20231108122539_clear_stuck_invites_test.go index 616fd62e..a4b4257d 100644 --- a/state/migrations/20231108122539_clear_stuck_invites_test.go +++ b/state/migrations/20231108122539_clear_stuck_invites_test.go @@ -151,6 +151,7 @@ func TestClearStuckInvites(t *testing.T) { if err != nil { t.Fatal(err) } + defer tx.Rollback() // users in room B (bob) and F (doris) should be reset. tokens, err := tokensTable.TokenForEachDevice(tx) diff --git a/testutils/db.go b/testutils/db.go index d4106aa1..af791021 100644 --- a/testutils/db.go +++ b/testutils/db.go @@ -1,11 +1,13 @@ package testutils import ( + "context" "database/sql" "fmt" "os" "os/exec" "os/user" + "time" ) var Quiet = false @@ -64,7 +66,9 @@ func PrepareDBConnectionString() (connStr string) { if err != nil { panic(err) } - _, err = db.Exec(`DROP SCHEMA public CASCADE; + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _, err = db.ExecContext(ctx, `DROP SCHEMA public CASCADE; CREATE SCHEMA public;`) if err != nil { panic(err) From 35c9fd4d95eadd5e082b18ac5debb4fc9936a9ed Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Fri, 17 May 2024 17:10:05 +0100 Subject: [PATCH 24/37] Remove spurious return --- state/device_list_table.go | 1 - 1 file changed, 1 deletion(-) diff --git a/state/device_list_table.go b/state/device_list_table.go index d1ab0b8c..889081e7 100644 --- a/state/device_list_table.go +++ b/state/device_list_table.go @@ -87,7 +87,6 @@ func (t *DeviceListTable) UpsertTx(txn *sqlx.Tx, userID, deviceID string, device } } return nil - return } func (t *DeviceListTable) Select(userID, deviceID string, swap bool) (result internal.MapStringInt, err error) { From fdbebaea68dfa3fb0b4e7c91dc260ee53251286d Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 20 May 2024 08:22:48 +0100 Subject: [PATCH 25/37] Some review comments; swap to UPDATE..RETURNING --- state/device_data_table.go | 16 ++++++++-------- state/device_data_table_test.go | 8 ++++---- state/device_list_table.go | 27 +++++++++++++++++++++------ sync2/handler2/handler.go | 14 ++++---------- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/state/device_data_table.go b/state/device_data_table.go index 86388fee..2c53295e 100644 --- a/state/device_data_table.go +++ b/state/device_data_table.go @@ -107,15 +107,15 @@ func (t *DeviceDataTable) Select(userID, deviceID string, swap bool) (result *in } // Upsert combines what is in the database for this user|device with the partial entry `dd` -func (t *DeviceDataTable) Upsert(dd *internal.DeviceData, deviceListChanges map[string]int) (err error) { +func (t *DeviceDataTable) Upsert(userID, deviceID string, keys internal.DeviceKeyData, deviceListChanges map[string]int) (err error) { err = sqlutil.WithTransaction(t.db, func(txn *sqlx.Tx) error { // Update device lists - if err = t.deviceListTable.UpsertTx(txn, dd.UserID, dd.DeviceID, deviceListChanges); err != nil { + if err = t.deviceListTable.UpsertTx(txn, userID, deviceID, deviceListChanges); err != nil { return err } // select what already exists var row DeviceDataRow - err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2 FOR UPDATE`, dd.UserID, dd.DeviceID) + err = txn.Get(&row, `SELECT data FROM syncv3_device_data WHERE user_id=$1 AND device_id=$2 FOR UPDATE`, userID, deviceID) if err != nil && err != sql.ErrNoRows { return err } @@ -126,12 +126,12 @@ func (t *DeviceDataTable) Upsert(dd *internal.DeviceData, deviceListChanges map[ return err } } - if dd.FallbackKeyTypes != nil { - keyData.FallbackKeyTypes = dd.FallbackKeyTypes + if keys.FallbackKeyTypes != nil { + keyData.FallbackKeyTypes = keys.FallbackKeyTypes keyData.SetFallbackKeysChanged() } - if dd.OTKCounts != nil { - keyData.OTKCounts = dd.OTKCounts + if keys.OTKCounts != nil { + keyData.OTKCounts = keys.OTKCounts keyData.SetOTKCountChanged() } @@ -142,7 +142,7 @@ func (t *DeviceDataTable) Upsert(dd *internal.DeviceData, deviceListChanges map[ _, err = txn.Exec( `INSERT INTO syncv3_device_data(user_id, device_id, data) VALUES($1,$2,$3) ON CONFLICT (user_id, device_id) DO UPDATE SET data=$3`, - dd.UserID, dd.DeviceID, data, + userID, deviceID, data, ) return err }) diff --git a/state/device_data_table_test.go b/state/device_data_table_test.go index eac73095..39e2d05f 100644 --- a/state/device_data_table_test.go +++ b/state/device_data_table_test.go @@ -68,7 +68,7 @@ func TestDeviceDataTableOTKCountAndFallbackKeyTypes(t *testing.T) { // apply them for _, dd := range deltas { - err := table.Upsert(&dd, nil) + err := table.Upsert(dd.UserID, dd.DeviceID, dd.DeviceKeyData, nil) assertNoError(t, err) } @@ -161,7 +161,7 @@ func TestDeviceDataTableBitset(t *testing.T) { }, } - err := table.Upsert(&otkUpdate, nil) + err := table.Upsert(otkUpdate.UserID, otkUpdate.DeviceID, otkUpdate.DeviceKeyData, nil) assertNoError(t, err) got, err := table.Select(userID, deviceID, true) assertNoError(t, err) @@ -173,7 +173,7 @@ func TestDeviceDataTableBitset(t *testing.T) { otkUpdate.ChangedBits = 0 assertDeviceData(t, *got, otkUpdate) // now same for fallback keys, but we won't swap them so it should return those diffs - err = table.Upsert(&fallbakKeyUpdate, nil) + err = table.Upsert(fallbakKeyUpdate.UserID, fallbakKeyUpdate.DeviceID, fallbakKeyUpdate.DeviceKeyData, nil) assertNoError(t, err) fallbakKeyUpdate.OTKCounts = otkUpdate.OTKCounts got, err = table.Select(userID, deviceID, false) @@ -185,7 +185,7 @@ func TestDeviceDataTableBitset(t *testing.T) { fallbakKeyUpdate.SetFallbackKeysChanged() assertDeviceData(t, *got, fallbakKeyUpdate) // updating both works - err = table.Upsert(&bothUpdate, nil) + err = table.Upsert(bothUpdate.UserID, bothUpdate.DeviceID, bothUpdate.DeviceKeyData, nil) assertNoError(t, err) got, err = table.Select(userID, deviceID, true) assertNoError(t, err) diff --git a/state/device_list_table.go b/state/device_list_table.go index 889081e7..350a6066 100644 --- a/state/device_list_table.go +++ b/state/device_list_table.go @@ -109,15 +109,30 @@ func (t *DeviceListTable) SelectTx(txn *sqlx.Tx, userID, deviceID string, swap b if err != nil { return nil, err } - // grab any 'new' updates - result, err = t.selectDeviceListChangesInBucket(txn, userID, deviceID, BucketNew) + // grab any 'new' updates and atomically mark these as 'sent'. + // NB: we must not SELECT then UPDATE, because a 'new' row could be inserted after the SELECT and before the UPDATE, which + // would then be incorrectly moved to 'sent' without being returned to the client, dropping the data. This happens because + // the default transaction level is 'read committed', which /allows/ nonrepeatable reads which is: + // > A transaction re-reads data it has previously read and finds that data has been modified by another transaction (that committed since the initial read). + // We could change the isolation level but this incurs extra performance costs in addition to serialisation errors which + // need to be handled. It's easier to just use UPDATE .. RETURNING. Note that we don't require UPDATE .. RETURNING to be + // atomic in any way, it's just that we need to guarantee each things SELECTed is also UPDATEd (so in the scenario above, + // we don't care if the SELECT includes or excludes the 'new' row, but if it is SELECTed it MUST be UPDATEd). + rows, err := txn.Query(`UPDATE syncv3_device_list_updates SET bucket=$1 WHERE user_id=$2 AND device_id=$3 AND bucket=$4 RETURNING target_user_id, target_state`, BucketSent, userID, deviceID, BucketNew) if err != nil { return nil, err } - - // mark these 'new' updates as 'sent' - _, err = txn.Exec(`UPDATE syncv3_device_list_updates SET bucket=$1 WHERE user_id=$2 AND device_id=$3 AND bucket=$4`, BucketSent, userID, deviceID, BucketNew) - return result, err + defer rows.Close() + result = make(internal.MapStringInt) + var targetUserID string + var targetState int + for rows.Next() { + if err := rows.Scan(&targetUserID, &targetState); err != nil { + return nil, err + } + result[targetUserID] = targetState + } + return result, rows.Err() } func (t *DeviceListTable) selectDeviceListChangesInBucket(txn *sqlx.Tx, userID, deviceID string, bucket int) (result internal.MapStringInt, err error) { diff --git a/sync2/handler2/handler.go b/sync2/handler2/handler.go index b8f29757..7b6ecf5b 100644 --- a/sync2/handler2/handler.go +++ b/sync2/handler2/handler.go @@ -232,16 +232,10 @@ func (h *Handler) OnE2EEData(ctx context.Context, userID, deviceID string, otkCo wg.Add(1) h.e2eeWorkerPool.Queue(func() { defer wg.Done() - // some of these fields may be set - partialDD := internal.DeviceData{ - UserID: userID, - DeviceID: deviceID, - DeviceKeyData: internal.DeviceKeyData{ - OTKCounts: otkCounts, - FallbackKeyTypes: fallbackKeyTypes, - }, - } - err := h.Store.DeviceDataTable.Upsert(&partialDD, deviceListChanges) + err := h.Store.DeviceDataTable.Upsert(userID, deviceID, internal.DeviceKeyData{ + OTKCounts: otkCounts, + FallbackKeyTypes: fallbackKeyTypes, + }, deviceListChanges) if err != nil { logger.Err(err).Str("user", userID).Msg("failed to upsert device data") internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) From 3fc49bd4ea405cfe49628d67a6afeae227efaa13 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 20 May 2024 08:37:09 +0100 Subject: [PATCH 26/37] Migration review comments --- .../20240517104423_device_list_table.go | 6 +- .../20240517104423_device_list_table_test.go | 85 +++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/state/migrations/20240517104423_device_list_table.go b/state/migrations/20240517104423_device_list_table.go index 5e6d7712..315b79c2 100644 --- a/state/migrations/20240517104423_device_list_table.go +++ b/state/migrations/20240517104423_device_list_table.go @@ -168,8 +168,10 @@ func upDeviceListTable(ctx context.Context, tx *sql.Tx) error { } func downDeviceListTable(ctx context.Context, tx *sql.Tx) error { - // no-op: we'll drop the device list updates but still work correctly as new/sent are still in the cbor but are empty - return nil + // no-op: we'll drop the device list updates but still work correctly as new/sent are still in the cbor but are empty. + // This will lose some device list updates. + _, err := tx.Exec(`DROP TABLE IF EXISTS syncv3_device_list_updates`) + return err } func deserialiseCBOR(data []byte) (*OldDeviceData, error) { diff --git a/state/migrations/20240517104423_device_list_table_test.go b/state/migrations/20240517104423_device_list_table_test.go index a9d53dda..88571cf4 100644 --- a/state/migrations/20240517104423_device_list_table_test.go +++ b/state/migrations/20240517104423_device_list_table_test.go @@ -2,9 +2,12 @@ package migrations import ( "context" + "reflect" "testing" "github.com/fxamacker/cbor/v2" + "github.com/matrix-org/sliding-sync/internal" + "github.com/matrix-org/sliding-sync/state" ) func TestDeviceListTableMigration(t *testing.T) { @@ -71,4 +74,86 @@ func TestDeviceListTableMigration(t *testing.T) { } tx.Commit() + wantSents := []internal.DeviceData{ + { + UserID: "@alice:localhost", + DeviceID: "ALICE", + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: internal.MapStringInt{ + "bar": 42, + }, + FallbackKeyTypes: []string{"narp"}, + ChangedBits: 2, + }, + }, + { + UserID: "@bob:localhost", + DeviceID: "BOB", + DeviceListChanges: internal.DeviceListChanges{ + DeviceListChanged: []string{"@sent:localhost"}, + }, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: internal.MapStringInt{ + "foo": 100, + }, + FallbackKeyTypes: []string{"yep"}, + }, + }, + } + + table := state.NewDeviceDataTable(db) + for _, wantSent := range wantSents { + gotSent, err := table.Select(wantSent.UserID, wantSent.DeviceID, false) + if err != nil { + t.Fatal(err) + } + assertVal(t, "'sent' data was corrupted during the migration", *gotSent, wantSent) + } + + wantNews := []internal.DeviceData{ + { + UserID: "@alice:localhost", + DeviceID: "ALICE", + DeviceListChanges: internal.DeviceListChanges{ + DeviceListLeft: []string{"@bob:localhost"}, + }, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: internal.MapStringInt{ + "bar": 42, + }, + FallbackKeyTypes: []string{"narp"}, + ChangedBits: 2, + }, + }, + { + UserID: "@bob:localhost", + DeviceID: "BOB", + DeviceListChanges: internal.DeviceListChanges{ + DeviceListChanged: []string{"@💣:localhost"}, + DeviceListLeft: []string{"@bomb:localhost"}, + }, + DeviceKeyData: internal.DeviceKeyData{ + OTKCounts: internal.MapStringInt{ + "foo": 100, + }, + FallbackKeyTypes: []string{"yep"}, + }, + }, + } + + for _, wantNew := range wantNews { + gotNew, err := table.Select(wantNew.UserID, wantNew.DeviceID, true) + if err != nil { + t.Fatal(err) + } + assertVal(t, "'new' data was corrupted during the migration", *gotNew, wantNew) + } + +} + +func assertVal(t *testing.T, msg string, got, want interface{}) { + t.Helper() + if !reflect.DeepEqual(got, want) { + t.Errorf("%s: got\n%#v\nwant\n%#v", msg, got, want) + } } From 45f3e012dbbef2ece64eac5367edef794427f787 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Nieto=20Tal=C3=B3?= Date: Mon, 20 May 2024 11:57:24 +0200 Subject: [PATCH 27/37] fix: urlencode since opaque string Since/next batch is an opaque string and might need to be urlencoded before being sent to the server. Signed-off-by: Guillem Nieto --- sync2/client.go | 2 +- sync2/client_test.go | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sync2/client.go b/sync2/client.go index 7d47f611..75d42e2d 100644 --- a/sync2/client.go +++ b/sync2/client.go @@ -152,7 +152,7 @@ func (v *HTTPClient) createSyncURL(since string, isFirst, toDeviceOnly bool) str qps += "timeout=30000" } if since != "" { - qps += "&since=" + since + qps += "&since=" + url.QueryEscape(since) } // Set presence to offline, this potentially reduces CPU load on upstream homeservers diff --git a/sync2/client_test.go b/sync2/client_test.go index f8d82d09..ae6fbcec 100644 --- a/sync2/client_test.go +++ b/sync2/client_test.go @@ -65,6 +65,12 @@ func TestSyncURL(t *testing.T) { toDeviceOnly: true, wantURL: wantBaseURL + `?timeout=0&since=112233&set_presence=offline&filter=` + url.QueryEscape(`{"presence":{"not_types":["*"]},"room":{"rooms":[],"timeline":{"limit":50}}}`), }, + { + since: "112233#145", + isFirst: true, + toDeviceOnly: true, + wantURL: wantBaseURL + `?timeout=0&since=112233%23145&set_presence=offline&filter=` + url.QueryEscape(`{"presence":{"not_types":["*"]},"room":{"rooms":[],"timeline":{"limit":50}}}`), + }, } for i, tc := range testCases { gotURL := client.createSyncURL(tc.since, tc.isFirst, tc.toDeviceOnly) From e295c31ab722325945192645db28f5182359e0f3 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 20 May 2024 13:57:24 +0100 Subject: [PATCH 28/37] Ensure null is not sent when we mean [] --- sync3/extensions/e2ee.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sync3/extensions/e2ee.go b/sync3/extensions/e2ee.go index 01711d6a..1dbcad45 100644 --- a/sync3/extensions/e2ee.go +++ b/sync3/extensions/e2ee.go @@ -70,6 +70,12 @@ func (r *E2EERequest) ProcessInitial(ctx context.Context, res *Response, extCtx extRes.OTKCounts = dd.OTKCounts hasUpdates = true } + if dd.DeviceListChanged == nil { + dd.DeviceListChanged = make([]string, 0) + } + if dd.DeviceListLeft == nil { + dd.DeviceListLeft = make([]string, 0) + } if len(dd.DeviceListChanged) > 0 || len(dd.DeviceListLeft) > 0 { extRes.DeviceLists = &E2EEDeviceList{ Changed: dd.DeviceListChanged, From 451f6fb9c877af0b79fa18b6640438c594448b3f Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Mon, 20 May 2024 14:02:29 +0100 Subject: [PATCH 29/37] Add regression test --- tests-integration/extensions_test.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests-integration/extensions_test.go b/tests-integration/extensions_test.go index b2f271f7..ed25a074 100644 --- a/tests-integration/extensions_test.go +++ b/tests-integration/extensions_test.go @@ -235,6 +235,32 @@ func TestExtensionE2EE(t *testing.T) { }) m.MatchResponse(t, res, m.MatchDeviceLists(wantChanged, wantLeft)) + // check that empty lists aren't serialised as null + v2.queueResponse(alice, sync2.SyncResponse{ + DeviceLists: struct { + Changed []string `json:"changed,omitempty"` + Left []string `json:"left,omitempty"` + }{ + Changed: wantChanged, + }, + }) + v2.waitUntilEmpty(t, alice) + res = v3.mustDoV3RequestWithPos(t, aliceToken, res.Pos, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, // doesn't matter + }, + }}, + // enable the E2EE extension + Extensions: extensions.Request{ + E2EE: &extensions.E2EERequest{ + Core: extensions.Core{Enabled: &boolTrue}, + }, + }, + }) + if res.Extensions.E2EE.DeviceLists.Left == nil { + t.Errorf("left array should be [] not null") + } } // Checks that to-device messages are passed from v2 to v3 From 9eee30b152e4b8f4bcd9e25e9ce46a0ab2bf0ce6 Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Tue, 21 May 2024 15:27:15 +0200 Subject: [PATCH 30/37] Add a materialized view for event_types and use it in an updated query --- state/event_table.go | 33 +++++++++++++++++++++------------ state/storage.go | 11 ++++++++++- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index 48d81c58..bc7e59e3 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -141,6 +141,11 @@ func NewEventTable(db *sqlx.DB) *EventTable { CREATE INDEX IF NOT EXISTS syncv3_nid_room_state_idx ON syncv3_events(room_id, event_nid, is_state); CREATE UNIQUE INDEX IF NOT EXISTS syncv3_events_room_event_nid_type_skey_idx ON syncv3_events(event_nid, event_type, state_key); + + -- Create a materialized view for event_types (used on startup to get the latest events in each room) + CREATE MATERIALIZED VIEW IF NOT EXISTS event_types as + SELECT DISTINCT event_type + FROM syncv3_events; `) return &EventTable{db} } @@ -441,18 +446,22 @@ func (t *EventTable) SelectLatestEventsBetween(txn *sqlx.Tx, roomID string, lowe func (t *EventTable) selectLatestEventByTypeInAllRooms(txn *sqlx.Tx) ([]Event, error) { result := []Event{} - // TODO: this query ends up doing a sequential scan on the events table. We have - // an index on (event_type, room_id, event_nid) so I'm a little surprised that PG - // decides to do so. Can we do something better here? Ideas: - // - Find a better query for selecting the newest event of each type in a room. - // - At present we only care about the _timestamps_ of these events. Perhaps we - // could store those in the DB (and even in an index) as a column and select - // those, to avoid having to parse the event bodies. - // - We could have the application maintain a `latest_events` table so that the - // rows can be directly read. Assuming a mostly-static set of event types, reads - // are then linear in the number of rooms. - rows, err := txn.Query( - `SELECT room_id, event_nid, event FROM syncv3_events WHERE event_nid in (SELECT MAX(event_nid) FROM syncv3_events GROUP BY room_id, event_type)`, + // What the following query does: + // 1. Gets all event types from a materialized view (updated on startup in `PrepareSnapshot`) as the `event_types` CTE + // 2. Gets all rooms as the `room_ids` CTE + // 3. Gets the latest event_nid for each event_type and room as the `max_by_ev_type` CTE + // 4. Queries the required data using the event_nids provided by the `max_by_ev_type` CTE + rows, err := txn.Query(` +WITH event_types AS ( + SELECT * FROM event_types +), room_ids AS ( + SELECT DISTINCT room_id FROM syncv3_rooms +), max_by_ev_type AS ( + SELECT m.max FROM event_types, room_ids, + LATERAL ( SELECT max(event_nid) as max FROM syncv3_events e WHERE e.room_id = room_ids.room_id AND e.event_type = event_types.event_type ) AS m +) +SELECT room_id, event_nid, event FROM syncv3_events, max_by_ev_type WHERE event_nid = max_by_ev_type.max +`, ) if err != nil { return nil, err diff --git a/state/storage.go b/state/storage.go index 0d0faa74..148f76aa 100644 --- a/state/storage.go +++ b/state/storage.go @@ -171,7 +171,16 @@ func (s *Storage) PrepareSnapshot(txn *sqlx.Tx) (tableName string, err error) { `SELECT UNNEST(membership_events) AS membership_nid INTO TEMP ` + tempTableName + ` FROM syncv3_snapshots JOIN syncv3_rooms ON syncv3_snapshots.snapshot_id = syncv3_rooms.current_snapshot_id`, ) - return tempTableName, err + if err != nil { + return "", err + } + // Refresh the materialized view, so getting latest events by type per room + // can use a fresh view of the event_types. + _, err = txn.Exec("REFRESH MATERIALIZED VIEW event_types;") + if err != nil { + return "", err + } + return tempTableName, nil } // GlobalSnapshot snapshots the entire database for the purposes of initialising From ce15a2800cd7710b81b41a1b62561da9419c6509 Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Tue, 21 May 2024 16:16:17 +0200 Subject: [PATCH 31/37] Remove materialized view and use a recursive CTE instead to get unique event_types --- state/event_table.go | 16 +++++++++------- state/storage.go | 9 --------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/state/event_table.go b/state/event_table.go index bc7e59e3..33c5c6c5 100644 --- a/state/event_table.go +++ b/state/event_table.go @@ -141,11 +141,6 @@ func NewEventTable(db *sqlx.DB) *EventTable { CREATE INDEX IF NOT EXISTS syncv3_nid_room_state_idx ON syncv3_events(room_id, event_nid, is_state); CREATE UNIQUE INDEX IF NOT EXISTS syncv3_events_room_event_nid_type_skey_idx ON syncv3_events(event_nid, event_type, state_key); - - -- Create a materialized view for event_types (used on startup to get the latest events in each room) - CREATE MATERIALIZED VIEW IF NOT EXISTS event_types as - SELECT DISTINCT event_type - FROM syncv3_events; `) return &EventTable{db} } @@ -447,13 +442,20 @@ func (t *EventTable) SelectLatestEventsBetween(txn *sqlx.Tx, roomID string, lowe func (t *EventTable) selectLatestEventByTypeInAllRooms(txn *sqlx.Tx) ([]Event, error) { result := []Event{} // What the following query does: - // 1. Gets all event types from a materialized view (updated on startup in `PrepareSnapshot`) as the `event_types` CTE + // 1. Gets all event types from a recursive CTE as the `event_types` CTE // 2. Gets all rooms as the `room_ids` CTE // 3. Gets the latest event_nid for each event_type and room as the `max_by_ev_type` CTE // 4. Queries the required data using the event_nids provided by the `max_by_ev_type` CTE rows, err := txn.Query(` WITH event_types AS ( - SELECT * FROM event_types + WITH RECURSIVE t AS ( + (SELECT event_type FROM syncv3_events ORDER BY event_type LIMIT 1) -- parentheses required + UNION ALL + SELECT (SELECT event_type FROM syncv3_events WHERE event_type > t.event_type ORDER BY event_type LIMIT 1) + FROM t + WHERE t.event_type IS NOT NULL + ) + SELECT event_type FROM t WHERE event_type IS NOT NULL ), room_ids AS ( SELECT DISTINCT room_id FROM syncv3_rooms ), max_by_ev_type AS ( diff --git a/state/storage.go b/state/storage.go index 148f76aa..d8cb2e1c 100644 --- a/state/storage.go +++ b/state/storage.go @@ -171,15 +171,6 @@ func (s *Storage) PrepareSnapshot(txn *sqlx.Tx) (tableName string, err error) { `SELECT UNNEST(membership_events) AS membership_nid INTO TEMP ` + tempTableName + ` FROM syncv3_snapshots JOIN syncv3_rooms ON syncv3_snapshots.snapshot_id = syncv3_rooms.current_snapshot_id`, ) - if err != nil { - return "", err - } - // Refresh the materialized view, so getting latest events by type per room - // can use a fresh view of the event_types. - _, err = txn.Exec("REFRESH MATERIALIZED VIEW event_types;") - if err != nil { - return "", err - } return tempTableName, nil } From 4b70d7d55fa74c54fe4ebd3aa3be9087009ffe06 Mon Sep 17 00:00:00 2001 From: Till Faelligen <2353100+S7evinK@users.noreply.github.com> Date: Tue, 21 May 2024 16:17:06 +0200 Subject: [PATCH 32/37] Return err and not nil --- state/storage.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state/storage.go b/state/storage.go index d8cb2e1c..0d0faa74 100644 --- a/state/storage.go +++ b/state/storage.go @@ -171,7 +171,7 @@ func (s *Storage) PrepareSnapshot(txn *sqlx.Tx) (tableName string, err error) { `SELECT UNNEST(membership_events) AS membership_nid INTO TEMP ` + tempTableName + ` FROM syncv3_snapshots JOIN syncv3_rooms ON syncv3_snapshots.snapshot_id = syncv3_rooms.current_snapshot_id`, ) - return tempTableName, nil + return tempTableName, err } // GlobalSnapshot snapshots the entire database for the purposes of initialising From ea3baf23514a18c44cda8d4ae9211d207056d114 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Thu, 23 May 2024 11:07:56 +0100 Subject: [PATCH 33/37] v0.99.18 --- cmd/syncv3/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/syncv3/main.go b/cmd/syncv3/main.go index 70fb6bc4..6c81ab2b 100644 --- a/cmd/syncv3/main.go +++ b/cmd/syncv3/main.go @@ -28,7 +28,7 @@ import ( var GitCommit string -const version = "0.99.17" +const version = "0.99.18" var ( flags = flag.NewFlagSet("goose", flag.ExitOnError) From 3c2275a935c357affc73510139a500938a633c22 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Tue, 4 Jun 2024 15:26:58 +0100 Subject: [PATCH 34/37] Protect against dropped databases Helps fix the worst of https://github.com/matrix-org/sliding-sync/issues/448 --- sync3/extensions/todevice.go | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/sync3/extensions/todevice.go b/sync3/extensions/todevice.go index 951177b3..430c3da5 100644 --- a/sync3/extensions/todevice.go +++ b/sync3/extensions/todevice.go @@ -65,6 +65,30 @@ func (r *ToDeviceRequest) ProcessInitial(ctx context.Context, res *Response, ext r.Limit = 100 // default to 100 } l := logger.With().Str("user", extCtx.UserID).Str("device", extCtx.DeviceID).Logger() + + mapMu.Lock() + lastSentPos, exists := deviceIDToSinceDebugOnly[extCtx.DeviceID] + internal.Logf(ctx, "to_device", "since=%v limit=%v last_sent=%v", r.Since, r.Limit, lastSentPos) + isFirstRequest := !exists + mapMu.Unlock() + + // If this is the first time we've seen this device ID since starting up, ignore the client-provided 'since' + // value. This is done to protect against dropped postgres sequences. Consider: + // - 5 to-device messages arrive for Alice + // - Alice requests all messages, gets them and acks them so since=5, and the nextval() sequence is 6. + // - the server admin drops the DB and starts over again. The DB sequence starts back at 1. + // - 2 to-device messages arrive for Alice + // - Alice requests messages from since=5. No messages are returned as the 2 new messages have a lower sequence number. + // - Even worse, those 2 messages are deleted because sending since=5 ACKNOWLEDGES all messages <=5. + // By ignoring the first `since` on startup, we effectively force the client into sending since=0. In this scenario, + // it will then A) not delete anything as since=0 acknowledges nothing, B) return the 2 to-device events. + // + // The cost to this is that it is possible to send duplicate to-device events if the server restarts before the client + // has time to send the ACK to the server. This isn't fatal as clients do suppress duplicate to-device events. + if isFirstRequest { + r.Since = "" + } + var from int64 var err error if r.Since != "" { @@ -82,10 +106,7 @@ func (r *ToDeviceRequest) ProcessInitial(ctx context.Context, res *Response, ext internal.GetSentryHubFromContextOrDefault(ctx).CaptureException(err) } } - mapMu.Lock() - lastSentPos := deviceIDToSinceDebugOnly[extCtx.DeviceID] - internal.Logf(ctx, "to_device", "since=%v limit=%v last_sent=%v", r.Since, r.Limit, lastSentPos) - mapMu.Unlock() + if from < lastSentPos { // we told the client about a newer position, but yet they are using an older position, yell loudly l.Warn().Int64("last_sent", lastSentPos).Int64("recv", from).Bool("initial", extCtx.IsInitial).Msg( From 91a1a56f6441b91a285a3c54607da238325e43f9 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Tue, 4 Jun 2024 16:57:57 +0100 Subject: [PATCH 35/37] Add test --- tests-integration/extensions_test.go | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests-integration/extensions_test.go b/tests-integration/extensions_test.go index ed25a074..f51000e0 100644 --- a/tests-integration/extensions_test.go +++ b/tests-integration/extensions_test.go @@ -443,6 +443,46 @@ func TestExtensionToDevice(t *testing.T) { m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(0)), m.MatchToDeviceMessages(newToDeviceMsgs)) } +// Test that if you sync with a very very high numbered since value, we return lower numbered entries. +// This guards against dropped databases. +func TestExtensionToDeviceSequence(t *testing.T) { + pqString := testutils.PrepareDBConnectionString() + // setup code + v2 := runTestV2Server(t) + v3 := runTestServer(t, v2, pqString) + defer v2.close() + defer v3.close() + alice := "@TestExtensionToDeviceSequence_alice:localhost" + aliceToken := "ALICE_BEARER_TOKEN_TestExtensionToDeviceSequence" + v2.addAccount(t, alice, aliceToken) + toDeviceMsgs := []json.RawMessage{ + json.RawMessage(`{"sender":"alice","type":"something","content":{"foo":"1"}}`), + json.RawMessage(`{"sender":"alice","type":"something","content":{"foo":"2"}}`), + json.RawMessage(`{"sender":"alice","type":"something","content":{"foo":"3"}}`), + json.RawMessage(`{"sender":"alice","type":"something","content":{"foo":"4"}}`), + } + v2.queueResponse(alice, sync2.SyncResponse{ + ToDevice: sync2.EventsResponse{ + Events: toDeviceMsgs, + }, + }) + + res := v3.mustDoV3Request(t, aliceToken, sync3.Request{ + Lists: map[string]sync3.RequestList{"a": { + Ranges: sync3.SliceRanges{ + [2]int64{0, 10}, // doesn't matter + }, + }}, + Extensions: extensions.Request{ + ToDevice: &extensions.ToDeviceRequest{ + Core: extensions.Core{Enabled: &boolTrue}, + Since: "999999", + }, + }, + }) + m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(0)), m.MatchToDeviceMessages(toDeviceMsgs)) +} + // tests that the account data extension works: // 1- check global account data is sent on first connection // 2- check global account data updates are proxied through From 17cb5f0376b8a1e17fafc4c972374d5d566d1077 Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Tue, 4 Jun 2024 17:00:31 +0100 Subject: [PATCH 36/37] Check the returned next_batch --- tests-integration/extensions_test.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests-integration/extensions_test.go b/tests-integration/extensions_test.go index f51000e0..d6159065 100644 --- a/tests-integration/extensions_test.go +++ b/tests-integration/extensions_test.go @@ -2,6 +2,7 @@ package syncv3 import ( "encoding/json" + "fmt" "testing" "time" @@ -467,6 +468,7 @@ func TestExtensionToDeviceSequence(t *testing.T) { }, }) + hiSince := "999999" res := v3.mustDoV3Request(t, aliceToken, sync3.Request{ Lists: map[string]sync3.RequestList{"a": { Ranges: sync3.SliceRanges{ @@ -476,11 +478,17 @@ func TestExtensionToDeviceSequence(t *testing.T) { Extensions: extensions.Request{ ToDevice: &extensions.ToDeviceRequest{ Core: extensions.Core{Enabled: &boolTrue}, - Since: "999999", + Since: hiSince, }, }, }) - m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(0)), m.MatchToDeviceMessages(toDeviceMsgs)) + m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(0)), m.MatchToDeviceMessages(toDeviceMsgs), func(res *sync3.Response) error { + // ensure that we return a lower numbered since token + if res.Extensions.ToDevice.NextBatch == hiSince { + return fmt.Errorf("next_batch got %v wanted lower", hiSince) + } + return nil + }) } // tests that the account data extension works: From 7ea0b711db377c9a67933233fd20b76130ccc3da Mon Sep 17 00:00:00 2001 From: Kegan Dougal <7190048+kegsay@users.noreply.github.com> Date: Tue, 4 Jun 2024 17:22:43 +0100 Subject: [PATCH 37/37] Check for lower --- tests-integration/extensions_test.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests-integration/extensions_test.go b/tests-integration/extensions_test.go index d6159065..26a5480f 100644 --- a/tests-integration/extensions_test.go +++ b/tests-integration/extensions_test.go @@ -3,6 +3,7 @@ package syncv3 import ( "encoding/json" "fmt" + "strconv" "testing" "time" @@ -468,7 +469,7 @@ func TestExtensionToDeviceSequence(t *testing.T) { }, }) - hiSince := "999999" + hiSince := 999999 res := v3.mustDoV3Request(t, aliceToken, sync3.Request{ Lists: map[string]sync3.RequestList{"a": { Ranges: sync3.SliceRanges{ @@ -478,14 +479,18 @@ func TestExtensionToDeviceSequence(t *testing.T) { Extensions: extensions.Request{ ToDevice: &extensions.ToDeviceRequest{ Core: extensions.Core{Enabled: &boolTrue}, - Since: hiSince, + Since: fmt.Sprintf("%d", hiSince), }, }, }) m.MatchResponse(t, res, m.MatchList("a", m.MatchV3Count(0)), m.MatchToDeviceMessages(toDeviceMsgs), func(res *sync3.Response) error { // ensure that we return a lower numbered since token - if res.Extensions.ToDevice.NextBatch == hiSince { - return fmt.Errorf("next_batch got %v wanted lower", hiSince) + got, err := strconv.Atoi(res.Extensions.ToDevice.NextBatch) + if err != nil { + return err + } + if got >= hiSince { + return fmt.Errorf("next_batch got %v wanted lower than %v", got, hiSince) } return nil })