diff --git a/federationapi/storage/shared/storage_edus.go b/federationapi/storage/shared/storage_edus.go index ce9632ed..e0c740c1 100644 --- a/federationapi/storage/shared/storage_edus.go +++ b/federationapi/storage/shared/storage_edus.go @@ -53,9 +53,9 @@ func (d *Database) AssociateEDUWithDestination( // Keep EDUs for at least x minutes before deleting them expiresAt = gomatrixserverlib.AsTimestamp(time.Now().Add(duration)) } - // We forcibly set m.direct_to_device events to 0, as we always want them - // to be delivered. (required for E2EE) - if eduType == gomatrixserverlib.MDirectToDevice { + // We forcibly set m.direct_to_device and m.device_list_update events + // to 0, as we always want them to be delivered. (required for E2EE) + if eduType == gomatrixserverlib.MDirectToDevice || eduType == gomatrixserverlib.MDeviceListUpdate { expiresAt = 0 } return d.Writer.Do(d.DB, nil, func(txn *sql.Tx) error { diff --git a/internal/httputil/http.go b/internal/httputil/http.go index 1e07ee33..ad26de51 100644 --- a/internal/httputil/http.go +++ b/internal/httputil/http.go @@ -24,7 +24,7 @@ import ( "net/url" "strings" - opentracing "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go" "github.com/opentracing/opentracing-go/ext" ) @@ -81,8 +81,8 @@ func PostJSON[reqtype, restype any, errtype error]( return fmt.Errorf("HTTP %d from %s (no response body)", res.StatusCode, apiURL) } var reserr errtype - if err = json.Unmarshal(body, reserr); err != nil { - return fmt.Errorf("HTTP %d from %s", res.StatusCode, apiURL) + if err = json.Unmarshal(body, &reserr); err != nil { + return fmt.Errorf("HTTP %d from %s - %w", res.StatusCode, apiURL, err) } return reserr } diff --git a/keyserver/internal/device_list_update.go b/keyserver/internal/device_list_update.go index 304b67b2..8530f907 100644 --- a/keyserver/internal/device_list_update.go +++ b/keyserver/internal/device_list_update.go @@ -19,9 +19,11 @@ import ( "encoding/json" "fmt" "hash/fnv" + "net" "sync" "time" + "github.com/matrix-org/gomatrix" "github.com/matrix-org/gomatrixserverlib" "github.com/matrix-org/util" "github.com/prometheus/client_golang/prometheus" @@ -388,6 +390,8 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam return waitTime, true } failCount := 0 + +userLoop: for _, userID := range userIDs { if ctx.Err() != nil { // we've timed out, give up and go to the back of the queue to let another server be processed. @@ -397,19 +401,35 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam res, err := u.fedClient.GetUserDevices(ctx, serverName, userID) if err != nil { failCount += 1 - fcerr, ok := err.(*fedsenderapi.FederationClientError) - if ok { - if fcerr.RetryAfter > 0 { - waitTime = fcerr.RetryAfter - } else if fcerr.Blacklisted { + switch e := err.(type) { + case *fedsenderapi.FederationClientError: + if e.RetryAfter > 0 { + waitTime = e.RetryAfter + break userLoop + } else if e.Blacklisted { waitTime = time.Hour * 8 - } else { - // For all other errors (DNS resolution, network etc.) wait 1 hour. - waitTime = time.Hour + break userLoop } - } else { - waitTime = time.Hour - logger.WithError(err).WithField("user_id", userID).Debug("GetUserDevices returned unknown error type") + case net.Error: + // Use the default waitTime, if it's a timeout. + // It probably doesn't make sense to try further users. + if !e.Timeout() { + waitTime = time.Minute * 10 + logrus.WithError(e).Error("GetUserDevices returned net.Error") + break userLoop + } + case gomatrix.HTTPError: + // The remote server returned an error, give it some time to recover + if e.Code >= 500 { + waitTime = time.Minute * 10 + logrus.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError") + break userLoop + } + default: + // Something else failed + waitTime = time.Minute * 10 + logger.WithError(err).WithField("user_id", userID).Debugf("GetUserDevices returned unknown error type: %T", err) + break userLoop } continue } @@ -437,7 +457,11 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam } } if failCount > 0 { - logger.WithField("total", len(userIDs)).WithField("failed", failCount).WithField("wait", waitTime).Warn("Failed to query device keys for some users") + logger.WithFields(logrus.Fields{ + "total": len(userIDs), + "failed": failCount, + "skipped": len(userIDs) - failCount, + }).Warn("Failed to query device keys for some users") } for _, userID := range userIDs { // always clear the channel to unblock Update calls regardless of success/failure diff --git a/sytest-whitelist b/sytest-whitelist index 5c8896b9..fc10bd3c 100644 --- a/sytest-whitelist +++ b/sytest-whitelist @@ -741,4 +741,5 @@ Newly joined room includes presence in incremental sync User in private room doesn't appear in user directory User joining then leaving public room appears and dissappears from directory User in remote room doesn't appear in user directory after server left room -User in shared private room does appear in user directory until leave \ No newline at end of file +User in shared private room does appear in user directory until leave +If a device list update goes missing, the server resyncs on the next one \ No newline at end of file