From 42a82091a8932fd92e36ba41c644c2058ab7dce6 Mon Sep 17 00:00:00 2001 From: Till <2353100+S7evinK@users.noreply.github.com> Date: Thu, 8 Sep 2022 12:03:44 +0200 Subject: [PATCH] Fix issue with stale device lists (#2702) We were only sending the last entry to the worker, so most likely missed updates. --- federationapi/inthttp/server.go | 13 +++++++++++-- keyserver/internal/device_list_update.go | 13 +++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/federationapi/inthttp/server.go b/federationapi/inthttp/server.go index 58ea9ddc..7aa0e480 100644 --- a/federationapi/inthttp/server.go +++ b/federationapi/inthttp/server.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "net/http" + "net/url" "github.com/gorilla/mux" "github.com/matrix-org/gomatrix" @@ -235,9 +236,17 @@ func federationClientError(err error) error { return &api.FederationClientError{ Code: ferr.Code, } - default: + case *url.Error: // e.g. certificate error, unable to connect return &api.FederationClientError{ - Err: err.Error(), + Err: ferr.Error(), + Code: 400, + } + default: + // We don't know what exactly failed, but we probably don't + // want to retry the request immediately in the device list updater + return &api.FederationClientError{ + Err: err.Error(), + Code: 400, } } } diff --git a/keyserver/internal/device_list_update.go b/keyserver/internal/device_list_update.go index 41534fe8..525f8a99 100644 --- a/keyserver/internal/device_list_update.go +++ b/keyserver/internal/device_list_update.go @@ -167,6 +167,7 @@ func (u *DeviceListUpdater) Start() error { step = (time.Second * 120) / time.Duration(max) } for _, userID := range staleLists { + userID := userID // otherwise we are only sending the last entry time.AfterFunc(offset, func() { u.notifyWorkers(userID) }) @@ -396,11 +397,19 @@ userLoop: if ctx.Err() != nil { // we've timed out, give up and go to the back of the queue to let another server be processed. failCount += 1 + waitTime = time.Minute * 10 break } res, err := u.fedClient.GetUserDevices(ctx, serverName, userID) if err != nil { failCount += 1 + select { + case <-ctx.Done(): + // we've timed out, give up and go to the back of the queue to let another server be processed. + waitTime = time.Minute * 10 + break userLoop + default: + } switch e := err.(type) { case *fedsenderapi.FederationClientError: if e.RetryAfter > 0 { @@ -419,7 +428,7 @@ userLoop: // It probably doesn't make sense to try further users. if !e.Timeout() { waitTime = time.Minute * 10 - logrus.WithError(e).Error("GetUserDevices returned net.Error") + logger.WithError(e).Error("GetUserDevices returned net.Error") break userLoop } case gomatrix.HTTPError: @@ -427,7 +436,7 @@ userLoop: // This is to avoid spamming remote servers, which may not be Matrix servers anymore. if e.Code >= 300 { waitTime = time.Hour - logrus.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError") + logger.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError") break userLoop } default: