diff --git a/blscosi/protocol/protocol.go b/blscosi/protocol/protocol.go index c9c792b327..8b7e740e09 100644 --- a/blscosi/protocol/protocol.go +++ b/blscosi/protocol/protocol.go @@ -99,13 +99,12 @@ func DefaultThreshold(n int) int { } // DefaultSubLeaders returns the number of sub-leaders, which is the -// cube-root of the number of nodes. +// square root of the number of nodes - 1. func DefaultSubLeaders(nodes int) int { - // As `math.Pow` calculates `8 ** (1/3) < 2`, - // we add 0.0001 for the rounding error. - // This works for up to 57 ** 3 = 185'193, which should be enough - // nodes. - return int(math.Pow(float64(nodes), 1.0/3.0) + 0.0001) + if nodes == 1 { + return 1 + } + return int(math.Pow(float64(nodes-1), 1.0/2.0)) } // NewBlsCosi method is used to define the blscosi protocol. diff --git a/blscosi/protocol/protocol_test.go b/blscosi/protocol/protocol_test.go index ac9922baea..1726c15425 100644 --- a/blscosi/protocol/protocol_test.go +++ b/blscosi/protocol/protocol_test.go @@ -192,7 +192,7 @@ func TestProtocol_FailingLeaves_25_9(t *testing.T) { func TestDefaultSubLeaders(t *testing.T) { require.Equal(t, DefaultSubLeaders(1), 1) for subleaders := 2; subleaders < 58; subleaders++ { - nodes := int(math.Pow(float64(subleaders), 3.0)) + nodes := int(math.Pow(float64(subleaders), 2.0)) + 1 require.Equal(t, DefaultSubLeaders(nodes-1), subleaders-1) require.Equal(t, DefaultSubLeaders(nodes), subleaders) } diff --git a/blscosi/protocol/sub_protocol.go b/blscosi/protocol/sub_protocol.go index 64195fae14..a0ed1642d1 100644 --- a/blscosi/protocol/sub_protocol.go +++ b/blscosi/protocol/sub_protocol.go @@ -4,6 +4,7 @@ import ( "crypto/rand" "errors" "fmt" + "golang.org/x/xerrors" "sync" "time" @@ -190,36 +191,42 @@ func (p *SubBlsCosi) dispatchRoot() error { return nil } - // Only one child anyway - err := p.SendToChildren(&Announcement{ - Msg: p.Msg, - Data: p.Data, - Timeout: p.Timeout, - Threshold: p.Threshold, - }) - if err != nil { - // Only log what happened so we can try to finish the protocol - // (e.g. one child is offline) - log.Warnf("Error when broadcasting to children: %s", err.Error()) - } + subLeaderActive := make(chan error, 1) + // Because SendToChildren blocks on some firewalls instead of returning + // an error, this call is put in a go-routine. + go func() { + subLeaderActive <- p.SendToChildren(&Announcement{ + Msg: p.Msg, + Data: p.Data, + Timeout: p.Timeout, + Threshold: p.Threshold, + }) + }() - select { - case <-p.closeChan: - return nil - case reply := <-p.ChannelResponse: - if reply.Equal(p.Root().Children[0]) { - // Transfer the response to the parent protocol - p.subResponse <- reply + for { + select { + case err := <-subLeaderActive: + if err != nil { + p.subleaderNotResponding <- true + return xerrors.Errorf("Couldn't contact subleader: %v", err) + } + case <-p.closeChan: + return nil + case reply := <-p.ChannelResponse: + if reply.Equal(p.Root().Children[0]) { + // Transfer the response to the parent protocol + p.subResponse <- reply + } + return nil + case <-time.After(p.Timeout): + // It might be only the subleader then we send a notification + // to let the parent protocol take actions + log.Warnf("%s: timed out while waiting for subleader response while %s", + p.ServerIdentity(), p.Tree().Dump()) + p.subleaderNotResponding <- true + return nil } - case <-time.After(p.Timeout): - // It might be only the subleader then we send a notification - // to let the parent protocol take actions - log.Warnf("%s: timed out while waiting for subleader response while %s", - p.ServerIdentity(), p.Tree().Dump()) - p.subleaderNotResponding <- true } - - return nil } // dispatchSubLeader takes care of synchronizing the children @@ -238,9 +245,16 @@ func (p *SubBlsCosi) dispatchSubLeader() error { return err } - errs := p.SendToChildrenInParallel(a) - if len(errs) > 0 { - log.Error(errs) + if len(p.Children()) > 0 { + for _, node := range p.Children() { + go func(node *onet.TreeNode) { + err := p.SendTo(node, a) + if err != nil { + log.Warnf("Error while sending to leaf %s: %v", + node.Name(), err) + } + }(node) + } } responses := make(ResponseMap) @@ -264,9 +278,7 @@ func (p *SubBlsCosi) dispatchSubLeader() error { // we need to timeout the children faster than the root timeout to let it // know the subleader is alive, but some children are failing timeout := time.After(p.Timeout / 2) - // If an error happens when sending the announcement, we can assume there - // will be a timeout from this node - done := len(errs) + done := 0 for done < len(p.Children()) { select { case <-p.closeChan: diff --git a/byzcoinx/byzcoinx_test.go b/byzcoinx/byzcoinx_test.go index 5263e90079..8a45e0df02 100644 --- a/byzcoinx/byzcoinx_test.go +++ b/byzcoinx/byzcoinx_test.go @@ -200,7 +200,7 @@ func runProtocol(t *testing.T, nbrHosts int, nbrFault int, refuseIndex int, prot log.Lvl3("Added counter", counters.size()-1, refuseIndex) require.True(t, int(math.Pow(float64(bftCosiProto.nSubtrees), - 3.0)) <= nbrHosts) + 2.0)) <= nbrHosts) // kill the leafs first nbrFault = min(nbrFault, len(servers)) diff --git a/messaging/propagate.go b/messaging/propagate.go index 6d1a6d0a4d..1f9b65da87 100644 --- a/messaging/propagate.go +++ b/messaging/propagate.go @@ -5,7 +5,6 @@ import ( "fmt" "go.dedis.ch/cothority/v3/blscosi/protocol" "reflect" - "strings" "sync" "time" @@ -173,6 +172,7 @@ func (p *Propagate) Dispatch() error { var gotSendData bool var errs []error subtreeCount := p.TreeNode().SubtreeCount() + errsChan := make(chan error, subtreeCount) for process { p.Lock() @@ -208,15 +208,18 @@ func (p *Propagate) Dispatch() error { process = false } log.Lvl3(p.ServerIdentity(), "Sending to children", p.Children()) - if errs = p.SendToChildrenInParallel(&msg.PropagateSendData); len(errs) != 0 { - errsStr := make([]string, len(errs)) - for i, e := range errs { - errsStr[i] = e.Error() - } - log.Lvl2("Error while sending to children:", errsStr) - if len(errs) > p.allowedFailures { - return errors.New(strings.Join(errsStr, "\n")) - } + + // Just blindly send to the children - we don't care if they receive it or + // not. If they don't receive it, they will complain later. + for _, c := range p.Children() { + go func(tn *onet.TreeNode) { + err := p.SendTo(tn, &msg.PropagateSendData) + if err != nil { + log.Warnf("Error while sending to child %s: %v", + tn.Name(), err) + errsChan <- err + } + }(c) } case <-p.ChannelReply: if !gotSendData { @@ -230,6 +233,11 @@ func (p *Propagate) Dispatch() error { return err } } + errsChan <- nil + case err := <-errsChan: + if err != nil { + errs = append(errs, err) + } // Only wait for the number of children that successfully received our message. if received == subtreeCount-len(errs) && received >= subtreeCount-p.allowedFailures { process = false