Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DONT MERGE - testing release 1.3.4 patch #2184

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
GOEXPERIMENT=${GOEXPERIMENT} \
make alloy

FROM public.ecr.aws/ubuntu/ubuntu:mantic
FROM public.ecr.aws/ubuntu/ubuntu:noble

# Username and uid for alloy user
ARG UID=473
Expand Down
32 changes: 23 additions & 9 deletions internal/runtime/internal/controller/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ import (
"time"

"github.com/go-kit/log"
"github.com/grafana/dskit/backoff"
"github.com/hashicorp/go-multierror"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"

"github.com/grafana/alloy/internal/featuregate"
"github.com/grafana/alloy/internal/runtime/internal/dag"
"github.com/grafana/alloy/internal/runtime/internal/worker"
Expand All @@ -18,11 +24,6 @@ import (
"github.com/grafana/alloy/internal/service"
"github.com/grafana/alloy/syntax/ast"
"github.com/grafana/alloy/syntax/diag"
"github.com/grafana/dskit/backoff"
"github.com/hashicorp/go-multierror"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)

// The Loader builds and evaluates ComponentNodes from Alloy blocks.
Expand Down Expand Up @@ -92,10 +93,11 @@ func NewLoader(opts LoaderOptions) *Loader {
componentNodeManager: NewComponentNodeManager(globals, reg),

// This is a reasonable default which should work for most cases. If a component is completely stuck, we would
// retry and log an error every 10 seconds, at most.
// retry and log an error every 10 seconds, at most. We give up after some time to prevent lasting deadlocks.
backoffConfig: backoff.Config{
MinBackoff: 1 * time.Millisecond,
MaxBackoff: 10 * time.Second,
MaxRetries: 20, // Give up after 20 attempts - it could be a deadlock instead of an overload.
},

graph: &dag.Graph{},
Expand Down Expand Up @@ -744,19 +746,31 @@ func (l *Loader) EvaluateDependants(ctx context.Context, updatedNodes []*QueuedN
l.concurrentEvalFn(nodeRef, dependantCtx, tracer, parentRef)
})
if err != nil {
level.Error(l.log).Log(
"msg", "failed to submit node for evaluation - Alloy is likely overloaded "+
"and cannot keep up with evaluating components - will retry",
level.Warn(l.log).Log(
"msg", "failed to submit node for evaluation - will retry",
"err", err,
"node_id", n.NodeID(),
"originator_id", parent.Node.NodeID(),
"retries", retryBackoff.NumRetries(),
)
// When backing off, release the mut in case the evaluation requires to interact with the loader itself.
l.mut.RUnlock()
retryBackoff.Wait()
l.mut.RLock()
} else {
break
}
}
if err != nil && !retryBackoff.Ongoing() {
level.Error(l.log).Log(
"msg", "retry attempts exhausted when submitting node for evaluation to the worker pool - "+
"this could be a deadlock, performance bottleneck or severe overload leading to goroutine starvation",
"err", err,
"node_id", n.NodeID(),
"originator_id", parent.Node.NodeID(),
"retries", retryBackoff.NumRetries(),
)
}
span.SetAttributes(attribute.Int("retries", retryBackoff.NumRetries()))
if err != nil {
span.SetStatus(codes.Error, err.Error())
Expand Down
4 changes: 2 additions & 2 deletions internal/runtime/internal/controller/node_config_import.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ func (cn *ImportConfigNode) setContentHealth(t component.HealthType, msg string)
// 4. Health reported from the source.
// 5. Health reported from the nested imports.
func (cn *ImportConfigNode) CurrentHealth() component.Health {
cn.healthMut.RLock()
defer cn.healthMut.RUnlock()
cn.mut.RLock()
defer cn.mut.RUnlock()
cn.healthMut.RLock()
defer cn.healthMut.RUnlock()

health := component.LeastHealthy(
cn.runHealth,
Expand Down