Skip to content

Commit

Permalink
Agent reported state as "406" when modules are in backoff. (Azure#3244)
Browse files Browse the repository at this point in the history
To reproduce

1.    create a deployment which has a module which fails to launch, like an incorrect image name, or some weird mount Docker doesn't like.
2.    Look at the reported status of the edge runtime - If you are lucky, you will see the real error.
3.    Let the runtime run for long enough that the "OrderedRetryPlanRunner" puts the module on backoff.
4.    Look at the reported status of the edge runtime - You won't see the real error message from the failure - you will see a "406 -- The device is offline or not sending status reports"

The root cause is that every reconcile starts with status=Unknown(406), and when a module is in backoff, the status doesn't update.

So it goes:
reconcile loop 1: module command fails, error is reported
reconcile loop 2: module is in backoff now, Unknown is reported
...
reconcile loop n: module command is attempted again, error is reported
reconcile loop n+1: module is in backoff again, Unknown is reported.

User is more likely to see "406 -- The device is offline or not sending status reports" which is not cool.
This has been seen before, one example: Azure#2066

The fix here is to keep track of status in Agent and report current status until it changes (either via a new error
  • Loading branch information
darobs authored Jul 22, 2020
1 parent 1fb21ff commit ee8ad66
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 9 deletions.
19 changes: 10 additions & 9 deletions edge-agent/src/Microsoft.Azure.Devices.Edge.Agent.Core/Agent.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public class Agent
readonly IAvailabilityMetric availabilityMetric;
IEnvironment environment;
DeploymentConfigInfo currentConfig;
DeploymentStatus status;

public Agent(
IConfigSource configSource,
Expand All @@ -59,6 +60,7 @@ public Agent(
this.environment = this.environmentProvider.Create(this.currentConfig.DeploymentConfig);
this.encryptionProvider = Preconditions.CheckNotNull(encryptionProvider, nameof(encryptionProvider));
this.availabilityMetric = Preconditions.CheckNotNull(availabilityMetric, nameof(availabilityMetric));
this.status = DeploymentStatus.Unknown;
Events.AgentCreated();
}

Expand Down Expand Up @@ -110,7 +112,6 @@ await deploymentConfigInfoJson.ForEachAsync(

public async Task ReconcileAsync(CancellationToken token)
{
DeploymentStatus status = DeploymentStatus.Unknown;
ModuleSet moduleSetToReport = null;
using (await this.reconcileLock.LockAsync(token))
{
Expand All @@ -127,7 +128,7 @@ public async Task ReconcileAsync(CancellationToken token)
DeploymentConfig deploymentConfig = deploymentConfigInfo.DeploymentConfig;
if (deploymentConfig.Equals(DeploymentConfig.Empty))
{
status = DeploymentStatus.Success;
this.status = DeploymentStatus.Success;
}
else
{
Expand All @@ -145,7 +146,7 @@ public async Task ReconcileAsync(CancellationToken token)

if (plan.IsEmpty)
{
status = DeploymentStatus.Success;
this.status = DeploymentStatus.Success;
}
else
{
Expand All @@ -155,7 +156,7 @@ public async Task ReconcileAsync(CancellationToken token)
await this.UpdateCurrentConfig(deploymentConfigInfo);
if (result)
{
status = DeploymentStatus.Success;
this.status = DeploymentStatus.Success;
}
}
catch (Exception ex) when (!ex.IsFatal())
Expand All @@ -172,28 +173,28 @@ public async Task ReconcileAsync(CancellationToken token)
switch (ex)
{
case ConfigEmptyException _:
status = new DeploymentStatus(DeploymentStatusCode.ConfigEmptyError, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.ConfigEmptyError, ex.Message);
Events.EmptyConfig(ex);
break;

case InvalidSchemaVersionException _:
status = new DeploymentStatus(DeploymentStatusCode.InvalidSchemaVersion, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.InvalidSchemaVersion, ex.Message);
Events.InvalidSchemaVersion(ex);
break;

case ConfigFormatException _:
status = new DeploymentStatus(DeploymentStatusCode.ConfigFormatError, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.ConfigFormatError, ex.Message);
Events.InvalidConfigFormat(ex);
break;

default:
status = new DeploymentStatus(DeploymentStatusCode.Failed, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.Failed, ex.Message);
Events.UnknownFailure(ex);
break;
}
}

await this.reporter.ReportAsync(token, moduleSetToReport, await this.environment.GetRuntimeInfoAsync(), this.currentConfig.Version, status);
await this.reporter.ReportAsync(token, moduleSetToReport, await this.environment.GetRuntimeInfoAsync(), this.currentConfig.Version, this.status);
Events.FinishedReconcile();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,121 @@ public async Task CurrentIsNotNullBecauseDesiredThrew()
mockPlanRunner.Verify(r => r.ExecuteAsync(1, It.IsAny<Plan>(), token), Times.Never);
}

[Fact]
public async void ReconcileAsyncExecuteAsyncIncompleteDefaulsUnknown()
{
var desiredModule = new TestModule("desired", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, Constants.DefaultPriority, new ConfigurationInfo("1"), null);
var currentModule = new TestModule("current", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, Constants.DefaultPriority, new ConfigurationInfo("1"), null);
var commandList = new List<ICommand>
{
new Mock<ICommand>().Object,
new Mock<ICommand>().Object,
};
var testPlan = new Plan(commandList);
var token = default(CancellationToken);
var runtimeInfo = Mock.Of<IRuntimeInfo>();
var deploymentConfig = new DeploymentConfig("1.0", runtimeInfo, new SystemModules(null, null), new Dictionary<string, IModule> { ["desired"] = desiredModule });
var deploymentConfigInfo = new DeploymentConfigInfo(0, deploymentConfig);
ModuleSet desiredSet = deploymentConfig.GetModuleSet();
ModuleSet currentSet = ModuleSet.Create(currentModule);

var mockConfigSource = new Mock<IConfigSource>();
var mockEnvironment = new Mock<IEnvironment>();
var mockPlanner = new Mock<IPlanner>();
var mockPlanRunner = new Mock<IPlanRunner>();
var mockReporter = new Mock<IReporter>();
var mockModuleIdentityLifecycleManager = new Mock<IModuleIdentityLifecycleManager>();
var configStore = Mock.Of<IEntityStore<string, string>>();
var mockEnvironmentProvider = Mock.Of<IEnvironmentProvider>(m => m.Create(It.IsAny<DeploymentConfig>()) == mockEnvironment.Object);
var serde = Mock.Of<ISerde<DeploymentConfigInfo>>();
var encryptionDecryptionProvider = Mock.Of<IEncryptionProvider>();
var availabilityMetric = Mock.Of<IAvailabilityMetric>();

mockConfigSource.Setup(cs => cs.GetDeploymentConfigInfoAsync())
.ReturnsAsync(deploymentConfigInfo);
mockEnvironment.Setup(env => env.GetModulesAsync(token))
.ReturnsAsync(currentSet);
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(desiredSet, currentSet))
.ReturnsAsync(ImmutableDictionary<string, IModuleIdentity>.Empty);
mockPlanner.Setup(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty))
.Returns(Task.FromResult(testPlan));
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(It.IsAny<ModuleSet>(), currentSet))
.Returns(Task.FromResult((IImmutableDictionary<string, IModuleIdentity>)ImmutableDictionary<string, IModuleIdentity>.Empty));
mockReporter.Setup(r => r.ReportAsync(token, It.IsAny<ModuleSet>(), It.IsAny<IRuntimeInfo>(), It.IsAny<long>(), DeploymentStatus.Unknown))
.Returns(Task.CompletedTask);
mockPlanRunner.SetupSequence(m => m.ExecuteAsync(It.IsAny<long>(), It.IsAny<Plan>(), It.IsAny<CancellationToken>()))
.ReturnsAsync(false);
var agent = new Agent(mockConfigSource.Object, mockEnvironmentProvider, mockPlanner.Object, mockPlanRunner.Object, mockReporter.Object, mockModuleIdentityLifecycleManager.Object, configStore, DeploymentConfigInfo.Empty, serde, encryptionDecryptionProvider, availabilityMetric);

await agent.ReconcileAsync(token);

mockEnvironment.Verify(env => env.GetModulesAsync(token), Times.Once());
mockPlanner.Verify(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty), Times.Once());
mockPlanRunner.VerifyAll();
mockReporter.VerifyAll();
}

[Fact]
public async void ReconcileAsyncExecuteAsyncIncompleteReportsLastState()
{
var desiredModule = new TestModule("desired", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, Constants.DefaultPriority, new ConfigurationInfo("1"), null);
var currentModule = new TestModule("current", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, Constants.DefaultPriority, new ConfigurationInfo("1"), null);
var commandList = new List<ICommand>
{
new Mock<ICommand>().Object,
new Mock<ICommand>().Object,
};
var testPlan = new Plan(commandList);
var token = default(CancellationToken);
var runtimeInfo = Mock.Of<IRuntimeInfo>();
var deploymentConfig = new DeploymentConfig("1.0", runtimeInfo, new SystemModules(null, null), new Dictionary<string, IModule> { ["desired"] = desiredModule });
var deploymentConfigInfo = new DeploymentConfigInfo(0, deploymentConfig);
ModuleSet desiredSet = deploymentConfig.GetModuleSet();
ModuleSet currentSet = ModuleSet.Create(currentModule);
var statuses = new List<DeploymentStatusCode>();

var mockConfigSource = new Mock<IConfigSource>();
var mockEnvironment = new Mock<IEnvironment>();
var mockPlanner = new Mock<IPlanner>();
var mockPlanRunner = new Mock<IPlanRunner>();
var mockReporter = new Mock<IReporter>();
var mockModuleIdentityLifecycleManager = new Mock<IModuleIdentityLifecycleManager>();
var configStore = Mock.Of<IEntityStore<string, string>>();
var mockEnvironmentProvider = Mock.Of<IEnvironmentProvider>(m => m.Create(It.IsAny<DeploymentConfig>()) == mockEnvironment.Object);
var serde = Mock.Of<ISerde<DeploymentConfigInfo>>();
var encryptionDecryptionProvider = Mock.Of<IEncryptionProvider>();
var availabilityMetric = Mock.Of<IAvailabilityMetric>();

mockConfigSource.Setup(cs => cs.GetDeploymentConfigInfoAsync())
.ReturnsAsync(deploymentConfigInfo);
mockEnvironment.Setup(env => env.GetModulesAsync(token))
.ReturnsAsync(currentSet);
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(desiredSet, currentSet))
.ReturnsAsync(ImmutableDictionary<string, IModuleIdentity>.Empty);
mockPlanner.Setup(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty))
.Returns(Task.FromResult(testPlan));
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(It.IsAny<ModuleSet>(), currentSet))
.Returns(Task.FromResult((IImmutableDictionary<string, IModuleIdentity>)ImmutableDictionary<string, IModuleIdentity>.Empty));
mockReporter.Setup(r => r.ReportAsync(token, It.IsAny<ModuleSet>(), It.IsAny<IRuntimeInfo>(), It.IsAny<long>(), It.IsAny<DeploymentStatus>()))
.Callback((CancellationToken _t, ModuleSet _m, IRuntimeInfo _r, long _v, DeploymentStatus d) => statuses.Add(d.Code))
.Returns(Task.CompletedTask);
// First call represents a command failure, 2nd call represents an execution backoff.
mockPlanRunner.SetupSequence(m => m.ExecuteAsync(It.IsAny<long>(), It.IsAny<Plan>(), It.IsAny<CancellationToken>()))
.ThrowsAsync(new Exception("generic exception"))
.ReturnsAsync(false);
var agent = new Agent(mockConfigSource.Object, mockEnvironmentProvider, mockPlanner.Object, mockPlanRunner.Object, mockReporter.Object, mockModuleIdentityLifecycleManager.Object, configStore, DeploymentConfigInfo.Empty, serde, encryptionDecryptionProvider, availabilityMetric);

await agent.ReconcileAsync(token);
await agent.ReconcileAsync(token);

mockEnvironment.Verify(env => env.GetModulesAsync(token), Times.Exactly(2));
mockPlanner.Verify(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty), Times.Exactly(2));
mockPlanRunner.VerifyAll();
mockReporter.Verify(r => r.ReportAsync(token, It.IsAny<ModuleSet>(), It.IsAny<IRuntimeInfo>(), It.IsAny<long>(), It.IsAny<DeploymentStatus>()), Times.Exactly(2));
Assert.Equal(DeploymentStatusCode.Failed, statuses[0]);
Assert.Equal(DeploymentStatusCode.Failed, statuses[1]);
}

[Fact]
public async Task ReportShutdownAsyncConfigTest()
{
Expand Down

0 comments on commit ee8ad66

Please sign in to comment.