Skip to content

Commit

Permalink
Merge pull request #314 from alphagov/too-many-updates
Browse files Browse the repository at this point in the history
Implement retrying for transient DE errors
  • Loading branch information
csutter authored Aug 20, 2024
2 parents 29b7967 + 001499b commit dcce01b
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 125 deletions.
22 changes: 20 additions & 2 deletions app/services/discovery_engine/sync/operation.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
module DiscoveryEngine::Sync
class Operation
MAX_RETRIES_ON_ERROR = 3
WAIT_ON_ERROR = 3.seconds

def initialize(type, content_id, payload_version: nil, client: nil)
@type = type
@content_id = content_id
@payload_version = payload_version
@client = client || ::Google::Cloud::DiscoveryEngine.document_service(version: :v1)
@attempt = 1
end

private

attr_reader :type, :content_id, :payload_version, :client
attr_reader :type, :content_id, :payload_version, :client, :attempt

def sync
lock.acquire
Expand All @@ -26,8 +30,22 @@ def sync
log(Logger::Severity::INFO, "Ignored as newer version already synced")
end
rescue Google::Cloud::Error => e
if attempt < MAX_RETRIES_ON_ERROR
increment_counter("retry")
log(
Logger::Severity::WARN,
"Failed attempt #{attempt} to #{type} document (#{e.message}), retrying",
)
@attempt += 1
Kernel.sleep(WAIT_ON_ERROR)
retry
end

increment_counter("error")
log(Logger::Severity::ERROR, "Failed to #{type} document due to an error (#{e.message})")
log(
Logger::Severity::ERROR,
"Failed on attempt #{attempt} to #{type} document (#{e.message}), giving up",
)
GovukError.notify(e)
ensure
lock.release
Expand Down
80 changes: 29 additions & 51 deletions spec/services/discovery_engine/sync/delete_spec.rb
Original file line number Diff line number Diff line change
@@ -1,77 +1,44 @@
require_relative "shared_examples"

RSpec.describe DiscoveryEngine::Sync::Delete do
let(:client) { double("DocumentService::Client", delete_document: nil) }
let(:logger) { double("Logger", add: nil) }
subject(:sync) { described_class.new("some_content_id", payload_version: "1", client:) }

let(:lock) { instance_double(Coordination::DocumentLock, acquire: true, release: true) }
include_context "with sync context"

let(:version_cache) { instance_double(Coordination::DocumentVersionCache, sync_required?: sync_required, set_as_latest_synced_version: nil) }
let(:sync_required) { true }

before do
allow(Rails).to receive(:logger).and_return(logger)
allow(Rails.configuration).to receive(:discovery_engine_datastore_branch).and_return("branch")
allow(GovukError).to receive(:notify)

allow(Coordination::DocumentLock).to receive(:new).with("some_content_id").and_return(lock)
allow(Coordination::DocumentVersionCache).to receive(:new)
.with("some_content_id", payload_version: "1").and_return(version_cache)
end

context "when the delete succeeds" do
before do
described_class.new("some_content_id", payload_version: "1", client:).call
sync.call
end

it "deletes the document" do
expect(client).to have_received(:delete_document)
.with(name: "branch/documents/some_content_id")
end

it "sets the new latest remote version" do
expect(version_cache).to have_received(:set_as_latest_synced_version)
end

it "logs the delete operation" do
expect(logger).to have_received(:add).with(
Logger::Severity::INFO,
"[DiscoveryEngine::Sync::Delete] Successful delete content_id:some_content_id payload_version:1",
)
end

it "acquires and releases the lock" do
expect(lock).to have_received(:acquire)
expect(lock).to have_received(:release)
end
it_behaves_like "a successful sync operation", "delete"
end

context "when the incoming document doesn't require syncing" do
let(:sync_required) { false }

before do
described_class.new("some_content_id", payload_version: "1", client:).call
sync.call
end

it "does not delete the document" do
expect(client).not_to have_received(:delete_document)
end

it "does not set the remote version" do
expect(version_cache).not_to have_received(:set_as_latest_synced_version)
end

it "logs that the document hasn't been deleted" do
expect(logger).to have_received(:add).with(
Logger::Severity::INFO,
"[DiscoveryEngine::Sync::Delete] Ignored as newer version already synced content_id:some_content_id payload_version:1",
)
end
it_behaves_like "a noop sync operation"
end

context "when locking the document fails" do
before do
allow(lock).to receive(:acquire).and_return(false)

described_class.new("some_content_id", payload_version: "1", client:).call
sync.call
end

it "deletes the document regardless" do
Expand All @@ -85,7 +52,7 @@
before do
allow(client).to receive(:delete_document).and_raise(err)

described_class.new("some_content_id", payload_version: "1", client:).call
sync.call
end

it "logs the failure" do
Expand All @@ -100,24 +67,35 @@
end
end

context "when the delete fails for another reason" do
context "when failing consistently for the maximum number of attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
allow(client).to receive(:delete_document).and_raise(err)

described_class.new("some_content_id", payload_version: "1", client:).call
sync.call
end

it "logs the failure" do
expect(logger).to have_received(:add).with(
Logger::Severity::ERROR,
"[DiscoveryEngine::Sync::Delete] Failed to delete document due to an error (Something went wrong) content_id:some_content_id payload_version:1",
it_behaves_like "a failed sync operation after the maximum attempts", "delete"
end

context "when failing transiently but succeeding within the maximum attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
allow(client).to receive(:delete_document).and_invoke(
->(_) { raise err },
->(_) { raise err },
->(_) { double(name: "document-name") },
)

sync.call
end

it "send the error to Sentry" do
expect(GovukError).to have_received(:notify).with(err)
it "tries three times" do
expect(client).to have_received(:delete_document).exactly(3).times
end

it_behaves_like "a sync operation that eventually succeeds", "delete"
end
end
110 changes: 38 additions & 72 deletions spec/services/discovery_engine/sync/put_spec.rb
Original file line number Diff line number Diff line change
@@ -1,35 +1,25 @@
RSpec.describe DiscoveryEngine::Sync::Put do
let(:client) { double("DocumentService::Client", update_document: nil) }
let(:logger) { double("Logger", add: nil) }

let(:lock) { instance_double(Coordination::DocumentLock, acquire: true, release: true) }
let(:version_cache) { instance_double(Coordination::DocumentVersionCache, sync_required?: sync_required, set_as_latest_synced_version: nil) }
let(:sync_required) { true }
require_relative "shared_examples"

before do
allow(Rails).to receive(:logger).and_return(logger)
allow(Rails.configuration).to receive(:discovery_engine_datastore_branch).and_return("branch")
allow(GovukError).to receive(:notify)
RSpec.describe DiscoveryEngine::Sync::Put do
subject(:sync) do
described_class.new(
"some_content_id",
{ foo: "bar" },
content: "some content",
payload_version: "1",
client:,
)
end

allow(Coordination::DocumentLock).to receive(:new).with("some_content_id").and_return(lock)
include_context "with sync context"

allow(Coordination::DocumentVersionCache).to receive(:new)
.with("some_content_id", payload_version: "1").and_return(version_cache)
end
let(:sync_required) { true }

context "when updating the document succeeds" do
before do
allow(client).to receive(:update_document).and_return(
double(name: "document-name"),
)
allow(client).to receive(:update_document).and_return(double(name: "document-name"))

described_class.new(
"some_content_id",
{ foo: "bar" },
content: "some content",
payload_version: "1",
client:,
).call
sync.call
end

it "updates the document" do
Expand All @@ -47,88 +37,64 @@
)
end

it "acquires and releases the lock" do
expect(lock).to have_received(:acquire)
expect(lock).to have_received(:release)
end

it "sets the new latest remote version" do
expect(version_cache).to have_received(:set_as_latest_synced_version)
end

it "logs the put operation" do
expect(logger).to have_received(:add).with(
Logger::Severity::INFO,
"[DiscoveryEngine::Sync::Put] Successful put content_id:some_content_id payload_version:1",
)
end
it_behaves_like "a successful sync operation", "put"
end

context "when the incoming document doesn't need syncing" do
let(:sync_required) { false }

before do
described_class.new(
"some_content_id",
{ foo: "bar" },
content: "some content",
payload_version: "1",
client:,
).call
sync.call
end

it "does not update the document" do
expect(client).not_to have_received(:update_document)
end

it "does not set the remote version" do
expect(version_cache).not_to have_received(:set_as_latest_synced_version)
end

it "logs that the document hasn't been updated" do
expect(logger).to have_received(:add).with(
Logger::Severity::INFO,
"[DiscoveryEngine::Sync::Put] Ignored as newer version already synced content_id:some_content_id payload_version:1",
)
end
it_behaves_like "a noop sync operation"
end

context "when locking the document fails" do
before do
allow(lock).to receive(:acquire).and_return(false)

described_class.new(
"some_content_id",
{ foo: "bar" },
content: "some content",
payload_version: "1",
client:,
).call
sync.call
end

it "updates the document regardless" do
expect(client).to have_received(:update_document)
end
end

context "when updating the document fails" do
context "when failing consistently for the maximum number of attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
allow(client).to receive(:update_document).and_raise(err)

described_class.new("some_content_id", {}, payload_version: "1", client:).call
sync.call
end

it "logs the failure" do
expect(logger).to have_received(:add).with(
Logger::Severity::ERROR,
"[DiscoveryEngine::Sync::Put] Failed to put document due to an error (Something went wrong) content_id:some_content_id payload_version:1",
it_behaves_like "a failed sync operation after the maximum attempts", "put"
end

context "when failing transiently but succeeding within the maximum attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
allow(client).to receive(:update_document).and_invoke(
->(_) { raise err },
->(_) { raise err },
->(_) { double(name: "document-name") },
)

sync.call
end

it "sends the error to Sentry" do
expect(GovukError).to have_received(:notify).with(err)
it "tries three times" do
expect(client).to have_received(:update_document).exactly(3).times
end

it_behaves_like "a sync operation that eventually succeeds", "put"
end
end
Loading

0 comments on commit dcce01b

Please sign in to comment.