diff --git a/.vscode/launch.json b/.vscode/launch.json index 751c7cb4..680f7f55 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,6 +1,15 @@ { "version": "0.2.0", "configurations": [ + { + "name": "Connect to server", + "type": "go", + "request": "attach", + "mode": "remote", + "remotePath": "${workspaceFolder}", + "port": 2345, + "host": "127.0.0.1" + }, { "name": "Launch Package", "type": "go", diff --git a/README.md b/README.md index de79ed44..23f495e5 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,9 @@ The following checks are available: 3. [DNS check](#check-dns) - `dns`: The `sparrow` is able to perform DNS resolution checks to monitor domain name system performance and reliability. The check has the ability to target specific domains or IPs for monitoring. +4. [Traceroute Check](#check-traceroute) - `traceroute`: The `sparrow` is able to perform traceroute checks to monitor + the network path to a target. The check has the ability to target specific domains or IPs for monitoring. + Each check is designed to provide comprehensive insights into the various aspects of network and service health, ensuring robust monitoring and quick detection of potential issues. @@ -435,6 +438,41 @@ dns: - Description: Histogram of response times for DNS checks - Labelled with `target` +### Check: Traceroute + +| Field | Type | Description | +| ----------------- | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `interval` | `duration` | Interval to perform the Traceroute check. | +| `timeout` | `duration` | Timeout for every hop. | +| `retries` | `integer` | Number of times to retry the traceroute for a target, if it fails. | +| `maxHops` | `integer` | Maximum number of hops to try before giving up. | +| `targets` | `list of objects` | List of targets to traceroute to. | +| `targets[].addr` | `string` | The address of the target to traceroute to. Can be an IP address or DNS name | +| `targets[].port` | `uint16` | The port of the target to traceroute to. Default is 80 | + +#### Example configuration + +```yaml + traceroute: + interval: 5s + timeout: 3s + retries: 3 + maxHops: 8 + targets: + - addr: 8.8.8.8 + port: 53 + - addr: www.google.com + port: 80 +``` + +#### Required Capabilities +To use this check, sparrow needs to be run with the `CAP_NET_RAW` capability or elevated privileges to be able to send raw packets. +Using the `CAP_NET_RAW` capability is recommended over running sparrow as sudo + +```bash +sudo setcap 'cap_net_raw=ep' sparrow +``` + ## API The `sparrow` exposes an API for accessing the results of various checks. Each check registers its own endpoint diff --git a/go.mod b/go.mod index 1e3f9e2e..09669822 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,13 @@ require ( gopkg.in/yaml.v3 v3.0.1 ) +require ( + github.com/aeden/traceroute v0.0.0-20210211061815-03f5f7cb7908 + github.com/google/go-cmp v0.6.0 +) + +require github.com/mitchellh/mapstructure v1.5.0 // indirect + require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect @@ -28,7 +35,6 @@ require ( github.com/magiconair/properties v1.8.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect - github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/perimeterx/marshmallow v1.1.5 // indirect diff --git a/go.sum b/go.sum index 7c135d54..b1c89f63 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/aeden/traceroute v0.0.0-20210211061815-03f5f7cb7908 h1:6suDyKbvZ5r2G/gblQLV9Cdv7rdqNlUxsRXpLOF0rKM= +github.com/aeden/traceroute v0.0.0-20210211061815-03f5f7cb7908/go.mod h1:HPBB/4vaPt7NcN9l72/+IwsmDVQsa6AWM6ZDKJCLB9U= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= @@ -27,8 +29,8 @@ github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaS github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= diff --git a/pkg/checks/runtime/config.go b/pkg/checks/runtime/config.go index fd92c806..d922441f 100644 --- a/pkg/checks/runtime/config.go +++ b/pkg/checks/runtime/config.go @@ -25,15 +25,17 @@ import ( "github.com/caas-team/sparrow/pkg/checks/dns" "github.com/caas-team/sparrow/pkg/checks/health" "github.com/caas-team/sparrow/pkg/checks/latency" + "github.com/caas-team/sparrow/pkg/checks/traceroute" ) // Config holds the runtime configuration // for the various checks // the sparrow supports type Config struct { - Health *health.Config `yaml:"health" json:"health"` - Latency *latency.Config `yaml:"latency" json:"latency"` - Dns *dns.Config `yaml:"dns" json:"dns"` + Health *health.Config `yaml:"health" json:"health"` + Latency *latency.Config `yaml:"latency" json:"latency"` + Dns *dns.Config `yaml:"dns" json:"dns"` + Traceroute *traceroute.Config `yaml:"traceroute" json:"traceroute"` } // Empty returns true if no checks are configured @@ -63,6 +65,9 @@ func (c Config) Iter() []checks.Runtime { if c.Dns != nil { configs = append(configs, c.Dns) } + if c.Traceroute != nil { + configs = append(configs, c.Traceroute) + } return configs } @@ -78,6 +83,9 @@ func (c Config) size() int { if c.HasDNSCheck() { size++ } + if c.HasTracerouteCheck() { + size++ + } return size } @@ -96,6 +104,11 @@ func (c Config) HasDNSCheck() bool { return c.Dns != nil } +// HasTracerouteCheck returns true if the check has a traceroute check configured +func (c Config) HasTracerouteCheck() bool { + return c.Traceroute != nil +} + // HasCheck returns true if the check has a check with the given name configured func (c Config) HasCheck(name string) bool { switch name { @@ -105,6 +118,8 @@ func (c Config) HasCheck(name string) bool { return c.HasLatencyCheck() case dns.CheckName: return c.HasDNSCheck() + case traceroute.CheckName: + return c.HasTracerouteCheck() default: return false } @@ -125,6 +140,10 @@ func (c Config) For(name string) checks.Runtime { if c.HasDNSCheck() { return c.Dns } + case traceroute.CheckName: + if c.HasTracerouteCheck() { + return c.Traceroute + } } return nil } diff --git a/pkg/checks/traceroute/config.go b/pkg/checks/traceroute/config.go new file mode 100644 index 00000000..f489e10d --- /dev/null +++ b/pkg/checks/traceroute/config.go @@ -0,0 +1,51 @@ +package traceroute + +import ( + "fmt" + "net" + "net/url" + "time" + + "github.com/caas-team/sparrow/pkg/checks" +) + +// Config is the configuration for the traceroute check +type Config struct { + // Targets is a list of targets to traceroute to + Targets []Target `json:"targets" yaml:"targets" mapstructure:"targets"` + // Retries is the number of times to retry the traceroute for a target, if it fails + Retries int `json:"retries" yaml:"retries" mapstructure:"retries"` + // MaxHops is the maximum number of hops to try before giving up + MaxHops int `json:"maxHops" yaml:"maxHops" mapstructure:"maxHops"` + // Interval is the time to wait between check iterations + Interval time.Duration `json:"interval" yaml:"interval" mapstructure:"interval"` + // Timeout is the maximum time to wait for a response from a hop + Timeout time.Duration `json:"timeout" yaml:"timeout" mapstructure:"timeout"` +} + +func (c *Config) For() string { + return CheckName +} + +func (c *Config) Validate() error { + if c.Timeout <= 0 { + return checks.ErrInvalidConfig{CheckName: CheckName, Field: "traceroute.timeout", Reason: "must be greater than 0"} + } + if c.Interval <= 0 { + return checks.ErrInvalidConfig{CheckName: CheckName, Field: "traceroute.interval", Reason: "must be greater than 0"} + } + + for i, t := range c.Targets { + ip := net.ParseIP(t.Addr) + + if ip != nil { + continue + } + + _, err := url.Parse(t.Addr) + if err != nil && ip == nil { + return checks.ErrInvalidConfig{CheckName: CheckName, Field: fmt.Sprintf("traceroute.targets[%d].addr", i), Reason: "invalid url or ip"} + } + } + return nil +} diff --git a/pkg/checks/traceroute/traceroute.go b/pkg/checks/traceroute/traceroute.go new file mode 100644 index 00000000..f2543e7c --- /dev/null +++ b/pkg/checks/traceroute/traceroute.go @@ -0,0 +1,199 @@ +package traceroute + +import ( + "context" + "sync" + "time" + + "github.com/aeden/traceroute" + "github.com/caas-team/sparrow/internal/logger" + "github.com/caas-team/sparrow/pkg/checks" + "github.com/getkin/kin-openapi/openapi3" + "github.com/prometheus/client_golang/prometheus" +) + +var _ checks.Check = (*Traceroute)(nil) + +const CheckName = "traceroute" + +type Target struct { + // The address of the target to traceroute to. Can be a DNS name or an IP address + Addr string `json:"addr" yaml:"addr" mapstructure:"addr"` + // The port to traceroute to + Port uint16 `json:"port" yaml:"port" mapstructure:"port"` +} + +func NewCheck() checks.Check { + return &Traceroute{ + config: Config{}, + traceroute: newTraceroute, + CheckBase: checks.CheckBase{ + Mu: sync.Mutex{}, + DoneChan: make(chan struct{}), + }, + } +} + +type Traceroute struct { + checks.CheckBase + config Config + traceroute tracerouteFactory +} + +type tracerouteFactory func(dest string, port, timeout, retries, maxHops int) (traceroute.TracerouteResult, error) + +func newTraceroute(dest string, port, timeout, retries, maxHops int) (traceroute.TracerouteResult, error) { + opts := &traceroute.TracerouteOptions{} + opts.SetTimeoutMs(timeout) + opts.SetRetries(retries) + opts.SetMaxHops(maxHops) + opts.SetPort(port) + return traceroute.Traceroute(dest, opts) +} + +type result struct { + // The minimum number of hops required to reach the target + NumHops int + // The path taken to the destination + Hops []hop +} + +type hop struct { + Addr string + Latency time.Duration + Success bool +} + +// Run runs the check in a loop sending results to the provided channel +func (tr *Traceroute) Run(ctx context.Context, cResult chan checks.ResultDTO) error { + ctx, cancel := logger.NewContextWithLogger(ctx) + defer cancel() + log := logger.FromContext(ctx) + log.Info("Starting traceroute check", "interval", tr.config.Interval.String()) + + for { + select { + case <-ctx.Done(): + log.Error("Context canceled", "error", ctx.Err()) + return ctx.Err() + case <-tr.DoneChan: + return nil + case <-time.After(tr.config.Interval): + res := tr.check(ctx) + + cResult <- checks.ResultDTO{ + Name: tr.Name(), + Result: &checks.Result{ + Data: res, + Timestamp: time.Now(), + }, + } + log.Debug("Successfully finished traceroute check run") + } + } +} + +// GetConfig returns the current configuration of the check +func (tr *Traceroute) GetConfig() checks.Runtime { + tr.Mu.Lock() + defer tr.Mu.Unlock() + return &tr.config +} + +func (tr *Traceroute) check(ctx context.Context) map[string]result { + res := make(map[string]result) + log := logger.FromContext(ctx) + + type internalResult struct { + addr string + res result + } + + var wg sync.WaitGroup + cResult := make(chan internalResult, len(tr.config.Targets)) + + for _, t := range tr.config.Targets { + wg.Add(1) + go func(t Target) { + l := log.With("target", t.Addr) + defer wg.Done() + l.Debug("Running traceroute") + start := time.Now() + trace, err := tr.traceroute(t.Addr, int(t.Port), int(tr.config.Timeout/time.Millisecond), tr.config.Retries, tr.config.MaxHops) + duration := time.Since(start) + if err != nil { + l.Error("Error running traceroute", "error", err) + } + + l.Debug("Ran traceroute", "result", trace, "duration", duration) + + r := result{ + NumHops: len(trace.Hops), + Hops: []hop{}, + } + + for _, h := range trace.Hops { + r.Hops = append(r.Hops, hop{ + Addr: h.Host, + Latency: h.ElapsedTime, + Success: h.Success, + }) + } + cResult <- internalResult{addr: t.Addr, res: r} + }(t) + } + + log.Debug("Waiting for traceroute checks to finish") + + go func() { + wg.Wait() + close(cResult) + }() + + log.Debug("All traceroute checks finished") + + for r := range cResult { + res[r.addr] = r.res + } + + return res +} + +// Shutdown is called once when the check is unregistered or sparrow shuts down +func (tr *Traceroute) Shutdown(_ context.Context) error { + tr.DoneChan <- struct{}{} + close(tr.DoneChan) + return nil +} + +// SetConfig is called once when the check is registered +// This is also called while the check is running, if the remote config is updated +// This should return an error if the config is invalid +func (tr *Traceroute) SetConfig(cfg checks.Runtime) error { + if cfg, ok := cfg.(*Config); ok { + tr.Mu.Lock() + defer tr.Mu.Unlock() + tr.config = *cfg + return nil + } + + return checks.ErrConfigMismatch{ + Expected: CheckName, + Current: cfg.For(), + } +} + +// Schema returns an openapi3.SchemaRef of the result type returned by the check +func (tr *Traceroute) Schema() (*openapi3.SchemaRef, error) { + return checks.OpenapiFromPerfData[map[string]result](map[string]result{}) +} + +// GetMetricCollectors allows the check to provide prometheus metric collectors +func (tr *Traceroute) GetMetricCollectors() []prometheus.Collector { + return []prometheus.Collector{} +} + +// Name returns the name of the check +func (tr *Traceroute) Name() string { + return CheckName +} diff --git a/pkg/checks/traceroute/traceroute_test.go b/pkg/checks/traceroute/traceroute_test.go new file mode 100644 index 00000000..608e1c78 --- /dev/null +++ b/pkg/checks/traceroute/traceroute_test.go @@ -0,0 +1,139 @@ +package traceroute + +import ( + "context" + "net" + "sync" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + + "github.com/aeden/traceroute" + "github.com/caas-team/sparrow/pkg/checks" +) + +func TestCheck(t *testing.T) { + cases := []struct { + name string + c *Traceroute + want map[string]result + }{ + { + name: "Success 5 hops", + c: newForTest(success(5), []string{"8.8.8.8"}), + want: map[string]result{ + "8.8.8.8": { + NumHops: 5, + Hops: []hop{ + {Addr: "0.0.0.0", Latency: 0 * time.Second, Success: false}, + {Addr: "0.0.0.1", Latency: 1 * time.Second, Success: false}, + {Addr: "0.0.0.2", Latency: 2 * time.Second, Success: false}, + {Addr: "0.0.0.3", Latency: 3 * time.Second, Success: false}, + {Addr: "google-public-dns-a.google.com", Latency: 69 * time.Second, Success: true}, + }, + }, + }, + }, + { + name: "Traceroute internal error fails silently", + c: newForTest(returnError(&net.DNSError{Err: "no such host", Name: "google.com", IsNotFound: true}), []string{"google.com"}), + want: map[string]result{ + "google.com": {Hops: []hop{}}, + }, + }, + } + + for _, c := range cases { + res := c.c.check(context.Background()) + + if !cmp.Equal(res, c.want) { + diff := cmp.Diff(res, c.want) + t.Errorf("unexpected result: +want -got\n%s", diff) + } + } +} + +func newForTest(f tracerouteFactory, targets []string) *Traceroute { + t := make([]Target, len(targets)) + for i, target := range targets { + t[i] = Target{Addr: target} + } + return &Traceroute{ + config: Config{ + Targets: t, + }, + traceroute: f, + CheckBase: checks.CheckBase{ + Mu: sync.Mutex{}, + DoneChan: make(chan struct{}), + }, + } +} + +// success produces a tracerouteFactory that returns a traceroute result with nHops hops +func success(nHops int) tracerouteFactory { + return func(dest string, port, timeout, retries, maxHops int) (traceroute.TracerouteResult, error) { + hops := make([]traceroute.TracerouteHop, nHops) + for i := 0; i < nHops-1; i++ { + hops[i] = traceroute.TracerouteHop{ + Success: false, + N: nHops, + Host: ipFromInt(i), + ElapsedTime: time.Duration(i) * time.Second, + TTL: i, + } + } + hops[nHops-1] = traceroute.TracerouteHop{ + Success: true, + Address: [4]byte{8, 8, 8, 8}, + N: nHops, + Host: "google-public-dns-a.google.com", + ElapsedTime: 69 * time.Second, + TTL: nHops, + } + + return traceroute.TracerouteResult{ + DestinationAddress: hops[nHops-1].Address, + Hops: hops, + }, nil + } +} + +func returnError(err error) tracerouteFactory { + return func(dest string, port, timeout, retries, maxHops int) (traceroute.TracerouteResult, error) { + return traceroute.TracerouteResult{}, err + } +} + +// ipFromInt takes in an int and builds an IP address from it +// Example: +// ipFromInt(300) -> 0.0.1.44 +func ipFromInt(i int) string { + b1 := i >> 24 & 0xFF + b2 := i >> 16 & 0xFF + b3 := i >> 8 & 0xFF + b4 := i & 0xFF + + return net.IPv4(byte(b1), byte(b2), byte(b3), byte(b4)).String() +} + +func TestIpFromInt(t *testing.T) { + cases := []struct { + In int + Expected string + }{ + {In: 300, Expected: "0.0.1.44"}, + {In: 0, Expected: "0.0.0.0"}, + {In: (1 << 33) - 1, Expected: "255.255.255.255"}, + } + + for _, c := range cases { + t.Run("ipFromInt", func(t *testing.T) { + actual := ipFromInt(c.In) + if c.Expected != actual { + t.Errorf("expected: %v, actual: %v", c.Expected, actual) + } + }) + } +} diff --git a/pkg/factory/factory.go b/pkg/factory/factory.go index 05e46d35..c65e6571 100644 --- a/pkg/factory/factory.go +++ b/pkg/factory/factory.go @@ -26,6 +26,7 @@ import ( "github.com/caas-team/sparrow/pkg/checks/health" "github.com/caas-team/sparrow/pkg/checks/latency" "github.com/caas-team/sparrow/pkg/checks/runtime" + "github.com/caas-team/sparrow/pkg/checks/traceroute" ) // newCheck creates a new check instance from the given name @@ -61,7 +62,8 @@ func NewChecksFromConfig(cfg runtime.Config) (map[string]checks.Check, error) { // registry is a convenience map to create new checks var registry = map[string]func() checks.Check{ - health.CheckName: health.NewCheck, - latency.CheckName: latency.NewCheck, - dns.CheckName: dns.NewCheck, + health.CheckName: health.NewCheck, + latency.CheckName: latency.NewCheck, + dns.CheckName: dns.NewCheck, + traceroute.CheckName: traceroute.NewCheck, } diff --git a/scripts/debug-elevated.sh b/scripts/debug-elevated.sh new file mode 100644 index 00000000..e3af8168 --- /dev/null +++ b/scripts/debug-elevated.sh @@ -0,0 +1,13 @@ +# Description: Debug the application with elevated privileges +# This is only necessary when debugging issues with the traceroute check, +# as it requires elevated privileges +# to createa a raw socket +# +# Usage: +# 1. Create a config for debugging in .tmp/config.yaml and a .tmp/runtime.yaml +# +# 2. Run the following command from the root of the project +# ./scripts/debug-elevated.sh +# +# 3. Attach to the debugger with launch.json in vscode +go build -gcflags '-N -l' -o .tmp/app ./ && sudo dlv exec .tmp/app -- run --config .tmp/config.yaml