diff --git a/src/stirling/source_connectors/socket_tracer/bcc_bpf/socket_trace.c b/src/stirling/source_connectors/socket_tracer/bcc_bpf/socket_trace.c index ba5df955dde..f51260d2108 100644 --- a/src/stirling/source_connectors/socket_tracer/bcc_bpf/socket_trace.c +++ b/src/stirling/source_connectors/socket_tracer/bcc_bpf/socket_trace.c @@ -35,8 +35,8 @@ #include "src/stirling/upid/upid.h" // This keeps instruction count below BPF's limit of 4096 per probe. -#define LOOP_LIMIT 42 #define PROTOCOL_VEC_LIMIT 3 +#define MAX_FILLER_SIZE (1 * 1024 * 1024) // 1MiB, taken from socket_trace.hpp const int32_t kInvalidFD = -1; @@ -476,6 +476,10 @@ static __inline void perf_submit_buf(struct pt_regs* ctx, const enum traffic_dir } else if (buf_size_minus_1 < 0x7fffffff) { // If-statement condition above is only required to prevent clang from optimizing // away the `if (amount_copied > 0)` below. + + // Here we truncate an iovec to MAX_MSG_SIZE (30KiB), then in user space we add a filler + // event if msg_size (size of this iovec) > msg_buf_size. If this differences > our filler max + // of 1MiB, then we push an event with a gap to the datastream buffer. bpf_probe_read(&event->msg, MAX_MSG_SIZE, buf); amount_copied = MAX_MSG_SIZE; } @@ -483,6 +487,23 @@ static __inline void perf_submit_buf(struct pt_regs* ctx, const enum traffic_dir // If-statement is redundant, but is required to keep the 4.14 verifier happy. if (amount_copied > 0) { event->attr.msg_buf_size = amount_copied; + // bytes_missed should be 0 if we didn't truncate amount_copied to MAX_MSG_SIZE above. + // Note that perf_submit_buf won't correctly set bytes_missed for perf_submit_iovecs + // when bytes_remaining > iov_size and we've reached the loop limit + // bc it takes only the size of the current iovec into account + // and not the bytes remaining across all iovecs which we drop due to the loop limit. + // In those cases we rely on the value set in perf_submit_iovecs. + + // For older kernels < 4.14, we can't record gap metadata without exceeding the instruction + // limit. + if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) { + if (event->attr.incomplete_chunk != kExceededLoopLimit) { + event->attr.bytes_missed = event->attr.msg_size - event->attr.msg_buf_size; + } + if (event->attr.bytes_missed > 0 && event->attr.incomplete_chunk == kFullyFormed) { + event->attr.incomplete_chunk = kUnknownGapReason; + } + } socket_data_events.perf_submit(ctx, event, sizeof(event->attr) + amount_copied); } } @@ -493,12 +514,23 @@ static __inline void perf_submit_wrapper(struct pt_regs* ctx, struct socket_data_event_t* event) { int bytes_sent = 0; unsigned int i; - + event->attr.incomplete_chunk = kFullyFormed; + event->attr.bytes_missed = 0; #pragma unroll for (i = 0; i < CHUNK_LIMIT; ++i) { const int bytes_remaining = buf_size - bytes_sent; const size_t current_size = (bytes_remaining > MAX_MSG_SIZE && (i != CHUNK_LIMIT - 1)) ? MAX_MSG_SIZE : bytes_remaining; + // For older kernels < 5.1, we can't record gap metadata without exceeding the instruction + // limit. + if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) { + // Check if we have reached the chunk limit, but there are bytes left to capture beyond our + // max msg size. + const bool chunks_not_fully_captured = i == CHUNK_LIMIT - 1 && current_size > MAX_MSG_SIZE; + if (chunks_not_fully_captured) { + event->attr.incomplete_chunk = kExceededChunkLimitAndMaxMsgSize; + } + } perf_submit_buf(ctx, direction, buf + bytes_sent, current_size, conn_info, event); bytes_sent += current_size; @@ -516,15 +548,41 @@ static __inline void perf_submit_iovecs(struct pt_regs* ctx, // array order. That means they read or fill iov[0], then iov[1], and so on. They return the total // size of the written or read data. Therefore, when loop through the buffers, both the number of // buffers and the total size need to be checked. More details can be found on their man pages. + event->attr.incomplete_chunk = kFullyFormed; + event->attr.bytes_missed = 0; int bytes_sent = 0; + unsigned int i; #pragma unroll - for (int i = 0; i < LOOP_LIMIT && i < iovlen && bytes_sent < total_size; ++i) { + for (i = 0; i < LOOP_LIMIT && i < iovlen && bytes_sent < total_size; ++i) { struct iovec iov_cpy; BPF_PROBE_READ_VAR(iov_cpy, &iov[i]); - + // total bytes we have left to copy across all iovecs const int bytes_remaining = total_size - bytes_sent; + // bytes contained in this iovec (either bytes we have left or the size of the iovec, whichever + // is smaller) This can be >MAX_MSG_SIZE and thus truncated in perf_submit_buf const size_t iov_size = min_size_t(iov_cpy.iov_len, bytes_remaining); + // For older kernels < 5.1, we can't record gap metadata without exceeding the instruction + // limit. + if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) { + // We have reached the loop limit, but there are iovecs left to capture. + const bool iovec_not_fully_captured = i == LOOP_LIMIT - 1 && i < iovlen; + // This iov exceeds the MAX_MSG_SIZE, and will be truncated in perf_submit_buf. + const bool iov_size_exceeds_max_msg_size = iov_size > MAX_MSG_SIZE; + + if (iovec_not_fully_captured && iov_size_exceeds_max_msg_size) { + event->attr.incomplete_chunk = kExceededLoopLimitAndMaxMsgSize; + } else if (iovec_not_fully_captured) { + event->attr.incomplete_chunk = kExceededLoopLimit; + // perf_submit_buf won't correctly set bytes_missed for perf_submit_iovecs + // if bytes_remaining > iov_size and we've reached the loop limit + // because it takes only the size of the current iovec into account + // see min(iov_len, bytes_remaining) above. + event->attr.bytes_missed = bytes_remaining - iov_size; + } else if (iov_size_exceeds_max_msg_size) { + event->attr.incomplete_chunk = kIovSizeExceededMaxMsgSize; + } + } // TODO(oazizi/yzhao): Should switch this to go through perf_submit_wrapper. // We don't have the BPF instruction count to do so right now. perf_submit_buf(ctx, direction, iov_cpy.iov_base, iov_size, conn_info, event); @@ -533,9 +591,6 @@ static __inline void perf_submit_iovecs(struct pt_regs* ctx, // Move the position for the next event. event->attr.pos += iov_size; } - - // TODO(oazizi): If there is data left after the loop limit, we should still report the remainder - // with a data-less event. } /*********************************************************** @@ -887,6 +942,22 @@ static __inline void process_syscall_sendfile(struct pt_regs* ctx, uint64_t id, } event->attr.pos = conn_info->wr_bytes; + // For older kernels < 5.1, we can't record gap metadata without exceeding the instruction + // limit. + if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) { + // technically we drop all the data and just send the gap event, filling the gap with \0 bytes + // up to 1MB + if (bytes_count > MAX_FILLER_SIZE) { + // if we exceed the max filler size (1MB), we'll create a gap in the data stream buffer + event->attr.incomplete_chunk = kSendFileExceededMaxFillerSize; + } else { + // If we don't exceed max filler size for this sendfile, we record a complete + // kSendFile to keep track of in our metrics. If filler is enabled (lazy parsing off) + // we will allocate a filler event in user space to fill the gap left by an empty sendfile. + event->attr.incomplete_chunk = kSendFile; + } + event->attr.bytes_missed = bytes_count; + } event->attr.msg_size = bytes_count; event->attr.msg_buf_size = 0; socket_data_events.perf_submit(ctx, event, sizeof(event->attr)); diff --git a/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.h b/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.h index 8fe90b6230d..14fc411fb73 100644 --- a/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.h +++ b/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.h @@ -129,6 +129,7 @@ struct close_event_t { // This applies to messages that are over MAX_MSG_SIZE, // and effectively makes the maximum message size to be CHUNK_LIMIT*MAX_MSG_SIZE. #define CHUNK_LIMIT 4 +#define LOOP_LIMIT 42 // Unique ID to all syscalls and a few other notable functions. // This applies to events sent to user-space. @@ -162,6 +163,29 @@ enum source_function_t { kSSLRead, }; +// Keeps track of the reasons for missed data from bpf, resulting in +// a gap in the data stream buffer (which we sometimes fill with null bytes). +enum chunk_t { + kFullyFormed = 0, + // perf_submit_iovecs + kExceededLoopLimit = 1, + kIovSizeExceededMaxMsgSize = 2, + kExceededLoopLimitAndMaxMsgSize = 3, + // perf_submit_wrapper + kExceededChunkLimitAndMaxMsgSize = 4, + // process_syscall_sendfile + kSendFile = 5, + kSendFileExceededMaxFillerSize = 6, + // filler event (populated in socket_trace.hpp) with size bytes_missed + // TODO(@benkilimnik): eventually we should remove the filler event + // and use lazy parsing across the board. + kFiller = 7, + // gap we tried to fill was larger than max filler size (kMaxFilledSizeBytes, currently 1MB) + kIncompleteFiller = 8, + kHeaderEvent = 9, // no gap + kUnknownGapReason = 10, +}; + struct socket_data_event_t { // We split attributes into a separate struct, because BPF gets upset if you do lots of // size arithmetic. This makes it so that it's attributes followed by message. @@ -195,8 +219,9 @@ struct socket_data_event_t { // Note that write/send have separate sequences than read/recv. uint64_t pos; - // The size of the original message. We use this to truncate msg field to minimize the amount - // of data being transferred. + // The size of the original message (or chunk of a message if iovlen > 1 + // since each perf_submit passes one iovec as an event). We use + // this to truncate the msg field to minimize the amount of data being transferred. uint32_t msg_size; // The amount of data actually being sent to user space. This may be less than msg_size if @@ -204,6 +229,14 @@ struct socket_data_event_t { // (e.g. if the connection data tracking has been disabled). uint32_t msg_buf_size; + // Bytes we could not capture (gap size in the data stream buffer) + // Currently keeps track of cases where we exceed CHUNK_LIMIT or LOOP_LIMIT, or truncate in such + // a way that we create a gap. Should be 0 if incomplete_chunk enum is kFullyFormed. + uint32_t bytes_missed; + + // Reason for incomplete chunk, if present. + enum chunk_t incomplete_chunk; + // Whether to prepend length header to the buffer for messages first inferred as Kafka. MySQL // may also use this in this future. // See infer_kafka_message in protocol_inference.h for details. diff --git a/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.hpp b/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.hpp index 372581ef150..ce7e0058e42 100644 --- a/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.hpp +++ b/src/stirling/source_connectors/socket_tracer/bcc_bpf_intf/socket_trace.hpp @@ -35,12 +35,22 @@ // The file name is kept identical to its BPF counterpart as well. inline std::string ToString(const socket_data_event_t::attr_t& attr) { - return absl::Substitute( + // Since absl::Substitute can handle up to 10 arguments after the format string, + // we concatenate the incomplete_chunk string separately. + std::string base_str = absl::Substitute( "[ts=$0 conn_id=$1 protocol=$2 role=$3 dir=$4 ssl=$5 source_fn=$6 pos=$7 size=$8 " - "buf_size=$9]", + "buf_size=$9", attr.timestamp_ns, ToString(attr.conn_id), magic_enum::enum_name(attr.protocol), magic_enum::enum_name(attr.role), magic_enum::enum_name(attr.direction), attr.ssl, magic_enum::enum_name(attr.source_fn), attr.pos, attr.msg_size, attr.msg_buf_size); + + // Second part: Continue with the next set of attributes. + std::string second_part = absl::Substitute( + " bytes_missed=$0 incomplete_chunk=$1]", + attr.bytes_missed, magic_enum::enum_name(attr.incomplete_chunk)); + + // Concatenate both parts and return. + return absl::StrCat(base_str, " ", second_part); } inline std::string ToString(const close_event_t& event) { @@ -104,6 +114,8 @@ struct SocketDataEvent { header_event_ptr->attr.pos = attr.pos - kHeaderBufSize; header_event_ptr->attr.msg_buf_size = kHeaderBufSize; header_event_ptr->attr.msg_size = kHeaderBufSize; + header_event_ptr->attr.incomplete_chunk = kHeaderEvent; + header_event_ptr->attr.bytes_missed = 0; // Take the length_header from the original, fix byte ordering, and place // into length_header of the header_event. @@ -124,7 +136,7 @@ struct SocketDataEvent { // For events that which couldn't transfer all its data, we have two options: // 1) A missing event. // 2) A filler event. - // A desired filler event is indicated by a msg_size > msg_buf_size when creating the BPF event. + // A desired filler event is indicated by a bytes_missed > 0 when creating the BPF event. // // A filler event is used in particular for sendfile data. // We need a better long-term solution for this, @@ -134,28 +146,51 @@ struct SocketDataEvent { DCHECK_GE(attr.msg_size, attr.msg_buf_size); - if (attr.msg_size > attr.msg_buf_size) { - VLOG(1) << "Adding filler to event"; + // Note that msg_size - msg_buf_size != bytes_missed in the case where we exceed LOOP_LIMIT + // in perf_submit_iovecs, because one call to perf_submit_buf takes only the size of the current + // iovec into account, ommitting the rest of the iovecs which could not be submitted. + // As a result, we need to use bytes_missed to determine the size of the filler event. + + // For kernels < 5.1, we cannot track the bytes missed in socket_trace.c properly and thus + // preserve the previous behavior of encoding the bytes missed via the msg_size. + // If our loop and chunk limits are at most 42 and 4, then we know that we can + // stay below the verifier instruction limit for kernels < 5.1. + if (LOOP_LIMIT <= 42 && CHUNK_LIMIT <= 4) { + if (attr.msg_size > attr.msg_buf_size) { + DCHECK_EQ(attr.bytes_missed, 0); + attr.bytes_missed = attr.msg_size - attr.msg_buf_size; + } + } + if (attr.bytes_missed > 0) { + VLOG(1) << absl::Substitute("Adding filler event for incomplete_chunk: $0, bytes_missed: $1", magic_enum::enum_name(attr.incomplete_chunk), attr.bytes_missed); // Limit the size so we don't have huge allocations. constexpr uint32_t kMaxFilledSizeBytes = 1 * 1024 * 1024; static char kZeros[kMaxFilledSizeBytes] = {0}; - size_t filler_size = attr.msg_size - attr.msg_buf_size; + filler_event_ptr = std::make_unique(); + filler_event_ptr->attr = attr; + size_t filler_size = attr.bytes_missed; if (filler_size > kMaxFilledSizeBytes) { VLOG(1) << absl::Substitute("Truncating filler event: $0->$1", filler_size, kMaxFilledSizeBytes); filler_size = kMaxFilledSizeBytes; + // incomplete even after filler (bytes_missed > 1MB) + filler_event_ptr->attr.incomplete_chunk = kIncompleteFiller; + filler_event_ptr->attr.bytes_missed -= kMaxFilledSizeBytes; + } else { + // We encode the filler size in bytes_missed for filler events which completely plug a gap (chunk_t kFiller) in our metrics. + // (In reality, bytes missed is 0 since filler plugs the gap.) + // In all other circumstances bytes_missed represents the size of the gap + filler_event_ptr->attr.incomplete_chunk = kFiller; } - - filler_event_ptr = std::make_unique(); - filler_event_ptr->attr = attr; filler_event_ptr->attr.pos = attr.pos + attr.msg_buf_size; filler_event_ptr->attr.msg_buf_size = filler_size; filler_event_ptr->attr.msg_size = filler_size; filler_event_ptr->msg = std::string_view(kZeros, filler_size); // We've created the filler event, so adjust the original event accordingly. + DCHECK(filler_size <= attr.bytes_missed); attr.msg_size = attr.msg_buf_size; }