From 9ffcca70601fbd44b904bb971efffd6124a04e77 Mon Sep 17 00:00:00 2001 From: Shivam Raikundalia Date: Tue, 3 Sep 2024 23:46:38 +0000 Subject: [PATCH] [Profiler] Handle Tensor Sizes/Strides Parsing Error (#134862) Summary: Currently some jobs are encountering the following trace, P1539415198. This suggests that when we are parsing through tensors the path is prone to encountering an invalid address. This is is possibly occurring because for some reason the sizes() and strides() of a Tensor seem to not be of the same dimensions. We assume such when iterating through the shapes to get the Ivalue generator. When browsing some of the tensor implementations, I found that some of the size and stride paths are different which could be the cause of this issue. Regardless, the profiler should be flexible enough to handle such issues without bringing down the whole main thread. If the crashes still persist, it will still give us a data point as to where they are occurring and we can rule out the strides/sizes as the culprit Test Plan: This change doesn't break anything in the happy path, just makes sure the bad path is not exited abruptly. We should use this in order to debug what the events are having mismatching dimensions between sizes and strides. Differential Revision: D62008788 Pull Request resolved: https://github.com/pytorch/pytorch/pull/134862 Approved by: https://github.com/aaronenyeshi --- torch/csrc/profiler/collection.cpp | 30 +++++++++++++++++++++++++---- torch/csrc/profiler/collection.h | 3 ++- torch/csrc/profiler/python/init.cpp | 2 +- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp index 3bfd0a4b8f3d9b..a4f53a5e10f99f 100644 --- a/torch/csrc/profiler/collection.cpp +++ b/torch/csrc/profiler/collection.cpp @@ -33,11 +33,18 @@ RawTensorMetadataBase::RawTensorMetadataBase(const at::Tensor& t) : data_{t.has_storage() ? t.storage().data() : nullptr}, dtype_{t.scalar_type()}, layout_{t.layout()}, - dim_{static_cast(t.sizes().size())} { + size_dim_{static_cast(t.sizes().size())}, + stride_dim_{static_cast(t.strides().size())} { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( t.sizes().size() <= std::numeric_limits::max(), "Cannot profile Tensors of size > uint32 max. Got dim: ", t.sizes().size()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + t.sizes().size() != t.strides().size(), + "Tensor has mismatching sizes and strides. Sizes: ", + t.sizes().size(), + " Strides: ", + t.strides().size()); } RawTensorMetadata::RawTensorMetadata(const at::Tensor& t) @@ -181,14 +188,29 @@ auto InputOutputEncoder::getIValueGenerator(const IOType& io_type) { ivals_it = ivalues_.begin(), io_type]() mutable { auto decode_tensor = [&]() -> TensorMetadata { - const auto& raw_metadata = *tensor_metadata_it++; std::vector sizes; std::vector strides; - for (C10_UNUSED const auto _ : c10::irange(raw_metadata.dim_)) { + if (tensor_metadata_it.exhausted()) { + LOG(WARNING) + << "Tensor metadata exhausted prematurely. Reported shapes may be inaccurate!"; + return {RawTensorMetadata(), sizes, strides}; + } + const auto& raw_metadata = *tensor_metadata_it++; + for (C10_UNUSED const auto _ : c10::irange(raw_metadata.size_dim_)) { + if (tensor_size_strides_it.exhausted()) { + LOG(WARNING) + << "Expected Tensor Size mismatch with raw Tensor metadata. Reported shapes may be inaccurate!"; + return {raw_metadata, sizes, strides}; + } sizes.push_back(*tensor_size_strides_it++); } if (raw_metadata.layout_ == at::kStrided) { - for (C10_UNUSED const auto _ : c10::irange(raw_metadata.dim_)) { + for (C10_UNUSED const auto _ : c10::irange(raw_metadata.stride_dim_)) { + if (tensor_size_strides_it.exhausted()) { + LOG(WARNING) + << "Expected Tensor Strides mismatch with raw Tensor metadata. Reported shapes may be inaccurate!"; + return {raw_metadata, sizes, strides}; + } strides.push_back(*tensor_size_strides_it++); } } diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h index 716fdb910c01ea..e2bb603c387cc5 100644 --- a/torch/csrc/profiler/collection.h +++ b/torch/csrc/profiler/collection.h @@ -47,7 +47,8 @@ struct TORCH_API RawTensorMetadataBase { StorageImplData data_; c10::ScalarType dtype_{c10::ScalarType::Undefined}; c10::Layout layout_{c10::Layout::Strided}; - uint32_t dim_{0}; + uint32_t size_dim_{0}; + uint32_t stride_dim_{0}; }; // Collected during profiling. diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp index 25f93a2663dfb5..661646920632e2 100644 --- a/torch/csrc/profiler/python/init.cpp +++ b/torch/csrc/profiler/python/init.cpp @@ -441,7 +441,7 @@ void initPythonBindings(PyObject* module) { return py::reinterpret_borrow( torch::autograd::utils::wrap(metadata.dtype_)); }) - .def_readonly("dim", &TensorMetadata::dim_) + .def_readonly("dim", &TensorMetadata::size_dim_) .def_readonly("sizes", &TensorMetadata::sizes_) .def_readonly("strides", &TensorMetadata::strides_);