From c9f0732c1f2bd6747cd473d4ad5172f02f7ff0b7 Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Sun, 3 May 2020 02:40:54 -0700 Subject: [PATCH 1/6] 0.17.4RC --- 3rdparty/half_float/half.hpp | 23 ++++++----- CMakeLists.txt | 2 +- Makefile | 2 +- README.md | 2 +- cmake/Dependencies.cmake | 3 +- cmake/Misc.cmake | 2 +- examples/web_demo/requirements.txt | 2 +- include/caffe/blob.hpp | 49 +++++++++++++++++++--- include/caffe/common.hpp | 34 +++------------- include/caffe/filler.hpp | 8 ++-- include/caffe/internal_thread.hpp | 27 +++++++----- include/caffe/parallel.hpp | 20 +++++---- include/caffe/solver.hpp | 5 ++- include/caffe/tensor.hpp | 2 +- python/caffe/io.py | 2 +- python/caffe/test/test_classification.py | 4 +- python/requirements.txt | 2 +- src/caffe/batch_transformer.cpp | 10 ++++- src/caffe/blob.cpp | 21 ++++++---- src/caffe/common.cpp | 14 ++++--- src/caffe/data_reader.cpp | 10 ++++- src/caffe/internal_thread.cpp | 52 +++++++++++++----------- src/caffe/layers/base_conv_layer.cpp | 2 +- src/caffe/layers/base_data_layer.cpp | 8 +++- src/caffe/layers/cudnn_conv_layer.cpp | 4 +- src/caffe/layers/inner_product_layer.cpp | 2 +- src/caffe/parallel.cpp | 32 ++++++++------- src/caffe/test/test_internal_thread.cpp | 6 +-- src/caffe/util/cudnn.cpp | 35 ---------------- src/caffe/util/gpu_memory.cpp | 8 ++-- src/caffe/util/math_functions.cpp | 6 +++ tools/caffe.cpp | 15 +++---- 32 files changed, 223 insertions(+), 191 deletions(-) diff --git a/3rdparty/half_float/half.hpp b/3rdparty/half_float/half.hpp index c477bbb4e7e..8a43f713fd8 100644 --- a/3rdparty/half_float/half.hpp +++ b/3rdparty/half_float/half.hpp @@ -349,7 +349,7 @@ namespace half_float struct binary_t {}; /// Tag for binary construction. - HALF_CONSTEXPR_CONST binary_t binary = binary_t(); + static HALF_CONSTEXPR_CONST binary_t binary = binary_t(); /// Temporary half-precision expression. /// This class represents a half-precision expression which just stores a single-precision value internally. @@ -1172,7 +1172,8 @@ namespace half_float /// Constructor. /// \param bits binary representation to set half to CAFFE_UTIL_HD - HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) : data_(bits) {} + HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT + : data_(static_cast(bits)) {} /// Internal binary representation detail::uint16 data_; @@ -3067,32 +3068,32 @@ namespace std static HALF_CONSTEXPR_CONST int max_exponent10 = 4; /// Smallest positive normal value. - static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); } + static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x0400}; } /// Smallest finite value. - static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); } + static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0xFBFF}; } /// Largest finite value. - static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); } + static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7BFF}; } /// Difference between one and next representable value. - static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); } + static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x1400}; } /// Maximum rounding error. static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW - { return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); } + { return half_float::half{half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00}; } /// Positive infinity. - static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); } + static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7C00}; } /// Quiet NaN. - static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); } + static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7FFF}; } /// Signalling NaN. - static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); } + static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7DFF}; } /// Smallest positive subnormal value. - static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); } + static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x0001}; } }; diff --git a/CMakeLists.txt b/CMakeLists.txt index 818818a5587..f77e45af379 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,7 @@ endif() caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON) caffe_option(BUILD_python "Build Python wrapper" ON) -set(python_version "2" CACHE STRING "Specify which Python version to use") +set(python_version "3" CACHE STRING "Specify which Python version to use") caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE) caffe_option(BUILD_docs "Build documentation" ON IF UNIX OR APPLE) caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON) diff --git a/Makefile b/Makefile index 2b0ad1aeba4..6bc07ffdf62 100644 --- a/Makefile +++ b/Makefile @@ -179,7 +179,7 @@ CUDA_LIB_DIR := # add /lib64 only if it exists ifneq ("$(wildcard $(CUDA_DIR)/lib64)","") CUDA_LIB_DIR += $(CUDA_DIR)/lib64 - CUDA_LIB_DIR += /usr/lib/nvidia-410 /usr/lib/nvidia-418 /usr/lib/nvidia-396 /usr/lib/nvidia-390 /usr/lib/nvidia-387 /usr/lib/nvidia-384 /usr/lib/nvidia-381 /usr/lib/nvidia-375 /usr/lib/nvidia-367 /usr/lib/nvidia-361 /usr/lib/nvidia-352 + CUDA_LIB_DIR += /usr/lib/nvidia-440 /usr/lib/nvidia-410 /usr/lib/nvidia-418 /usr/lib/nvidia-396 /usr/lib/nvidia-390 /usr/lib/nvidia-387 /usr/lib/nvidia-384 /usr/lib/nvidia-381 /usr/lib/nvidia-375 /usr/lib/nvidia-367 /usr/lib/nvidia-361 /usr/lib/nvidia-352 endif CUDA_LIB_DIR += $(CUDA_DIR)/lib diff --git a/README.md b/README.md index 888d5853059..bcfe94fa15b 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,6 @@ to your PR. Libturbojpeg library is used since 0.16.5. It has a packaging bug. Please execute the following (required for Makefile, optional for CMake): ``` -sudo apt-get install libturbojpeg +sudo apt-get install libturbojpeg libturbojpeg0-dev sudo ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so ``` \ No newline at end of file diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 6d685fdcf4c..d9e73753aee 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -108,7 +108,8 @@ if(BUILD_python) boost_python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR} PATHS ${LIBDIR}) if ("${Boost_PYTHON_FOUND}" STREQUAL "Boost_PYTHON_FOUND-NOTFOUND") - message(SEND_ERROR "Could NOT find Boost Python Library") + find_package(Boost 1.65 COMPONENTS "python${python_version}") + set(Boost_PYTHON_FOUND ${Boost_PYTHON${python_version}_FOUND}) else() message(STATUS "Found Boost Python Library ${Boost_PYTHON_FOUND}") list(APPEND Caffe_LINKER_LIBS ${Boost_PYTHON_FOUND}) diff --git a/cmake/Misc.cmake b/cmake/Misc.cmake index 9dd2609b36a..88c0f6b8c65 100644 --- a/cmake/Misc.cmake +++ b/cmake/Misc.cmake @@ -29,7 +29,7 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) endif() # ---[ RPATH settings -set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath") +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON CACHE BOOLEAN "Use link paths for shared library rpath") set(CMAKE_MACOSX_RPATH TRUE) list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir) diff --git a/examples/web_demo/requirements.txt b/examples/web_demo/requirements.txt index 937ac1835b6..478694aff7f 100644 --- a/examples/web_demo/requirements.txt +++ b/examples/web_demo/requirements.txt @@ -3,5 +3,5 @@ flask tornado numpy pandas -pillow=>6.2.0 +pillow>=6.2.0 pyyaml diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 2241396bf13..6235272f64f 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -54,8 +54,14 @@ class Blob { explicit Blob(Type dtype) : Blob(dtype, dtype) {} - public: - virtual ~Blob() {} +public: + virtual ~Blob() = default; + + enum class RESHAPE_MODE: int { + RESHAPE_DIFF = -1, + RESHAPE_BOTH = 0, + RESHAPE_DATA = 1 + }; /// @brief Deprecated; use Reshape(const vector& shape). void Reshape(const int num, const int channels, const int height, const int width); @@ -75,8 +81,8 @@ class Blob { * an error; either Net::Forward or Net::Reshape need to be called to * propagate the new input shape to higher layers. */ - void Reshape(const vector& shape); - void Reshape(const BlobShape& shape); + void Reshape(const vector& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH); + void Reshape(const BlobShape& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH); void ReshapeLike(const Blob* other) { Reshape(other->shape()); @@ -111,9 +117,9 @@ class Blob { /// @brief Creates an instance of a Blob with given type Dtype and given shape. template - static shared_ptr create(const vector& shape) { + static shared_ptr create(const vector& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH) { shared_ptr ptr = create(); - ptr->Reshape(shape); + ptr->Reshape(shape, mode); return ptr; } @@ -303,6 +309,7 @@ class Blob { template void set_cpu_data(Dtype* data) { CHECK_NOTNULL(data); + ensure_data_count(); convert_data(tp()); CHECK(is_type(data_type())); data_tensor_->mutable_synced_mem()->set_cpu_data(data); @@ -311,6 +318,7 @@ class Blob { template void set_cpu_diff(Dtype* diff) { CHECK_NOTNULL(diff); + ensure_diff_count(); convert_diff(tp()); CHECK(is_type(diff_type())); diff_tensor_->mutable_synced_mem()->set_cpu_data(diff); @@ -318,18 +326,21 @@ class Blob { template const Dtype* cpu_data() const { + ensure_data_count(); convert_data(tp()); return static_cast(data_tensor_->synced_mem()->cpu_data()); } template const Dtype* cpu_diff() const { + ensure_diff_count(); convert_diff(tp()); return static_cast(diff_tensor_->synced_mem()->cpu_data()); } template Dtype* mutable_cpu_data_c(bool copy_from_gpu) { + ensure_data_count(); convert_data(tp()); return static_cast(data_tensor_->mutable_synced_mem()->mutable_cpu_data(copy_from_gpu)); } @@ -341,6 +352,7 @@ class Blob { template Dtype* mutable_cpu_diff_c(bool copy_from_gpu) { + ensure_diff_count(); convert_diff(tp()); return static_cast(diff_tensor_->mutable_synced_mem()->mutable_cpu_data(copy_from_gpu)); } @@ -390,36 +402,43 @@ class Blob { /// @brief Compute the sum of absolute values (L1 norm) of the diff. float asum_diff(int group = 0) const { + ensure_diff_count(); return diff_tensor_->asum(group); } /// @brief Compute the sum of squares (L2 norm squared) of the data. float sumsq_data(int group = 0) const { + ensure_data_count(); return data_tensor_->sumsq(group); } /// @brief Compute the sum of squares (L2 norm squared) of the diff. float sumsq_diff(int group = 0) const { + ensure_diff_count(); return diff_tensor_->sumsq(group); } /// @brief Scale the blob data by a constant factor. void scale_data(float scale, void* handle = nullptr) { + ensure_data_count(); data_tensor_->scale(scale, handle); } /// @brief Scale the blob diff by a constant factor. void scale_diff(float scale, void* handle = nullptr) { + ensure_diff_count(); diff_tensor_->scale(scale, handle); } /// @brief Set all the blob's data elements to a value. void set_data(float value) { + ensure_data_count(); data_tensor_->set(value); } /// @brief Set all the blob's diff elements to a value. void set_diff(float value) { + ensure_diff_count(); diff_tensor_->set(value); } @@ -480,28 +499,33 @@ class Blob { void set_gpu_data(void* data) { CHECK_NOTNULL(data); + ensure_data_count(); data_tensor_->mutable_synced_mem()->set_gpu_data(data); } void set_gpu_diff(void* diff) { CHECK_NOTNULL(diff); + ensure_diff_count(); diff_tensor_->mutable_synced_mem()->set_gpu_data(diff); } template const Dtype* gpu_data() const { + ensure_data_count(); convert_data(tp()); return static_cast(data_tensor_->synced_mem()->gpu_data()); } template const Dtype* gpu_diff() const { + ensure_diff_count(); convert_diff(tp()); return static_cast(diff_tensor_->synced_mem()->gpu_data()); } template Dtype* mutable_gpu_data_c(bool copy_from_cpu) { + ensure_data_count(); convert_data(tp()); return static_cast(data_tensor_->mutable_synced_mem()->mutable_gpu_data(copy_from_cpu)); } @@ -513,6 +537,7 @@ class Blob { template Dtype* mutable_gpu_diff_c(bool copy_from_cpu) { + ensure_diff_count(); convert_diff(tp()); return static_cast(diff_tensor_->mutable_synced_mem()->mutable_gpu_data(copy_from_cpu)); } @@ -573,6 +598,18 @@ class Blob { diff_tensor_->convert(new_diff_type); } + void ensure_data_count() const { + if (data_tensor_->is_empty() && !diff_tensor_->is_empty()) { + data_tensor_->Reshape(diff_tensor_->count_); + } + } + + void ensure_diff_count() const { + if (diff_tensor_->is_empty() && !data_tensor_->is_empty()) { + diff_tensor_->Reshape(data_tensor_->count_); + } + } + static float at(int offset, Type dtype, const void* data); static void cpu_axpy(int count, Type dtype, float alpha, const void* X, void* Y); static void gpu_axpy(int count, Type dtype, float alpha, const void* X, void* Y); diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index faa20cd0eb4..17e14aba617 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -159,9 +159,6 @@ class Caffe { static shared_ptr thread_pstream(int group = 0) { return Get().pstream(group); } - static shared_ptr short_term_cublas_phandle() { - return make_shared(); - } #ifdef USE_CUDNN static cudnnHandle_t cudnn_handle(int group) { return Get().th_cudnn_handle(group); @@ -232,14 +229,14 @@ class Caffe { /// All physical devices regardless of usage static int device_count(); // Parallel training info - static int solver_count() { + static size_t solver_count() { return solver_count_; } - /// NUmber of physical devices being used + // Number of physical devices being used static int device_in_use_per_host_count() { return (int)gpus_.size(); } - static void set_solver_count(int val) { + static void set_solver_count(size_t val) { if (solver_count_ != val) { std::lock_guard lock(caffe_mutex_); solver_count_ = val; @@ -260,27 +257,6 @@ class Caffe { static const std::vector& gpus() { return gpus_; } - static const std::string& caffe_version() { - return props().caffe_version(); - } - static const std::string& cudnn_version() { - return props().cudnn_version(); - } - static const std::string& cublas_version() { - return props().cublas_version(); - } - static const std::string& cuda_version() { - return props().cuda_version(); - } - static const std::string& cuda_driver_version() { - return props().cuda_driver_version(); - } - static std::string start_time() { - return props().start_time(); - } - static std::time_t init_time() { - return props().init_time(); - } static std::string time_from_init(); static int device_capability(int device) { return props().device_capability(device); @@ -325,7 +301,7 @@ class Caffe { // For example, if user runs `caffe train -gpu=1,0,3` then it has to be set to 1. static int root_device_; static Brew mode_; - static int solver_count_; + static size_t solver_count_; static std::vector gpus_; static int thread_count_; static int restored_iter_; @@ -362,9 +338,9 @@ class Caffe { class Properties { friend class Caffe; - public: Properties(); + public: const std::string& caffe_version() const { return caffe_version_; } diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index 11fdd8dee5d..bb40b10100b 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -282,10 +282,10 @@ class XavierFiller : public Filler { CHECK(blob->count()); int fan_in = blob->count() / blob->num(); int fan_out = blob->count() / blob->channels(); - Dtype n = fan_in; // default to fan_in + float n = fan_in; // default to fan_in if (this->filler_param_.variance_norm() == FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); + n = (fan_in + fan_out) / 2.F; } else if (this->filler_param_.variance_norm() == FillerParameter_VarianceNorm_FAN_OUT) { n = fan_out; @@ -334,10 +334,10 @@ class XavierStaticFiller : public Filler { xavier_static_filler_data_.resize(blob_count); int fan_in = blob_count / blob->num(); int fan_out = blob_count / blob->channels(); - Dtype n = fan_in; // default to fan_in + float n = fan_in; // default to fan_in if (this->filler_param_.variance_norm() == FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); + n = (fan_in + fan_out) / 2.F; } else if (this->filler_param_.variance_norm() == FillerParameter_VarianceNorm_FAN_OUT) { n = fan_out; diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp index eba359cebef..a1e8b03346f 100644 --- a/include/caffe/internal_thread.hpp +++ b/include/caffe/internal_thread.hpp @@ -18,8 +18,8 @@ namespace caffe { */ class InternalThread { public: - InternalThread(int target_device, size_t rank_, size_t threads, bool delayed); - virtual ~InternalThread() {} + InternalThread(int target_device, size_t rank_, size_t threads, bool delayed, const std::string& name); + virtual ~InternalThread() = default; /** * Caffe's thread local state will be initialized using the current @@ -40,12 +40,16 @@ class InternalThread { void StopInternalThread(bool wait_all = true); void WaitAll(); - bool is_started(int id = 0) const { - return threads_[id].joinable(); + const string& get_name() const { + return name_; + } + + bool is_started(int child_id = 0) const { + return children_[child_id].joinable(); } size_t threads_num() const { - return threads_.size(); + return children_.size(); } void go() { @@ -62,19 +66,20 @@ class InternalThread { with the code you want your thread to run. */ virtual void InternalThreadEntry() {} - virtual void InternalThreadEntryN(size_t id) {} + virtual void InternalThreadEntryN(size_t child_id) {} /* Should be tested when running loops to exit when requested. */ - bool must_stop(int id) { - return threads_[id].interruption_requested(); + bool must_stop(size_t child_id) { + return children_[child_id].interruption_requested(); } private: - void entry(int thread_id, int device, Caffe::Brew mode, uint64_t rand_seed, + void entry(size_t thread_id, int device, Caffe::Brew mode, uint64_t rand_seed, size_t rank, bool set_cpu_affinity); - - vector threads_; + std::uint32_t lwp_id_, lwp_id_parent_; + vector children_; vector> delay_flags_; + const std::string name_; }; } // namespace caffe diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp index 279a52fe57a..d8cdf4cf439 100644 --- a/include/caffe/parallel.hpp +++ b/include/caffe/parallel.hpp @@ -62,10 +62,6 @@ class P2PManager { } #endif - void bar_wait(int b) { - bar_[b]->wait(); - } - static void Init(int *argc, char ***argv); static int global_rank() { @@ -80,7 +76,10 @@ class P2PManager { return host_name_; } - protected: + static unique_ptr solve_bar_; + static unique_ptr solved_bar_; + +protected: const size_t nranks_; vector> syncs_; shared_ptr> shared_; @@ -88,8 +87,6 @@ class P2PManager { #ifdef USE_NCCL ncclUniqueId nccl_id_; #endif - static unique_ptr bar_[3]; - static int global_rank_; static int global_count_; static char host_name_[_POSIX_HOST_NAME_MAX + 1]; @@ -101,14 +98,19 @@ class P2PSync : public Solver::Callback, public InternalThread { public: P2PSync(P2PManager* mgr, shared_ptr root_solver, int rank, int nranks, const SolverParameter& param); - virtual ~P2PSync(); + ~P2PSync(); // Divide the batch size by the number of solvers static unsigned int divide_batch_size(NetParameter* net); void allreduce(int param_id) override; void allreduce_bucket(size_t count, void* bucket, Type type) override; - void soft_barrier(int b) override; + void solve_barrier() override { + P2PManager::solve_bar_->wait(); + } + void solved_barrier() override { + P2PManager::solved_bar_->wait(); + } void cancel_all() override; void saveTestResults(float loss, const vector& scores) override; void aggregateTestResults(float* loss, vector* scores) override; diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index 66652845934..9b865e4ea29 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -107,7 +107,8 @@ class Solver { public: virtual void allreduce(int param_id) = 0; virtual void allreduce_bucket(size_t count, void* bucket, Type type) = 0; - virtual void soft_barrier(int b) = 0; + virtual void solve_barrier() = 0; + virtual void solved_barrier() = 0; virtual void cancel_all() = 0; virtual void saveTestResults(float loss, const vector& scores) = 0; virtual void aggregateTestResults(float* loss, vector* scores) = 0; @@ -197,7 +198,7 @@ class Solver { void callback_soft_barrier() { if (callback_ != nullptr) { - callback_->soft_barrier(0); + callback_->solve_barrier(); } } diff --git a/include/caffe/tensor.hpp b/include/caffe/tensor.hpp index 975f1bc314a..61cab872b1c 100644 --- a/include/caffe/tensor.hpp +++ b/include/caffe/tensor.hpp @@ -20,7 +20,7 @@ class Tensor { public: explicit Tensor(Type type); - ~Tensor() {} + ~Tensor() = default; std::string to_string(int indent) const; diff --git a/python/caffe/io.py b/python/caffe/io.py index dcb24cc31f4..fc20c010df7 100644 --- a/python/caffe/io.py +++ b/python/caffe/io.py @@ -300,7 +300,7 @@ def load_image(filename, color=True): of size (H x W x 3) in RGB or of size (H x W x 1) in grayscale. """ - img = skimage.img_as_float(skimage.io.imread(filename, as_grey=not color)).astype(np.float32) + img = skimage.img_as_float(skimage.io.imread(filename, as_gray=not color)).astype(np.float32) if img.ndim == 2: img = img[:, :, np.newaxis] if color: diff --git a/python/caffe/test/test_classification.py b/python/caffe/test/test_classification.py index 823d6e28c9c..b19b859d5b2 100644 --- a/python/caffe/test/test_classification.py +++ b/python/caffe/test/test_classification.py @@ -10,7 +10,6 @@ from glob import glob from google.protobuf import text_format from PIL import Image -import scipy.misc # os.environ['GLOG_minloglevel'] = '2' # Suppress most caffe output import caffe @@ -80,9 +79,8 @@ def load_image(path, height, width, mode='RGB'): """ image = Image.open(path) image = image.convert(mode) - image = np.array(image) # squash - image = scipy.misc.imresize(image, (height, width), 'bilinear') + image = np.array(image.resize((height, width), Image.BILINEAR)) return image def forward_pass(images, net, transformer, batch_size=None): diff --git a/python/requirements.txt b/python/requirements.txt index 90964988645..d09f6e680f4 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -13,6 +13,6 @@ pydotplus python-dateutil>=2 python-gflags>=2.0 pyyaml>=3.10 -six>=1.5.2,<=1.10.0 +six>=1.5.2,<=1.11.0 image>=1.5.16 jupyterlab diff --git a/src/caffe/batch_transformer.cpp b/src/caffe/batch_transformer.cpp index 97f0114af86..1033808dcbf 100644 --- a/src/caffe/batch_transformer.cpp +++ b/src/caffe/batch_transformer.cpp @@ -2,10 +2,16 @@ namespace caffe { +std::string bt_name(size_t rank, size_t queues_num) { + std::ostringstream os; + os << "BatchTransformer of rank " << rank << ", queues " << queues_num; + return os.str(); +} + template -BatchTransformer::BatchTransformer(int target_device, size_t rank_, +BatchTransformer::BatchTransformer(int target_device, size_t rank, size_t queues_num, const TransformationParameter& transform_param, bool gpu_transform) : - InternalThread(target_device, rank_, 1, false), + InternalThread(target_device, rank, 1, false, bt_name(rank, queues_num)), queues_num_(queues_num), next_batch_queue_(0UL), transform_param_(transform_param), diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 9c7f869cca3..86259dd2629 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -32,7 +32,7 @@ void Blob::Reshape(const int n) { Reshape(shape); } -void Blob::Reshape(const vector& shape) { +void Blob::Reshape(const vector& shape, RESHAPE_MODE mode) { CHECK_LE(shape.size(), kMaxBlobAxes); CHECK(data_tensor_); CHECK(diff_tensor_); @@ -51,19 +51,23 @@ void Blob::Reshape(const vector& shape) { shape_[i] = shape[i]; shape_data[i] = shape[i]; } - data_tensor_->Reshape(count_); - diff_tensor_->Reshape(count_); - CHECK(is_current_data_valid()); - CHECK(is_current_diff_valid()); + if (mode == RESHAPE_MODE::RESHAPE_DATA || mode == RESHAPE_MODE::RESHAPE_BOTH) { + data_tensor_->Reshape(count_); + CHECK(is_current_data_valid()); + } + if (mode == RESHAPE_MODE::RESHAPE_DIFF || mode == RESHAPE_MODE::RESHAPE_BOTH) { + diff_tensor_->Reshape(count_); + CHECK(is_current_diff_valid()); + } } -void Blob::Reshape(const BlobShape& shape) { +void Blob::Reshape(const BlobShape& shape, RESHAPE_MODE mode) { CHECK_LE(shape.dim_size(), kMaxBlobAxes); vector shape_vec(shape.dim_size()); for (int i = 0; i < shape.dim_size(); ++i) { shape_vec[i] = shape.dim(i); } - Reshape(shape_vec); + Reshape(shape_vec, mode); } const int* Blob::gpu_shape() const { @@ -73,6 +77,7 @@ const int* Blob::gpu_shape() const { void Blob::ShareData(const Blob& other) { CHECK_NE(this, &other); + other.ensure_data_count(); // CHECK(!other.IsSharedDataCycled()); if (data_tensor_.get() == other.data_tensor_.get()) { CHECK_EQ(data_shared_with_, &other); @@ -87,6 +92,7 @@ void Blob::ShareData(const Blob& other) { void Blob::ShareDiff(const Blob& other) { CHECK_NE(this, &other); + other.ensure_diff_count(); /// CHECK(!other.IsSharedDiffCycled()); if (diff_tensor_.get() == other.diff_tensor_.get()) { CHECK_EQ(diff_shared_with_, &other); @@ -409,6 +415,7 @@ void Blob::ToProto(BlobProto* proto, bool store_in_old_format, bool write_diff) return; } CHECK(is_current_data_valid()); + ensure_diff_count(); CHECK(is_current_diff_valid()); Type dt = data_type(); proto->clear_shape(); diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index efe6d21c36e..d0bdb6b9a87 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -17,7 +17,7 @@ namespace caffe { // Must be set before brewing Caffe::Brew Caffe::mode_ = Caffe::GPU; -int Caffe::solver_count_ = 1; +size_t Caffe::solver_count_ = 1; std::vector Caffe::gpus_; int Caffe::root_device_ = -1; int Caffe::thread_count_ = 0; @@ -109,13 +109,14 @@ void Caffe::set_restored_iter(int val) { } void GlobalInit(int* pargc, char*** pargv) { - P2PManager::Init(pargc, pargv); // Google flags. ::gflags::ParseCommandLineFlags(pargc, pargv, true); // Google logging. ::google::InitGoogleLogging(*(pargv)[0]); // Provide a backtrace on segfault. ::google::InstallFailureSignalHandler(); + + P2PManager::Init(pargc, pargv); } int Caffe::device_count() { @@ -466,7 +467,7 @@ Caffe::Properties::Properties() : for (int gpu = 0; gpu < compute_capabilities_.size(); ++gpu) { CUDA_CHECK(cudaGetDeviceProperties(&device_prop, gpus[gpu])); compute_capabilities_[gpu] = device_prop.major * 100 + device_prop.minor; - DLOG(INFO) << "GPU " << gpus[gpu] << " '" << device_prop.name + LOG(INFO) << "GPU " << gpus[gpu] << " '" << device_prop.name << "' has compute capability " << device_prop.major << "." << device_prop.minor; } #ifdef USE_CUDNN @@ -474,9 +475,10 @@ Caffe::Properties::Properties() : #else cudnn_version_ = "USE_CUDNN is not defined"; #endif - shared_ptr phandle = Caffe::short_term_cublas_phandle(); + cublasHandle_t handle; + CUBLAS_CHECK(cublasCreate(&handle)); int cublas_version = 0; - CUBLAS_CHECK(cublasGetVersion(phandle->get(), &cublas_version)); + CUBLAS_CHECK(cublasGetVersion(handle, &cublas_version)); cublas_version_ = std::to_string(cublas_version); int cuda_version = 0; @@ -492,7 +494,7 @@ std::string Caffe::time_from_init() { std::ostringstream os; os.unsetf(std::ios_base::floatfield); os.precision(4); - double span = std::difftime(std::time(NULL), init_time()); + double span = std::difftime(std::time(NULL), props().init_time()); const double mn = 60.; const double hr = 3600.; if (span < mn) { diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp index 456eae3ec96..d7ce0f495b3 100644 --- a/src/caffe/data_reader.cpp +++ b/src/caffe/data_reader.cpp @@ -14,6 +14,13 @@ std::mutex DataReader::db_mutex_; template std::mutex DataReader::DataCache::cache_mutex_{}; +std::string dr_name(size_t rank, size_t parser_threads_num, size_t transf_threads_num) { + std::ostringstream os; + os << "DataReader of local solver rank " << rank + << ", parser threads " << parser_threads_num << ", transf threads " << transf_threads_num; + return os.str(); +} + template DataReader::DataReader(const LayerParameter& param, size_t local_solver_count, @@ -27,7 +34,8 @@ DataReader::DataReader(const LayerParameter& param, bool shuffle, bool epoch_count_required) : InternalThread(Caffe::device(), - local_solver_rank, sample_only ? 1U : parser_threads_num, false), + local_solver_rank, sample_only ? 1U : parser_threads_num, false, + dr_name(local_solver_rank, parser_threads_num, transf_threads_num)), parser_threads_num_(threads_num()), transf_threads_num_(sample_only ? 1U : transf_threads_num), queues_num_(parser_threads_num_ * transf_threads_num_), diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp index 688e9b591c3..b30b2fa3bec 100644 --- a/src/caffe/internal_thread.cpp +++ b/src/caffe/internal_thread.cpp @@ -7,11 +7,16 @@ namespace caffe { -InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed) +InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed, const std::string& name) : target_device_(target_device), rank_(rank), - threads_(threads), - delay_flags_(threads, make_shared(!delayed)) {} + lwp_id_(0), + lwp_id_parent_(caffe::lwp_id()), + children_(threads), + delay_flags_(threads, make_shared(!delayed)), + name_(name) { + LOG(INFO) << "InternalThread " << lwp_id_parent_ << ": " << name; +} void InternalThread::StartInternalThread(bool set_cpu_affinity, uint64_t random_seed) { CHECK(!is_started()) << "Threads should persist and not be restarted."; @@ -19,15 +24,15 @@ void InternalThread::StartInternalThread(bool set_cpu_affinity, uint64_t random_ #ifdef USE_MPI "{" << P2PManager::global_rank() << "} " #endif - "Starting " << threads_.size() << " internal thread(s) on device " << target_device_; + "Starting " << children_.size() << " internal thread(s) on device " << target_device_; Caffe::Brew mode = Caffe::mode(); if (mode == Caffe::GPU) { CHECK_GE(target_device_, 0); } try { - for (size_t id = 0; id < threads_.size(); ++id) { - threads_[id] = boost::thread(&InternalThread::entry, this, id, target_device_, mode, - random_seed, rank_, set_cpu_affinity); + for (size_t child_id = 0; child_id < children_.size(); ++child_id) { + children_[child_id] = boost::thread(&InternalThread::entry, this, child_id, + target_device_, mode, random_seed, rank_, set_cpu_affinity); } } catch (std::exception& e) { LOG(FATAL) << "Thread exception: " << e.what(); @@ -44,15 +49,15 @@ void InternalThread::RestartAllThreads(size_t new_threads, bool delayed, bool se if (mode == Caffe::GPU) { CHECK_GE(target_device_, 0); } - threads_.clear(); + children_.clear(); delay_flags_.clear(); - threads_.resize(new_threads); + children_.resize(new_threads); delay_flags_.resize(new_threads); try { - for (size_t id = 0; id < new_threads; ++id) { - CHECK(!is_started(id)); - delay_flags_[id] = make_shared(!delayed); - threads_[id] = boost::thread(&InternalThread::entry, this, id, + for (size_t child_id = 0; child_id < new_threads; ++child_id) { + CHECK(!is_started(child_id)); + delay_flags_[child_id] = make_shared(!delayed); + children_[child_id] = boost::thread(&InternalThread::entry, this, child_id, target_device_, mode, random_seed, rank_, set_cpu_affinity); } } catch (std::exception& e) { @@ -60,9 +65,10 @@ void InternalThread::RestartAllThreads(size_t new_threads, bool delayed, bool se } } -void InternalThread::entry(int thread_id, int device, Caffe::Brew mode, uint64_t random_seed, +void InternalThread::entry(size_t child_id, int device, Caffe::Brew mode, uint64_t random_seed, size_t rank, bool set_cpu_affinity) { - delay_flags_[thread_id]->wait(); + lwp_id_ = caffe::lwp_id(); + delay_flags_[child_id]->wait(); if (mode == Caffe::GPU) { CHECK_GE(device, 0); } @@ -82,17 +88,17 @@ void InternalThread::entry(int thread_id, int device, Caffe::Brew mode, uint64_t nvml::setCpuAffinity(device); #endif } - if (threads_.size() == 1) { + if (children_.size() == 1) { InternalThreadEntry(); } else { - InternalThreadEntryN(thread_id); + InternalThreadEntryN(child_id); } } void InternalThread::StopInternalThread(bool wait_all) { - for (size_t id = 0; id < threads_.size(); ++id) { - if (is_started(id)) { - threads_[id].interrupt(); + for (size_t child_id = 0; child_id < children_.size(); ++child_id) { + if (is_started(child_id)) { + children_[child_id].interrupt(); } } if (wait_all) { @@ -102,9 +108,9 @@ void InternalThread::StopInternalThread(bool wait_all) { void InternalThread::WaitAll() { try { - for (size_t id = 0; id < threads_.size(); ++id) { - if (is_started(id)) { - threads_[id].join(); + for (size_t child_id = 0; child_id < children_.size(); ++child_id) { + if (is_started(child_id)) { + children_[child_id].join(); } } } catch (boost::thread_interrupted&) { diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index c223a3c5a8b..8c68928cf60 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -150,7 +150,7 @@ void BaseConvolutionLayer::LayerSetUp(const vector& bottom, } // Initialize and fill the weights: // output channels x input channels per-group x kernel height x kernel width - this->blobs_[0] = Blob::create(weight_shape); + this->blobs_[0] = Blob::create(weight_shape, Blob::RESHAPE_MODE::RESHAPE_DATA); shared_ptr> weight_filler( GetFiller(this->layer_param_.convolution_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 0b9a76a61de..f84a7828ade 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -56,11 +56,17 @@ void BaseDataLayer::LayerSetUp(const vector& bottom, DataLayerSetUp(bottom, top); } +std::string bpdl_name(size_t rank) { + std::ostringstream os; + os << "BasePrefetchingDataLayer of local solver rank " << rank; + return os.str(); +} + template BasePrefetchingDataLayer::BasePrefetchingDataLayer(const LayerParameter& param, size_t solver_rank) : BaseDataLayer(param, threads(param)), - InternalThread(Caffe::device(), solver_rank, threads(param), false), + InternalThread(Caffe::device(), solver_rank, threads(param), false, bpdl_name(solver_rank)), auto_mode_(Caffe::mode() == Caffe::GPU && this->phase_ == TRAIN && auto_mode(param)), parsers_num_(parser_threads(param)), transf_num_(threads(param)), diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index 49795c93a1d..891b14f4b6c 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -308,6 +308,7 @@ template void CuDNNConvolutionLayer::Reshape( const vector& bottom, const vector& top) { // Check whether cached descriptors have been initialized. + bool sizes_changed = false; if (initialized_cached_descs_) { // Check whether bottom and conv descriptors have changed, // which then requires a new reshape and set algo. @@ -316,6 +317,7 @@ void CuDNNConvolutionLayer::Reshape( IsConvDescChanged(bottom, true) || (this->phase_ == TRAIN && IsConvDescChanged(bottom, false))) { use_reshape_ = true; + sizes_changed = true; } else { // When no reshape is needed, setting algo may be still needed // (for example, if we are at iteration 1). @@ -427,7 +429,7 @@ void CuDNNConvolutionLayer::Reshape( align_up<8>(this->weight_offset_ * tsize(tpmax()))); } - if (fwd_count_ == 0UL) { + if (sizes_changed || fwd_count_ == 0UL) { AllocateWorkspace(bottom.size()); } // Ask cuDNN to find the best algorithm diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 7fa4d7c4e65..f0cb1994f09 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -37,7 +37,7 @@ InnerProductLayer::LayerSetUp(const vector& bottom, const v weight_shape[1] = K_; } // CPU filler always 32 bits - this->blobs_[0] = Blob::create(weight_shape); + this->blobs_[0] = Blob::create(weight_shape, Blob::RESHAPE_MODE::RESHAPE_DATA); shared_ptr> weight_filler( GetFiller(this->layer_param_.inner_product_param().weight_filler())); diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp index 75b44a323ae..ad2561cd6a1 100644 --- a/src/caffe/parallel.cpp +++ b/src/caffe/parallel.cpp @@ -19,7 +19,8 @@ namespace caffe { int P2PManager::global_rank_ = 0; int P2PManager::global_count_ = 1; char P2PManager::host_name_[_POSIX_HOST_NAME_MAX + 1]; -unique_ptr P2PManager::bar_[3]; +unique_ptr P2PManager::solve_bar_; +unique_ptr P2PManager::solved_bar_; void P2PManager::Init(int *argc, char ***argv) { #ifdef USE_MPI @@ -58,9 +59,8 @@ P2PManager::P2PManager(shared_ptr root_solver, nranks_(nranks), syncs_(devices), root_solver_(root_solver) { - bar_[0].reset(new boost::barrier(devices)); - bar_[1].reset(new boost::barrier(devices)); - bar_[2].reset(new boost::barrier(devices)); + solve_bar_.reset(new boost::barrier(devices)); + solved_bar_.reset(new boost::barrier(devices)); #ifndef USE_NCCL LOG(FATAL) << "USE_NCCL must be specified for multi-GPU mode"; #else @@ -156,9 +156,16 @@ void P2PManager::cancel_all(P2PSync* killed) { } } +std::string p2p_sync_name(bool is_root, int rank, int nranks) { + std::ostringstream os; + os << (is_root ? "ROOT " : "") << "P2PSync of rank " << rank << " out of " << nranks; + return os.str(); +} + P2PSync::P2PSync(P2PManager* mgr, shared_ptr root_solver, int rank, int nranks, const SolverParameter& solver_param) - : InternalThread(solver_param.device_id(), rank, 1, false), + : InternalThread(solver_param.device_id(), rank, 1, false, + p2p_sync_name(!root_solver, rank, nranks)), mgr_(mgr), rank_(rank), nranks_(nranks), @@ -188,7 +195,7 @@ void P2PSync::InternalThreadEntry() { solver_.swap(root_solver_); solver_->root_add_callback(this); } - soft_barrier(0); + solve_barrier(); if (!root) { Caffe::set_root_solver(false); solver_.reset(caffe::SolverRegistry::CreateSolver(solver_param_, root_solver_.get(), rank_)); @@ -214,12 +221,12 @@ void P2PSync::InternalThreadEntry() { rank_)); NCCL_CHECK(ncclGroupEnd()); #else - soft_barrier(0); + solve_barrier(); NCCL_CHECK(ncclCommInitRank(&nccl_comm_, nranks_, mgr_->nccl_id(), rank_)); - soft_barrier(0); + solve_barrier(); #endif #endif @@ -239,13 +246,8 @@ void P2PSync::InternalThreadEntry() { solver_->Solve(); - soft_barrier(2); - DLOG(INFO) << " Leaving P2PSync thread " << lwp_id(); -} - -void P2PSync::soft_barrier(int b) { - // CPU barrier to avoid busy-polling on the GPU. - mgr_->bar_wait(b); + solved_barrier(); + LOG(INFO) << " Leaving P2PSync thread " << lwp_id() << " '" << this->get_name() << "'"; } void P2PSync::cancel_all() { diff --git a/src/caffe/test/test_internal_thread.cpp b/src/caffe/test/test_internal_thread.cpp index f08f72c1de2..b447026b712 100644 --- a/src/caffe/test/test_internal_thread.cpp +++ b/src/caffe/test/test_internal_thread.cpp @@ -12,7 +12,7 @@ namespace caffe { class InternalThreadTest : public ::testing::Test {}; TEST_F(InternalThreadTest, TestStartAndExit) { - InternalThread thread(Caffe::device(), 0U, 1, false); + InternalThread thread(Caffe::device(), 0U, 1, false, "TestStartAndExit"); EXPECT_FALSE(thread.is_started()); thread.StartInternalThread(); EXPECT_TRUE(thread.is_started()); @@ -26,7 +26,7 @@ class TestThreadA : public InternalThread { } public: TestThreadA(int device = Caffe::device()) - : InternalThread(device, 0U, 1, false) {} + : InternalThread(device, 0U, 1, false, "TestThreadA") {} }; class TestThreadB : public InternalThread { @@ -35,7 +35,7 @@ class TestThreadB : public InternalThread { } public: TestThreadB(int device = Caffe::device()) - : InternalThread(device, 0U, 1, false) {} + : InternalThread(device, 0U, 1, false, "TestThreadB") {} }; TEST_F(InternalThreadTest, TestRandomSeed) { diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp index 3a488b72a58..b008e6f4d1e 100644 --- a/src/caffe/util/cudnn.cpp +++ b/src/caffe/util/cudnn.cpp @@ -1,41 +1,6 @@ #ifdef USE_CUDNN #include "caffe/util/cudnn.hpp" -const char* cudnnGetErrorString(cudnnStatus_t status) { - switch (status) { - case CUDNN_STATUS_SUCCESS: - return "CUDNN_STATUS_SUCCESS"; - case CUDNN_STATUS_NOT_INITIALIZED: - return "CUDNN_STATUS_NOT_INITIALIZED"; - case CUDNN_STATUS_ALLOC_FAILED: - return "CUDNN_STATUS_ALLOC_FAILED"; - case CUDNN_STATUS_BAD_PARAM: - return "CUDNN_STATUS_BAD_PARAM"; - case CUDNN_STATUS_INTERNAL_ERROR: - return "CUDNN_STATUS_INTERNAL_ERROR"; - case CUDNN_STATUS_INVALID_VALUE: - return "CUDNN_STATUS_INVALID_VALUE"; - case CUDNN_STATUS_ARCH_MISMATCH: - return "CUDNN_STATUS_ARCH_MISMATCH"; - case CUDNN_STATUS_MAPPING_ERROR: - return "CUDNN_STATUS_MAPPING_ERROR"; - case CUDNN_STATUS_EXECUTION_FAILED: - return "CUDNN_STATUS_EXECUTION_FAILED"; - case CUDNN_STATUS_NOT_SUPPORTED: - return "CUDNN_STATUS_NOT_SUPPORTED"; - case CUDNN_STATUS_LICENSE_ERROR: - return "CUDNN_STATUS_LICENSE_ERROR"; -#if CUDNN_VERSION_MIN(6, 0, 1) - case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: - return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING"; -#endif - default: - break; - } - return "Unknown cudnn status"; -} - - namespace caffe { namespace cudnn { diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index d48c2b95031..6bce127d0cb 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -55,16 +55,16 @@ bool GPUMemory::Workspace::try_reserve(size_t size, int device) { } GPUMemory::Manager::Manager() : debug_(false), initialized_(false) { - const int count = Caffe::device_count(); - dev_info_.resize(count); - update_thresholds_.resize(count); } - void GPUMemory::Manager::init(const vector& gpus, bool debug) { if (initialized_) { return; } + const int count = Caffe::device_count(); + dev_info_.resize(count); + update_thresholds_.resize(count); + bool debug_env = getenv("DEBUG_GPU_MEM") != 0; debug_ = debug || debug_env; try { diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 525cf156139..71d94458fb4 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -369,6 +369,12 @@ template void caffe_rng_uniform(int n, Ftype a, Ftype b, Blob* blob) { CHECK_GE(n, 0); CHECK_LE(a, b); + + if (Caffe::mode() == Caffe::GPU && n >= 1000000) { + caffe_gpu_rng_uniform(n, a, b, blob->mutable_gpu_data_c(false)); + return; + } + boost::uniform_real random_distribution(a, caffe_nextafter(b)); boost::variate_generator > variate_generator(caffe_rng(), random_distribution); diff --git a/tools/caffe.cpp b/tools/caffe.cpp index f42c1022b01..6cc1cfec4cb 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -698,13 +698,14 @@ int main(int argc, char** argv) { get_gpus(&gpus); Caffe::SetDevice(gpus.size() > 0 ? gpus[0] : 0); Caffe::set_gpus(gpus); - - LOG(INFO) << "This is NVCaffe " << Caffe::caffe_version() - << " started at " << Caffe::start_time(); - LOG(INFO) << "CuDNN version: " << Caffe::cudnn_version(); - LOG(INFO) << "CuBLAS version: " << Caffe::cublas_version(); - LOG(INFO) << "CUDA version: " << Caffe::cuda_version(); - LOG(INFO) << "CUDA driver version: " << Caffe::cuda_driver_version(); + Caffe::Properties& props = Caffe::props(); + + LOG(INFO) << "This is NVCaffe " << props.caffe_version() + << " started at " << props.start_time(); + LOG(INFO) << "CuDNN version: " << props.cudnn_version(); + LOG(INFO) << "CuBLAS version: " << props.cublas_version(); + LOG(INFO) << "CUDA version: " << props.cuda_version(); + LOG(INFO) << "CUDA driver version: " << props.cuda_driver_version(); LOG(INFO) << "Arguments: " << os.str(); if (argc == 2) { From e3ffb453f5f83144e15f42b939e14119f326366f Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Sun, 3 May 2020 23:59:19 -0700 Subject: [PATCH 2/6] 0.17.4RC --- include/caffe/blob.hpp | 5 +++-- include/caffe/internal_thread.hpp | 3 ++- include/caffe/parallel.hpp | 2 +- python/requirements.txt | 3 ++- src/caffe/internal_thread.cpp | 3 ++- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 6235272f64f..6efdc6701c3 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -54,7 +54,7 @@ class Blob { explicit Blob(Type dtype) : Blob(dtype, dtype) {} -public: + public: virtual ~Blob() = default; enum class RESHAPE_MODE: int { @@ -117,7 +117,8 @@ class Blob { /// @brief Creates an instance of a Blob with given type Dtype and given shape. template - static shared_ptr create(const vector& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH) { + static shared_ptr create(const vector& shape, + RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH) { shared_ptr ptr = create(); ptr->Reshape(shape, mode); return ptr; diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp index a1e8b03346f..cdb65253d3e 100644 --- a/include/caffe/internal_thread.hpp +++ b/include/caffe/internal_thread.hpp @@ -18,7 +18,8 @@ namespace caffe { */ class InternalThread { public: - InternalThread(int target_device, size_t rank_, size_t threads, bool delayed, const std::string& name); + InternalThread(int target_device, size_t rank_, size_t threads, bool delayed, + const std::string& name); virtual ~InternalThread() = default; /** diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp index d8cdf4cf439..bd633313610 100644 --- a/include/caffe/parallel.hpp +++ b/include/caffe/parallel.hpp @@ -79,7 +79,7 @@ class P2PManager { static unique_ptr solve_bar_; static unique_ptr solved_bar_; -protected: + protected: const size_t nranks_; vector> syncs_; shared_ptr> shared_; diff --git a/python/requirements.txt b/python/requirements.txt index d09f6e680f4..260e91635b4 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,5 +1,6 @@ Cython>=0.19.2 -numpy==1.14.5 +gfortran +numpy>=1.18.1 scipy>=0.13.3,<=0.17.0 scikit-image>=0.9.3 matplotlib>=1.5.2 diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp index b30b2fa3bec..e7d12ee4cb0 100644 --- a/src/caffe/internal_thread.cpp +++ b/src/caffe/internal_thread.cpp @@ -7,7 +7,8 @@ namespace caffe { -InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed, const std::string& name) +InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed, + const std::string& name) : target_device_(target_device), rank_(rank), lwp_id_(0), From f936ed65183a2d25337f5ac4223e6ff31249dc65 Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Mon, 4 May 2020 01:51:48 -0700 Subject: [PATCH 3/6] 0.17.4RC - Travis & Python 3 --- .travis.yml | 14 +++++++------- Makefile.config.example | 12 ++++++------ scripts/travis/defaults.sh | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6478c866904..ee1ee2f0b9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: trusty +dist: bionic sudo: required language: cpp @@ -14,14 +14,14 @@ env: # WITH_PYTHON3: false # WITH_CUDA: const true since v0.17 # WITH_CUDNN: false - - BUILD_NAME="default-make" -# - BUILD_NAME="python3-make" WITH_PYTHON3=true - - BUILD_NAME="cudnn-make" WITH_CUDNN=true +# - BUILD_NAME="default-make" + - BUILD_NAME="python3-make" WITH_PYTHON3=true + - BUILD_NAME="python3-cudnn-make" WITH_PYTHON3=true WITH_CUDNN=true - - BUILD_NAME="default-cmake" WITH_CMAKE=true +# - BUILD_NAME="default-cmake" WITH_CMAKE=true - BUILD_NAME="python3-cmake" WITH_CMAKE=true WITH_PYTHON3=true - - BUILD_NAME="cudnn-cmake" WITH_CMAKE=true WITH_CUDNN=true - - BUILD_NAME="cudnn-python3-cmake" WITH_CMAKE=true WITH_CUDNN=true WITH_PYTHON3=true +# - BUILD_NAME="cudnn-cmake" WITH_CMAKE=true WITH_CUDNN=true + - BUILD_NAME="python3-cudnn-cmake" WITH_CMAKE=true WITH_CUDNN=true WITH_PYTHON3=true cache: apt: true diff --git a/Makefile.config.example b/Makefile.config.example index 43ef562002e..0b7ba1ccc88 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -59,8 +59,8 @@ BLAS_LIB := /opt/OpenBLAS/lib/ # NOTE: this is required only if you will compile the python interface. # We need to be able to find Python.h and numpy/arrayobject.h. -PYTHON_INCLUDE := /usr/include/python2.7 \ - /usr/lib/python2.7/dist-packages/numpy/core/include +#PYTHON_INCLUDE := /usr/include/python2.7 \ +# /usr/lib/python2.7/dist-packages/numpy/core/include # Anaconda Python distribution is quite popular. Include path: # Verify anaconda location, sometimes it's in root. # ANACONDA_HOME := $(HOME)/anaconda @@ -69,9 +69,9 @@ PYTHON_INCLUDE := /usr/include/python2.7 \ # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \ # Uncomment to use Python 3 (default is Python 2) -# PYTHON_LIBRARIES := boost_python3 python3.5m -# PYTHON_INCLUDE := /usr/include/python3.5m \ -# /usr/lib/python3.5/dist-packages/numpy/core/include +PYTHON_LIBRARIES := boost_python3 python3.6m +PYTHON_INCLUDE := /usr/include/python3.6m \ + /usr/lib/python3.6/dist-packages/numpy/core/include # We need to be able to find libpythonX.X.so or .dylib. PYTHON_LIB := /usr/lib @@ -82,7 +82,7 @@ PYTHON_LIB := /usr/lib # PYTHON_LIB += $(shell brew --prefix numpy)/lib # Uncomment to support layers written in Python (will link against Python libs) -# WITH_PYTHON_LAYER := 1 +WITH_PYTHON_LAYER := 1 # Whatever else you find you need goes here. INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial diff --git a/scripts/travis/defaults.sh b/scripts/travis/defaults.sh index 5d546ce4fbd..1ef35bf5a17 100755 --- a/scripts/travis/defaults.sh +++ b/scripts/travis/defaults.sh @@ -4,6 +4,6 @@ set -e WITH_CMAKE=${WITH_CMAKE:-false} -WITH_PYTHON3=${WITH_PYTHON3:-false} +WITH_PYTHON3=${WITH_PYTHON3:-true} WITH_CUDA=${WITH_CUDA:-true} WITH_CUDNN=${WITH_CUDNN:-false} From 7501483f83199d4870b2a565e889950325477f0c Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Mon, 4 May 2020 02:24:26 -0700 Subject: [PATCH 4/6] 0.17.4RC - Travis & Python 3 --- scripts/travis/configure-make.sh | 4 ++-- scripts/travis/install-deps.sh | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/scripts/travis/configure-make.sh b/scripts/travis/configure-make.sh index f5a0be1cf81..2bdcb74746d 100644 --- a/scripts/travis/configure-make.sh +++ b/scripts/travis/configure-make.sh @@ -17,8 +17,8 @@ LINE "WITH_PYTHON_LAYER := 1" if $WITH_PYTHON3 ; then # TODO(lukeyeager) this path is currently disabled because of test errors like: # ImportError: dynamic module does not define init function (PyInit__caffe) - LINE "PYTHON_LIBRARIES := python3.4m boost_python-py34" - LINE "PYTHON_INCLUDE := /usr/include/python3.4 /usr/lib/python3/dist-packages/numpy/core/include" + LINE "PYTHON_LIBRARIES := python3.6m boost_python-py36" + LINE "PYTHON_INCLUDE := /usr/include/python3.6 /usr/lib/python3/dist-packages/numpy/core/include" LINE "INCLUDE_DIRS := \$(INCLUDE_DIRS) \$(PYTHON_INCLUDE)" fi diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh index 0d7caf045dd..c346fef6fdd 100755 --- a/scripts/travis/install-deps.sh +++ b/scripts/travis/install-deps.sh @@ -19,6 +19,7 @@ apt-get install -y --no-install-recommends \ libhdf5-serial-dev \ libopenblas-dev \ libturbojpeg \ + libturbojpeg0-dev \ python-virtualenv \ wget @@ -82,16 +83,16 @@ fi if $WITH_CUDA ; then # install repo packages - CUDA_REPO_PKG=cuda-repo-ubuntu1404_8.0.61-1_amd64.deb - wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/$CUDA_REPO_PKG + CUDA_REPO_PKG=cuda-10-2_10.2.89-1_amd64.deb + wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/$CUDA_REPO_PKG dpkg -i $CUDA_REPO_PKG rm $CUDA_REPO_PKG if $WITH_CUDNN ; then - ML_REPO_PKG=libcudnn7_7.0.5.15-1+cuda8.0_amd64.deb - ML_REPO_PKGD=libcudnn7-dev_7.0.5.15-1+cuda8.0_amd64.deb - wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/$ML_REPO_PKG - wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/$ML_REPO_PKGD + ML_REPO_PKG=libcudnn7_7.6.5.32-1+cuda10.2_amd64.deb + ML_REPO_PKGD=libcudnn7-dev_7.6.5.32-1+cuda10.2_amd64.deb + wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/$ML_REPO_PKG + wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/$ML_REPO_PKGD dpkg -i $ML_REPO_PKG $ML_REPO_PKGD fi @@ -99,8 +100,8 @@ if $WITH_CUDA ; then apt-get -y update # install packages - CUDA_PKG_VERSION="8-0" - CUDA_VERSION="8.0" + CUDA_PKG_VERSION="10-2" + CUDA_VERSION="10.2" apt-get install -y --no-install-recommends \ cuda-core-$CUDA_PKG_VERSION \ cuda-cudart-dev-$CUDA_PKG_VERSION \ @@ -110,8 +111,8 @@ if $WITH_CUDA ; then # manually create CUDA symlink ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda - if $WITH_CUDNN ; then - apt-get install -y --no-install-recommends libcudnn7 libcudnn7-dev - fi +# if $WITH_CUDNN ; then +# apt-get install -y --no-install-recommends libcudnn7 libcudnn7-dev +# fi fi From 60c9409482197cc7bdd7d0d48ff66122ef05ad3e Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Mon, 4 May 2020 02:34:11 -0700 Subject: [PATCH 5/6] 0.17.4RC - Travis & Python 3 --- scripts/travis/install-deps.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh index c346fef6fdd..5a520d44a94 100755 --- a/scripts/travis/install-deps.sh +++ b/scripts/travis/install-deps.sh @@ -24,7 +24,7 @@ apt-get install -y --no-install-recommends \ wget # package bug WAR: -ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so +#ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so if $WITH_CMAKE ; then apt-get install -y --no-install-recommends cmake From 4c7481c50e3fbe909f316d24b05a0e78f193a90b Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Fri, 8 Jan 2021 22:38:11 -0800 Subject: [PATCH 6/6] 0.17.4 --- CMakeLists.txt | 4 +- Makefile | 10 ++- Makefile.config.example | 6 +- cmake/Cuda.cmake | 2 +- include/caffe/macros.hpp | 2 +- include/caffe/util/io.hpp | 3 + src/caffe/layers/cudnn_conv_layer.cpp | 81 +++++++++++++++---- src/caffe/layers/cudnn_deconv_layer.cpp | 100 ++++++++---------------- src/caffe/util/bbox_util.cu | 7 +- 9 files changed, 118 insertions(+), 97 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f77e45af379..51d7587be33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,7 +73,7 @@ include(cmake/Dependencies.cmake) # ---[ Flags if(UNIX OR APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++11") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++14") endif() caffe_set_caffe_link() @@ -114,7 +114,7 @@ add_custom_target(lint COMMAND ${CMAKE_COMMAND} -P ${PROJECT_SOURCE_DIR}/cmake/l # ---[ pytest target if(BUILD_python) add_custom_target(pytest COMMAND python${python_version} -m unittest discover -s caffe/test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/python ) - add_dependencies(pytest pycaffe) +# add_dependencies(pytest pycaffe) endif() # ---[ Configuration summary diff --git a/Makefile b/Makefile index b08c91b8d68..5983a473d60 100644 --- a/Makefile +++ b/Makefile @@ -27,25 +27,27 @@ endif THIRDPARTY_DIR := ./3rdparty # All of the directories containing code. -SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \ +SRC_DIRS := $(shell find src python tools examples -type d -exec bash -c "find {} -maxdepth 1 \ \( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print 2>/dev/null) + # The target shared library name LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX) LIB_BUILD_DIR := $(BUILD_DIR)/lib STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a DYNAMIC_VERSION_MAJOR := 0 DYNAMIC_VERSION_MINOR := 17 -DYNAMIC_VERSION_REVISION := 3 +DYNAMIC_VERSION_REVISION := 4 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR) DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION) DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT) COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION) -# NVCaffe requires C++ 11 -COMMON_FLAGS += -std=c++11 +# NVCaffe requires C++ 14 +COMMON_FLAGS += -std=c++14 COMMON_FLAGS += -DCUDA_NO_HALF + ############################## # Get all source files ############################## diff --git a/Makefile.config.example b/Makefile.config.example index a8c6a3f383e..1d0147b7bc1 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -69,9 +69,9 @@ BLAS_LIB := /opt/OpenBLAS/lib/ # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \ # Uncomment to use Python 3 (default is Python 2) -PYTHON_LIBRARIES := boost_python3 python3.6m -PYTHON_INCLUDE := /usr/include/python3.6m \ - /usr/lib/python3.6/dist-packages/numpy/core/include +PYTHON_LIBRARIES := boost_python38 python3.8 +PYTHON_INCLUDE := /usr/include/python3.8 \ + /usr/lib/python3.8/dist-packages/numpy/core/include # We need to be able to find libpythonX.X.so or .dylib. PYTHON_LIB := /usr/lib diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake index 0e755d05282..dbc36a278c9 100644 --- a/cmake/Cuda.cmake +++ b/cmake/Cuda.cmake @@ -254,7 +254,7 @@ if(USE_CUDNN) endif() if(UNIX OR APPLE) - list(APPEND CUDA_NVCC_FLAGS -std=c++11;-Xcompiler;-fPIC) + list(APPEND CUDA_NVCC_FLAGS -std=c++14;-Xcompiler;-fPIC) endif() if(APPLE) diff --git a/include/caffe/macros.hpp b/include/caffe/macros.hpp index 9d24c985659..6816f26da0c 100644 --- a/include/caffe/macros.hpp +++ b/include/caffe/macros.hpp @@ -7,7 +7,7 @@ ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) #endif -#if BOOST_VERSION >= 106100 +#if (BOOST_VERSION >= 106100) && !defined(BOOST_CUDA_VERSION) // error: class "boost::common_type" has no member "type" #define BOOST_NO_CXX11_VARIADIC_TEMPLATES #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_BUILD__) diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index ec98f5a43c0..e01a3918411 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -323,11 +323,14 @@ void FloatCVMatToBuf(const cv::Mat& cv_img, size_t buf_len, Dtype* buf, bool rep LOG(FATAL) << "Image depth is not supported"; } } else { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" if (cv_img.depth() == CV_32F && tp() == FLOAT) { std::memcpy(buf, cv_img.ptr(0), img_size * sizeof(float)); // NOLINT(caffe/alt_fn) } else if (cv_img.depth() == CV_64F && tp() == DOUBLE) { std::memcpy(buf, cv_img.ptr(0), img_size * sizeof(double)); // NOLINT(caffe/alt_fn) } else { +#pragma GCC diagnostic pop if (cv_img.depth() == CV_8U) { for (size_t i = 0UL; i < img_size; ++i) { buf[i] = static_cast(cv_img.ptr(0)[i]); diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index 891b14f4b6c..d8784e38c8a 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -469,29 +469,80 @@ template void CuDNNConvolutionLayer::GetConvAlgo(const vector& bottom, const vector& top, const size_t workspace_bytes, int pad_h, int pad_w, int stride_h, int stride_w) { + const size_t limit_per_group = align_down<8>(workspace_bytes / ws_groups()); + int returnedAlgoCount = 0; for (int i = 0; i < bottom.size(); ++i) { - // Get backward data algorithm (if not set by user) - if (user_algos_override_[1] < 0) { - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0), - bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - align_down<8>(workspace_bytes / ws_groups()), &bwd_data_algo_[i])); - } // Get forward algorithm (if not set by user) if (user_algos_override_[0] < 0) { - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(0), + int count = 0; + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(Caffe::cudnn_handle(0), &count)); + std::vector perfResults; + CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(Caffe::cudnn_handle(0), fwd_bottom_descs_[i], fwd_filter_desc_, fwd_conv_descs_[i], fwd_top_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - align_down<8>(workspace_bytes / ws_groups()), &fwd_algo_[i])); - CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0))); + count, &returnedAlgoCount, &perfResults.front())); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bool found = false; + for (int a = 0; a < returnedAlgoCount; ++a) { + if (perfResults[a].memory <= limit_per_group) { + fwd_algo_[i] = perfResults[a].algo; + found = true; + break; + } + } + if (!found) { + LOG(FATAL) << "Can't find forward algorithm with memory limit " + << limit_per_group << " bytes per group"; + } + } + // Get backward data algorithm (if not set by user) + if (user_algos_override_[1] < 0) { + int count = 0; + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(Caffe::cudnn_handle(0), &count)); + std::vector perfResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0), + bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i], + count, &returnedAlgoCount, &perfResults.front())); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bool found = false; + for (int a = 0; a < returnedAlgoCount; ++a) { + if (perfResults[a].memory <= limit_per_group) { + bwd_data_algo_[i] = perfResults[a].algo; + found = true; + break; + } + } + if (!found) { + LOG(FATAL) << "Can't find backward data algorithm with memory limit " + << limit_per_group << " bytes per group"; + } } // Get backward filter algorithm (if not set by user) if (user_algos_override_[2] < 0) { - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0), + int count = 0; + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(Caffe::cudnn_handle(0), &count)); + std::vector perfResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0), bwd_bottom_descs_[i], bwd_top_descs_[i], bwd_conv_filter_descs_[i], bwd_filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - align_down<8>(workspace_bytes / ws_groups()), &bwd_filter_algo_[i])); - CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0))); + count, &returnedAlgoCount, &perfResults.front())); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bool found = false; + for (int a = 0; a < returnedAlgoCount; ++a) { + if (perfResults[a].memory <= limit_per_group) { + bwd_filter_algo_[i] = perfResults[a].algo; + found = true; + break; + } + } + if (!found) { + LOG(FATAL) << "Can't find backward filter algorithm with memory limit " + << limit_per_group << " bytes per group"; + } } LOG(INFO) << Phase_Name(this->phase_) << " Conv Algos by Get* (F,BD,BF) for layer '" << this->name() diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp index 6834dfda3b6..6cf3898a502 100644 --- a/src/caffe/layers/cudnn_deconv_layer.cpp +++ b/src/caffe/layers/cudnn_deconv_layer.cpp @@ -110,10 +110,6 @@ void CuDNNDeconvolutionLayer::Reshape( const int stride_h = stride_data[0]; const int stride_w = stride_data[1]; - // Specify workspace limit for kernels directly until we have a - // planning strategy and a rewrite of Caffe's GPU memory mangagement - size_t workspace_limit_bytes = 8*1024*1024; - for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc(&bottom_descs_[i], this->num_, @@ -135,96 +131,62 @@ void CuDNNDeconvolutionLayer::Reshape( 1); cudnn::setConvolutionDesc(forward_math_, conv_descs_[i], -// top_descs_[i], -// filter_desc_, pad_h, pad_w, stride_h, stride_w, 1, 1); + int returnedAlgoCount = 0; + cudnnConvolutionFwdAlgoPerf_t perfFResults; // choose forward and backward algorithms + workspace(s) - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &fwd_algo_[i])); - - // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is - // buggy. Thus, if this algo was chosen, choose winograd instead. If - // winograd is not supported or workspace is larger than threshold, choose - // implicit_gemm instead. -// if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { -// size_t winograd_workspace_size; -// cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize( -// handle_[0], -// top_descs_[i], -// filter_desc_, -// conv_descs_[i], -// bottom_descs_[i], -// CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, -// &winograd_workspace_size); -// if (status != CUDNN_STATUS_SUCCESS || -// winograd_workspace_size >= workspace_limit_bytes) { -// fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; -// } else { -// fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; -// } -// } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm( handle_[0], top_descs_[i], filter_desc_, conv_descs_[i], bottom_descs_[i], - fwd_algo_[i], - &(workspace_fwd_sizes_[i]))); + 1, + &returnedAlgoCount, + &perfFResults)); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + fwd_algo_[i] = perfFResults.algo; + workspace_fwd_sizes_[i] = perfFResults.memory; // choose backward algorithm for filter - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( + cudnnConvolutionBwdFilterAlgoPerf_t perfBFResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm( handle_[0], top_descs_[i], bottom_descs_[i], conv_descs_[i], filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_filter_algo_[i])); - - // get workspace for backwards filter algorithm - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle_[0], - top_descs_[i], - bottom_descs_[i], - conv_descs_[i], - filter_desc_, - bwd_filter_algo_[i], - &workspace_bwd_filter_sizes_[i])); + 1, + &returnedAlgoCount, + &perfBFResults)); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bwd_filter_algo_[i] = perfBFResults.algo; + workspace_bwd_filter_sizes_[i] = perfBFResults.memory; // choose backward algo for data - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( - handle_[0], - filter_desc_, - bottom_descs_[i], - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_data_algo_[i])); - - // get workspace size - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + cudnnConvolutionBwdDataAlgoPerf_t perfBDResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm( handle_[0], filter_desc_, bottom_descs_[i], conv_descs_[i], top_descs_[i], - bwd_data_algo_[i], - &workspace_bwd_data_sizes_[i])); + 1, + &returnedAlgoCount, + &perfBDResults)); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bwd_data_algo_[i] = perfBDResults.algo; + workspace_bwd_data_sizes_[i] = perfBDResults.memory; } // reduce over all workspace sizes to get a maximum to allocate / reallocate diff --git a/src/caffe/util/bbox_util.cu b/src/caffe/util/bbox_util.cu index 3d7c8d7783b..798a4d3b128 100644 --- a/src/caffe/util/bbox_util.cu +++ b/src/caffe/util/bbox_util.cu @@ -3,8 +3,11 @@ #include #include -#include "thrust/functional.h" -#include "thrust/sort.h" +#if !defined(THRUST_IGNORE_CUB_VERSION_CHECK) +#define THRUST_IGNORE_CUB_VERSION_CHECK 1 +#endif +#include +#include #include "caffe/common.hpp" #include "caffe/util/bbox_util.hpp"