From c9f0732c1f2bd6747cd473d4ad5172f02f7ff0b7 Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Sun, 3 May 2020 02:40:54 -0700
Subject: [PATCH 1/6] 0.17.4RC

---
 3rdparty/half_float/half.hpp             | 23 ++++++-----
 CMakeLists.txt                           |  2 +-
 Makefile                                 |  2 +-
 README.md                                |  2 +-
 cmake/Dependencies.cmake                 |  3 +-
 cmake/Misc.cmake                         |  2 +-
 examples/web_demo/requirements.txt       |  2 +-
 include/caffe/blob.hpp                   | 49 +++++++++++++++++++---
 include/caffe/common.hpp                 | 34 +++-------------
 include/caffe/filler.hpp                 |  8 ++--
 include/caffe/internal_thread.hpp        | 27 +++++++-----
 include/caffe/parallel.hpp               | 20 +++++----
 include/caffe/solver.hpp                 |  5 ++-
 include/caffe/tensor.hpp                 |  2 +-
 python/caffe/io.py                       |  2 +-
 python/caffe/test/test_classification.py |  4 +-
 python/requirements.txt                  |  2 +-
 src/caffe/batch_transformer.cpp          | 10 ++++-
 src/caffe/blob.cpp                       | 21 ++++++----
 src/caffe/common.cpp                     | 14 ++++---
 src/caffe/data_reader.cpp                | 10 ++++-
 src/caffe/internal_thread.cpp            | 52 +++++++++++++-----------
 src/caffe/layers/base_conv_layer.cpp     |  2 +-
 src/caffe/layers/base_data_layer.cpp     |  8 +++-
 src/caffe/layers/cudnn_conv_layer.cpp    |  4 +-
 src/caffe/layers/inner_product_layer.cpp |  2 +-
 src/caffe/parallel.cpp                   | 32 ++++++++-------
 src/caffe/test/test_internal_thread.cpp  |  6 +--
 src/caffe/util/cudnn.cpp                 | 35 ----------------
 src/caffe/util/gpu_memory.cpp            |  8 ++--
 src/caffe/util/math_functions.cpp        |  6 +++
 tools/caffe.cpp                          | 15 +++----
 32 files changed, 223 insertions(+), 191 deletions(-)

diff --git a/3rdparty/half_float/half.hpp b/3rdparty/half_float/half.hpp
index c477bbb4e7e..8a43f713fd8 100644
--- a/3rdparty/half_float/half.hpp
+++ b/3rdparty/half_float/half.hpp
@@ -349,7 +349,7 @@ namespace half_float
 		struct binary_t {};
 
 		/// Tag for binary construction.
-		HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+		static HALF_CONSTEXPR_CONST binary_t binary = binary_t();
 
 		/// Temporary half-precision expression.
 		/// This class represents a half-precision expression which just stores a single-precision value internally.
@@ -1172,7 +1172,8 @@ namespace half_float
 		/// Constructor.
 		/// \param bits binary representation to set half to
 		CAFFE_UTIL_HD
-		HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) : data_(bits) {}
+		HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT
+		    : data_(static_cast<detail::uint16>(bits)) {}
 
 		/// Internal binary representation
 		detail::uint16 data_;
@@ -3067,32 +3068,32 @@ namespace std
 		static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
 
 		/// Smallest positive normal value.
-		static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); }
+    static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x0400}; }
 
 		/// Smallest finite value.
-		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); }
+		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0xFBFF}; }
 
 		/// Largest finite value.
-		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); }
+		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7BFF}; }
 
 		/// Difference between one and next representable value.
-		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); }
+		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x1400}; }
 
 		/// Maximum rounding error.
 		static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
-			{ return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); }
+			{ return half_float::half{half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00}; }
 
 		/// Positive infinity.
-		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); }
+		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7C00}; }
 
 		/// Quiet NaN.
-		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); }
+		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7FFF}; }
 
 		/// Signalling NaN.
-		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); }
+		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x7DFF}; }
 
 		/// Smallest positive subnormal value.
-		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); }
+		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half{half_float::detail::binary, 0x0001}; }
 	};
 
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 818818a5587..f77e45af379 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,7 @@ endif()
 
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
-set(python_version "2" CACHE STRING "Specify which Python version to use")
+set(python_version "3" CACHE STRING "Specify which Python version to use")
 caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE)
 caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
 caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
diff --git a/Makefile b/Makefile
index 2b0ad1aeba4..6bc07ffdf62 100644
--- a/Makefile
+++ b/Makefile
@@ -179,7 +179,7 @@ CUDA_LIB_DIR :=
 # add <cuda>/lib64 only if it exists
 ifneq ("$(wildcard $(CUDA_DIR)/lib64)","")
 	CUDA_LIB_DIR += $(CUDA_DIR)/lib64
-	CUDA_LIB_DIR += /usr/lib/nvidia-410 /usr/lib/nvidia-418 /usr/lib/nvidia-396 /usr/lib/nvidia-390 /usr/lib/nvidia-387 /usr/lib/nvidia-384 /usr/lib/nvidia-381 /usr/lib/nvidia-375 /usr/lib/nvidia-367 /usr/lib/nvidia-361 /usr/lib/nvidia-352
+	CUDA_LIB_DIR += /usr/lib/nvidia-440 /usr/lib/nvidia-410 /usr/lib/nvidia-418 /usr/lib/nvidia-396 /usr/lib/nvidia-390 /usr/lib/nvidia-387 /usr/lib/nvidia-384 /usr/lib/nvidia-381 /usr/lib/nvidia-375 /usr/lib/nvidia-367 /usr/lib/nvidia-361 /usr/lib/nvidia-352
 endif
 CUDA_LIB_DIR += $(CUDA_DIR)/lib
 
diff --git a/README.md b/README.md
index 888d5853059..bcfe94fa15b 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,6 @@ to your PR.
 
 Libturbojpeg library is used since 0.16.5. It has a packaging bug. Please execute the following (required for Makefile, optional for CMake):
 ```
-sudo apt-get install libturbojpeg
+sudo apt-get install libturbojpeg libturbojpeg0-dev
 sudo ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
 ```
\ No newline at end of file
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 6d685fdcf4c..d9e73753aee 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -108,7 +108,8 @@ if(BUILD_python)
           boost_python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}
           PATHS ${LIBDIR})
   if ("${Boost_PYTHON_FOUND}" STREQUAL "Boost_PYTHON_FOUND-NOTFOUND")
-    message(SEND_ERROR "Could NOT find Boost Python Library")
+    find_package(Boost 1.65 COMPONENTS "python${python_version}")
+    set(Boost_PYTHON_FOUND ${Boost_PYTHON${python_version}_FOUND})
   else()
     message(STATUS "Found Boost Python Library ${Boost_PYTHON_FOUND}")
     list(APPEND Caffe_LINKER_LIBS ${Boost_PYTHON_FOUND})
diff --git a/cmake/Misc.cmake b/cmake/Misc.cmake
index 9dd2609b36a..88c0f6b8c65 100644
--- a/cmake/Misc.cmake
+++ b/cmake/Misc.cmake
@@ -29,7 +29,7 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 endif()
 
 # ---[ RPATH settings
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath")
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON CACHE BOOLEAN "Use link paths for shared library rpath")
 set(CMAKE_MACOSX_RPATH TRUE)
 
 list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir)
diff --git a/examples/web_demo/requirements.txt b/examples/web_demo/requirements.txt
index 937ac1835b6..478694aff7f 100644
--- a/examples/web_demo/requirements.txt
+++ b/examples/web_demo/requirements.txt
@@ -3,5 +3,5 @@ flask
 tornado
 numpy
 pandas
-pillow=>6.2.0
+pillow>=6.2.0
 pyyaml
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 2241396bf13..6235272f64f 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -54,8 +54,14 @@ class Blob {
   explicit Blob(Type dtype)
       : Blob(dtype, dtype) {}
 
- public:
-  virtual ~Blob() {}
+public:
+  virtual ~Blob() = default;
+
+  enum class RESHAPE_MODE: int {
+    RESHAPE_DIFF = -1,
+    RESHAPE_BOTH = 0,
+    RESHAPE_DATA = 1
+  };
 
   /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
   void Reshape(const int num, const int channels, const int height, const int width);
@@ -75,8 +81,8 @@ class Blob {
    * an error; either Net::Forward or Net::Reshape need to be called to
    * propagate the new input shape to higher layers.
    */
-  void Reshape(const vector<int>& shape);
-  void Reshape(const BlobShape& shape);
+  void Reshape(const vector<int>& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH);
+  void Reshape(const BlobShape& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH);
 
   void ReshapeLike(const Blob* other) {
     Reshape(other->shape());
@@ -111,9 +117,9 @@ class Blob {
 
   /// @brief Creates an instance of a Blob with given type Dtype and given shape.
   template<typename D, typename DI = D>
-  static shared_ptr<Blob> create(const vector<int>& shape) {
+  static shared_ptr<Blob> create(const vector<int>& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH) {
     shared_ptr<Blob> ptr = create<D, DI>();
-    ptr->Reshape(shape);
+    ptr->Reshape(shape, mode);
     return ptr;
   }
 
@@ -303,6 +309,7 @@ class Blob {
   template<typename Dtype>
   void set_cpu_data(Dtype* data) {
     CHECK_NOTNULL(data);
+    ensure_data_count();
     convert_data(tp<Dtype>());
     CHECK(is_type<Dtype>(data_type()));
     data_tensor_->mutable_synced_mem()->set_cpu_data(data);
@@ -311,6 +318,7 @@ class Blob {
   template<typename Dtype>
   void set_cpu_diff(Dtype* diff) {
     CHECK_NOTNULL(diff);
+    ensure_diff_count();
     convert_diff(tp<Dtype>());
     CHECK(is_type<Dtype>(diff_type()));
     diff_tensor_->mutable_synced_mem()->set_cpu_data(diff);
@@ -318,18 +326,21 @@ class Blob {
 
   template<typename Dtype>
   const Dtype* cpu_data() const {
+    ensure_data_count();
     convert_data(tp<Dtype>());
     return static_cast<const Dtype*>(data_tensor_->synced_mem()->cpu_data());
   }
 
   template<typename Dtype>
   const Dtype* cpu_diff() const {
+    ensure_diff_count();
     convert_diff(tp<Dtype>());
     return static_cast<const Dtype*>(diff_tensor_->synced_mem()->cpu_data());
   }
 
   template<typename Dtype>
   Dtype* mutable_cpu_data_c(bool copy_from_gpu) {
+    ensure_data_count();
     convert_data(tp<Dtype>());
     return static_cast<Dtype*>(data_tensor_->mutable_synced_mem()->mutable_cpu_data(copy_from_gpu));
   }
@@ -341,6 +352,7 @@ class Blob {
 
   template<typename Dtype>
   Dtype* mutable_cpu_diff_c(bool copy_from_gpu) {
+    ensure_diff_count();
     convert_diff(tp<Dtype>());
     return static_cast<Dtype*>(diff_tensor_->mutable_synced_mem()->mutable_cpu_data(copy_from_gpu));
   }
@@ -390,36 +402,43 @@ class Blob {
 
   /// @brief Compute the sum of absolute values (L1 norm) of the diff.
   float asum_diff(int group = 0) const {
+    ensure_diff_count();
     return diff_tensor_->asum(group);
   }
 
   /// @brief Compute the sum of squares (L2 norm squared) of the data.
   float sumsq_data(int group = 0) const {
+    ensure_data_count();
     return data_tensor_->sumsq(group);
   }
 
   /// @brief Compute the sum of squares (L2 norm squared) of the diff.
   float sumsq_diff(int group = 0) const {
+    ensure_diff_count();
     return diff_tensor_->sumsq(group);
   }
 
   /// @brief Scale the blob data by a constant factor.
   void scale_data(float scale, void* handle = nullptr) {
+    ensure_data_count();
     data_tensor_->scale(scale, handle);
   }
 
   /// @brief Scale the blob diff by a constant factor.
   void scale_diff(float scale, void* handle = nullptr) {
+    ensure_diff_count();
     diff_tensor_->scale(scale, handle);
   }
 
   /// @brief Set all the blob's data elements to a value.
   void set_data(float value) {
+    ensure_data_count();
     data_tensor_->set(value);
   }
 
   /// @brief Set all the blob's diff elements to a value.
   void set_diff(float value) {
+    ensure_diff_count();
     diff_tensor_->set(value);
   }
 
@@ -480,28 +499,33 @@ class Blob {
 
   void set_gpu_data(void* data) {
     CHECK_NOTNULL(data);
+    ensure_data_count();
     data_tensor_->mutable_synced_mem()->set_gpu_data(data);
   }
 
   void set_gpu_diff(void* diff) {
     CHECK_NOTNULL(diff);
+    ensure_diff_count();
     diff_tensor_->mutable_synced_mem()->set_gpu_data(diff);
   }
 
   template<typename Dtype>
   const Dtype* gpu_data() const {
+    ensure_data_count();
     convert_data(tp<Dtype>());
     return static_cast<const Dtype*>(data_tensor_->synced_mem()->gpu_data());
   }
 
   template<typename Dtype>
   const Dtype* gpu_diff() const {
+    ensure_diff_count();
     convert_diff(tp<Dtype>());
     return static_cast<const Dtype*>(diff_tensor_->synced_mem()->gpu_data());
   }
 
   template<typename Dtype>
   Dtype* mutable_gpu_data_c(bool copy_from_cpu) {
+    ensure_data_count();
     convert_data(tp<Dtype>());
     return static_cast<Dtype*>(data_tensor_->mutable_synced_mem()->mutable_gpu_data(copy_from_cpu));
   }
@@ -513,6 +537,7 @@ class Blob {
 
   template<typename Dtype>
   Dtype* mutable_gpu_diff_c(bool copy_from_cpu) {
+    ensure_diff_count();
     convert_diff(tp<Dtype>());
     return static_cast<Dtype*>(diff_tensor_->mutable_synced_mem()->mutable_gpu_data(copy_from_cpu));
   }
@@ -573,6 +598,18 @@ class Blob {
     diff_tensor_->convert(new_diff_type);
   }
 
+  void ensure_data_count() const {
+    if (data_tensor_->is_empty() && !diff_tensor_->is_empty()) {
+      data_tensor_->Reshape(diff_tensor_->count_);
+    }
+  }
+
+  void ensure_diff_count() const {
+    if (diff_tensor_->is_empty() && !data_tensor_->is_empty()) {
+      diff_tensor_->Reshape(data_tensor_->count_);
+    }
+  }
+
   static float at(int offset, Type dtype, const void* data);
   static void cpu_axpy(int count, Type dtype, float alpha, const void* X, void* Y);
   static void gpu_axpy(int count, Type dtype, float alpha, const void* X, void* Y);
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index faa20cd0eb4..17e14aba617 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -159,9 +159,6 @@ class Caffe {
   static shared_ptr<CudaStream> thread_pstream(int group = 0) {
     return Get().pstream(group);
   }
-  static shared_ptr<CuBLASHandle> short_term_cublas_phandle() {
-    return make_shared<CuBLASHandle>();
-  }
 #ifdef USE_CUDNN
   static cudnnHandle_t cudnn_handle(int group) {
     return Get().th_cudnn_handle(group);
@@ -232,14 +229,14 @@ class Caffe {
   /// All physical devices regardless of usage
   static int device_count();
   // Parallel training info
-  static int solver_count() {
+  static size_t solver_count() {
     return solver_count_;
   }
-  /// NUmber of physical devices being used
+  // Number of physical devices being used
   static int device_in_use_per_host_count() {
     return (int)gpus_.size();
   }
-  static void set_solver_count(int val) {
+  static void set_solver_count(size_t val) {
     if (solver_count_ != val) {
       std::lock_guard<std::mutex> lock(caffe_mutex_);
       solver_count_ = val;
@@ -260,27 +257,6 @@ class Caffe {
   static const std::vector<int>& gpus() {
     return gpus_;
   }
-  static const std::string& caffe_version() {
-    return props().caffe_version();
-  }
-  static const std::string& cudnn_version() {
-    return props().cudnn_version();
-  }
-  static const std::string& cublas_version() {
-    return props().cublas_version();
-  }
-  static const std::string& cuda_version() {
-    return props().cuda_version();
-  }
-  static const std::string& cuda_driver_version() {
-    return props().cuda_driver_version();
-  }
-  static std::string start_time() {
-    return props().start_time();
-  }
-  static std::time_t init_time() {
-    return props().init_time();
-  }
   static std::string time_from_init();
   static int device_capability(int device) {
     return props().device_capability(device);
@@ -325,7 +301,7 @@ class Caffe {
   // For example, if user runs `caffe train -gpu=1,0,3` then it has to be set to 1.
   static int root_device_;
   static Brew mode_;
-  static int solver_count_;
+  static size_t solver_count_;
   static std::vector<int> gpus_;
   static int thread_count_;
   static int restored_iter_;
@@ -362,9 +338,9 @@ class Caffe {
   class Properties {
     friend class Caffe;
 
-   public:
     Properties();
 
+   public:
     const std::string& caffe_version() const {
       return caffe_version_;
     }
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 11fdd8dee5d..bb40b10100b 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -282,10 +282,10 @@ class XavierFiller : public Filler<Dtype> {
     CHECK(blob->count());
     int fan_in = blob->count() / blob->num();
     int fan_out = blob->count() / blob->channels();
-    Dtype n = fan_in;  // default to fan_in
+    float n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {
-      n = (fan_in + fan_out) / Dtype(2);
+      n = (fan_in + fan_out) / 2.F;
     } else if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_FAN_OUT) {
       n = fan_out;
@@ -334,10 +334,10 @@ class XavierStaticFiller : public Filler<Dtype> {
       xavier_static_filler_data_.resize(blob_count);
       int fan_in = blob_count / blob->num();
       int fan_out = blob_count / blob->channels();
-      Dtype n = fan_in;  // default to fan_in
+      float n = fan_in;  // default to fan_in
       if (this->filler_param_.variance_norm() ==
          FillerParameter_VarianceNorm_AVERAGE) {
-        n = (fan_in + fan_out) / Dtype(2);
+        n = (fan_in + fan_out) / 2.F;
       } else if (this->filler_param_.variance_norm() ==
           FillerParameter_VarianceNorm_FAN_OUT) {
         n = fan_out;
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index eba359cebef..a1e8b03346f 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -18,8 +18,8 @@ namespace caffe {
  */
 class InternalThread {
  public:
-  InternalThread(int target_device, size_t rank_, size_t threads, bool delayed);
-  virtual ~InternalThread() {}
+  InternalThread(int target_device, size_t rank_, size_t threads, bool delayed, const std::string& name);
+  virtual ~InternalThread() = default;
 
   /**
    * Caffe's thread local state will be initialized using the current
@@ -40,12 +40,16 @@ class InternalThread {
   void StopInternalThread(bool wait_all = true);
   void WaitAll();
 
-  bool is_started(int id = 0) const {
-    return threads_[id].joinable();
+  const string& get_name() const {
+    return name_;
+  }
+
+  bool is_started(int child_id = 0) const {
+    return children_[child_id].joinable();
   }
 
   size_t threads_num() const {
-    return threads_.size();
+    return children_.size();
   }
 
   void go() {
@@ -62,19 +66,20 @@ class InternalThread {
       with the code you want your thread to run. */
   virtual void InternalThreadEntry() {}
 
-  virtual void InternalThreadEntryN(size_t id) {}
+  virtual void InternalThreadEntryN(size_t child_id) {}
 
   /* Should be tested when running loops to exit when requested. */
-  bool must_stop(int id) {
-    return threads_[id].interruption_requested();
+  bool must_stop(size_t child_id) {
+    return children_[child_id].interruption_requested();
   }
 
  private:
-  void entry(int thread_id, int device, Caffe::Brew mode, uint64_t rand_seed,
+  void entry(size_t thread_id, int device, Caffe::Brew mode, uint64_t rand_seed,
       size_t rank, bool set_cpu_affinity);
-
-  vector<boost::thread> threads_;
+  std::uint32_t lwp_id_, lwp_id_parent_;
+  vector<boost::thread> children_;
   vector<shared_ptr<Flag>> delay_flags_;
+  const std::string name_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
index 279a52fe57a..d8cdf4cf439 100644
--- a/include/caffe/parallel.hpp
+++ b/include/caffe/parallel.hpp
@@ -62,10 +62,6 @@ class P2PManager {
   }
 #endif
 
-  void bar_wait(int b) {
-    bar_[b]->wait();
-  }
-
   static void Init(int *argc, char ***argv);
 
   static int global_rank() {
@@ -80,7 +76,10 @@ class P2PManager {
     return host_name_;
   }
 
- protected:
+  static unique_ptr<boost::barrier> solve_bar_;
+  static unique_ptr<boost::barrier> solved_bar_;
+
+protected:
   const size_t nranks_;
   vector<unique_ptr<P2PSync>> syncs_;
   shared_ptr<SharedScores<float>> shared_;
@@ -88,8 +87,6 @@ class P2PManager {
 #ifdef USE_NCCL
   ncclUniqueId nccl_id_;
 #endif
-  static unique_ptr<boost::barrier> bar_[3];
-
   static int global_rank_;
   static int global_count_;
   static char host_name_[_POSIX_HOST_NAME_MAX + 1];
@@ -101,14 +98,19 @@ class P2PSync : public Solver::Callback, public InternalThread {
  public:
   P2PSync(P2PManager* mgr, shared_ptr<Solver> root_solver,
       int rank, int nranks, const SolverParameter& param);
-  virtual ~P2PSync();
+  ~P2PSync();
 
   // Divide the batch size by the number of solvers
   static unsigned int divide_batch_size(NetParameter* net);
 
   void allreduce(int param_id) override;
   void allreduce_bucket(size_t count, void* bucket, Type type) override;
-  void soft_barrier(int b) override;
+  void solve_barrier() override {
+    P2PManager::solve_bar_->wait();
+  }
+  void solved_barrier() override {
+    P2PManager::solved_bar_->wait();
+  }
   void cancel_all() override;
   void saveTestResults(float loss, const vector<float>& scores) override;
   void aggregateTestResults(float* loss, vector<float>* scores) override;
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 66652845934..9b865e4ea29 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -107,7 +107,8 @@ class Solver {
    public:
     virtual void allreduce(int param_id) = 0;
     virtual void allreduce_bucket(size_t count, void* bucket, Type type) = 0;
-    virtual void soft_barrier(int b) = 0;
+    virtual void solve_barrier() = 0;
+    virtual void solved_barrier() = 0;
     virtual void cancel_all() = 0;
     virtual void saveTestResults(float loss, const vector<float>& scores) = 0;
     virtual void aggregateTestResults(float* loss, vector<float>* scores) = 0;
@@ -197,7 +198,7 @@ class Solver {
 
   void callback_soft_barrier() {
     if (callback_ != nullptr) {
-      callback_->soft_barrier(0);
+      callback_->solve_barrier();
     }
   }
 
diff --git a/include/caffe/tensor.hpp b/include/caffe/tensor.hpp
index 975f1bc314a..61cab872b1c 100644
--- a/include/caffe/tensor.hpp
+++ b/include/caffe/tensor.hpp
@@ -20,7 +20,7 @@ class Tensor {
 
  public:
   explicit Tensor(Type type);
-  ~Tensor() {}
+  ~Tensor() = default;
 
   std::string to_string(int indent) const;
 
diff --git a/python/caffe/io.py b/python/caffe/io.py
index dcb24cc31f4..fc20c010df7 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -300,7 +300,7 @@ def load_image(filename, color=True):
         of size (H x W x 3) in RGB or
         of size (H x W x 1) in grayscale.
     """
-    img = skimage.img_as_float(skimage.io.imread(filename, as_grey=not color)).astype(np.float32)
+    img = skimage.img_as_float(skimage.io.imread(filename, as_gray=not color)).astype(np.float32)
     if img.ndim == 2:
         img = img[:, :, np.newaxis]
         if color:
diff --git a/python/caffe/test/test_classification.py b/python/caffe/test/test_classification.py
index 823d6e28c9c..b19b859d5b2 100644
--- a/python/caffe/test/test_classification.py
+++ b/python/caffe/test/test_classification.py
@@ -10,7 +10,6 @@
 from glob import glob
 from google.protobuf import text_format
 from PIL import Image
-import scipy.misc
 
 # os.environ['GLOG_minloglevel'] = '2' # Suppress most caffe output
 import caffe
@@ -80,9 +79,8 @@ def load_image(path, height, width, mode='RGB'):
     """
     image = Image.open(path)
     image = image.convert(mode)
-    image = np.array(image)
     # squash
-    image = scipy.misc.imresize(image, (height, width), 'bilinear')
+    image = np.array(image.resize((height, width), Image.BILINEAR))
     return image
 
 def forward_pass(images, net, transformer, batch_size=None):
diff --git a/python/requirements.txt b/python/requirements.txt
index 90964988645..d09f6e680f4 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -13,6 +13,6 @@ pydotplus
 python-dateutil>=2
 python-gflags>=2.0
 pyyaml>=3.10
-six>=1.5.2,<=1.10.0
+six>=1.5.2,<=1.11.0
 image>=1.5.16
 jupyterlab
diff --git a/src/caffe/batch_transformer.cpp b/src/caffe/batch_transformer.cpp
index 97f0114af86..1033808dcbf 100644
--- a/src/caffe/batch_transformer.cpp
+++ b/src/caffe/batch_transformer.cpp
@@ -2,10 +2,16 @@
 
 namespace caffe {
 
+std::string bt_name(size_t rank, size_t queues_num) {
+  std::ostringstream os;
+  os << "BatchTransformer of rank " << rank << ", queues " << queues_num;
+  return os.str();
+}
+
 template<typename Ftype, typename Btype>
-BatchTransformer<Ftype, Btype>::BatchTransformer(int target_device, size_t rank_,
+BatchTransformer<Ftype, Btype>::BatchTransformer(int target_device, size_t rank,
     size_t queues_num, const TransformationParameter& transform_param, bool gpu_transform) :
-    InternalThread(target_device, rank_, 1, false),
+    InternalThread(target_device, rank, 1, false, bt_name(rank, queues_num)),
     queues_num_(queues_num),
     next_batch_queue_(0UL),
     transform_param_(transform_param),
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 9c7f869cca3..86259dd2629 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -32,7 +32,7 @@ void Blob::Reshape(const int n) {
   Reshape(shape);
 }
 
-void Blob::Reshape(const vector<int>& shape) {
+void Blob::Reshape(const vector<int>& shape, RESHAPE_MODE mode) {
   CHECK_LE(shape.size(), kMaxBlobAxes);
   CHECK(data_tensor_);
   CHECK(diff_tensor_);
@@ -51,19 +51,23 @@ void Blob::Reshape(const vector<int>& shape) {
     shape_[i] = shape[i];
     shape_data[i] = shape[i];
   }
-  data_tensor_->Reshape(count_);
-  diff_tensor_->Reshape(count_);
-  CHECK(is_current_data_valid());
-  CHECK(is_current_diff_valid());
+  if (mode == RESHAPE_MODE::RESHAPE_DATA || mode == RESHAPE_MODE::RESHAPE_BOTH) {
+    data_tensor_->Reshape(count_);
+    CHECK(is_current_data_valid());
+  }
+  if (mode == RESHAPE_MODE::RESHAPE_DIFF || mode == RESHAPE_MODE::RESHAPE_BOTH) {
+    diff_tensor_->Reshape(count_);
+    CHECK(is_current_diff_valid());
+  }
 }
 
-void Blob::Reshape(const BlobShape& shape) {
+void Blob::Reshape(const BlobShape& shape, RESHAPE_MODE mode) {
   CHECK_LE(shape.dim_size(), kMaxBlobAxes);
   vector<int> shape_vec(shape.dim_size());
   for (int i = 0; i < shape.dim_size(); ++i) {
     shape_vec[i] = shape.dim(i);
   }
-  Reshape(shape_vec);
+  Reshape(shape_vec, mode);
 }
 
 const int* Blob::gpu_shape() const {
@@ -73,6 +77,7 @@ const int* Blob::gpu_shape() const {
 
 void Blob::ShareData(const Blob& other) {
   CHECK_NE(this, &other);
+  other.ensure_data_count();
 //  CHECK(!other.IsSharedDataCycled());
   if (data_tensor_.get() == other.data_tensor_.get()) {
     CHECK_EQ(data_shared_with_, &other);
@@ -87,6 +92,7 @@ void Blob::ShareData(const Blob& other) {
 
 void Blob::ShareDiff(const Blob& other) {
   CHECK_NE(this, &other);
+  other.ensure_diff_count();
 ///  CHECK(!other.IsSharedDiffCycled());
   if (diff_tensor_.get() == other.diff_tensor_.get()) {
     CHECK_EQ(diff_shared_with_, &other);
@@ -409,6 +415,7 @@ void Blob::ToProto(BlobProto* proto, bool store_in_old_format, bool write_diff)
     return;
   }
   CHECK(is_current_data_valid());
+  ensure_diff_count();
   CHECK(is_current_diff_valid());
   Type dt = data_type();
   proto->clear_shape();
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index efe6d21c36e..d0bdb6b9a87 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -17,7 +17,7 @@ namespace caffe {
 
 // Must be set before brewing
 Caffe::Brew Caffe::mode_ = Caffe::GPU;
-int Caffe::solver_count_ = 1;
+size_t Caffe::solver_count_ = 1;
 std::vector<int> Caffe::gpus_;
 int Caffe::root_device_ = -1;
 int Caffe::thread_count_ = 0;
@@ -109,13 +109,14 @@ void Caffe::set_restored_iter(int val) {
 }
 
 void GlobalInit(int* pargc, char*** pargv) {
-  P2PManager::Init(pargc, pargv);
   // Google flags.
   ::gflags::ParseCommandLineFlags(pargc, pargv, true);
   // Google logging.
   ::google::InitGoogleLogging(*(pargv)[0]);
   // Provide a backtrace on segfault.
   ::google::InstallFailureSignalHandler();
+
+  P2PManager::Init(pargc, pargv);
 }
 
 int Caffe::device_count() {
@@ -466,7 +467,7 @@ Caffe::Properties::Properties() :
   for (int gpu = 0; gpu < compute_capabilities_.size(); ++gpu) {
     CUDA_CHECK(cudaGetDeviceProperties(&device_prop, gpus[gpu]));
     compute_capabilities_[gpu] = device_prop.major * 100 + device_prop.minor;
-    DLOG(INFO) << "GPU " << gpus[gpu] << " '" << device_prop.name
+    LOG(INFO) << "GPU " << gpus[gpu] << " '" << device_prop.name
                << "' has compute capability " << device_prop.major << "." << device_prop.minor;
   }
 #ifdef USE_CUDNN
@@ -474,9 +475,10 @@ Caffe::Properties::Properties() :
 #else
   cudnn_version_ = "USE_CUDNN is not defined";
 #endif
-  shared_ptr<CuBLASHandle> phandle = Caffe::short_term_cublas_phandle();
+  cublasHandle_t handle;
+  CUBLAS_CHECK(cublasCreate(&handle));
   int cublas_version = 0;
-  CUBLAS_CHECK(cublasGetVersion(phandle->get(), &cublas_version));
+  CUBLAS_CHECK(cublasGetVersion(handle, &cublas_version));
   cublas_version_ = std::to_string(cublas_version);
 
   int cuda_version = 0;
@@ -492,7 +494,7 @@ std::string Caffe::time_from_init() {
   std::ostringstream os;
   os.unsetf(std::ios_base::floatfield);
   os.precision(4);
-  double span = std::difftime(std::time(NULL), init_time());
+  double span = std::difftime(std::time(NULL), props().init_time());
   const double mn = 60.;
   const double hr = 3600.;
   if (span < mn) {
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
index 456eae3ec96..d7ce0f495b3 100644
--- a/src/caffe/data_reader.cpp
+++ b/src/caffe/data_reader.cpp
@@ -14,6 +14,13 @@ std::mutex DataReader<DatumType>::db_mutex_;
 template<typename DatumType>
 std::mutex DataReader<DatumType>::DataCache::cache_mutex_{};
 
+std::string dr_name(size_t rank, size_t parser_threads_num, size_t transf_threads_num) {
+  std::ostringstream os;
+  os << "DataReader of local solver rank " << rank
+      << ", parser threads " << parser_threads_num << ", transf threads " << transf_threads_num;
+  return os.str();
+}
+
 template<typename DatumType>
 DataReader<DatumType>::DataReader(const LayerParameter& param,
     size_t local_solver_count,
@@ -27,7 +34,8 @@ DataReader<DatumType>::DataReader(const LayerParameter& param,
     bool shuffle,
     bool epoch_count_required)
     : InternalThread(Caffe::device(),
-                     local_solver_rank, sample_only ? 1U : parser_threads_num, false),
+                     local_solver_rank, sample_only ? 1U : parser_threads_num, false,
+                     dr_name(local_solver_rank, parser_threads_num, transf_threads_num)),
       parser_threads_num_(threads_num()),
       transf_threads_num_(sample_only ? 1U : transf_threads_num),
       queues_num_(parser_threads_num_ * transf_threads_num_),
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index 688e9b591c3..b30b2fa3bec 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -7,11 +7,16 @@
 
 namespace caffe {
 
-InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed)
+InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed, const std::string& name)
     : target_device_(target_device),
       rank_(rank),
-      threads_(threads),
-      delay_flags_(threads, make_shared<Flag>(!delayed)) {}
+      lwp_id_(0),
+      lwp_id_parent_(caffe::lwp_id()),
+      children_(threads),
+      delay_flags_(threads, make_shared<Flag>(!delayed)),
+      name_(name) {
+    LOG(INFO) << "InternalThread " << lwp_id_parent_ << ": " << name;
+}
 
 void InternalThread::StartInternalThread(bool set_cpu_affinity, uint64_t random_seed) {
   CHECK(!is_started()) << "Threads should persist and not be restarted.";
@@ -19,15 +24,15 @@ void InternalThread::StartInternalThread(bool set_cpu_affinity, uint64_t random_
 #ifdef USE_MPI
     "{" << P2PManager::global_rank() << "} "
 #endif
-    "Starting " << threads_.size() << " internal thread(s) on device " << target_device_;
+    "Starting " << children_.size() << " internal thread(s) on device " << target_device_;
   Caffe::Brew mode = Caffe::mode();
   if (mode == Caffe::GPU) {
     CHECK_GE(target_device_, 0);
   }
   try {
-    for (size_t id = 0; id < threads_.size(); ++id) {
-      threads_[id] = boost::thread(&InternalThread::entry, this, id, target_device_, mode,
-          random_seed, rank_, set_cpu_affinity);
+    for (size_t child_id = 0; child_id < children_.size(); ++child_id) {
+        children_[child_id] = boost::thread(&InternalThread::entry, this, child_id,
+                target_device_, mode, random_seed, rank_, set_cpu_affinity);
     }
   } catch (std::exception& e) {
     LOG(FATAL) << "Thread exception: " << e.what();
@@ -44,15 +49,15 @@ void InternalThread::RestartAllThreads(size_t new_threads, bool delayed, bool se
   if (mode == Caffe::GPU) {
     CHECK_GE(target_device_, 0);
   }
-  threads_.clear();
+  children_.clear();
   delay_flags_.clear();
-  threads_.resize(new_threads);
+  children_.resize(new_threads);
   delay_flags_.resize(new_threads);
   try {
-    for (size_t id = 0; id < new_threads; ++id) {
-      CHECK(!is_started(id));
-      delay_flags_[id] = make_shared<Flag>(!delayed);
-      threads_[id] = boost::thread(&InternalThread::entry, this, id,
+    for (size_t child_id = 0; child_id < new_threads; ++child_id) {
+      CHECK(!is_started(child_id));
+      delay_flags_[child_id] = make_shared<Flag>(!delayed);
+      children_[child_id] = boost::thread(&InternalThread::entry, this, child_id,
           target_device_, mode, random_seed, rank_, set_cpu_affinity);
     }
   } catch (std::exception& e) {
@@ -60,9 +65,10 @@ void InternalThread::RestartAllThreads(size_t new_threads, bool delayed, bool se
   }
 }
 
-void InternalThread::entry(int thread_id, int device, Caffe::Brew mode, uint64_t random_seed,
+void InternalThread::entry(size_t child_id, int device, Caffe::Brew mode, uint64_t random_seed,
     size_t rank, bool set_cpu_affinity) {
-  delay_flags_[thread_id]->wait();
+  lwp_id_ = caffe::lwp_id();
+  delay_flags_[child_id]->wait();
   if (mode == Caffe::GPU) {
     CHECK_GE(device, 0);
   }
@@ -82,17 +88,17 @@ void InternalThread::entry(int thread_id, int device, Caffe::Brew mode, uint64_t
     nvml::setCpuAffinity(device);
 #endif
   }
-  if (threads_.size() == 1) {
+  if (children_.size() == 1) {
     InternalThreadEntry();
   } else {
-    InternalThreadEntryN(thread_id);
+    InternalThreadEntryN(child_id);
   }
 }
 
 void InternalThread::StopInternalThread(bool wait_all) {
-  for (size_t id = 0; id < threads_.size(); ++id) {
-    if (is_started(id)) {
-      threads_[id].interrupt();
+  for (size_t child_id = 0; child_id < children_.size(); ++child_id) {
+    if (is_started(child_id)) {
+      children_[child_id].interrupt();
     }
   }
   if (wait_all) {
@@ -102,9 +108,9 @@ void InternalThread::StopInternalThread(bool wait_all) {
 
 void InternalThread::WaitAll() {
   try {
-    for (size_t id = 0; id < threads_.size(); ++id) {
-      if (is_started(id)) {
-        threads_[id].join();
+    for (size_t child_id = 0; child_id < children_.size(); ++child_id) {
+      if (is_started(child_id)) {
+        children_[child_id].join();
       }
     }
   } catch (boost::thread_interrupted&) {
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index c223a3c5a8b..8c68928cf60 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -150,7 +150,7 @@ void BaseConvolutionLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
     }
     // Initialize and fill the weights:
     // output channels x input channels per-group x kernel height x kernel width
-    this->blobs_[0] = Blob::create<Ftype, Btype>(weight_shape);
+    this->blobs_[0] = Blob::create<Ftype, Btype>(weight_shape, Blob::RESHAPE_MODE::RESHAPE_DATA);
     shared_ptr<Filler<Ftype>> weight_filler(
         GetFiller<Ftype>(this->layer_param_.convolution_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 0b9a76a61de..f84a7828ade 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -56,11 +56,17 @@ void BaseDataLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
   DataLayerSetUp(bottom, top);
 }
 
+std::string bpdl_name(size_t rank) {
+  std::ostringstream os;
+  os << "BasePrefetchingDataLayer of local solver rank " << rank;
+  return os.str();
+}
+
 template<typename Ftype, typename Btype>
 BasePrefetchingDataLayer<Ftype, Btype>::BasePrefetchingDataLayer(const LayerParameter& param,
     size_t solver_rank)
     : BaseDataLayer<Ftype, Btype>(param, threads(param)),
-      InternalThread(Caffe::device(), solver_rank, threads(param), false),
+      InternalThread(Caffe::device(), solver_rank, threads(param), false, bpdl_name(solver_rank)),
       auto_mode_(Caffe::mode() == Caffe::GPU && this->phase_ == TRAIN && auto_mode(param)),
       parsers_num_(parser_threads(param)),
       transf_num_(threads(param)),
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 49795c93a1d..891b14f4b6c 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -308,6 +308,7 @@ template <typename Ftype, typename Btype>
 void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
     const vector<Blob*>& bottom, const vector<Blob*>& top) {
   // Check whether cached descriptors have been initialized.
+  bool sizes_changed = false;
   if (initialized_cached_descs_) {
     // Check whether bottom and conv descriptors have changed,
     // which then requires a new reshape and set algo.
@@ -316,6 +317,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
         IsConvDescChanged(bottom, true) ||
         (this->phase_ == TRAIN && IsConvDescChanged(bottom, false))) {
       use_reshape_ = true;
+      sizes_changed = true;
     } else {
       // When no reshape is needed, setting algo may be still needed
       // (for example, if we are at iteration 1).
@@ -427,7 +429,7 @@ void CuDNNConvolutionLayer<Ftype, Btype>::Reshape(
         align_up<8>(this->weight_offset_ * tsize(tpmax<Btype, float>())));
   }
 
-  if (fwd_count_ == 0UL) {
+  if (sizes_changed || fwd_count_ == 0UL) {
     AllocateWorkspace(bottom.size());
   }
   // Ask cuDNN to find the best algorithm
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 7fa4d7c4e65..f0cb1994f09 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -37,7 +37,7 @@ InnerProductLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom, const v
       weight_shape[1] = K_;
     }
     // CPU filler always 32 bits
-    this->blobs_[0] = Blob::create<Ftype>(weight_shape);
+    this->blobs_[0] = Blob::create<Ftype>(weight_shape, Blob::RESHAPE_MODE::RESHAPE_DATA);
     shared_ptr<Filler<Ftype>> weight_filler(
         GetFiller<Ftype>(this->layer_param_.inner_product_param().weight_filler()));
 
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
index 75b44a323ae..ad2561cd6a1 100644
--- a/src/caffe/parallel.cpp
+++ b/src/caffe/parallel.cpp
@@ -19,7 +19,8 @@ namespace caffe {
 int P2PManager::global_rank_  = 0;
 int P2PManager::global_count_ = 1;
 char P2PManager::host_name_[_POSIX_HOST_NAME_MAX + 1];
-unique_ptr<boost::barrier> P2PManager::bar_[3];
+unique_ptr<boost::barrier> P2PManager::solve_bar_;
+unique_ptr<boost::barrier> P2PManager::solved_bar_;
 
 void P2PManager::Init(int *argc, char ***argv) {
 #ifdef USE_MPI
@@ -58,9 +59,8 @@ P2PManager::P2PManager(shared_ptr<Solver> root_solver,
       nranks_(nranks),
       syncs_(devices),
       root_solver_(root_solver) {
-  bar_[0].reset(new boost::barrier(devices));
-  bar_[1].reset(new boost::barrier(devices));
-  bar_[2].reset(new boost::barrier(devices));
+  solve_bar_.reset(new boost::barrier(devices));
+  solved_bar_.reset(new boost::barrier(devices));
 #ifndef USE_NCCL
   LOG(FATAL) << "USE_NCCL must be specified for multi-GPU mode";
 #else
@@ -156,9 +156,16 @@ void P2PManager::cancel_all(P2PSync* killed) {
   }
 }
 
+std::string p2p_sync_name(bool is_root, int rank, int nranks) {
+  std::ostringstream os;
+  os << (is_root ? "ROOT " : "") << "P2PSync of rank " << rank << " out of " << nranks;
+  return os.str();
+}
+
 P2PSync::P2PSync(P2PManager* mgr, shared_ptr<Solver> root_solver,
     int rank, int nranks, const SolverParameter& solver_param)
-    : InternalThread(solver_param.device_id(), rank, 1, false),
+    : InternalThread(solver_param.device_id(), rank, 1, false,
+                     p2p_sync_name(!root_solver, rank, nranks)),
       mgr_(mgr),
       rank_(rank),
       nranks_(nranks),
@@ -188,7 +195,7 @@ void P2PSync::InternalThreadEntry() {
     solver_.swap(root_solver_);
     solver_->root_add_callback(this);
   }
-  soft_barrier(0);
+  solve_barrier();
   if (!root) {
     Caffe::set_root_solver(false);
     solver_.reset(caffe::SolverRegistry::CreateSolver(solver_param_, root_solver_.get(), rank_));
@@ -214,12 +221,12 @@ void P2PSync::InternalThreadEntry() {
                               rank_));
   NCCL_CHECK(ncclGroupEnd());
 #else
-  soft_barrier(0);
+  solve_barrier();
   NCCL_CHECK(ncclCommInitRank(&nccl_comm_,
                               nranks_,
                               mgr_->nccl_id(),
                               rank_));
-  soft_barrier(0);
+  solve_barrier();
 #endif
 #endif
 
@@ -239,13 +246,8 @@ void P2PSync::InternalThreadEntry() {
 
   solver_->Solve();
 
-  soft_barrier(2);
-  DLOG(INFO) << " Leaving P2PSync thread " << lwp_id();
-}
-
-void P2PSync::soft_barrier(int b) {
-  // CPU barrier to avoid busy-polling on the GPU.
-  mgr_->bar_wait(b);
+  solved_barrier();
+  LOG(INFO) << " Leaving P2PSync thread " << lwp_id() << " '" << this->get_name() << "'";
 }
 
 void P2PSync::cancel_all() {
diff --git a/src/caffe/test/test_internal_thread.cpp b/src/caffe/test/test_internal_thread.cpp
index f08f72c1de2..b447026b712 100644
--- a/src/caffe/test/test_internal_thread.cpp
+++ b/src/caffe/test/test_internal_thread.cpp
@@ -12,7 +12,7 @@ namespace caffe {
 class InternalThreadTest : public ::testing::Test {};
 
 TEST_F(InternalThreadTest, TestStartAndExit) {
-  InternalThread thread(Caffe::device(), 0U, 1, false);
+  InternalThread thread(Caffe::device(), 0U, 1, false, "TestStartAndExit");
   EXPECT_FALSE(thread.is_started());
   thread.StartInternalThread();
   EXPECT_TRUE(thread.is_started());
@@ -26,7 +26,7 @@ class TestThreadA : public InternalThread {
   }
  public:
   TestThreadA(int device = Caffe::device())
-    : InternalThread(device, 0U, 1, false) {}
+    : InternalThread(device, 0U, 1, false, "TestThreadA") {}
 };
 
 class TestThreadB : public InternalThread {
@@ -35,7 +35,7 @@ class TestThreadB : public InternalThread {
   }
  public:
   TestThreadB(int device = Caffe::device())
-    : InternalThread(device, 0U, 1, false) {}
+    : InternalThread(device, 0U, 1, false, "TestThreadB") {}
 };
 
 TEST_F(InternalThreadTest, TestRandomSeed) {
diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp
index 3a488b72a58..b008e6f4d1e 100644
--- a/src/caffe/util/cudnn.cpp
+++ b/src/caffe/util/cudnn.cpp
@@ -1,41 +1,6 @@
 #ifdef USE_CUDNN
 #include "caffe/util/cudnn.hpp"
 
-const char* cudnnGetErrorString(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-#if CUDNN_VERSION_MIN(6, 0, 1)
-    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
-      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
-#endif
-    default:
-      break;
-  }
-  return "Unknown cudnn status";
-}
-
-
 namespace caffe {
 namespace cudnn {
 
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index d48c2b95031..6bce127d0cb 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -55,16 +55,16 @@ bool GPUMemory::Workspace::try_reserve(size_t size, int device) {
 }
 
 GPUMemory::Manager::Manager() : debug_(false), initialized_(false) {
-  const int count = Caffe::device_count();
-  dev_info_.resize(count);
-  update_thresholds_.resize(count);
 }
 
-
 void GPUMemory::Manager::init(const vector<int>& gpus, bool debug) {
   if (initialized_) {
     return;
   }
+  const int count = Caffe::device_count();
+  dev_info_.resize(count);
+  update_thresholds_.resize(count);
+
   bool debug_env = getenv("DEBUG_GPU_MEM") != 0;
   debug_ = debug || debug_env;
   try {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 525cf156139..71d94458fb4 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -369,6 +369,12 @@ template <typename Ftype>
 void caffe_rng_uniform(int n, Ftype a, Ftype b, Blob* blob) {
   CHECK_GE(n, 0);
   CHECK_LE(a, b);
+
+  if (Caffe::mode() == Caffe::GPU && n >= 1000000) {
+    caffe_gpu_rng_uniform(n, a, b, blob->mutable_gpu_data_c<Ftype>(false));
+    return;
+  }
+
   boost::uniform_real<Ftype> random_distribution(a, caffe_nextafter<Ftype>(b));
   boost::variate_generator<caffe::rng_t*, boost::uniform_real<Ftype> >
       variate_generator(caffe_rng(), random_distribution);
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index f42c1022b01..6cc1cfec4cb 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -698,13 +698,14 @@ int main(int argc, char** argv) {
   get_gpus(&gpus);
   Caffe::SetDevice(gpus.size() > 0 ? gpus[0] : 0);
   Caffe::set_gpus(gpus);
-
-  LOG(INFO) << "This is NVCaffe " << Caffe::caffe_version()
-            << " started at " << Caffe::start_time();
-  LOG(INFO) << "CuDNN version: " << Caffe::cudnn_version();
-  LOG(INFO) << "CuBLAS version: " << Caffe::cublas_version();
-  LOG(INFO) << "CUDA version: " << Caffe::cuda_version();
-  LOG(INFO) << "CUDA driver version: " << Caffe::cuda_driver_version();
+  Caffe::Properties& props = Caffe::props();
+
+  LOG(INFO) << "This is NVCaffe " << props.caffe_version()
+            << " started at " << props.start_time();
+  LOG(INFO) << "CuDNN version: " << props.cudnn_version();
+  LOG(INFO) << "CuBLAS version: " << props.cublas_version();
+  LOG(INFO) << "CUDA version: " << props.cuda_version();
+  LOG(INFO) << "CUDA driver version: " << props.cuda_driver_version();
   LOG(INFO) << "Arguments: " << os.str();
 
   if (argc == 2) {

From e3ffb453f5f83144e15f42b939e14119f326366f Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Sun, 3 May 2020 23:59:19 -0700
Subject: [PATCH 2/6] 0.17.4RC

---
 include/caffe/blob.hpp            | 5 +++--
 include/caffe/internal_thread.hpp | 3 ++-
 include/caffe/parallel.hpp        | 2 +-
 python/requirements.txt           | 3 ++-
 src/caffe/internal_thread.cpp     | 3 ++-
 5 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 6235272f64f..6efdc6701c3 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -54,7 +54,7 @@ class Blob {
   explicit Blob(Type dtype)
       : Blob(dtype, dtype) {}
 
-public:
+ public:
   virtual ~Blob() = default;
 
   enum class RESHAPE_MODE: int {
@@ -117,7 +117,8 @@ class Blob {
 
   /// @brief Creates an instance of a Blob with given type Dtype and given shape.
   template<typename D, typename DI = D>
-  static shared_ptr<Blob> create(const vector<int>& shape, RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH) {
+  static shared_ptr<Blob> create(const vector<int>& shape,
+      RESHAPE_MODE mode = RESHAPE_MODE::RESHAPE_BOTH) {
     shared_ptr<Blob> ptr = create<D, DI>();
     ptr->Reshape(shape, mode);
     return ptr;
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index a1e8b03346f..cdb65253d3e 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -18,7 +18,8 @@ namespace caffe {
  */
 class InternalThread {
  public:
-  InternalThread(int target_device, size_t rank_, size_t threads, bool delayed, const std::string& name);
+  InternalThread(int target_device, size_t rank_, size_t threads, bool delayed,
+      const std::string& name);
   virtual ~InternalThread() = default;
 
   /**
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
index d8cdf4cf439..bd633313610 100644
--- a/include/caffe/parallel.hpp
+++ b/include/caffe/parallel.hpp
@@ -79,7 +79,7 @@ class P2PManager {
   static unique_ptr<boost::barrier> solve_bar_;
   static unique_ptr<boost::barrier> solved_bar_;
 
-protected:
+ protected:
   const size_t nranks_;
   vector<unique_ptr<P2PSync>> syncs_;
   shared_ptr<SharedScores<float>> shared_;
diff --git a/python/requirements.txt b/python/requirements.txt
index d09f6e680f4..260e91635b4 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,6 @@
 Cython>=0.19.2
-numpy==1.14.5
+gfortran
+numpy>=1.18.1
 scipy>=0.13.3,<=0.17.0
 scikit-image>=0.9.3
 matplotlib>=1.5.2
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index b30b2fa3bec..e7d12ee4cb0 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -7,7 +7,8 @@
 
 namespace caffe {
 
-InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed, const std::string& name)
+InternalThread::InternalThread(int target_device, size_t rank, size_t threads, bool delayed,
+    const std::string& name)
     : target_device_(target_device),
       rank_(rank),
       lwp_id_(0),

From f936ed65183a2d25337f5ac4223e6ff31249dc65 Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Mon, 4 May 2020 01:51:48 -0700
Subject: [PATCH 3/6] 0.17.4RC - Travis & Python 3

---
 .travis.yml                | 14 +++++++-------
 Makefile.config.example    | 12 ++++++------
 scripts/travis/defaults.sh |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6478c866904..ee1ee2f0b9b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,4 @@
-dist: trusty
+dist: bionic
 sudo: required
 
 language: cpp
@@ -14,14 +14,14 @@ env:
     #   WITH_PYTHON3: false
     #   WITH_CUDA: const true since v0.17
     #   WITH_CUDNN: false
-    - BUILD_NAME="default-make"
-#   - BUILD_NAME="python3-make" WITH_PYTHON3=true
-    - BUILD_NAME="cudnn-make" WITH_CUDNN=true
+#    - BUILD_NAME="default-make"
+    - BUILD_NAME="python3-make" WITH_PYTHON3=true
+    - BUILD_NAME="python3-cudnn-make" WITH_PYTHON3=true WITH_CUDNN=true
 
-    - BUILD_NAME="default-cmake" WITH_CMAKE=true
+#    - BUILD_NAME="default-cmake" WITH_CMAKE=true
     - BUILD_NAME="python3-cmake" WITH_CMAKE=true WITH_PYTHON3=true
-    - BUILD_NAME="cudnn-cmake" WITH_CMAKE=true WITH_CUDNN=true
-    - BUILD_NAME="cudnn-python3-cmake" WITH_CMAKE=true WITH_CUDNN=true WITH_PYTHON3=true
+#    - BUILD_NAME="cudnn-cmake" WITH_CMAKE=true WITH_CUDNN=true
+    - BUILD_NAME="python3-cudnn-cmake" WITH_CMAKE=true WITH_CUDNN=true WITH_PYTHON3=true
 
 cache:
   apt: true
diff --git a/Makefile.config.example b/Makefile.config.example
index 43ef562002e..0b7ba1ccc88 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -59,8 +59,8 @@ BLAS_LIB := /opt/OpenBLAS/lib/
 
 # NOTE: this is required only if you will compile the python interface.
 # We need to be able to find Python.h and numpy/arrayobject.h.
-PYTHON_INCLUDE := /usr/include/python2.7 \
-		/usr/lib/python2.7/dist-packages/numpy/core/include
+#PYTHON_INCLUDE := /usr/include/python2.7 \
+#		/usr/lib/python2.7/dist-packages/numpy/core/include
 # Anaconda Python distribution is quite popular. Include path:
 # Verify anaconda location, sometimes it's in root.
 # ANACONDA_HOME := $(HOME)/anaconda
@@ -69,9 +69,9 @@ PYTHON_INCLUDE := /usr/include/python2.7 \
 		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
 
 # Uncomment to use Python 3 (default is Python 2)
-# PYTHON_LIBRARIES := boost_python3 python3.5m
-# PYTHON_INCLUDE := /usr/include/python3.5m \
-#                 /usr/lib/python3.5/dist-packages/numpy/core/include
+PYTHON_LIBRARIES := boost_python3 python3.6m
+PYTHON_INCLUDE := /usr/include/python3.6m \
+                 /usr/lib/python3.6/dist-packages/numpy/core/include
 
 # We need to be able to find libpythonX.X.so or .dylib.
 PYTHON_LIB := /usr/lib
@@ -82,7 +82,7 @@ PYTHON_LIB := /usr/lib
 # PYTHON_LIB += $(shell brew --prefix numpy)/lib
 
 # Uncomment to support layers written in Python (will link against Python libs)
-# WITH_PYTHON_LAYER := 1
+WITH_PYTHON_LAYER := 1
 
 # Whatever else you find you need goes here.
 INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial
diff --git a/scripts/travis/defaults.sh b/scripts/travis/defaults.sh
index 5d546ce4fbd..1ef35bf5a17 100755
--- a/scripts/travis/defaults.sh
+++ b/scripts/travis/defaults.sh
@@ -4,6 +4,6 @@
 set -e
 
 WITH_CMAKE=${WITH_CMAKE:-false}
-WITH_PYTHON3=${WITH_PYTHON3:-false}
+WITH_PYTHON3=${WITH_PYTHON3:-true}
 WITH_CUDA=${WITH_CUDA:-true}
 WITH_CUDNN=${WITH_CUDNN:-false}

From 7501483f83199d4870b2a565e889950325477f0c Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Mon, 4 May 2020 02:24:26 -0700
Subject: [PATCH 4/6] 0.17.4RC - Travis & Python 3

---
 scripts/travis/configure-make.sh |  4 ++--
 scripts/travis/install-deps.sh   | 23 ++++++++++++-----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/scripts/travis/configure-make.sh b/scripts/travis/configure-make.sh
index f5a0be1cf81..2bdcb74746d 100644
--- a/scripts/travis/configure-make.sh
+++ b/scripts/travis/configure-make.sh
@@ -17,8 +17,8 @@ LINE "WITH_PYTHON_LAYER := 1"
 if $WITH_PYTHON3 ; then
   # TODO(lukeyeager) this path is currently disabled because of test errors like:
   #   ImportError: dynamic module does not define init function (PyInit__caffe)
-  LINE "PYTHON_LIBRARIES := python3.4m boost_python-py34"
-  LINE "PYTHON_INCLUDE := /usr/include/python3.4 /usr/lib/python3/dist-packages/numpy/core/include"
+  LINE "PYTHON_LIBRARIES := python3.6m boost_python-py36"
+  LINE "PYTHON_INCLUDE := /usr/include/python3.6 /usr/lib/python3/dist-packages/numpy/core/include"
   LINE "INCLUDE_DIRS := \$(INCLUDE_DIRS) \$(PYTHON_INCLUDE)"
 fi
 
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 0d7caf045dd..c346fef6fdd 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -19,6 +19,7 @@ apt-get install -y --no-install-recommends \
   libhdf5-serial-dev \
   libopenblas-dev \
   libturbojpeg \
+  libturbojpeg0-dev \
   python-virtualenv \
   wget
 
@@ -82,16 +83,16 @@ fi
 
 if $WITH_CUDA ; then
   # install repo packages
-  CUDA_REPO_PKG=cuda-repo-ubuntu1404_8.0.61-1_amd64.deb
-  wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/$CUDA_REPO_PKG
+  CUDA_REPO_PKG=cuda-10-2_10.2.89-1_amd64.deb
+  wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/$CUDA_REPO_PKG
   dpkg -i $CUDA_REPO_PKG
   rm $CUDA_REPO_PKG
 
   if $WITH_CUDNN ; then
-    ML_REPO_PKG=libcudnn7_7.0.5.15-1+cuda8.0_amd64.deb
-    ML_REPO_PKGD=libcudnn7-dev_7.0.5.15-1+cuda8.0_amd64.deb
-    wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/$ML_REPO_PKG
-    wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/$ML_REPO_PKGD
+    ML_REPO_PKG=libcudnn7_7.6.5.32-1+cuda10.2_amd64.deb
+    ML_REPO_PKGD=libcudnn7-dev_7.6.5.32-1+cuda10.2_amd64.deb
+    wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/$ML_REPO_PKG
+    wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/$ML_REPO_PKGD
     dpkg -i $ML_REPO_PKG $ML_REPO_PKGD
   fi
 
@@ -99,8 +100,8 @@ if $WITH_CUDA ; then
   apt-get -y update
 
   # install packages
-  CUDA_PKG_VERSION="8-0"
-  CUDA_VERSION="8.0"
+  CUDA_PKG_VERSION="10-2"
+  CUDA_VERSION="10.2"
   apt-get install -y --no-install-recommends \
     cuda-core-$CUDA_PKG_VERSION \
     cuda-cudart-dev-$CUDA_PKG_VERSION \
@@ -110,8 +111,8 @@ if $WITH_CUDA ; then
   # manually create CUDA symlink
   ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
 
-  if $WITH_CUDNN ; then
-    apt-get install -y --no-install-recommends libcudnn7 libcudnn7-dev
-  fi
+#  if $WITH_CUDNN ; then
+#    apt-get install -y --no-install-recommends libcudnn7 libcudnn7-dev
+#  fi
 fi
 

From 60c9409482197cc7bdd7d0d48ff66122ef05ad3e Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Mon, 4 May 2020 02:34:11 -0700
Subject: [PATCH 5/6] 0.17.4RC - Travis & Python 3

---
 scripts/travis/install-deps.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index c346fef6fdd..5a520d44a94 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -24,7 +24,7 @@ apt-get install -y --no-install-recommends \
   wget
 
 # package bug WAR:
-ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
+#ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
 
 if $WITH_CMAKE ; then
   apt-get install -y --no-install-recommends cmake

From 4c7481c50e3fbe909f316d24b05a0e78f193a90b Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Fri, 8 Jan 2021 22:38:11 -0800
Subject: [PATCH 6/6] 0.17.4

---
 CMakeLists.txt                          |   4 +-
 Makefile                                |  10 ++-
 Makefile.config.example                 |   6 +-
 cmake/Cuda.cmake                        |   2 +-
 include/caffe/macros.hpp                |   2 +-
 include/caffe/util/io.hpp               |   3 +
 src/caffe/layers/cudnn_conv_layer.cpp   |  81 +++++++++++++++----
 src/caffe/layers/cudnn_deconv_layer.cpp | 100 ++++++++----------------
 src/caffe/util/bbox_util.cu             |   7 +-
 9 files changed, 118 insertions(+), 97 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f77e45af379..51d7587be33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ include(cmake/Dependencies.cmake)
 
 # ---[ Flags
 if(UNIX OR APPLE)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++11")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++14")
 endif()
 
 caffe_set_caffe_link()
@@ -114,7 +114,7 @@ add_custom_target(lint COMMAND ${CMAKE_COMMAND} -P ${PROJECT_SOURCE_DIR}/cmake/l
 # ---[ pytest target
 if(BUILD_python)
   add_custom_target(pytest COMMAND python${python_version} -m unittest discover -s caffe/test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/python )
-  add_dependencies(pytest pycaffe)
+#  add_dependencies(pytest pycaffe)
 endif()
 
 # ---[ Configuration summary
diff --git a/Makefile b/Makefile
index b08c91b8d68..5983a473d60 100644
--- a/Makefile
+++ b/Makefile
@@ -27,25 +27,27 @@ endif
 THIRDPARTY_DIR := ./3rdparty
 
 # All of the directories containing code.
-SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \
+SRC_DIRS := $(shell find src python tools examples -type d -exec bash -c "find {} -maxdepth 1 \
 	\( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print 2>/dev/null)
 
+
 # The target shared library name
 LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX)
 LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 0
 DYNAMIC_VERSION_MINOR 		:= 17
-DYNAMIC_VERSION_REVISION 	:= 3
+DYNAMIC_VERSION_REVISION 	:= 4
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)
 DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT)
 COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
-# NVCaffe requires C++ 11
-COMMON_FLAGS += -std=c++11
+# NVCaffe requires C++ 14
+COMMON_FLAGS += -std=c++14
 COMMON_FLAGS += -DCUDA_NO_HALF
 
+
 ##############################
 # Get all source files
 ##############################
diff --git a/Makefile.config.example b/Makefile.config.example
index a8c6a3f383e..1d0147b7bc1 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -69,9 +69,9 @@ BLAS_LIB := /opt/OpenBLAS/lib/
 		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
 
 # Uncomment to use Python 3 (default is Python 2)
-PYTHON_LIBRARIES := boost_python3 python3.6m
-PYTHON_INCLUDE := /usr/include/python3.6m \
-                 /usr/lib/python3.6/dist-packages/numpy/core/include
+PYTHON_LIBRARIES := boost_python38 python3.8
+PYTHON_INCLUDE := /usr/include/python3.8 \
+                 /usr/lib/python3.8/dist-packages/numpy/core/include
 
 # We need to be able to find libpythonX.X.so or .dylib.
 PYTHON_LIB := /usr/lib
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 0e755d05282..dbc36a278c9 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -254,7 +254,7 @@ if(USE_CUDNN)
 endif()
 
 if(UNIX OR APPLE)
-  list(APPEND CUDA_NVCC_FLAGS -std=c++11;-Xcompiler;-fPIC)
+  list(APPEND CUDA_NVCC_FLAGS -std=c++14;-Xcompiler;-fPIC)
 endif()
 
 if(APPLE)
diff --git a/include/caffe/macros.hpp b/include/caffe/macros.hpp
index 9d24c985659..6816f26da0c 100644
--- a/include/caffe/macros.hpp
+++ b/include/caffe/macros.hpp
@@ -7,7 +7,7 @@
   ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
 #endif
 
-#if BOOST_VERSION >= 106100
+#if (BOOST_VERSION >= 106100) && !defined(BOOST_CUDA_VERSION)
 // error: class "boost::common_type<long, long>" has no member "type"
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_BUILD__)
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index ec98f5a43c0..e01a3918411 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -323,11 +323,14 @@ void FloatCVMatToBuf(const cv::Mat& cv_img, size_t buf_len, Dtype* buf, bool rep
       LOG(FATAL) << "Image depth is not supported";
     }
   } else {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
     if (cv_img.depth() == CV_32F && tp<Dtype>() == FLOAT) {
       std::memcpy(buf, cv_img.ptr<float>(0), img_size * sizeof(float));  // NOLINT(caffe/alt_fn)
     } else if (cv_img.depth() == CV_64F && tp<Dtype>() == DOUBLE) {
       std::memcpy(buf, cv_img.ptr<double>(0), img_size * sizeof(double));  // NOLINT(caffe/alt_fn)
     } else {
+#pragma GCC diagnostic pop
       if (cv_img.depth() == CV_8U) {
         for (size_t i = 0UL; i < img_size; ++i) {
           buf[i] = static_cast<Dtype>(cv_img.ptr<unsigned char>(0)[i]);
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 891b14f4b6c..d8784e38c8a 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -469,29 +469,80 @@ template <typename Ftype, typename Btype>
 void CuDNNConvolutionLayer<Ftype, Btype>::GetConvAlgo(const vector<Blob*>& bottom,
     const vector<Blob*>& top, const size_t workspace_bytes, int pad_h, int pad_w,
     int stride_h, int stride_w) {
+  const size_t limit_per_group = align_down<8>(workspace_bytes / ws_groups());
+  int returnedAlgoCount = 0;
   for (int i = 0; i < bottom.size(); ++i) {
-    // Get backward data algorithm (if not set by user)
-    if (user_algos_override_[1] < 0) {
-      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0),
-          bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i],
-          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-          align_down<8>(workspace_bytes / ws_groups()), &bwd_data_algo_[i]));
-    }
     // Get forward algorithm (if not set by user)
     if (user_algos_override_[0] < 0) {
-      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(0),
+      int count = 0;
+      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
+      std::vector<cudnnConvolutionFwdAlgoPerf_t> perfResults;
+      CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(Caffe::cudnn_handle(0),
           fwd_bottom_descs_[i], fwd_filter_desc_, fwd_conv_descs_[i], fwd_top_descs_[i],
-          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          align_down<8>(workspace_bytes / ws_groups()), &fwd_algo_[i]));
-      CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
+          count, &returnedAlgoCount, &perfResults.front()));
+      if (returnedAlgoCount < 1) {
+        LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+      }
+      bool found = false;
+      for (int a = 0; a < returnedAlgoCount; ++a) {
+        if (perfResults[a].memory <= limit_per_group) {
+          fwd_algo_[i] = perfResults[a].algo;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        LOG(FATAL) << "Can't find forward algorithm with memory limit "
+                   << limit_per_group << " bytes per group";
+      }
+    }
+    // Get backward data algorithm (if not set by user)
+    if (user_algos_override_[1] < 0) {
+      int count = 0;
+      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
+      std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perfResults;
+      CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0),
+          bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i],
+          count, &returnedAlgoCount, &perfResults.front()));
+      if (returnedAlgoCount < 1) {
+        LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+      }
+      bool found = false;
+      for (int a = 0; a < returnedAlgoCount; ++a) {
+        if (perfResults[a].memory <= limit_per_group) {
+          bwd_data_algo_[i] = perfResults[a].algo;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        LOG(FATAL) << "Can't find backward data algorithm with memory limit "
+                   << limit_per_group << " bytes per group";
+      }
     }
     // Get backward filter algorithm (if not set by user)
     if (user_algos_override_[2] < 0) {
-      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0),
+      int count = 0;
+      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
+      std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perfResults;
+      CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0),
           bwd_bottom_descs_[i], bwd_top_descs_[i], bwd_conv_filter_descs_[i], bwd_filter_desc_,
-          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-          align_down<8>(workspace_bytes / ws_groups()), &bwd_filter_algo_[i]));
-      CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
+          count, &returnedAlgoCount, &perfResults.front()));
+      if (returnedAlgoCount < 1) {
+        LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+      }
+      bool found = false;
+      for (int a = 0; a < returnedAlgoCount; ++a) {
+        if (perfResults[a].memory <= limit_per_group) {
+          bwd_filter_algo_[i] = perfResults[a].algo;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        LOG(FATAL) << "Can't find backward filter algorithm with memory limit "
+                   << limit_per_group << " bytes per group";
+      }
     }
     LOG(INFO) << Phase_Name(this->phase_)
         << " Conv Algos by Get* (F,BD,BF) for layer '" << this->name()
diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp
index 6834dfda3b6..6cf3898a502 100644
--- a/src/caffe/layers/cudnn_deconv_layer.cpp
+++ b/src/caffe/layers/cudnn_deconv_layer.cpp
@@ -110,10 +110,6 @@ void CuDNNDeconvolutionLayer<Ftype, Btype>::Reshape(
   const int stride_h = stride_data[0];
   const int stride_w = stride_data[1];
 
-  // Specify workspace limit for kernels directly until we have a
-  // planning strategy and a rewrite of Caffe's GPU memory mangagement
-  size_t workspace_limit_bytes = 8*1024*1024;
-
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Btype>(&bottom_descs_[i],
                                   this->num_,
@@ -135,96 +131,62 @@ void CuDNNDeconvolutionLayer<Ftype, Btype>::Reshape(
                                   1);
     cudnn::setConvolutionDesc(forward_math_,
                               conv_descs_[i],
-//                              top_descs_[i],
-//                              filter_desc_,
                               pad_h,
                               pad_w,
                               stride_h,
                               stride_w, 1, 1);
 
+    int returnedAlgoCount = 0;
+    cudnnConvolutionFwdAlgoPerf_t perfFResults;
     // choose forward and backward algorithms + workspace(s)
-    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-        handle_[0],
-        top_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        bottom_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &fwd_algo_[i]));
-
-    // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
-    // buggy. Thus, if this algo was chosen, choose winograd instead. If
-    // winograd is not supported or workspace is larger than threshold, choose
-    // implicit_gemm instead.
-//    if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
-//      size_t winograd_workspace_size;
-//      cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
-//          handle_[0],
-//          top_descs_[i],
-//          filter_desc_,
-//          conv_descs_[i],
-//          bottom_descs_[i],
-//          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-//          &winograd_workspace_size);
-//      if (status != CUDNN_STATUS_SUCCESS ||
-//          winograd_workspace_size >= workspace_limit_bytes) {
-//        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-//      } else {
-//        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
-//      }
-//    }
-
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
         handle_[0],
         top_descs_[i],
         filter_desc_,
         conv_descs_[i],
         bottom_descs_[i],
-        fwd_algo_[i],
-        &(workspace_fwd_sizes_[i])));
+        1,
+        &returnedAlgoCount,
+        &perfFResults));
+    if (returnedAlgoCount < 1) {
+      LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+    }
+    fwd_algo_[i] = perfFResults.algo;
+    workspace_fwd_sizes_[i] = perfFResults.memory;
 
     // choose backward algorithm for filter
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnConvolutionBwdFilterAlgoPerf_t perfBFResults;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
         handle_[0],
         top_descs_[i],
         bottom_descs_[i],
         conv_descs_[i],
         filter_desc_,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &bwd_filter_algo_[i]));
-
-    // get workspace for backwards filter algorithm
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        handle_[0],
-        top_descs_[i],
-        bottom_descs_[i],
-        conv_descs_[i],
-        filter_desc_,
-        bwd_filter_algo_[i],
-        &workspace_bwd_filter_sizes_[i]));
+        1,
+        &returnedAlgoCount,
+        &perfBFResults));
+    if (returnedAlgoCount < 1) {
+      LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+    }
+    bwd_filter_algo_[i] = perfBFResults.algo;
+    workspace_bwd_filter_sizes_[i] = perfBFResults.memory;
 
     // choose backward algo for data
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-        handle_[0],
-        filter_desc_,
-        bottom_descs_[i],
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &bwd_data_algo_[i]));
-
-    // get workspace size
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnConvolutionBwdDataAlgoPerf_t perfBDResults;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
         handle_[0],
         filter_desc_,
         bottom_descs_[i],
         conv_descs_[i],
         top_descs_[i],
-        bwd_data_algo_[i],
-        &workspace_bwd_data_sizes_[i]));
+        1,
+        &returnedAlgoCount,
+        &perfBDResults));
+    if (returnedAlgoCount < 1) {
+      LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+    }
+    bwd_data_algo_[i] = perfBDResults.algo;
+    workspace_bwd_data_sizes_[i] = perfBDResults.memory;
   }
 
   // reduce over all workspace sizes to get a maximum to allocate / reallocate
diff --git a/src/caffe/util/bbox_util.cu b/src/caffe/util/bbox_util.cu
index 3d7c8d7783b..798a4d3b128 100644
--- a/src/caffe/util/bbox_util.cu
+++ b/src/caffe/util/bbox_util.cu
@@ -3,8 +3,11 @@
 #include <map>
 #include <vector>
 
-#include "thrust/functional.h"
-#include "thrust/sort.h"
+#if !defined(THRUST_IGNORE_CUB_VERSION_CHECK)
+#define THRUST_IGNORE_CUB_VERSION_CHECK 1
+#endif
+#include <thrust/functional.h>
+#include <thrust/sort.h>
 
 #include "caffe/common.hpp"
 #include "caffe/util/bbox_util.hpp"