0.17.4

NVIDIA · Jan 9, 2021 · 4c7481c · 4c7481c
1 parent 85e10c0
commit 4c7481c
Show file tree

Hide file tree

Showing 9 changed files with 118 additions and 97 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -73,7 +73,7 @@ include(cmake/Dependencies.cmake)
 
 # ---[ Flags
 if(UNIX OR APPLE)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++11")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++14")
 endif()
 
 caffe_set_caffe_link()
@@ -114,7 +114,7 @@ add_custom_target(lint COMMAND ${CMAKE_COMMAND} -P ${PROJECT_SOURCE_DIR}/cmake/l
 # ---[ pytest target
 if(BUILD_python)
   add_custom_target(pytest COMMAND python${python_version} -m unittest discover -s caffe/test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/python )
-  add_dependencies(pytest pycaffe)
+#  add_dependencies(pytest pycaffe)
 endif()
 
 # ---[ Configuration summary

diff --git a/Makefile b/Makefile
@@ -27,25 +27,27 @@ endif
 THIRDPARTY_DIR := ./3rdparty
 
 # All of the directories containing code.
-SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \
+SRC_DIRS := $(shell find src python tools examples -type d -exec bash -c "find {} -maxdepth 1 \
 	\( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print 2>/dev/null)
 
+
 # The target shared library name
 LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX)
 LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 0
 DYNAMIC_VERSION_MINOR 		:= 17
-DYNAMIC_VERSION_REVISION 	:= 3
+DYNAMIC_VERSION_REVISION 	:= 4
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)
 DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT)
 COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
-# NVCaffe requires C++ 11
-COMMON_FLAGS += -std=c++11
+# NVCaffe requires C++ 14
+COMMON_FLAGS += -std=c++14
 COMMON_FLAGS += -DCUDA_NO_HALF
 
+
 ##############################
 # Get all source files
 ##############################

diff --git a/Makefile.config.example b/Makefile.config.example
@@ -69,9 +69,9 @@ BLAS_LIB := /opt/OpenBLAS/lib/
 		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
 
 # Uncomment to use Python 3 (default is Python 2)
-PYTHON_LIBRARIES := boost_python3 python3.6m
-PYTHON_INCLUDE := /usr/include/python3.6m \
-                 /usr/lib/python3.6/dist-packages/numpy/core/include
+PYTHON_LIBRARIES := boost_python38 python3.8
+PYTHON_INCLUDE := /usr/include/python3.8 \
+                 /usr/lib/python3.8/dist-packages/numpy/core/include
 
 # We need to be able to find libpythonX.X.so or .dylib.
 PYTHON_LIB := /usr/lib

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
@@ -254,7 +254,7 @@ if(USE_CUDNN)
 endif()
 
 if(UNIX OR APPLE)
-  list(APPEND CUDA_NVCC_FLAGS -std=c++11;-Xcompiler;-fPIC)
+  list(APPEND CUDA_NVCC_FLAGS -std=c++14;-Xcompiler;-fPIC)
 endif()
 
 if(APPLE)

diff --git a/include/caffe/macros.hpp b/include/caffe/macros.hpp
@@ -7,7 +7,7 @@
   ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
 #endif
 
-#if BOOST_VERSION >= 106100
+#if (BOOST_VERSION >= 106100) && !defined(BOOST_CUDA_VERSION)
 // error: class "boost::common_type<long, long>" has no member "type"
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_BUILD__)

diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
@@ -323,11 +323,14 @@ void FloatCVMatToBuf(const cv::Mat& cv_img, size_t buf_len, Dtype* buf, bool rep
       LOG(FATAL) << "Image depth is not supported";
     }
   } else {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
     if (cv_img.depth() == CV_32F && tp<Dtype>() == FLOAT) {
       std::memcpy(buf, cv_img.ptr<float>(0), img_size * sizeof(float));  // NOLINT(caffe/alt_fn)
     } else if (cv_img.depth() == CV_64F && tp<Dtype>() == DOUBLE) {
       std::memcpy(buf, cv_img.ptr<double>(0), img_size * sizeof(double));  // NOLINT(caffe/alt_fn)
     } else {
+#pragma GCC diagnostic pop
       if (cv_img.depth() == CV_8U) {
         for (size_t i = 0UL; i < img_size; ++i) {
           buf[i] = static_cast<Dtype>(cv_img.ptr<unsigned char>(0)[i]);

diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -469,29 +469,80 @@ template <typename Ftype, typename Btype>
 void CuDNNConvolutionLayer<Ftype, Btype>::GetConvAlgo(const vector<Blob*>& bottom,
     const vector<Blob*>& top, const size_t workspace_bytes, int pad_h, int pad_w,
     int stride_h, int stride_w) {
+  const size_t limit_per_group = align_down<8>(workspace_bytes / ws_groups());
+  int returnedAlgoCount = 0;
   for (int i = 0; i < bottom.size(); ++i) {
-    // Get backward data algorithm (if not set by user)
-    if (user_algos_override_[1] < 0) {
-      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0),
-          bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i],
-          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-          align_down<8>(workspace_bytes / ws_groups()), &bwd_data_algo_[i]));
-    }
     // Get forward algorithm (if not set by user)
     if (user_algos_override_[0] < 0) {
-      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(0),
+      int count = 0;
+      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
+      std::vector<cudnnConvolutionFwdAlgoPerf_t> perfResults;
+      CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(Caffe::cudnn_handle(0),
           fwd_bottom_descs_[i], fwd_filter_desc_, fwd_conv_descs_[i], fwd_top_descs_[i],
-          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          align_down<8>(workspace_bytes / ws_groups()), &fwd_algo_[i]));
-      CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
+          count, &returnedAlgoCount, &perfResults.front()));
+      if (returnedAlgoCount < 1) {
+        LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+      }
+      bool found = false;
+      for (int a = 0; a < returnedAlgoCount; ++a) {
+        if (perfResults[a].memory <= limit_per_group) {
+          fwd_algo_[i] = perfResults[a].algo;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        LOG(FATAL) << "Can't find forward algorithm with memory limit "
+                   << limit_per_group << " bytes per group";
+      }
+    }
+    // Get backward data algorithm (if not set by user)
+    if (user_algos_override_[1] < 0) {
+      int count = 0;
+      CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
+      std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perfResults;
+      CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0),
+          bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i],
+          count, &returnedAlgoCount, &perfResults.front()));
+      if (returnedAlgoCount < 1) {
+        LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+      }
+      bool found = false;
+      for (int a = 0; a < returnedAlgoCount; ++a) {
+        if (perfResults[a].memory <= limit_per_group) {
+          bwd_data_algo_[i] = perfResults[a].algo;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        LOG(FATAL) << "Can't find backward data algorithm with memory limit "
+                   << limit_per_group << " bytes per group";
+      }
     }
     // Get backward filter algorithm (if not set by user)
     if (user_algos_override_[2] < 0) {
-      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0),
+      int count = 0;
+      CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
+      std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perfResults;
+      CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0),
           bwd_bottom_descs_[i], bwd_top_descs_[i], bwd_conv_filter_descs_[i], bwd_filter_desc_,
-          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-          align_down<8>(workspace_bytes / ws_groups()), &bwd_filter_algo_[i]));
-      CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
+          count, &returnedAlgoCount, &perfResults.front()));
+      if (returnedAlgoCount < 1) {
+        LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+      }
+      bool found = false;
+      for (int a = 0; a < returnedAlgoCount; ++a) {
+        if (perfResults[a].memory <= limit_per_group) {
+          bwd_filter_algo_[i] = perfResults[a].algo;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        LOG(FATAL) << "Can't find backward filter algorithm with memory limit "
+                   << limit_per_group << " bytes per group";
+      }
     }
     LOG(INFO) << Phase_Name(this->phase_)
         << " Conv Algos by Get* (F,BD,BF) for layer '" << this->name()

diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp
@@ -110,10 +110,6 @@ void CuDNNDeconvolutionLayer<Ftype, Btype>::Reshape(
   const int stride_h = stride_data[0];
   const int stride_w = stride_data[1];
 
-  // Specify workspace limit for kernels directly until we have a
-  // planning strategy and a rewrite of Caffe's GPU memory mangagement
-  size_t workspace_limit_bytes = 8*1024*1024;
-
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Btype>(&bottom_descs_[i],
                                   this->num_,
@@ -135,96 +131,62 @@ void CuDNNDeconvolutionLayer<Ftype, Btype>::Reshape(
                                   1);
     cudnn::setConvolutionDesc(forward_math_,
                               conv_descs_[i],
-//                              top_descs_[i],
-//                              filter_desc_,
                               pad_h,
                               pad_w,
                               stride_h,
                               stride_w, 1, 1);
 
+    int returnedAlgoCount = 0;
+    cudnnConvolutionFwdAlgoPerf_t perfFResults;
     // choose forward and backward algorithms + workspace(s)
-    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-        handle_[0],
-        top_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        bottom_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &fwd_algo_[i]));
-
-    // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
-    // buggy. Thus, if this algo was chosen, choose winograd instead. If
-    // winograd is not supported or workspace is larger than threshold, choose
-    // implicit_gemm instead.
-//    if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
-//      size_t winograd_workspace_size;
-//      cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
-//          handle_[0],
-//          top_descs_[i],
-//          filter_desc_,
-//          conv_descs_[i],
-//          bottom_descs_[i],
-//          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-//          &winograd_workspace_size);
-//      if (status != CUDNN_STATUS_SUCCESS ||
-//          winograd_workspace_size >= workspace_limit_bytes) {
-//        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-//      } else {
-//        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
-//      }
-//    }
-
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
         handle_[0],
         top_descs_[i],
         filter_desc_,
         conv_descs_[i],
         bottom_descs_[i],
-        fwd_algo_[i],
-        &(workspace_fwd_sizes_[i])));
+        1,
+        &returnedAlgoCount,
+        &perfFResults));
+    if (returnedAlgoCount < 1) {
+      LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+    }
+    fwd_algo_[i] = perfFResults.algo;
+    workspace_fwd_sizes_[i] = perfFResults.memory;
 
     // choose backward algorithm for filter
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnConvolutionBwdFilterAlgoPerf_t perfBFResults;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
         handle_[0],
         top_descs_[i],
         bottom_descs_[i],
         conv_descs_[i],
         filter_desc_,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &bwd_filter_algo_[i]));
-
-    // get workspace for backwards filter algorithm
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        handle_[0],
-        top_descs_[i],
-        bottom_descs_[i],
-        conv_descs_[i],
-        filter_desc_,
-        bwd_filter_algo_[i],
-        &workspace_bwd_filter_sizes_[i]));
+        1,
+        &returnedAlgoCount,
+        &perfBFResults));
+    if (returnedAlgoCount < 1) {
+      LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+    }
+    bwd_filter_algo_[i] = perfBFResults.algo;
+    workspace_bwd_filter_sizes_[i] = perfBFResults.memory;
 
     // choose backward algo for data
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-        handle_[0],
-        filter_desc_,
-        bottom_descs_[i],
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,
-        &bwd_data_algo_[i]));
-
-    // get workspace size
-    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnConvolutionBwdDataAlgoPerf_t perfBDResults;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
         handle_[0],
         filter_desc_,
         bottom_descs_[i],
         conv_descs_[i],
         top_descs_[i],
-        bwd_data_algo_[i],
-        &workspace_bwd_data_sizes_[i]));
+        1,
+        &returnedAlgoCount,
+        &perfBDResults));
+    if (returnedAlgoCount < 1) {
+      LOG(FATAL) << returnedAlgoCount << " algorithms returned";
+    }
+    bwd_data_algo_[i] = perfBDResults.algo;
+    workspace_bwd_data_sizes_[i] = perfBDResults.memory;
   }
 
   // reduce over all workspace sizes to get a maximum to allocate / reallocate

diff --git a/src/caffe/util/bbox_util.cu b/src/caffe/util/bbox_util.cu
@@ -3,8 +3,11 @@
 #include <map>
 #include <vector>
 
-#include "thrust/functional.h"
-#include "thrust/sort.h"
+#if !defined(THRUST_IGNORE_CUB_VERSION_CHECK)
+#define THRUST_IGNORE_CUB_VERSION_CHECK 1
+#endif
+#include <thrust/functional.h>
+#include <thrust/sort.h>
 
 #include "caffe/common.hpp"
 #include "caffe/util/bbox_util.hpp"