diff --git a/CMakeLists.txt b/CMakeLists.txt index f77e45af379..51d7587be33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,7 +73,7 @@ include(cmake/Dependencies.cmake) # ---[ Flags if(UNIX OR APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++11") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++14") endif() caffe_set_caffe_link() @@ -114,7 +114,7 @@ add_custom_target(lint COMMAND ${CMAKE_COMMAND} -P ${PROJECT_SOURCE_DIR}/cmake/l # ---[ pytest target if(BUILD_python) add_custom_target(pytest COMMAND python${python_version} -m unittest discover -s caffe/test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/python ) - add_dependencies(pytest pycaffe) +# add_dependencies(pytest pycaffe) endif() # ---[ Configuration summary diff --git a/Makefile b/Makefile index b08c91b8d68..5983a473d60 100644 --- a/Makefile +++ b/Makefile @@ -27,25 +27,27 @@ endif THIRDPARTY_DIR := ./3rdparty # All of the directories containing code. -SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \ +SRC_DIRS := $(shell find src python tools examples -type d -exec bash -c "find {} -maxdepth 1 \ \( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print 2>/dev/null) + # The target shared library name LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX) LIB_BUILD_DIR := $(BUILD_DIR)/lib STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a DYNAMIC_VERSION_MAJOR := 0 DYNAMIC_VERSION_MINOR := 17 -DYNAMIC_VERSION_REVISION := 3 +DYNAMIC_VERSION_REVISION := 4 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR) DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION) DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT) COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION) -# NVCaffe requires C++ 11 -COMMON_FLAGS += -std=c++11 +# NVCaffe requires C++ 14 +COMMON_FLAGS += -std=c++14 COMMON_FLAGS += -DCUDA_NO_HALF + ############################## # Get all source files ############################## diff --git a/Makefile.config.example b/Makefile.config.example index a8c6a3f383e..1d0147b7bc1 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -69,9 +69,9 @@ BLAS_LIB := /opt/OpenBLAS/lib/ # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \ # Uncomment to use Python 3 (default is Python 2) -PYTHON_LIBRARIES := boost_python3 python3.6m -PYTHON_INCLUDE := /usr/include/python3.6m \ - /usr/lib/python3.6/dist-packages/numpy/core/include +PYTHON_LIBRARIES := boost_python38 python3.8 +PYTHON_INCLUDE := /usr/include/python3.8 \ + /usr/lib/python3.8/dist-packages/numpy/core/include # We need to be able to find libpythonX.X.so or .dylib. PYTHON_LIB := /usr/lib diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake index 0e755d05282..dbc36a278c9 100644 --- a/cmake/Cuda.cmake +++ b/cmake/Cuda.cmake @@ -254,7 +254,7 @@ if(USE_CUDNN) endif() if(UNIX OR APPLE) - list(APPEND CUDA_NVCC_FLAGS -std=c++11;-Xcompiler;-fPIC) + list(APPEND CUDA_NVCC_FLAGS -std=c++14;-Xcompiler;-fPIC) endif() if(APPLE) diff --git a/include/caffe/macros.hpp b/include/caffe/macros.hpp index 9d24c985659..6816f26da0c 100644 --- a/include/caffe/macros.hpp +++ b/include/caffe/macros.hpp @@ -7,7 +7,7 @@ ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) #endif -#if BOOST_VERSION >= 106100 +#if (BOOST_VERSION >= 106100) && !defined(BOOST_CUDA_VERSION) // error: class "boost::common_type" has no member "type" #define BOOST_NO_CXX11_VARIADIC_TEMPLATES #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_BUILD__) diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index ec98f5a43c0..e01a3918411 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -323,11 +323,14 @@ void FloatCVMatToBuf(const cv::Mat& cv_img, size_t buf_len, Dtype* buf, bool rep LOG(FATAL) << "Image depth is not supported"; } } else { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" if (cv_img.depth() == CV_32F && tp() == FLOAT) { std::memcpy(buf, cv_img.ptr(0), img_size * sizeof(float)); // NOLINT(caffe/alt_fn) } else if (cv_img.depth() == CV_64F && tp() == DOUBLE) { std::memcpy(buf, cv_img.ptr(0), img_size * sizeof(double)); // NOLINT(caffe/alt_fn) } else { +#pragma GCC diagnostic pop if (cv_img.depth() == CV_8U) { for (size_t i = 0UL; i < img_size; ++i) { buf[i] = static_cast(cv_img.ptr(0)[i]); diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index 891b14f4b6c..d8784e38c8a 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -469,29 +469,80 @@ template void CuDNNConvolutionLayer::GetConvAlgo(const vector& bottom, const vector& top, const size_t workspace_bytes, int pad_h, int pad_w, int stride_h, int stride_w) { + const size_t limit_per_group = align_down<8>(workspace_bytes / ws_groups()); + int returnedAlgoCount = 0; for (int i = 0; i < bottom.size(); ++i) { - // Get backward data algorithm (if not set by user) - if (user_algos_override_[1] < 0) { - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0), - bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - align_down<8>(workspace_bytes / ws_groups()), &bwd_data_algo_[i])); - } // Get forward algorithm (if not set by user) if (user_algos_override_[0] < 0) { - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(0), + int count = 0; + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(Caffe::cudnn_handle(0), &count)); + std::vector perfResults; + CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(Caffe::cudnn_handle(0), fwd_bottom_descs_[i], fwd_filter_desc_, fwd_conv_descs_[i], fwd_top_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - align_down<8>(workspace_bytes / ws_groups()), &fwd_algo_[i])); - CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0))); + count, &returnedAlgoCount, &perfResults.front())); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bool found = false; + for (int a = 0; a < returnedAlgoCount; ++a) { + if (perfResults[a].memory <= limit_per_group) { + fwd_algo_[i] = perfResults[a].algo; + found = true; + break; + } + } + if (!found) { + LOG(FATAL) << "Can't find forward algorithm with memory limit " + << limit_per_group << " bytes per group"; + } + } + // Get backward data algorithm (if not set by user) + if (user_algos_override_[1] < 0) { + int count = 0; + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(Caffe::cudnn_handle(0), &count)); + std::vector perfResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0), + bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i], + count, &returnedAlgoCount, &perfResults.front())); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bool found = false; + for (int a = 0; a < returnedAlgoCount; ++a) { + if (perfResults[a].memory <= limit_per_group) { + bwd_data_algo_[i] = perfResults[a].algo; + found = true; + break; + } + } + if (!found) { + LOG(FATAL) << "Can't find backward data algorithm with memory limit " + << limit_per_group << " bytes per group"; + } } // Get backward filter algorithm (if not set by user) if (user_algos_override_[2] < 0) { - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0), + int count = 0; + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(Caffe::cudnn_handle(0), &count)); + std::vector perfResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0), bwd_bottom_descs_[i], bwd_top_descs_[i], bwd_conv_filter_descs_[i], bwd_filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - align_down<8>(workspace_bytes / ws_groups()), &bwd_filter_algo_[i])); - CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0))); + count, &returnedAlgoCount, &perfResults.front())); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bool found = false; + for (int a = 0; a < returnedAlgoCount; ++a) { + if (perfResults[a].memory <= limit_per_group) { + bwd_filter_algo_[i] = perfResults[a].algo; + found = true; + break; + } + } + if (!found) { + LOG(FATAL) << "Can't find backward filter algorithm with memory limit " + << limit_per_group << " bytes per group"; + } } LOG(INFO) << Phase_Name(this->phase_) << " Conv Algos by Get* (F,BD,BF) for layer '" << this->name() diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp index 6834dfda3b6..6cf3898a502 100644 --- a/src/caffe/layers/cudnn_deconv_layer.cpp +++ b/src/caffe/layers/cudnn_deconv_layer.cpp @@ -110,10 +110,6 @@ void CuDNNDeconvolutionLayer::Reshape( const int stride_h = stride_data[0]; const int stride_w = stride_data[1]; - // Specify workspace limit for kernels directly until we have a - // planning strategy and a rewrite of Caffe's GPU memory mangagement - size_t workspace_limit_bytes = 8*1024*1024; - for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc(&bottom_descs_[i], this->num_, @@ -135,96 +131,62 @@ void CuDNNDeconvolutionLayer::Reshape( 1); cudnn::setConvolutionDesc(forward_math_, conv_descs_[i], -// top_descs_[i], -// filter_desc_, pad_h, pad_w, stride_h, stride_w, 1, 1); + int returnedAlgoCount = 0; + cudnnConvolutionFwdAlgoPerf_t perfFResults; // choose forward and backward algorithms + workspace(s) - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &fwd_algo_[i])); - - // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is - // buggy. Thus, if this algo was chosen, choose winograd instead. If - // winograd is not supported or workspace is larger than threshold, choose - // implicit_gemm instead. -// if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { -// size_t winograd_workspace_size; -// cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize( -// handle_[0], -// top_descs_[i], -// filter_desc_, -// conv_descs_[i], -// bottom_descs_[i], -// CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, -// &winograd_workspace_size); -// if (status != CUDNN_STATUS_SUCCESS || -// winograd_workspace_size >= workspace_limit_bytes) { -// fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; -// } else { -// fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; -// } -// } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm( handle_[0], top_descs_[i], filter_desc_, conv_descs_[i], bottom_descs_[i], - fwd_algo_[i], - &(workspace_fwd_sizes_[i]))); + 1, + &returnedAlgoCount, + &perfFResults)); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + fwd_algo_[i] = perfFResults.algo; + workspace_fwd_sizes_[i] = perfFResults.memory; // choose backward algorithm for filter - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( + cudnnConvolutionBwdFilterAlgoPerf_t perfBFResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm( handle_[0], top_descs_[i], bottom_descs_[i], conv_descs_[i], filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_filter_algo_[i])); - - // get workspace for backwards filter algorithm - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle_[0], - top_descs_[i], - bottom_descs_[i], - conv_descs_[i], - filter_desc_, - bwd_filter_algo_[i], - &workspace_bwd_filter_sizes_[i])); + 1, + &returnedAlgoCount, + &perfBFResults)); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bwd_filter_algo_[i] = perfBFResults.algo; + workspace_bwd_filter_sizes_[i] = perfBFResults.memory; // choose backward algo for data - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( - handle_[0], - filter_desc_, - bottom_descs_[i], - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_data_algo_[i])); - - // get workspace size - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + cudnnConvolutionBwdDataAlgoPerf_t perfBDResults; + CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm( handle_[0], filter_desc_, bottom_descs_[i], conv_descs_[i], top_descs_[i], - bwd_data_algo_[i], - &workspace_bwd_data_sizes_[i])); + 1, + &returnedAlgoCount, + &perfBDResults)); + if (returnedAlgoCount < 1) { + LOG(FATAL) << returnedAlgoCount << " algorithms returned"; + } + bwd_data_algo_[i] = perfBDResults.algo; + workspace_bwd_data_sizes_[i] = perfBDResults.memory; } // reduce over all workspace sizes to get a maximum to allocate / reallocate diff --git a/src/caffe/util/bbox_util.cu b/src/caffe/util/bbox_util.cu index 3d7c8d7783b..798a4d3b128 100644 --- a/src/caffe/util/bbox_util.cu +++ b/src/caffe/util/bbox_util.cu @@ -3,8 +3,11 @@ #include #include -#include "thrust/functional.h" -#include "thrust/sort.h" +#if !defined(THRUST_IGNORE_CUB_VERSION_CHECK) +#define THRUST_IGNORE_CUB_VERSION_CHECK 1 +#endif +#include +#include #include "caffe/common.hpp" #include "caffe/util/bbox_util.hpp"