Skip to content

Commit

Permalink
0.17.4
Browse files Browse the repository at this point in the history
  • Loading branch information
drnikolaev committed Jan 9, 2021
1 parent 85e10c0 commit 4c7481c
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 97 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ include(cmake/Dependencies.cmake)

# ---[ Flags
if(UNIX OR APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++11")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++14")
endif()

caffe_set_caffe_link()
Expand Down Expand Up @@ -114,7 +114,7 @@ add_custom_target(lint COMMAND ${CMAKE_COMMAND} -P ${PROJECT_SOURCE_DIR}/cmake/l
# ---[ pytest target
if(BUILD_python)
add_custom_target(pytest COMMAND python${python_version} -m unittest discover -s caffe/test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/python )
add_dependencies(pytest pycaffe)
# add_dependencies(pytest pycaffe)
endif()

# ---[ Configuration summary
Expand Down
10 changes: 6 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,27 @@ endif
THIRDPARTY_DIR := ./3rdparty

# All of the directories containing code.
SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \
SRC_DIRS := $(shell find src python tools examples -type d -exec bash -c "find {} -maxdepth 1 \
\( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print 2>/dev/null)


# The target shared library name
LIBRARY_NAME := $(PROJECT)$(LIBRARY_NAME_SUFFIX)
LIB_BUILD_DIR := $(BUILD_DIR)/lib
STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
DYNAMIC_VERSION_MAJOR := 0
DYNAMIC_VERSION_MINOR := 17
DYNAMIC_VERSION_REVISION := 3
DYNAMIC_VERSION_REVISION := 4
DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR)
DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)
DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT)
COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
# NVCaffe requires C++ 11
COMMON_FLAGS += -std=c++11
# NVCaffe requires C++ 14
COMMON_FLAGS += -std=c++14
COMMON_FLAGS += -DCUDA_NO_HALF


##############################
# Get all source files
##############################
Expand Down
6 changes: 3 additions & 3 deletions Makefile.config.example
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ BLAS_LIB := /opt/OpenBLAS/lib/
# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \

# Uncomment to use Python 3 (default is Python 2)
PYTHON_LIBRARIES := boost_python3 python3.6m
PYTHON_INCLUDE := /usr/include/python3.6m \
/usr/lib/python3.6/dist-packages/numpy/core/include
PYTHON_LIBRARIES := boost_python38 python3.8
PYTHON_INCLUDE := /usr/include/python3.8 \
/usr/lib/python3.8/dist-packages/numpy/core/include

# We need to be able to find libpythonX.X.so or .dylib.
PYTHON_LIB := /usr/lib
Expand Down
2 changes: 1 addition & 1 deletion cmake/Cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ if(USE_CUDNN)
endif()

if(UNIX OR APPLE)
list(APPEND CUDA_NVCC_FLAGS -std=c++11;-Xcompiler;-fPIC)
list(APPEND CUDA_NVCC_FLAGS -std=c++14;-Xcompiler;-fPIC)
endif()

if(APPLE)
Expand Down
2 changes: 1 addition & 1 deletion include/caffe/macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
#endif

#if BOOST_VERSION >= 106100
#if (BOOST_VERSION >= 106100) && !defined(BOOST_CUDA_VERSION)
// error: class "boost::common_type<long, long>" has no member "type"
#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) && defined(__CUDACC_VER_BUILD__)
Expand Down
3 changes: 3 additions & 0 deletions include/caffe/util/io.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,14 @@ void FloatCVMatToBuf(const cv::Mat& cv_img, size_t buf_len, Dtype* buf, bool rep
LOG(FATAL) << "Image depth is not supported";
}
} else {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wclass-memaccess"
if (cv_img.depth() == CV_32F && tp<Dtype>() == FLOAT) {
std::memcpy(buf, cv_img.ptr<float>(0), img_size * sizeof(float)); // NOLINT(caffe/alt_fn)
} else if (cv_img.depth() == CV_64F && tp<Dtype>() == DOUBLE) {
std::memcpy(buf, cv_img.ptr<double>(0), img_size * sizeof(double)); // NOLINT(caffe/alt_fn)
} else {
#pragma GCC diagnostic pop
if (cv_img.depth() == CV_8U) {
for (size_t i = 0UL; i < img_size; ++i) {
buf[i] = static_cast<Dtype>(cv_img.ptr<unsigned char>(0)[i]);
Expand Down
81 changes: 66 additions & 15 deletions src/caffe/layers/cudnn_conv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -469,29 +469,80 @@ template <typename Ftype, typename Btype>
void CuDNNConvolutionLayer<Ftype, Btype>::GetConvAlgo(const vector<Blob*>& bottom,
const vector<Blob*>& top, const size_t workspace_bytes, int pad_h, int pad_w,
int stride_h, int stride_w) {
const size_t limit_per_group = align_down<8>(workspace_bytes / ws_groups());
int returnedAlgoCount = 0;
for (int i = 0; i < bottom.size(); ++i) {
// Get backward data algorithm (if not set by user)
if (user_algos_override_[1] < 0) {
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0),
bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i],
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
align_down<8>(workspace_bytes / ws_groups()), &bwd_data_algo_[i]));
}
// Get forward algorithm (if not set by user)
if (user_algos_override_[0] < 0) {
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(0),
int count = 0;
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
std::vector<cudnnConvolutionFwdAlgoPerf_t> perfResults;
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(Caffe::cudnn_handle(0),
fwd_bottom_descs_[i], fwd_filter_desc_, fwd_conv_descs_[i], fwd_top_descs_[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
align_down<8>(workspace_bytes / ws_groups()), &fwd_algo_[i]));
CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
count, &returnedAlgoCount, &perfResults.front()));
if (returnedAlgoCount < 1) {
LOG(FATAL) << returnedAlgoCount << " algorithms returned";
}
bool found = false;
for (int a = 0; a < returnedAlgoCount; ++a) {
if (perfResults[a].memory <= limit_per_group) {
fwd_algo_[i] = perfResults[a].algo;
found = true;
break;
}
}
if (!found) {
LOG(FATAL) << "Can't find forward algorithm with memory limit "
<< limit_per_group << " bytes per group";
}
}
// Get backward data algorithm (if not set by user)
if (user_algos_override_[1] < 0) {
int count = 0;
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perfResults;
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(Caffe::cudnn_handle(0),
bwd_filter_desc_, bwd_top_descs_[i], bwd_conv_data_descs_[i], bwd_bottom_descs_[i],
count, &returnedAlgoCount, &perfResults.front()));
if (returnedAlgoCount < 1) {
LOG(FATAL) << returnedAlgoCount << " algorithms returned";
}
bool found = false;
for (int a = 0; a < returnedAlgoCount; ++a) {
if (perfResults[a].memory <= limit_per_group) {
bwd_data_algo_[i] = perfResults[a].algo;
found = true;
break;
}
}
if (!found) {
LOG(FATAL) << "Can't find backward data algorithm with memory limit "
<< limit_per_group << " bytes per group";
}
}
// Get backward filter algorithm (if not set by user)
if (user_algos_override_[2] < 0) {
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0),
int count = 0;
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(Caffe::cudnn_handle(0), &count));
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perfResults;
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(Caffe::cudnn_handle(0),
bwd_bottom_descs_[i], bwd_top_descs_[i], bwd_conv_filter_descs_[i], bwd_filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
align_down<8>(workspace_bytes / ws_groups()), &bwd_filter_algo_[i]));
CUDA_CHECK(cudaStreamSynchronize(Caffe::thread_stream(0)));
count, &returnedAlgoCount, &perfResults.front()));
if (returnedAlgoCount < 1) {
LOG(FATAL) << returnedAlgoCount << " algorithms returned";
}
bool found = false;
for (int a = 0; a < returnedAlgoCount; ++a) {
if (perfResults[a].memory <= limit_per_group) {
bwd_filter_algo_[i] = perfResults[a].algo;
found = true;
break;
}
}
if (!found) {
LOG(FATAL) << "Can't find backward filter algorithm with memory limit "
<< limit_per_group << " bytes per group";
}
}
LOG(INFO) << Phase_Name(this->phase_)
<< " Conv Algos by Get* (F,BD,BF) for layer '" << this->name()
Expand Down
100 changes: 31 additions & 69 deletions src/caffe/layers/cudnn_deconv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ void CuDNNDeconvolutionLayer<Ftype, Btype>::Reshape(
const int stride_h = stride_data[0];
const int stride_w = stride_data[1];

// Specify workspace limit for kernels directly until we have a
// planning strategy and a rewrite of Caffe's GPU memory mangagement
size_t workspace_limit_bytes = 8*1024*1024;

for (int i = 0; i < bottom.size(); i++) {
cudnn::setTensor4dDesc<Btype>(&bottom_descs_[i],
this->num_,
Expand All @@ -135,96 +131,62 @@ void CuDNNDeconvolutionLayer<Ftype, Btype>::Reshape(
1);
cudnn::setConvolutionDesc(forward_math_,
conv_descs_[i],
// top_descs_[i],
// filter_desc_,
pad_h,
pad_w,
stride_h,
stride_w, 1, 1);

int returnedAlgoCount = 0;
cudnnConvolutionFwdAlgoPerf_t perfFResults;
// choose forward and backward algorithms + workspace(s)
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
handle_[0],
top_descs_[i],
filter_desc_,
conv_descs_[i],
bottom_descs_[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&fwd_algo_[i]));

// We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
// buggy. Thus, if this algo was chosen, choose winograd instead. If
// winograd is not supported or workspace is larger than threshold, choose
// implicit_gemm instead.
// if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
// size_t winograd_workspace_size;
// cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
// handle_[0],
// top_descs_[i],
// filter_desc_,
// conv_descs_[i],
// bottom_descs_[i],
// CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
// &winograd_workspace_size);
// if (status != CUDNN_STATUS_SUCCESS ||
// winograd_workspace_size >= workspace_limit_bytes) {
// fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// } else {
// fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
// }
// }

CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
handle_[0],
top_descs_[i],
filter_desc_,
conv_descs_[i],
bottom_descs_[i],
fwd_algo_[i],
&(workspace_fwd_sizes_[i])));
1,
&returnedAlgoCount,
&perfFResults));
if (returnedAlgoCount < 1) {
LOG(FATAL) << returnedAlgoCount << " algorithms returned";
}
fwd_algo_[i] = perfFResults.algo;
workspace_fwd_sizes_[i] = perfFResults.memory;

// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
cudnnConvolutionBwdFilterAlgoPerf_t perfBFResults;
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
handle_[0],
top_descs_[i],
bottom_descs_[i],
conv_descs_[i],
filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&bwd_filter_algo_[i]));

// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle_[0],
top_descs_[i],
bottom_descs_[i],
conv_descs_[i],
filter_desc_,
bwd_filter_algo_[i],
&workspace_bwd_filter_sizes_[i]));
1,
&returnedAlgoCount,
&perfBFResults));
if (returnedAlgoCount < 1) {
LOG(FATAL) << returnedAlgoCount << " algorithms returned";
}
bwd_filter_algo_[i] = perfBFResults.algo;
workspace_bwd_filter_sizes_[i] = perfBFResults.memory;

// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
handle_[0],
filter_desc_,
bottom_descs_[i],
conv_descs_[i],
top_descs_[i],
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&bwd_data_algo_[i]));

// get workspace size
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
cudnnConvolutionBwdDataAlgoPerf_t perfBDResults;
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
handle_[0],
filter_desc_,
bottom_descs_[i],
conv_descs_[i],
top_descs_[i],
bwd_data_algo_[i],
&workspace_bwd_data_sizes_[i]));
1,
&returnedAlgoCount,
&perfBDResults));
if (returnedAlgoCount < 1) {
LOG(FATAL) << returnedAlgoCount << " algorithms returned";
}
bwd_data_algo_[i] = perfBDResults.algo;
workspace_bwd_data_sizes_[i] = perfBDResults.memory;
}

// reduce over all workspace sizes to get a maximum to allocate / reallocate
Expand Down
7 changes: 5 additions & 2 deletions src/caffe/util/bbox_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
#include <map>
#include <vector>

#include "thrust/functional.h"
#include "thrust/sort.h"
#if !defined(THRUST_IGNORE_CUB_VERSION_CHECK)
#define THRUST_IGNORE_CUB_VERSION_CHECK 1
#endif
#include <thrust/functional.h>
#include <thrust/sort.h>

#include "caffe/common.hpp"
#include "caffe/util/bbox_util.hpp"
Expand Down

0 comments on commit 4c7481c

Please sign in to comment.