diff --git a/backends/gcu/CMakeLists.txt b/backends/gcu/CMakeLists.txt index 7f1b17ee3..bc3edafc7 100644 --- a/backends/gcu/CMakeLists.txt +++ b/backends/gcu/CMakeLists.txt @@ -40,6 +40,7 @@ include(external/topscc) include_directories(${CMAKE_SOURCE_DIR}) include_directories(/opt/tops/include) +include_directories(${PADDLE_INC_DIR}/build) option(WITH_KERNELS "compile with custom kernels" ON) option(WITH_TESTING "compile with unit testing" OFF) diff --git a/backends/gcu/custom_engine/custom_engine_interface.cc b/backends/gcu/custom_engine/custom_engine_interface.cc index 199366164..0971c7f26 100644 --- a/backends/gcu/custom_engine/custom_engine_interface.cc +++ b/backends/gcu/custom_engine/custom_engine_interface.cc @@ -15,8 +15,13 @@ #include "custom_engine/custom_engine_interface.h" #include "custom_engine/custom_engine_op.h" +#include "custom_engine/gcu_engine.h" +#include "custom_engine/gcu_engine_compiler.h" +#include "paddle/fluid/framework/new_executor/instruction/custom_engine_instruction.h" +#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/utils/utils.h" +#include "paddle/fluid/pir/utils/pd_op_to_kernel_utils.h" #include "paddle/pir/include/core/builder.h" #include "paddle/pir/include/core/builtin_attribute.h" #include "paddle/pir/include/core/builtin_dialect.h" @@ -27,28 +32,308 @@ #include "paddle/pir/include/core/op_trait.h" #include "paddle/pir/include/core/operation_utils.h" -namespace paddle { -namespace dialect { +namespace { +using DenseTensorType = pir::DenseTensorType; +using AllocatedDenseTensorType = paddle::dialect::AllocatedDenseTensorType; +using SelectedRowsType = paddle::dialect::SelectedRowsType; +using AllocatedSelectedRowsType = paddle::dialect::AllocatedSelectedRowsType; +using DenseTensorArrayType = paddle::dialect::DenseTensorArrayType; +using AllocatedDenseTensorArrayType = + paddle::dialect::AllocatedDenseTensorArrayType; +using SparseCooTensorType = paddle::dialect::SparseCooTensorType; +using SparseCsrTensorType = paddle::dialect::SparseCsrTensorType; -void RegisterCustomEngineOp() { - pir::IrContext *ctx = pir::IrContext::Instance(); +template +static pir::Type CreatType(pir::Type type, + const phi::Place& place, + pir::Type out_dtype, + pir::IrContext* ctx) { + auto input_type = type.dyn_cast(); + return IrType2::get(ctx, + place, + out_dtype, + input_type.dims(), + input_type.data_layout(), + input_type.lod(), + input_type.offset()); +} + +static pir::Type BuildOutputType(pir::Type type, + const phi::Place& place, + pir::IrContext* ctx) { + if (type.isa()) { + auto out_dtype = type.dyn_cast().dtype(); + return CreatType( + type, place, out_dtype, ctx); + } else if (type.isa()) { + auto out_dtype = type.dyn_cast().dtype(); + return CreatType( + type, place, out_dtype, ctx); + } else if (type.isa()) { + auto array_type = type.dyn_cast(); + return AllocatedDenseTensorArrayType::get(ctx, + place, + array_type.dtype(), + array_type.dims(), + array_type.data_layout()); + } else { + PADDLE_THROW(common::errors::Unimplemented( + "BuildOutputType only support DenseTensorType, SelectedRowsType, " + "and DenseTensorArrayType")); + } +} + +void PushBackOutputTypes(pir::IrContext* ctx, + pir::Operation* op_item, + const pir::Type& origin_type, + const phi::Place& out_place, + const phi::KernelKey& kernel_key, + std::vector* op_output_types) { + auto result_type = origin_type; + if (!result_type) { + op_output_types->push_back(result_type); + } else if (result_type.isa() || + result_type.isa() || + result_type.isa() || + result_type.isa() || + result_type.isa()) { + op_output_types->push_back(BuildOutputType(result_type, out_place, ctx)); + + } else if (result_type.isa()) { + std::vector vec_inner_types; + auto base_types = result_type.dyn_cast().data(); + for (auto& base_type : base_types) { + if (base_type) { + if (base_type.isa() || + base_type.isa()) { + vec_inner_types.push_back(BuildOutputType(base_type, out_place, ctx)); + } else { + PADDLE_THROW(common::errors::Unimplemented( + "only support dense tensor and selected rows in vector type " + "for now")); + } + } else { + // NOTE(phlrain), kernel not support a nullptr in output + pir::Type fp32_dtype = pir::Float32Type::get(ctx); + phi::DDim dims = {}; + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LegacyLoD lod = {{}}; + size_t offset = 0; + auto dense_tensor_dtype = DenseTensorType::get( + ctx, fp32_dtype, dims, data_layout, lod, offset); + auto allocated_dense_tensor_dtype = + AllocatedDenseTensorType::get(ctx, out_place, dense_tensor_dtype); + vec_inner_types.push_back(allocated_dense_tensor_dtype); + } + } + + pir::Type t1 = pir::VectorType::get(ctx, vec_inner_types); + op_output_types->push_back(t1); + } else { + PADDLE_THROW(common::errors::Unimplemented( + "Result type only support DenseTensorType, SelectedRowType, " + "SparseCooTensorType, SparseCsrTensorType and " + "VectorType")); + } +} +} // namespace + +C_Status RegisterCustomEngineOp() { + pir::IrContext* ctx = pir::IrContext::Instance(); ctx->GetOrRegisterDialect(); ctx->GetOrRegisterDialect(); - pir::Dialect *custom_engine_dialect = + pir::Dialect* custom_engine_dialect = ctx->GetOrRegisterDialect(); PADDLE_ENFORCE_NOT_NULL(custom_engine_dialect, "Failed to register CustomEngineDialect."); ctx->RegisterOpInfo(custom_engine_dialect, - pir::TypeId::get(), - paddle::dialect::CustomEngineOp::name(), - paddle::dialect::CustomEngineOp::interface_set(), - paddle::dialect::CustomEngineOp::GetTraitSet(), - paddle::dialect::CustomEngineOp::attributes_num, - paddle::dialect::CustomEngineOp::attributes_name, - paddle::dialect::CustomEngineOp::VerifySigInvariants, - paddle::dialect::CustomEngineOp::VerifyRegionInvariants); + pir::TypeId::get(), + custom_engine::CustomEngineOp::name(), + custom_engine::CustomEngineOp::interface_set(), + custom_engine::CustomEngineOp::GetTraitSet(), + custom_engine::CustomEngineOp::attributes_num, + custom_engine::CustomEngineOp::attributes_name, + custom_engine::CustomEngineOp::VerifySigInvariants, + custom_engine::CustomEngineOp::VerifyRegionInvariants); VLOG(3) << "Register CustomEngineOp successfully."; + return C_SUCCESS; +} + +C_Status CustomEngineOpLower(C_CustomEngineLowerParams* lower_param) { + VLOG(3) << "Enter CustomEngineOpLower."; + // get lower params + pir::IrContext* ctx = + reinterpret_cast(lower_param->ir_context); + pir::Operation* op_item = + reinterpret_cast(lower_param->operation); + phi::KernelKey* kernel_key = + reinterpret_cast(lower_param->kernel_key); + phi::Place* place = reinterpret_cast(lower_param->place); + std::unordered_map* map_op_pair = + reinterpret_cast*>( + lower_param->map_op_pair); + std::unordered_map* map_value_pair = + reinterpret_cast*>( + lower_param->map_value_pair); + pir::Block* block = reinterpret_cast(lower_param->block); + + // Prepare output types + std::vector op_output_types; + + for (size_t i = 0; i < op_item->num_results(); ++i) { + phi::Place out_place = phi::TransToPhiPlace(kernel_key->backend()); + PushBackOutputTypes(ctx, + op_item, + op_item->result(i).type(), + out_place, + *kernel_key, + &op_output_types); + } + + // Prepare input + std::vector vec_inputs; + + for (size_t i = 0; i < op_item->num_operands(); ++i) { + auto cur_in = op_item->operand_source(i); + PADDLE_ENFORCE_EQ( + map_value_pair->count(cur_in), + true, + common::errors::PreconditionNotMet( + "[%d]'s input of [%s] op MUST in map pair", i, op_item->name())); + + auto new_in = map_value_pair->at(cur_in); + + vec_inputs.push_back(new_in); + } + + // Prepare attr + std::unordered_map op_attribute; + auto op_attr_map = op_item->attributes(); + for (auto& map_item : op_attr_map) { + op_attribute.emplace(map_item.first, map_item.second); + } + op_attribute["op_name"] = pir::StrAttribute::get(ctx, op_item->name()); + + pir::OpInfo custom_engine_op_info = + ctx->GetRegisteredOpInfo(custom_engine::CustomEngineOp::name()); + + pir::Operation* op = pir::Operation::Create( + vec_inputs, op_attribute, op_output_types, custom_engine_op_info, 1); + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); + VLOG(3) << "CustomEngineOpLower create custom_engine_op"; + + VLOG(3) << "CustomEngineOpLower get op_item subgraph block."; + pir::Region& op_item_region = op_item->region(0); + PADDLE_ENFORCE_EQ( + op_item_region.empty(), + false, + ::common::errors::Unavailable( + "Required CustomEngineOp's region must not be emptpy.")); + pir::Block* sub_graph_block = &(op_item_region.front()); + + VLOG(3) << "CustomEngineOpLower set new op subgraph block."; + pir::Region& region = op->region(0); + if (region.empty()) { + region.emplace_back(); + } + pir::Block* op_block = &(region.front()); + + // process subgraph block + paddle::dialect::ProcessBlock( + *place, sub_graph_block, op_block, ctx, map_op_pair, map_value_pair); + + if (VLOG_IS_ON(3)) { + std::stringstream ss; + ss << "CustomEngineOpLower new op:"; + op->Print(ss); + VLOG(3) << ss.str(); + } + + (*map_op_pair)[op_item] = op; + + // only deal with single output + if (op_item->num_results() > 0) { + for (size_t i = 0; i < op_item->num_results(); ++i) { + (*map_value_pair)[op_item->result(i)] = op->result(i); + } + } + block->push_back(op); + VLOG(3) << "CustomEngineOpLower successfully."; + return C_SUCCESS; } -} // namespace dialect -} // namespace paddle +C_Status GraphEngineBuild(C_CustomEngineInstruction instruction) { + VLOG(3) << "Enter GraphEngineBuild."; + paddle::framework::CustomEngineInstruction* instruction_ = + reinterpret_cast( + instruction); + pir::Operation* op = instruction_->Operation(); + const phi::KernelContext& kernel_context = instruction_->KernelContext(); + phi::KernelContext kernel_ctx = kernel_context; + auto engine_inputs = instruction_->GetEngineInputs(); + auto engine_outputs = instruction_->GetEngineOutputs(); + auto engine_value_to_tensors = instruction_->GetEngineValueToTensors(); + auto engine_value_to_var_names = instruction_->GetEngineValueToVarNames(); + + // NOTES: The memory is managed by CustomEngineInstruction, and we provide a + // release interface here. + custom_engine::GCUEngine* gcu_engine = new custom_engine::GCUEngine(); + auto gcu_engine_deleter = [](void* ptr) { + custom_engine::GCUEngine* gcu_engine = + static_cast(ptr); + + if (gcu_engine != nullptr) { + delete gcu_engine; + } else { + PADDLE_THROW(phi::errors::PreconditionNotMet("gcu_engine is nullptr")); + } + }; + + std::string engine_key = + "GCUEngine_" + + std::to_string(reinterpret_cast(instruction)); + custom_engine::GCUEngineCompiler gcu_compiler(kernel_ctx, + op, + engine_inputs, + engine_outputs, + engine_value_to_tensors, + engine_value_to_var_names, + engine_key); + gcu_compiler.Compile(gcu_engine); + + instruction_->SetCustomEngine(reinterpret_cast(gcu_engine)); + instruction_->SetCustomEngineDeleter(gcu_engine_deleter); + VLOG(3) << "GraphEngineBuild successfully."; + + return C_SUCCESS; +} + +C_Status GraphEngineExecute(C_CustomEngineInstruction instruction) { + VLOG(3) << "Enter GraphEngineExecute."; + paddle::framework::CustomEngineInstruction* instruction_ = + reinterpret_cast( + instruction); + custom_engine::GCUEngine* gcu_engine = + reinterpret_cast(instruction_->CustomEngine()); + PADDLE_ENFORCE_NOT_NULL(gcu_engine, "GCUEngine is nullptr."); + + auto* dev_ctx = + static_cast(phi::DeviceContextPool::Instance().Get( + instruction_->DeviceContext().GetPlace())); + + gcu_engine->Run(*dev_ctx); + VLOG(3) << "GraphEngineExecute successfully."; + return C_SUCCESS; +} + +void InitPluginCustomEngine(CustomEngineParams* params) { + memset(reinterpret_cast(params->interface), + 0, + sizeof(C_CustomEngineInterface)); + + params->interface->register_custom_engine_op = RegisterCustomEngineOp; + params->interface->graph_engine_build = GraphEngineBuild; + params->interface->graph_engine_execute = GraphEngineExecute; + params->interface->custom_engine_op_lower = CustomEngineOpLower; +} diff --git a/backends/gcu/custom_engine/custom_engine_interface.h b/backends/gcu/custom_engine/custom_engine_interface.h index 2229a8071..b0e1151d7 100644 --- a/backends/gcu/custom_engine/custom_engine_interface.h +++ b/backends/gcu/custom_engine/custom_engine_interface.h @@ -13,13 +13,20 @@ // limitations under the License. #pragma once +#include "paddle/fluid/custom_engine/custom_engine_ext.h" #include "paddle/phi/extension.h" -namespace paddle { -namespace dialect { +#ifdef __cplusplus +extern "C" { +#endif -void InitPluginCustomEngine(void*); -void RegisterCustomEngineOp(); +C_Status RegisterCustomEngineOp(); +C_Status CustomEngineOpLower(C_CustomEngineLowerParams* lower_param); +C_Status GraphEngineBuild(C_CustomEngineInstruction instruction); +C_Status GraphEngineExecute(C_CustomEngineInstruction instruction); -} // namespace dialect -} // namespace paddle +void InitPluginCustomEngine(CustomEngineParams* params); + +#ifdef __cplusplus +} /* extern "c" */ +#endif diff --git a/backends/gcu/custom_engine/custom_engine_op.cc b/backends/gcu/custom_engine/custom_engine_op.cc index 89dd934fe..ff762d821 100644 --- a/backends/gcu/custom_engine/custom_engine_op.cc +++ b/backends/gcu/custom_engine/custom_engine_op.cc @@ -16,20 +16,19 @@ #include "paddle/fluid/pir/dialect/operator/utils/utils.h" -namespace paddle { -namespace dialect { - +namespace custom_engine { const char *CustomEngineOp::attributes_name[2] = {"input_names", "output_names"}; OpInfoTuple CustomEngineOp::GetOpInfo() { std::vector inputs = { - OpInputInfo("x", - "pir::VectorType", - false, - false, - false, - false)}; + paddle::dialect::OpInputInfo( + "x", + "pir::VectorType", + false, + false, + false, + false)}; std::vector attributes = { paddle::dialect::OpAttributeInfo( @@ -38,13 +37,14 @@ OpInfoTuple CustomEngineOp::GetOpInfo() { "output_names", "pir::ArrayAttribute", "")}; std::vector outputs = { - OpOutputInfo("out", - "pir::VectorType", - false, - false)}; + paddle::dialect::OpOutputInfo( + "out", + "pir::VectorType", + false, + false)}; paddle::dialect::OpRunTimeInfo run_time_info = - OpRunTimeInfo("", {""}, "", {""}, {}, {}, {}, {}); + paddle::dialect::OpRunTimeInfo("", {}, "", {}, {}, {}, {}, {}); return std::make_tuple( inputs, attributes, outputs, run_time_info, "gcu_engine_op"); @@ -98,7 +98,7 @@ void CustomEngineOp::Build(pir::Builder &builder, // NOLINT } else { out_types.emplace_back(pir::DenseTensorType::get( pir::IrContext::Instance(), - TransToIrDataType(outputs_dtype[i]), + paddle::dialect::TransToIrDataType(outputs_dtype[i]), phi::DDim(outputs_shape[i].data(), outputs_shape[i].size()), phi::DataLayout::kNCHW, phi::LoD(), @@ -110,6 +110,35 @@ void CustomEngineOp::Build(pir::Builder &builder, // NOLINT argument_outputs.emplace_back(out_vector_type); argument.AddOutputs(argument_outputs.begin(), argument_outputs.end()); + argument.AddRegion(nullptr); + pir::PassStopGradientsDefaultly(argument); +} + +void CustomEngineOp::Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value x, + const std::vector &input_names, + const std::vector &output_names, + const std::vector &outputs_type) { + VLOG(3) << "Start building CustomEngineOp"; + + VLOG(3) << "Builder construction inputs"; + std::vector argument_inputs = {x}; + argument.AddInputs(argument_inputs); + + VLOG(3) << "Builder construction attributes"; + + ADD_VEC_ATTRIBUTE(pir::StrAttribute, input_names); + ADD_VEC_ATTRIBUTE(pir::StrAttribute, output_names); + + VLOG(3) << "Builder construction outputs"; + pir::Type out_vector_type = + pir::VectorType::get(pir::IrContext::Instance(), outputs_type); + // std::vector argument_outputs; + // argument_outputs.emplace_back(out_vector_type); + // argument.AddOutputs(argument_outputs.begin(), argument_outputs.end()); + argument.AddOutput(out_vector_type); + argument.AddRegion(nullptr); pir::PassStopGradientsDefaultly(argument); } @@ -165,7 +194,22 @@ void CustomEngineOp::VerifySig() { VLOG(3) << "End Verifying for: CustomEngineOp."; } -} // namespace dialect -} // namespace paddle +pir::Block *CustomEngineOp::block() { + pir::Region ®ion = (*this)->region(0); + if (region.empty()) region.emplace_back(); + return ®ion.front(); +} + +pir::Block *CustomEngineOp::block() const { + pir::Region ®ion = (*this)->region(0); + PADDLE_ENFORCE_EQ( + region.empty(), + false, + ::common::errors::Unavailable( + "Required CustomEngineOp's region must not be emptpy.")); + return ®ion.front(); +} + +} // namespace custom_engine -IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::CustomEngineOp) +IR_DEFINE_EXPLICIT_TYPE_ID(custom_engine::CustomEngineOp) diff --git a/backends/gcu/custom_engine/custom_engine_op.h b/backends/gcu/custom_engine/custom_engine_op.h index 0b2dfbb6b..0034e20b5 100644 --- a/backends/gcu/custom_engine/custom_engine_op.h +++ b/backends/gcu/custom_engine/custom_engine_op.h @@ -48,9 +48,7 @@ } \ } // namespace pir -namespace paddle { -namespace dialect { - +namespace custom_engine { class CustomEngineOp : public pir::Op { public: @@ -68,13 +66,22 @@ class CustomEngineOp std::vector> outputs_shape, std::vector outputs_dtype); + static void Build(pir::Builder &builder, // NOLINT + pir::OperationArgument &argument, // NOLINT + pir::Value x, + const std::vector &input_names, + const std::vector &output_names, + const std::vector &outputs_type); + void VerifySig(); + pir::Block *block(); + pir::Block *block() const; + pir::Value x() { return operand_source(0); } pir::Value out() { return result(0); } }; -} // namespace dialect -} // namespace paddle +} // namespace custom_engine -IR_DECLARE_EXPLICIT_PLUGIN_TYPE_ID(paddle::dialect::CustomEngineOp) +IR_DECLARE_EXPLICIT_PLUGIN_TYPE_ID(custom_engine::CustomEngineOp) diff --git a/backends/gcu/custom_engine/gcu_engine.cc b/backends/gcu/custom_engine/gcu_engine.cc new file mode 100644 index 000000000..9ea369bf7 --- /dev/null +++ b/backends/gcu/custom_engine/gcu_engine.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "custom_engine/gcu_engine.h" + +namespace custom_engine { + +GCUEngine::GCUEngine(const std::string &engine_key, + topsExecutable_t tops_exec, + const std::vector &tensor_args, + const std::vector &return_tensor) + : engine_key_(engine_key), + tops_exec_(tops_exec), + tensor_args_(tensor_args), + return_tensor_(return_tensor) { + PADDLE_ENFORCE_NOT_NULL( + tops_exec_, + phi::errors::InvalidArgument("Expect executable is not null.")); +} + +void GCUEngine::Init(const std::string &engine_key, + topsExecutable_t tops_exec, + const std::vector &tensor_args, + const std::vector &return_tensor) { + engine_key_ = engine_key; + tops_exec_ = tops_exec; + tensor_args_ = tensor_args; + return_tensor_ = return_tensor; + executor_ = std::make_shared( + tops_exec, tensor_args, return_tensor); +} + +void GCUEngine::Run(const phi::CustomContext &dev_ctx) { + VLOG(3) << "=== GCUEngine Run ==="; + executor_->Run(dev_ctx); +} + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/gcu_engine.h b/backends/gcu/custom_engine/gcu_engine.h new file mode 100644 index 000000000..29751f46c --- /dev/null +++ b/backends/gcu/custom_engine/gcu_engine.h @@ -0,0 +1,59 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#include "custom_engine/gcu_engine_executor.h" +#include "custom_engine/ir_translator/utils/utils.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace custom_engine { + +class GCUEngine { + public: + GCUEngine() = default; + GCUEngine(const std::string &engine_key, + topsExecutable_t tops_exec, + const std::vector &tensor_args, + const std::vector &return_tensor); + ~GCUEngine() { + if (tops_exec_ != nullptr) { + RT_CHECK(topsDestroyExecutable(tops_exec_)); + tops_exec_ = nullptr; + VLOG(3) << "Release topsExecutable and destory GCUEngine " << engine_key_; + } + } + + void Init(const std::string &engine_key, + topsExecutable_t tops_exec, + const std::vector &tensor_args, + const std::vector &return_tensor); + + void Run(const phi::CustomContext &dev_ctx); + + private: + std::string engine_key_; + topsExecutable_t tops_exec_ = nullptr; + std::vector tensor_args_; + std::vector return_tensor_; + std::shared_ptr executor_ = nullptr; +}; + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/gcu_engine_compiler.cc b/backends/gcu/custom_engine/gcu_engine_compiler.cc new file mode 100644 index 000000000..450cd8b1d --- /dev/null +++ b/backends/gcu/custom_engine/gcu_engine_compiler.cc @@ -0,0 +1,372 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "custom_engine/gcu_engine_compiler.h" + +#include "custom_engine/ir_translator/translator_registry.h" +#include "paddle/common/flags.h" +#include "paddle/pir/include/core/builtin_attribute.h" +#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h" + +COMMON_DECLARE_bool(print_ir); + +namespace custom_engine { +class GCUEngineCompiler::GCUEngineCompilerImpl { + public: + GCUEngineCompilerImpl( + const phi::KernelContext& kernel_context, + pir::Operation* op, + const std::vector& engine_inputs, + const std::vector& engine_outputs, + const std::unordered_map>& + engine_value_to_tensors, + const std::unordered_map>& + engine_value_to_var_names, + const std::string& engine_key) + : kernel_context_(kernel_context), + op_(op), + engine_inputs_(engine_inputs), + engine_outputs_(engine_outputs), + engine_value_to_tensors_(engine_value_to_tensors), + engine_value_to_var_names_(engine_value_to_var_names), + engine_key_(engine_key) { + Init(); + } + + ~GCUEngineCompilerImpl() {} + + void Init(); + + void Compile(GCUEngine* gcu_engine); + + private: + void CreateInputs(); + void MapInnerOutputValues(const pir::Operation* yield_op); + void SetGraphOutputs(); + void ConvertGraph(); + + phi::KernelContext kernel_context_; + pir::Operation* op_; // Not owned + + std::string engine_key_; + + std::vector engine_inputs_; + std::vector engine_outputs_; + std::vector engine_inner_outputs_; + std::unordered_map> + engine_value_to_tensors_; + std::unordered_map> + engine_value_to_var_names_; + + pir::Block* block_; + std::vector inputs_; + std::vector outputs_; + + // for GCU graph + GcuBuilderPtr builder_ = nullptr; + std::unordered_map gcu_op_cache_; +}; + +GCUEngineCompiler::GCUEngineCompiler::GCUEngineCompiler( + const phi::KernelContext& kernel_context, + pir::Operation* op, + const std::vector& engine_inputs, + const std::vector& engine_outputs, + const std::unordered_map>& + engine_value_to_tensors, + const std::unordered_map>& + engine_value_to_var_names, + const std::string& engine_key) { + impl_ = std::make_shared(kernel_context, + op, + engine_inputs, + engine_outputs, + engine_value_to_tensors, + engine_value_to_var_names, + engine_key); +} + +void GCUEngineCompiler::Compile(GCUEngine* gcu_engine) { + impl_->Compile(gcu_engine); +} + +void GCUEngineCompiler::GCUEngineCompilerImpl::Init() { + pir::Region& region = op_->region(0); + PADDLE_ENFORCE_EQ( + region.empty(), + false, + ::common::errors::Unavailable( + "Required CustomEngineOp's region must not be emptpy.")); + block_ = &(region.front()); + + // inputs_ = kernel_context_.InputsBetween( + // size_t(0), kernel_context_.InputsSize()); + + // auto outputs = kernel_context_.MutableOutputBetween( + // size_t(0), kernel_context_.OutputsSize()); + // outputs_.assign(outputs.begin(), outputs.end()); + + for (size_t i = 0; i < engine_inputs_.size(); ++i) { + PADDLE_ENFORCE_GT(engine_value_to_tensors_.count(engine_inputs_[i]), + 0, + common::errors::PreconditionNotMet( + "Input[%zu] is not in value map", i)); + inputs_.emplace_back(engine_value_to_tensors_.at(engine_inputs_[i]).at(0)); + } + + for (size_t i = 0; i < engine_outputs_.size(); ++i) { + PADDLE_ENFORCE_GT(engine_value_to_tensors_.count(engine_outputs_[i]), + 0, + common::errors::PreconditionNotMet( + "Output[%zu] is not in value map", i)); + outputs_.emplace_back( + engine_value_to_tensors_.at(engine_outputs_[i]).at(0)); + } + + builder_ = std::make_shared(); + PADDLE_ENFORCE_NOT_NULL( + builder_, "Faild to reate gcu builder for %s.", engine_key_.c_str()); + builder_->SetShapeInference(true); + + VLOG(3) << "GCUEngineCompiler Init successfully."; +} + +void GCUEngineCompiler::GCUEngineCompilerImpl::Compile(GCUEngine* gcu_engine) { + PADDLE_ENFORCE_NOT_NULL(gcu_engine, + "The return GCUEngine memory is not allocated."); + VLOG(3) << "Compile for " << engine_key_; + ConvertGraph(); + + auto hlir_module = builder_->GetModule(); + VLOG(3) << "Compiler begin to CompileHLIR for " << engine_key_; + topsExecutable_t tops_executable = + custom_engine::CompileTopsExecutable(hlir_module); + VLOG(3) << "Compiler CompileHLIR end for " << engine_key_; + gcu_engine->Init(engine_key_, tops_executable, inputs_, outputs_); + VLOG(3) << "Generate GCUEngine for " << engine_key_; + return; +} + +void GCUEngineCompiler::GCUEngineCompilerImpl::CreateInputs() { + for (size_t i = 0; i < engine_inputs_.size(); ++i) { + auto tensor = engine_value_to_tensors_.at(engine_inputs_[i]).at(0); + + auto ptype = custom_engine::ConvertFromPhiDataType(tensor->dtype()); + std::vector dims = common::vectorize(tensor->dims()); + builder::Type input_type(dims, ptype); + gcu_op_cache_[tensor] = + std::make_shared(builder_->CreateInput(input_type)); + VLOG(6) << "Create gcu builder input[" << i + << "]: " << engine_value_to_var_names_.at(engine_inputs_[i]).at(0); + } +} + +void GCUEngineCompiler::GCUEngineCompilerImpl::MapInnerOutputValues( + const pir::Operation* yield_op) { + std::vector> input_gcu_ops; + size_t input_num = yield_op->num_operands(); + VLOG(6) << "MapOutputValues for yeild op:" << yield_op->name() + << ", input num:" << input_num; + PADDLE_ENFORCE_EQ(input_num, + engine_outputs_.size(), + common::errors::PreconditionNotMet( + "Output num check failed, except:%zu, but get:%zu", + engine_outputs_.size(), + input_num)); + for (size_t i = 0; i < input_num; ++i) { + auto value = yield_op->operand_source(i); + PADDLE_ENFORCE_GT( + engine_value_to_tensors_.count(value), + 0, + common::errors::PreconditionNotMet( + "Input[%zu] value of yeild is not in engine_value_to_tensors_", i)); + PADDLE_ENFORCE_GT( + engine_value_to_var_names_.count(value), + 0, + common::errors::PreconditionNotMet( + "Input[%zu] value of yeild is not in engine_value_to_var_names_", + i)); + + engine_inner_outputs_.emplace_back(value); + } +} + +void GCUEngineCompiler::GCUEngineCompilerImpl::SetGraphOutputs() { + std::vector graph_outputs; + for (size_t i = 0; i < engine_inner_outputs_.size(); ++i) { + auto tensors = engine_value_to_tensors_.at(engine_inner_outputs_[i]); + PADDLE_ENFORCE_EQ(tensors.size(), + 1, + common::errors::PreconditionNotMet( + "Only support one tensor now, but get %zu, " + "output_index:%zu", + tensors.size(), + i)); + + auto tensor = tensors.at(0); + auto inner_value_name = + engine_value_to_var_names_.at(engine_inner_outputs_[i]).at(0); + auto external_value_name = + engine_value_to_var_names_.at(engine_outputs_[i]).at(0); + PADDLE_ENFORCE_GT( + gcu_op_cache_.count(tensor), + 0, + common::errors::PreconditionNotMet( + "Output[%zu] is not generated in gcu_op map, value name:%s", + i, + inner_value_name.c_str())); + graph_outputs.emplace_back(*(gcu_op_cache_.at(tensor))); + + // set output shapes + auto gcu_shape = gcu_op_cache_.at(tensor)->GetType().GetShape(); + tensor->Resize(common::make_ddim(gcu_shape)); + outputs_[i]->Resize(common::make_ddim(gcu_shape)); + // *(outputs_[i]) = *tensor; + VLOG(6) << "Found gcu builder output[" << i << "]: " << inner_value_name + << ", external var name:" << external_value_name + << ", dims:" << tensor->dims(); + } + builder_->SetOutput(graph_outputs); +} + +void GCUEngineCompiler::GCUEngineCompilerImpl::ConvertGraph() { + VLOG(3) << "ConvertGraph for " << engine_key_; + if (FLAGS_print_ir) { + std::cout << "IR Before conversion = " << *block_ << std::endl; + } + + VLOG(3) << "Create inputs node for " << engine_key_; + CreateInputs(); + // builder_->Dump(); + + VLOG(3) << "Convert calc ops for " << engine_key_; + // NOTES: Consider the subgraph to be topologically sorted. + std::list graph_ops = block_->ops(); + for (const auto* op : graph_ops) { + if (op->isa()) { + MapInnerOutputValues(op); + continue; + } + std::string op_name = op->name(); + auto op_attributes = op->attributes(); + if (op->HasAttribute("op_name")) { + op_name = op->attribute("op_name").AsString(); + } + + OpTranslateFunc convert_func = + TranslatorRegistry::Instance().Get(OpTranslateFuncKey(op_name)); + PADDLE_ENFORCE_NOT_NULL(convert_func); + + // inputs + std::vector> input_gcu_ops; + size_t input_num = op->num_operands(); + VLOG(6) << "Get input_gcu_ops for " << op_name << ", num:" << input_num; + for (size_t i = 0; i < input_num; ++i) { + auto value = op->operand_source(i); + PADDLE_ENFORCE_GT( + engine_value_to_tensors_.count(value), + 0, + common::errors::PreconditionNotMet( + "Input[%zu] value is not in engine_value_to_tensors_", i)); + PADDLE_ENFORCE_GT( + engine_value_to_var_names_.count(value), + 0, + common::errors::PreconditionNotMet( + "Input[%zu] value is not in engine_value_to_var_names_", i)); + + std::vector gcu_ops; + auto tensors = engine_value_to_tensors_.at(value); + auto var_names = engine_value_to_var_names_.at(value); + + for (size_t n = 0; n < tensors.size(); ++n) { + PADDLE_ENFORCE_GT( + gcu_op_cache_.count(tensors[n]), + 0, + common::errors::PreconditionNotMet( + "Input[%zu][%zu] is not generated in gcu_op map, name: %s", + i, + n, + var_names.at(n))); + gcu_ops.emplace_back(gcu_op_cache_.at(tensors[n])); + VLOG(6) << "op_name:" << op_name << ", inputs[" << i << "][" << n + << "], var name:" << var_names.at(n); + } + input_gcu_ops.emplace_back(gcu_ops); + } + + // convert + VLOG(6) << "Start to convert for " << op_name; + GcuOpPtr gcu_op = convert_func(builder_, op, input_gcu_ops); + VLOG(6) << "End of conversion for " << op_name; + + bool is_tuple_out = gcu_op->GetType().IsTuple(); + if (is_tuple_out) { + size_t gcu_output_num = gcu_op->GetType().GetTupleSize(); + size_t output_num = op->num_results(); + PADDLE_ENFORCE_EQ( + gcu_output_num, + output_num, + common::errors::PreconditionNotMet("Output num check failed, op: %s", + op_name.c_str())); + + for (size_t i = 0; i < output_num; ++i) { + auto out_value = op->result(i); + auto tensors = engine_value_to_tensors_.at(out_value); + PADDLE_ENFORCE_EQ(tensors.size(), + 1, + common::errors::PreconditionNotMet( + "Only support one tensor now, but get %zu, op: " + "%s, output_index:%zu", + tensors.size(), + op_name.c_str(), + i)); + + auto tensor = tensors.at(0); + auto ptype = custom_engine::ConvertFromPhiDataType(tensor->dtype()); + std::vector dims = common::vectorize(tensor->dims()); + builder::Type input_type(dims, ptype); + gcu_op_cache_[tensor] = + std::make_shared(builder::GetTupleElement(*gcu_op, i)); + VLOG(6) << "Output GetTupleElement for " << op_name + << ", output index:" << i + << ", name:" << engine_value_to_var_names_.at(out_value).at(0); + } + } else { + if (op->num_results() == 1) { + auto out_value = op->result(0); + auto tensors = engine_value_to_tensors_.at(out_value); + PADDLE_ENFORCE_EQ(tensors.size(), + 1, + common::errors::PreconditionNotMet( + "Output num should be one, but get %zu, op: " + "%s, output_index:%zu", + tensors.size(), + op_name.c_str())); + gcu_op_cache_[tensors.at(0)] = gcu_op; + VLOG(6) << "Output set for " << op_name + << ", name:" << engine_value_to_var_names_.at(out_value).at(0); + } else { + VLOG(6) << "Op " << op_name << " does not have any output value."; + } + } + } // end of for (const auto* op : graph_ops) + // outputs + SetGraphOutputs(); + if (FLAGS_print_ir) { + std::cout << "IR After conversion = " << std::endl; + builder_->Dump(); + } +} + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/gcu_engine_compiler.h b/backends/gcu/custom_engine/gcu_engine_compiler.h new file mode 100644 index 000000000..35a9a0438 --- /dev/null +++ b/backends/gcu/custom_engine/gcu_engine_compiler.h @@ -0,0 +1,52 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#include "custom_engine/gcu_engine.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_context.h" +#include "paddle/pir/include/core/operation.h" + +namespace custom_engine { + +class GCUEngineCompiler { + public: + GCUEngineCompiler( + const phi::KernelContext& kernel_context, + pir::Operation* op, + const std::vector& engine_inputs, + const std::vector& engine_outputs, + const std::unordered_map>& + engine_value_to_tensors, + const std::unordered_map>& + engine_value_to_var_names, + const std::string& engine_key = "GCUEngineCompiler_default"); + ~GCUEngineCompiler() {} + + // NOTES: This function applies for memory and is released by the caller. + void Compile(GCUEngine* gcu_engine); + + private: + class GCUEngineCompilerImpl; + std::shared_ptr impl_ = nullptr; +}; + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/gcu_engine_executor.cc b/backends/gcu/custom_engine/gcu_engine_executor.cc new file mode 100644 index 000000000..9ede59fd5 --- /dev/null +++ b/backends/gcu/custom_engine/gcu_engine_executor.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "custom_engine/gcu_engine_executor.h" + +namespace custom_engine { +void GCUEngineExecutor::Init() { + tensor_args_device_.resize(tensor_args_.size()); +} + +void GCUEngineExecutor::Run(const phi::CustomContext &dev_ctx) { + VLOG(3) << "=== GCUEngineExecutor Run ==="; + std::vector dev_inputs; + dev_inputs.reserve(tensor_args_.size()); + std::vector dev_outputs; + dev_outputs.resize(return_tensor_.size()); + + for (size_t i = 0; i < tensor_args_.size(); ++i) { + auto input = tensor_args_[i]; + PADDLE_ENFORCE_NE( + input, nullptr, phi::errors::InvalidArgument("inputs is null")); + + if (input->initialized()) { + phi::DenseTensor *tensor = &(tensor_args_device_[i]); + if (input->place().GetType() != phi::AllocationType::CUSTOM) { + custom_kernel::TensorCopy(dev_ctx, *input, false, tensor); + } else { + *tensor = *input; + } + auto device_tensor = tensor_args_device_[i]; + dev_inputs.emplace_back(device_tensor.data()); + VLOG(6) << "GCUEngineExecutor::Run, Inputs[" << i + << "] addr:" << device_tensor.data() << ", capacity is " + << device_tensor.capacity() << ", type:" << device_tensor.dtype() + << ", place:" << device_tensor.place() + << ", ddim:" << device_tensor.dims().to_str(); + } else { + VLOG(6) << "GCUEngineExecutor::Run, inputs[" << i + << "] is not initialized."; + } + } + + for (size_t i = 0; i < return_tensor_.size(); ++i) { + auto *tensor = return_tensor_[i]; + PADDLE_ENFORCE_NE( + tensor, nullptr, phi::errors::InvalidArgument("outputs is null")); + dev_ctx.Alloc(tensor, tensor->dtype()); + dev_outputs[i] = tensor->data(); + + VLOG(6) << "GCUEngineExecutor::Run, outputs[" << i + << "] addr:" << tensor->data() << ", capacity is " + << tensor->capacity() << ", type:" << tensor->dtype() + << ", place:" << tensor->place() + << ", ddim:" << tensor->dims().to_str(); + } + + auto tops_stream = static_cast(dev_ctx.stream()); + VLOG(6) << "GCUEngineExecutor Run on stream:" << tops_stream + << ", tops_exec_:" << tops_exec_; + + static double total_time_cost = 0; + static int32_t exec_count = 0; + auto start_time = custom_kernel::GetCurrentTimestap(); + + RT_CHECK(topsLaunchExecutable(tops_exec_, + nullptr, + dev_inputs.data(), + dev_inputs.size(), + nullptr, + nullptr, + dev_outputs.data(), + dev_outputs.size(), + nullptr, + nullptr, + tops_stream)); + + if (VLOG_IS_ON(6)) { + auto time_cost = custom_kernel::GetTimeCostInMs( + start_time, custom_kernel::GetCurrentTimestap()); + total_time_cost += time_cost; + + VLOG(6) << "exec_count: " << ++exec_count << ", time_cost: " << time_cost + << ", total_time_cost: " << total_time_cost; + } + return; +} + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/gcu_engine_executor.h b/backends/gcu/custom_engine/gcu_engine_executor.h new file mode 100644 index 000000000..ddadba01a --- /dev/null +++ b/backends/gcu/custom_engine/gcu_engine_executor.h @@ -0,0 +1,52 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#include "custom_engine/ir_translator/utils/utils.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace custom_engine { + +class GCUEngineExecutor { + public: + GCUEngineExecutor(topsExecutable_t tops_exec, + const std::vector &tensor_args, + const std::vector &return_tensor) + : tops_exec_(tops_exec), + tensor_args_(tensor_args), + return_tensor_(return_tensor) { + Init(); + } + ~GCUEngineExecutor() {} + + void Init(); + + void Run(const phi::CustomContext &dev_ctx); + + private: + topsExecutable_t tops_exec_ = nullptr; // Not owned + std::vector tensor_args_; + std::vector return_tensor_; + + std::vector tensor_args_device_; +}; + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/ir_translator/operators/activation_ops.cc b/backends/gcu/custom_engine/ir_translator/operators/activation_ops.cc new file mode 100644 index 000000000..80267a109 --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/operators/activation_ops.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "custom_engine/ir_translator/translator_registry.h" + +namespace custom_engine { + +static GcuOpPtr TranslateAbs( + GcuBuilderPtr gcu_builder, + const pir::Operation *op, + const std::vector> &gcu_op_inputs) { + auto input = *(gcu_op_inputs[0][0]); + return std::make_shared(builder::Abs(input)); +} + +} // namespace custom_engine + +REGISTER_OP_TRANSLATOR(pd_op_abs, custom_engine::TranslateAbs) diff --git a/backends/gcu/custom_engine/ir_translator/operators/binary_ops.cc b/backends/gcu/custom_engine/ir_translator/operators/binary_ops.cc new file mode 100644 index 000000000..275b7e68a --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/operators/binary_ops.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "custom_engine/ir_translator/translator_registry.h" + +namespace custom_engine { + +#define DEFINE_BINARY_TRANS_FUNC(func) \ + static GcuOpPtr TranslateBinaryOps##func( \ + GcuBuilderPtr gcu_builder, \ + const pir::Operation *op, \ + const std::vector> &gcu_op_inputs) { \ + PADDLE_ENFORCE_EQ(gcu_op_inputs.size(), \ + 2, \ + common::errors::PreconditionNotMet( \ + "Intput op num check failed, op: %s, num:%zu", \ + std::string(#func).c_str(), \ + gcu_op_inputs.size())); \ + auto lhs = *(gcu_op_inputs[0][0]); \ + auto rhs = *(gcu_op_inputs[1][0]); \ + return std::make_shared(builder::func(lhs, rhs)); \ + } + +DEFINE_BINARY_TRANS_FUNC(Add) +DEFINE_BINARY_TRANS_FUNC(Sub) +DEFINE_BINARY_TRANS_FUNC(Mul) +DEFINE_BINARY_TRANS_FUNC(Div) +DEFINE_BINARY_TRANS_FUNC(Greater) +DEFINE_BINARY_TRANS_FUNC(GreaterEqual) +DEFINE_BINARY_TRANS_FUNC(Less) +DEFINE_BINARY_TRANS_FUNC(LessEqual) + +#undef DEFINE_BINARY_TRANS_FUNC + +} // namespace custom_engine + +REGISTER_OP_TRANSLATOR(pd_op_add, custom_engine::TranslateBinaryOpsAdd) +REGISTER_OP_TRANSLATOR(pd_op_add_, custom_engine::TranslateBinaryOpsAdd) +REGISTER_OP_TRANSLATOR(pd_op_subtract, custom_engine::TranslateBinaryOpsSub) +REGISTER_OP_TRANSLATOR(pd_op_multiply, custom_engine::TranslateBinaryOpsMul) +REGISTER_OP_TRANSLATOR(pd_op_divide, custom_engine::TranslateBinaryOpsDiv) +REGISTER_OP_TRANSLATOR(pd_op_greater_than, + custom_engine::TranslateBinaryOpsGreater) +REGISTER_OP_TRANSLATOR(pd_op_greater_equal, + custom_engine::TranslateBinaryOpsGreaterEqual) +REGISTER_OP_TRANSLATOR(pd_op_less_than, custom_engine::TranslateBinaryOpsLess) +REGISTER_OP_TRANSLATOR(pd_op_less_equal, + custom_engine::TranslateBinaryOpsLessEqual) diff --git a/backends/gcu/custom_engine/ir_translator/operators/full.cc b/backends/gcu/custom_engine/ir_translator/operators/full.cc new file mode 100644 index 000000000..a3f4b3528 --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/operators/full.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "custom_engine/ir_translator/translator_registry.h" + +namespace custom_engine { + +static GcuOpPtr TranslateFull( + GcuBuilderPtr gcu_builder, + const pir::Operation *op, + const std::vector> &gcu_op_inputs) { + const auto &attrs = op->attributes(); + + auto shape_array = + attrs.at("shape").dyn_cast().AsVector(); + std::vector shape; + if (shape_array.size() > 0) { + PADDLE_ENFORCE_EQ(shape_array[0].isa(), + true, + common::errors::Unimplemented( + "the 0th elementwise MUST be pir::Int64Attribute")); + for (size_t i = 0; i < shape_array.size(); ++i) { + shape.emplace_back(shape_array[i].dyn_cast().data()); + } + } + double value = attrs.at("value").dyn_cast().data(); + phi::DataType dtype = + attrs.at("dtype").dyn_cast().data(); + auto ptype = custom_engine::ConvertFromPhiDataType(dtype); + auto result = builder::Const(gcu_builder, value, builder::Type(shape, ptype)); + return std::make_shared(result); +} + +} // namespace custom_engine + +REGISTER_OP_TRANSLATOR(pd_op_full, custom_engine::TranslateFull) +REGISTER_OP_TRANSLATOR(pd_op_full_, custom_engine::TranslateFull) diff --git a/backends/gcu/custom_engine/ir_translator/operators/matmul.cc b/backends/gcu/custom_engine/ir_translator/operators/matmul.cc new file mode 100644 index 000000000..016f0c6d0 --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/operators/matmul.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "custom_engine/ir_translator/translator_registry.h" + +namespace custom_engine { + +static GcuOpPtr TranslateMatmul( + GcuBuilderPtr gcu_builder, + const pir::Operation *op, + const std::vector> &gcu_op_inputs) { + PADDLE_ENFORCE_EQ( + gcu_op_inputs.size(), + 2, + common::errors::PreconditionNotMet( + "Intput op num check failed, get num:%zu", gcu_op_inputs.size())); + auto X = *(gcu_op_inputs[0][0]); + auto Y = *(gcu_op_inputs[1][0]); + + auto x_shape = X.GetType().GetShape(); + auto y_shape = Y.GetType().GetShape(); + + bool trans_x = + op->attribute("transpose_x").dyn_cast().data(); + bool trans_y = + op->attribute("transpose_y").dyn_cast().data(); + + int64_t x_rank = x_shape.size(); + int64_t y_rank = y_shape.size(); + int64_t max_rank = std::max(x_rank, y_rank); + int64_t rank_diff = std::abs(x_rank - y_rank); + auto ptype = X.GetType().GetPrimitiveType(); + int64_t batch_dim; + + if (x_rank > y_rank) { + if (trans_x || y_rank == 1) { + std::vector broadcast_dims; + std::vector bc_shape; + if (y_rank == 1) { + for (int64_t i = 0; i < rank_diff - 1; i++) { + bc_shape.emplace_back(x_shape[i]); + } + bc_shape.emplace_back(y_shape[0]); + bc_shape.emplace_back(1); + broadcast_dims.emplace_back(rank_diff - 1); + } else { + for (int64_t i = 0; i < rank_diff; i++) { + bc_shape.emplace_back(x_shape[i]); + } + for (int64_t i = 0; i < y_rank; i++) { + bc_shape.emplace_back(y_shape[i]); + } + int iter = 0; + for (int64_t i = 0; i < x_rank; ++i) { + if (i < rank_diff) { + ++iter; + } else { + broadcast_dims.emplace_back(i); + } + } + } + builder::Type type(bc_shape, ptype); + Y = builder::BroadcastInDim(Y, broadcast_dims, type); + } + if (y_rank == 1) { + batch_dim = rank_diff - 1; + } else { + batch_dim = rank_diff; + } + + } else if (x_rank < y_rank) { + std::vector broadcast_dims; + std::vector bc_shape; + if (x_rank == 1) { + for (int64_t i = 0; i < rank_diff - 1; i++) { + bc_shape.emplace_back(y_shape[i]); + } + bc_shape.emplace_back(1); + bc_shape.emplace_back(x_shape[0]); + broadcast_dims.emplace_back(rank_diff); + } else { + for (int64_t i = 0; i < rank_diff; i++) { + bc_shape.emplace_back(y_shape[i]); + } + for (int64_t i = 0; i < x_rank; i++) { + bc_shape.emplace_back(x_shape[i]); + } + int iter = 0; + for (int64_t i = 0; i < y_rank; ++i) { + if (i < rank_diff) { + ++iter; + } else { + broadcast_dims.emplace_back(i); + } + } + } + builder::Type type(bc_shape, ptype); + X = builder::BroadcastInDim(X, broadcast_dims, type); + if (x_rank == 1) { + batch_dim = rank_diff - 1; + } else { + batch_dim = rank_diff; + } + + } else { + batch_dim = max_rank - 2; + if (x_rank == y_rank && x_rank > 3) { + auto x_brd_shape = x_shape; + auto y_brd_shape = y_shape; + std::vector x_brd_dims, y_brd_dims; + for (int64_t i = 0; i < x_rank - 2; ++i) { + x_brd_shape[i] = x_shape[i] > y_shape[i] ? x_shape[i] : y_shape[i]; + y_brd_shape[i] = x_shape[i] > y_shape[i] ? x_shape[i] : y_shape[i]; + } + x_brd_dims.resize(x_rank); + y_brd_dims.resize(y_rank); + std::iota(x_brd_dims.begin(), x_brd_dims.end(), 0); + std::iota(y_brd_dims.begin(), y_brd_dims.end(), 0); + if (x_brd_shape != x_shape) { + X = builder::BroadcastInDim( + X, x_brd_dims, builder::Type(x_brd_shape, ptype)); + } + if (y_brd_shape != y_shape) { + Y = builder::BroadcastInDim( + Y, y_brd_dims, builder::Type(y_brd_shape, ptype)); + } + } + } + + builder::DotDimensionNumbers dims_attr; + std::vector lhs_batching_dimensions = {}; + std::vector rhs_batching_dimensions = {}; + std::vector lhs_contracting_dimensions = {}; + std::vector rhs_contracting_dimensions = {}; + if (x_rank == 1 && y_rank == 1) { + lhs_contracting_dimensions.emplace_back(0); + rhs_contracting_dimensions.emplace_back(0); + } else if (x_rank <= y_rank || trans_x || y_rank == 1) { + for (int64_t i = 0; i < max_rank - 1; ++i) { + if (i < batch_dim) { + lhs_batching_dimensions.emplace_back(i); + rhs_batching_dimensions.emplace_back(i); + } else { + if (trans_x && x_rank != 1) { + lhs_contracting_dimensions.emplace_back(i); + } else { + lhs_contracting_dimensions.emplace_back(i + 1); + } + if (trans_y && y_rank != 1) { + rhs_contracting_dimensions.emplace_back(i + 1); + } else { + rhs_contracting_dimensions.emplace_back(i); + } + } + } + } else { + lhs_contracting_dimensions.emplace_back(x_rank - 1); + if (y_rank != 1) { + if (trans_y) { + rhs_contracting_dimensions.emplace_back(y_rank - 1); + } else { + rhs_contracting_dimensions.emplace_back(y_rank - 2); + } + } else { + rhs_contracting_dimensions.emplace_back(0); + } + } + + dims_attr.set_lhs_batching_dimensions(lhs_batching_dimensions); + dims_attr.set_rhs_batching_dimensions(rhs_batching_dimensions); + dims_attr.set_lhs_contracting_dimensions(lhs_contracting_dimensions); + dims_attr.set_rhs_contracting_dimensions(rhs_contracting_dimensions); + std::vector precision_config = {}; + auto dot = builder::DotGeneral(X, Y, dims_attr, precision_config); + dot.SetAttribute("op_type", builder::Attribute("DotInference")); + if (x_rank == 1 && y_rank == 1) { + auto type = dot.GetType().GetPrimitiveType(); + std::vector new_shape; + new_shape.push_back(1); + builder::Type output_type(new_shape, type); + dot = builder::Reshape(dot, output_type); + } else if (y_rank == 1) { + auto shape = dot.GetType().GetShape(); + auto type = dot.GetType().GetPrimitiveType(); + std::vector new_shape; + for (size_t i = 0; i < shape.size() - 1; i++) { + new_shape.push_back(shape[i]); + } + builder::Type output_type(new_shape, type); + dot = builder::Reshape(dot, output_type); + } else if (x_rank == 1) { + auto shape = dot.GetType().GetShape(); + auto type = dot.GetType().GetPrimitiveType(); + std::vector new_shape; + for (size_t i = 0; i < shape.size(); i++) { + if (i != shape.size() - 2) { + new_shape.push_back(shape[i]); + } + } + builder::Type output_type(new_shape, type); + dot = builder::Reshape(dot, output_type); + } + + auto result = std::make_shared(dot); + + return result; +} + +} // namespace custom_engine + +REGISTER_OP_TRANSLATOR(pd_op_matmul, custom_engine::TranslateMatmul) diff --git a/backends/gcu/custom_engine/ir_translator/operators/scale.cc b/backends/gcu/custom_engine/ir_translator/operators/scale.cc new file mode 100644 index 000000000..c5aa18bec --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/operators/scale.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "custom_engine/ir_translator/translator_registry.h" + +namespace custom_engine { + +static GcuOpPtr TranslateScale( + GcuBuilderPtr gcu_builder, + const pir::Operation *op, + const std::vector> &gcu_op_inputs) { + auto input = gcu_op_inputs[0][0]; + const auto &attrs = op->attributes(); + bool bias_after_scale = + attrs.at("bias_after_scale").dyn_cast().data(); + + builder::Op scale_op; + if (gcu_op_inputs.size() == 2) { // with scale tensor + scale_op = *(gcu_op_inputs[1][0]); + } else { + float scale = attrs.at("scale").dyn_cast<::pir::FloatAttribute>().data(); + scale_op = builder::FullLike(*input, scale); + } + float bias = attrs.at("bias").dyn_cast().data(); + auto bias_op = builder::FullLike(*input, bias); + if (bias_after_scale) { + return std::make_shared((*input) * scale_op + bias_op); + } else { + return std::make_shared(((*input) + bias_op) * scale_op); + } +} + +} // namespace custom_engine + +REGISTER_OP_TRANSLATOR(pd_op_scale, custom_engine::TranslateScale) +REGISTER_OP_TRANSLATOR(pd_op_scale_, custom_engine::TranslateScale) diff --git a/backends/gcu/custom_engine/ir_translator/operators/yield.cc b/backends/gcu/custom_engine/ir_translator/operators/yield.cc new file mode 100644 index 000000000..fd456e7b7 --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/operators/yield.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "custom_engine/ir_translator/translator_registry.h" + +namespace custom_engine { + +static GcuOpPtr TranslateYield( + GcuBuilderPtr gcu_builder, + const pir::Operation *op, + const std::vector> &gcu_op_inputs) { + size_t output_num = gcu_op_inputs.size(); + if (output_num > 1) { + std::vector outputs; + for (size_t i = 0; i < output_num; ++i) { + outputs.emplace_back(*(gcu_op_inputs[i][0])); + } + builder::Op result = builder::Tuple(outputs); + return std::make_shared(result); + } else if (output_num == 1) { + return gcu_op_inputs[0][0]; + } else { + PADDLE_THROW(common::errors::PreconditionNotMet("Not support now.")); + } +} + +} // namespace custom_engine + +REGISTER_OP_TRANSLATOR(cf_yield, custom_engine::TranslateYield) diff --git a/backends/gcu/custom_engine/ir_translator/translator_registry.h b/backends/gcu/custom_engine/ir_translator/translator_registry.h new file mode 100644 index 000000000..d45e3c588 --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/translator_registry.h @@ -0,0 +1,131 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include + +#include "custom_engine/ir_translator/utils/utils.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/pir/include/core/operation.h" + +namespace custom_engine { + +using OpTranslateFunc = std::function> &map_inputs)>; + +static inline std::string OpTranslateFuncKey(const std::string &op_name) { + std::regex pattern("\\."); + std::string result = std::regex_replace(op_name, pattern, "_"); + return result; +} + +class TranslatorRegistry { + public: + static TranslatorRegistry &Instance() { + static TranslatorRegistry g_op_translator_registry_instance; + return g_op_translator_registry_instance; + } + + bool Has(const std::string &op_name) const { + return translator_map_.find(op_name) != translator_map_.end(); + } + + void Insert(const std::string &op_name, + const OpTranslateFunc &op_trans_func) { + PADDLE_ENFORCE_NE( + Has(op_name), + true, + common::errors::InvalidArgument( + "OpTranslateFunc of %s has been registered.", op_name)); + translator_map_.insert({op_name, op_trans_func}); + std::cout << "TranslatorRegistry insert " << op_name << std::endl; + } + + OpTranslateFunc Get(const std::string &op_name) const { + PADDLE_ENFORCE_EQ( + Has(op_name), + true, + common::errors::InvalidArgument( + "OpTranslateFunc of %s has not been registered.", op_name)); + return translator_map_.at(op_name); + } + + private: + TranslatorRegistry() = default; + std::unordered_map translator_map_; + + TranslatorRegistry(const TranslatorRegistry &) = delete; + TranslatorRegistry(TranslatorRegistry &&) = delete; + TranslatorRegistry &operator=(const TranslatorRegistry &) = delete; + TranslatorRegistry &operator=(TranslatorRegistry &&) = delete; +}; + +class OpTranslatorRegistrar { + public: + // The action of registration is in the constructor of a global registrar + // variable, which are not used in the code that calls package framework, and + // would be removed from the generated binary file by the linker. To avoid + // such removal, we add Touch to all registrar classes and make + // USE_OP_TRANSLATOR macros to call this method. So, as long as the callee + // code calls USE_OP_TRANSLATOR, the global registrar variable won't be + // removed by the linker. + void Touch() {} + OpTranslatorRegistrar(const char *op_name, + const OpTranslateFunc &op_trans_func) { + TranslatorRegistry::Instance().Insert(op_name, op_trans_func); + } +}; + +#define STATIC_ASSERT_TRANSLATOR_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_translator_global_namespace_##uniq_name##__ {}; \ + static_assert( \ + std::is_same<::__test_translator_global_namespace_##uniq_name##__, \ + __test_translator_global_namespace_##uniq_name##__>::value, \ + msg) + +// Register a new op_trans_func that can be applied on the operator. +#define REGISTER_OP_TRANSLATOR(op_name, op_trans_func) \ + STATIC_ASSERT_TRANSLATOR_GLOBAL_NAMESPACE( \ + __reg_op_translator__##op_name, \ + "REGISTER_OP_TRANSLATOR must be called in global namespace"); \ + static custom_engine::OpTranslatorRegistrar \ + __op_translator_registrar_##op_name##__(#op_name, op_trans_func); \ + int TouchOpTranslatorRegistrar_##op_name() { \ + __op_translator_registrar_##op_name##__.Touch(); \ + return 0; \ + } + +#define USE_OP_TRANSLATOR(op_name) \ + STATIC_ASSERT_TRANSLATOR_GLOBAL_NAMESPACE( \ + __use_op_translator_itself_##op_name, \ + "USE_OP_TRANSLATOR must be called in global namespace"); \ + extern int TouchOpTranslatorRegistrar_##op_name(); \ + static int use_op_translator_itself_##op_name##_ UNUSED = \ + TouchOpTranslatorRegistrar_##op_name() + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/ir_translator/utils/utils.cc b/backends/gcu/custom_engine/ir_translator/utils/utils.cc new file mode 100644 index 000000000..fd45265e7 --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/utils/utils.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "custom_engine/ir_translator/utils/utils.h" + +#include "gcu/tops_graph_compiler/tops_graph_compiler.h" +#include "gcu/tops_graph_compiler/tops_graph_compiler_option.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace custom_engine { +namespace { +std::vector TargetOptionSplit(const std::string& s, + char delimiter) { + std::vector tokens; + std::string token; + std::istringstream tokenStream(s); + while (std::getline(tokenStream, token, delimiter)) { + std::size_t first_non_space = token.find_first_not_of(" \t\n\r"); + std::size_t last_non_space = token.find_last_not_of(" \t\n\r"); + if (first_non_space == std::string::npos || + last_non_space == std::string::npos) { + continue; + } + token.substr(first_non_space, last_non_space - first_non_space + 1); + if (!token.empty()) tokens.push_back(token); + } + return tokens; +} +} // namespace + +GcuPrimitiveType ConvertFromPhiDataType(const phi::DataType& type) { + switch (type) { + case phi::DataType::BOOL: + return builder::PrimitiveType::PRED(); + case phi::DataType::INT8: + return builder::PrimitiveType::S8(); + case phi::DataType::INT16: + return builder::PrimitiveType::S16(); + case phi::DataType::INT32: + return builder::PrimitiveType::S32(); + case phi::DataType::INT64: + return builder::PrimitiveType::S64(); + case phi::DataType::FLOAT16: + return builder::PrimitiveType::F16(); + case phi::DataType::FLOAT32: + return builder::PrimitiveType::F32(); + case phi::DataType::FLOAT64: + return builder::PrimitiveType::F64(); + case phi::DataType::UINT8: + return builder::PrimitiveType::U8(); + case phi::DataType::UINT16: + return builder::PrimitiveType::U16(); + case phi::DataType::UINT32: + return builder::PrimitiveType::U32(); + case phi::DataType::UINT64: + return builder::PrimitiveType::U64(); + + default: + return builder::PrimitiveType::NONE(); + } +} + +std::vector GetTopsCompileOptions() { + std::vector opts; + + auto target_name = custom_kernel::GetTargetName(); + // std::string hlir_options = "hlir-codegen-pipeline"; + std::string hlir_options = "tops-hlir-pipeline"; + + // add target options + int options_len = 1024; // NOLINT + char target_options[options_len]; // NOLINT + TOPSGRAPH_CHECK( + topsgraphInitOptions(target_name.c_str(), target_options, options_len)); + + std::string target_opt_s = std::string(target_options); + char delimiter = '-'; + auto target_opt_vec = TargetOptionSplit(target_opt_s, delimiter); + for (auto it : target_opt_vec) { + auto temp_opt = "-" + it; + opts.emplace_back(temp_opt); + } + opts.emplace_back(std::string("-hlir=") + hlir_options); + // opts.emplace_back( + // std::string("-codegen=codegen-gcu-pipeline{enable-memory-reuse=true}")); + // opts.emplace_back(std::string("-output=codegen")); + + if (VLOG_IS_ON(3)) { + std::stringstream ss; + ss << "compile options: "; + for (auto it : opts) { + ss << it << " "; + } + VLOG(3) << ss.str(); + } + + return opts; +} + +topsExecutable_t CompileTopsExecutable( + const std::shared_ptr& module) { + std::vector options; + auto compile_options = GetTopsCompileOptions(); + for (auto& option : compile_options) { + options.push_back(option.c_str()); + } + + // create program and compile + topsgraphProgram program; + TOPSGRAPH_CHECK(topsgraphCreateProgramFromModule(&program, module.get())); + TOPSGRAPH_CHECK( + topsgraphCompileProgram(program, options.size(), options.data())); + + // get binary size and binary data + uint64_t binary_size = 0; + TOPSGRAPH_CHECK(topsgraphGetBinSize(program, &binary_size)); + std::unique_ptr binary(new char[binary_size]); + TOPSGRAPH_CHECK(topsgraphGetBin(program, binary.get())); + + // delete program + topsgraphDestroyProgram(&program); + + topsExecutable_t exe; + RT_CHECK(topsCreateExecutable(&exe, binary.get(), binary_size)); + + return exe; +} + +} // namespace custom_engine diff --git a/backends/gcu/custom_engine/ir_translator/utils/utils.h b/backends/gcu/custom_engine/ir_translator/utils/utils.h new file mode 100644 index 000000000..a3637812b --- /dev/null +++ b/backends/gcu/custom_engine/ir_translator/utils/utils.h @@ -0,0 +1,40 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "common/utils.h" +#include "gcu/hlir_builder/hlir_builder.h" +#include "paddle/phi/common/data_type.h" + +using GcuOp = ::builder::Op; +using GcuOpPtr = std::shared_ptr; +using GcuPrimitiveType = builder::PrimitiveType; +using GcuType = builder::Type; +// using GcuShape = std::vector; +using GcuBuilder = builder::Builder; +using GcuBuilderPtr = std::shared_ptr; +using GcuGraphPtr = std::shared_ptr; +// using GcuOpDescPtr = std::shared_ptr; + +namespace custom_engine { +GcuPrimitiveType ConvertFromPhiDataType(const phi::DataType& type); + +std::vector GetTopsCompileOptions(); +topsExecutable_t CompileTopsExecutable( + const std::shared_ptr& module); + +} // namespace custom_engine diff --git a/backends/gcu/custom_op/test_for_custom_engine_op.cc b/backends/gcu/custom_op/test_for_custom_engine_op.cc index e40e59bf9..be69ff3c1 100644 --- a/backends/gcu/custom_op/test_for_custom_engine_op.cc +++ b/backends/gcu/custom_op/test_for_custom_engine_op.cc @@ -54,7 +54,7 @@ void TestCustomEngineOp() { std::vector{const_op1.result(0), const_op2.result(0)}); pir::OpInfo custom_engine_op_info = - ctx->GetRegisteredOpInfo(paddle::dialect::CustomEngineOp::name()); + ctx->GetRegisteredOpInfo(custom_engine::CustomEngineOp::name()); std::vector out_types; out_types.emplace_back( @@ -90,7 +90,7 @@ void TestCustomEngineOp() { builder.Insert(op1); - auto op2 = builder.Build( + auto op2 = builder.Build( buildin_combine_op.result(0), std::vector{"input_0", "input_1"}, std::vector{"output_0"}, @@ -110,7 +110,7 @@ void TestCustomEngineOp() { } void RunTestCustomEngineOp() { - paddle::dialect::RegisterCustomEngineOp(); + (void)RegisterCustomEngineOp(); TestCustomEngineOp(); } } // namespace diff --git a/backends/gcu/paddle_gcu_export.map b/backends/gcu/paddle_gcu_export.map index 1b410925a..975b9be19 100644 --- a/backends/gcu/paddle_gcu_export.map +++ b/backends/gcu/paddle_gcu_export.map @@ -57,5 +57,6 @@ PADDLE_GCU_1.0 { XcclGroupEnd; XcclSend; XcclRecv; + InitPluginCustomEngine; local: *; }; diff --git a/backends/gcu/passes/gcu_custom_passes.h b/backends/gcu/passes/gcu_custom_passes.h index 3871a21dd..6609dff66 100644 --- a/backends/gcu/passes/gcu_custom_passes.h +++ b/backends/gcu/passes/gcu_custom_passes.h @@ -17,3 +17,6 @@ #include "paddle/pir/include/pass/pass_registry.h" USE_PIR_PASS(addn_replace_pass); +USE_PIR_PASS(gcu_op_marker_pass); +USE_PIR_PASS(gcu_sub_graph_extract_pass); +USE_PIR_PASS(gcu_replace_with_engine_op_pass); diff --git a/backends/gcu/passes/gcu_op_marker_pass.cc b/backends/gcu/passes/gcu_op_marker_pass.cc new file mode 100644 index 000000000..ac9cc034c --- /dev/null +++ b/backends/gcu/passes/gcu_op_marker_pass.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/fluid/pir/dialect/operator/utils/utils.h" +#include "paddle/fluid/pir/utils/general_functions.h" +#include "paddle/pir/include/core/builtin_attribute.h" +#include "paddle/pir/include/core/builtin_op.h" +#include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" + +namespace { + +// inline auto kCanRunGcuAttr = paddle::dialect::kCanRunGcuAttr; +inline const char kCanRunGcuAttr[] = "__l_gcu__"; + +#define DEFINE_GENERAL_PATTERN(OpName, OpType) \ + class OpName##OpPattern : public pir::OpRewritePattern { \ + public: \ + using pir::OpRewritePattern::OpRewritePattern; \ + bool MatchAndRewrite(OpType op, \ + pir::PatternRewriter &rewriter) const override { \ + if (op->HasAttribute(kCanRunGcuAttr) && \ + op->attribute(kCanRunGcuAttr).data()) { \ + return false; \ + } \ + op->set_attribute(kCanRunGcuAttr, rewriter.bool_attr(true)); \ + return true; \ + } \ + }; + +DEFINE_GENERAL_PATTERN(Matmul, paddle::dialect::MatmulOp) +DEFINE_GENERAL_PATTERN(Add, paddle::dialect::AddOp) +DEFINE_GENERAL_PATTERN(Abs, paddle::dialect::AbsOp) +DEFINE_GENERAL_PATTERN(Full, paddle::dialect::FullOp) +DEFINE_GENERAL_PATTERN(ScaleOp, paddle::dialect::ScaleOp) + +class GcuOpMarkerPass : public pir::PatternRewritePass { + public: + GcuOpMarkerPass() : pir::PatternRewritePass("gcu_op_marker_pass", 2) {} + + pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override { + pir::RewritePatternSet ps(context); + +#define ADD_PATTERN(OpName) \ + ps.Add(std::make_unique(context)); + ADD_PATTERN(Matmul) + ADD_PATTERN(Add) + ADD_PATTERN(Abs) + ADD_PATTERN(Full) + ADD_PATTERN(ScaleOp) +#undef ADD_PATTERN + + return ps; + } +}; +} // namespace + +namespace pir { +std::unique_ptr CreateGcuOpMarkerPass() { + return std::make_unique(); +} +} // namespace pir + +REGISTER_IR_PASS(gcu_op_marker_pass, GcuOpMarkerPass); diff --git a/backends/gcu/passes/gcu_replace_with_engine_op_pass.cc b/backends/gcu/passes/gcu_replace_with_engine_op_pass.cc new file mode 100644 index 000000000..3dafea263 --- /dev/null +++ b/backends/gcu/passes/gcu_replace_with_engine_op_pass.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "custom_engine/custom_engine_op.h" +#include "paddle/common/flags.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" +#include "paddle/pir/include/core/builder.h" +#include "paddle/pir/include/core/builtin_op.h" +#include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" + +COMMON_DECLARE_bool(print_ir); +namespace { +using OpListType = std::list; + +std::vector AnalysisOutputs( + const OpListType& group_ops) { // NOLINT + // Get output by ud chain + std::unordered_set op_set(group_ops.begin(), + group_ops.end()); + + std::vector outputs; + for (auto* op : group_ops) { + for (size_t i = 0; i < op->num_results(); ++i) { + auto result = op->result(i); + + for (auto use_iter = result.use_begin(); use_iter != result.use_end(); + ++use_iter) { + if (!op_set.count(use_iter->owner())) { + outputs.push_back(result); + break; + } + } + } + } + + // NOTE: If all value are not used outside, we mark last op's results + // as outputs. But keep in mind that is risky. + if (outputs.size() == 0) { + for (size_t i = 0; i < group_ops.back()->num_results(); ++i) { + outputs.push_back(group_ops.back()->result(i)); + } + } + + return outputs; +} + +std::vector AnalysisInputs(const OpListType& group_ops) { // NOLINT + std::unordered_set visited_values; + std::vector group_inputs; + std::unordered_set ops_set(group_ops.begin(), + group_ops.end()); + + // count all op's input Value + for (auto* op : group_ops) { + for (auto& value : op->operands_source()) { + if (!value || !value.type() || ops_set.count(value.defining_op())) + continue; + if (visited_values.count(value)) continue; + // if the input value owner op is not in OpSet, it's the group's input + visited_values.insert(value); + group_inputs.push_back(value); + } + } + return group_inputs; +} + +class ReplaceWithCustomEngineOpPattern + : public pir::OpRewritePattern { + public: + using pir::OpRewritePattern::OpRewritePattern; + + bool MatchAndRewrite( + pir::GroupOp op, + pir::PatternRewriter& rewriter) const override { // NOLINT + pir::Block* block = op.block(); + + if (FLAGS_print_ir) { + std::cout + << "ReplaceWithCustomEngineOpPattern MatchAndRewrite before IR = " + << *(op->GetParent()) << std::endl; + } + + OpListType group_ops = block->ops(); + + const std::vector inputs = AnalysisInputs(group_ops); + const std::vector outputs = op->results(); + + // attrs + std::vector input_names; + std::vector output_names; + for (size_t i = 0; i < inputs.size(); ++i) { + std::string input_name = "graph_input_" + std::to_string(i) + "_op_" + + std::to_string(inputs[i].defining_op()->id()); + input_names.emplace_back(input_name); + } + for (size_t i = 0; i < outputs.size(); ++i) { + std::string output_name = "graph_output_" + std::to_string(i) + "_op_" + + std::to_string(outputs[i].defining_op()->id()); + output_names.emplace_back(output_name); + } + + std::vector output_types; + for (auto& value : outputs) { + output_types.emplace_back(value.type()); + } + + auto buildin_combine_op = rewriter.Build(inputs); + + custom_engine::CustomEngineOp custom_engine_op = + rewriter.Build( + buildin_combine_op.out(), input_names, output_names, output_types); + + auto out_split_op = rewriter.Build(custom_engine_op.out()); + std::vector new_outputs = out_split_op.outputs(); + + if (FLAGS_print_ir) { + std::cout << "custom_engine_op name: " << custom_engine_op.name() + << std::endl; + std::cout << "ReplaceWithCustomEngineOpPattern MatchAndRewrite mid IR = " + << *(op->GetParent()) << std::endl; + } + + for (auto inner_op : group_ops) { + inner_op->MoveTo(custom_engine_op.block(), + custom_engine_op.block()->end()); + } + rewriter.ReplaceOp(op, new_outputs); + + if (FLAGS_print_ir) { + std::cout + << "ReplaceWithCustomEngineOpPattern MatchAndRewrite after IR = " + << *(op->GetParent()) << std::endl; + } + + return true; + } +}; + +class GcuReplaceWithCustomEngineOpPass : public pir::PatternRewritePass { + public: + GcuReplaceWithCustomEngineOpPass() + : pir::PatternRewritePass("gcu_replace_with_engine_op_pass", 2) {} + + pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override { + pir::RewritePatternSet ps(context); + ps.Add(std::make_unique(context)); + return ps; + } +}; +} // namespace + +namespace pir { + +std::unique_ptr CreateGcuReplaceWithCustomEngineOpPass() { + return std::make_unique(); +} + +} // namespace pir + +REGISTER_IR_PASS(gcu_replace_with_engine_op_pass, + GcuReplaceWithCustomEngineOpPass); diff --git a/backends/gcu/passes/gcu_sub_graph_extract_pass.cc b/backends/gcu/passes/gcu_sub_graph_extract_pass.cc new file mode 100644 index 000000000..26870a845 --- /dev/null +++ b/backends/gcu/passes/gcu_sub_graph_extract_pass.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "paddle/common/flags.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" +#include "paddle/pir/include/core/builder.h" +#include "paddle/pir/include/core/builtin_op.h" +#include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" + +// #include "passes/sub_graph_detector.h" +#include "paddle/fluid/pir/utils/sub_graph_detector.h" + +// COMMON_DECLARE_int32(gcu_min_group_size); +PHI_DEFINE_EXPORTED_int32( + gcu_min_group_size, + 1, + "when the gcu subgraph size is not larger than `gcu_min_group_size`, the " + "group will fallback to original graph."); + +COMMON_DECLARE_bool(print_ir); + +namespace { +using GroupOpsVec = std::vector; +inline const char kCanRunGcuAttr[] = "__l_gcu__"; + +bool IsSupportedByGCU(const pir::Operation& op) { + if (op.HasAttribute(kCanRunGcuAttr) && + op.attribute(kCanRunGcuAttr).data()) { + return true; + } + return false; +} + +class GcuSubGraphExtractPass : public pir::Pass { + public: + GcuSubGraphExtractPass() : pir::Pass("gcu_sub_graph_extract_pass", 2) {} + + void Run(pir::Operation* op) override { + auto module_op = op->dyn_cast(); + PADDLE_ENFORCE_NOT_NULL( + module_op, + common::errors::InvalidArgument( + "sub_graph_extract_pass should run on module op.")); + auto& block = module_op.block(); + + if (FLAGS_print_ir) { + std::cout << "GcuSubGraphExtractPass before IR = " << block << std::endl; + } + + std::vector groups = + pir::DetectSubGraphs(&block, IsSupportedByGCU); + AddStatistics(groups.size()); + for (auto& group_ops : groups) { + if (group_ops.size() < static_cast(FLAGS_gcu_min_group_size)) { + VLOG(0) << "current group_ops.size(): " << group_ops.size() + << ", less than min_group_size:" + << static_cast(FLAGS_gcu_min_group_size) + << ", will fallback to paddle original graph"; + continue; + } + VLOG(0) << "current group_ops.size(): " << group_ops.size() + << ", greater or equal than min_group_size:" + << static_cast(FLAGS_gcu_min_group_size) + << ", will lower to GCU graph"; + pir::ReplaceWithGroupOp(&block, group_ops); + } + if (FLAGS_print_ir) { + std::cout << "GcuSubGraphExtractPass after IR = " << block << std::endl; + } + } + + bool CanApplyOn(pir::Operation* op) const override { + return op->isa() && op->num_regions() > 0; + } +}; +} // namespace + +namespace pir { + +std::unique_ptr CreateGcuSubGraphExtractPass() { + return std::make_unique(); +} + +} // namespace pir + +REGISTER_IR_PASS(gcu_sub_graph_extract_pass, GcuSubGraphExtractPass); diff --git a/backends/gcu/passes/sub_graph_detector.cc b/backends/gcu/passes/sub_graph_detector.cc new file mode 100644 index 000000000..1d2f22112 --- /dev/null +++ b/backends/gcu/passes/sub_graph_detector.cc @@ -0,0 +1,894 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "passes/sub_graph_detector.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/fluid/pir/utils/general_functions.h" +#include "paddle/pir/include/core/builder.h" +#include "paddle/pir/include/core/builtin_op.h" +#include "paddle/pir/include/dialect/control_flow/ir/cf_dialect.h" +#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h" +// #include "paddle/pir/include/pass/pass.h" +// #include "paddle/pir/include/pass/pass_registry.h" +#include "custom_engine/custom_engine_op.h" +#include "paddle/common/flags.h" + +namespace custom_pass { +std::vector InverselyTopologicalSort(pir::Block* block) { + std::vector sort_ops; + std::unordered_map pending_count; + // step 1: initialize pending_cout for defined op + for (auto& op : *block) { + if (pending_count.find(&op) == pending_count.end()) { + pending_count[&op] = 0; + } + for (auto operand : GetUsedExternalValue(op)) { + if (!operand || !operand.defining_op()) { + continue; + } + auto* defined_op = operand.defining_op(); + if (pending_count.find(defined_op) != pending_count.end()) { + ++pending_count[defined_op]; + } else { + pending_count[defined_op] = 1; + } + } + } + + std::queue queue; + for (auto& op : *block) { + if (pending_count[&op] == 0) { + queue.push(&op); + } + } + + while (!queue.empty()) { + auto* op = queue.front(); + queue.pop(); + sort_ops.push_back(op); + for (auto operand : GetUsedExternalValue(*op)) { + if (!operand || !operand.defining_op()) { + continue; + } + auto* defined_op = operand.defining_op(); + --pending_count[defined_op]; + if (defined_op && pending_count[defined_op] == 0 && + defined_op->GetParent() == block) { + queue.push(defined_op); + } + } + } + + PADDLE_ENFORCE_EQ( + block->size(), + sort_ops.size(), + common::errors::InvalidArgument("sort_ops.size() must be equal to " + "block.size(), but received %d != %d", + block->size(), + sort_ops.size())); + + return sort_ops; +} + +std::vector GetProducerOpsReverseSort( + pir::Operation* op, + const std::unordered_map& op2index) { + std::unordered_set producers; + + std::vector vec_res; + for (auto operand : GetUsedExternalValue(*op)) { + if (!operand || !operand.defining_op()) { + continue; + } + auto* source_op = operand.defining_op(); + if (source_op && !producers.count(source_op) && + source_op->GetParent() == op->GetParent()) { + producers.insert(source_op); + PADDLE_ENFORCE( + op2index.count(source_op), + common::errors::PreconditionNotMet("source op MUST in op2index map")); + vec_res.emplace_back(source_op); + } + } + + std::sort(vec_res.begin(), + vec_res.end(), + [&op2index](pir::Operation* a, pir::Operation* b) { + return op2index.at(a) > op2index.at(b); + }); + + return vec_res; +} + +std::vector GetProducerOps(pir::Operation* op) { + std::vector producers; + + for (auto operand : GetUsedExternalValue(*op)) { + if (!operand || !operand.defining_op()) { + continue; + } + auto* source_op = operand.defining_op(); + if (source_op && source_op->GetParent() == op->GetParent()) { + producers.push_back(source_op); + } + } + return producers; +} + +std::vector GetConsumerOps( + pir::Operation* op, + const std::unordered_map& op2index) { + std::vector consumers; + + for (auto& result : op->results()) { + for (auto it = result.use_begin(); it != result.use_end(); ++it) { + auto parent_op = it->owner(); + while (parent_op) { + if (op2index.count(parent_op)) { + consumers.push_back(parent_op); + break; + } + parent_op = parent_op->GetParentOp(); + } + } + } + return consumers; +} + +static std::string OpsDebugStr(std::vector ops) { + std::stringstream ss; + pir::IrPrinter printer(ss); + for (const auto* op : ops) { + printer.PrintOperation(*op); + ss << "{" << op->id() << "}\n"; + } + return ss.str(); +} + +struct SubGraph : public std::enable_shared_from_this { + using SubGraphPtr = std::shared_ptr; + SubGraph() = delete; + SubGraph(pir::Operation* op, int index, bool subst) + : substitute(subst), topo_index(index), id(UniqueId()) { + ops.push_back(op); + } + + void Merge(const SubGraphPtr& other); + + static size_t UniqueId() { + static std::atomic counter{0}; + return counter++; + } + + template + static std::string JointName(const V& subgraphs) { + std::stringstream ss; + for (const auto& subgraph : subgraphs) { + ss << subgraph->name() << ", "; + } + auto str = ss.str(); + return str.empty() ? str : str.substr(0, str.size() - 2); + } + + std::string DebugStr(bool print_ops = false) const { + std::stringstream ss; + ss << "=========================================\n"; + ss << name() << " (substitute=" << substitute << ", " + << "index=" << topo_index << ", " + << "size=" << ops.size() << ")\n"; + if (print_ops) ss << OpsDebugStr(ops); + ss << "upstream: " << JointName(upstreams); + ss << "\ndownstream: " << JointName(downstreams); + return ss.str(); + } + + std::string name() const { + return std::string("Subgraph_") + std::to_string(id); + } + + struct compare { + bool operator()(const SubGraphPtr& lhs, const SubGraphPtr& rhs) const { + // sort by reverse order of topo id + return lhs->id > rhs->id; + } + }; + + std::vector ops; + std::set upstreams; + std::set downstreams; + + bool substitute; // whether this subgraph can be merged + int topo_index; + size_t id; +}; +using SubGraphPtr = std::shared_ptr; + +void SubGraph::Merge(const SubGraphPtr& other) { + // Merge other subgraph into this subgraph: + // Inherit its upstreams/downstreams and ops + SubGraphPtr self = shared_from_this(); + for (const auto& upstream : other->upstreams) { + if (upstream == self) continue; + upstream->downstreams.erase(other); + upstream->downstreams.insert(self); + upstreams.insert(upstream); + } + for (const auto& downstream : other->downstreams) { + if (downstream == self) continue; + downstream->upstreams.erase(other); + downstream->upstreams.insert(self); + downstreams.insert(downstream); + } + upstreams.erase(other); + downstreams.erase(other); + ops.insert(ops.begin(), other->ops.begin(), other->ops.end()); +} + +bool HasSinkRoute(const SubGraphPtr& source, const SubGraphPtr& target) { + if (source == target) return true; + std::unordered_set visited; + std::queue queue; + queue.push(source); + visited.insert(source); + while (!queue.empty()) { + SubGraphPtr cur = queue.front(); + queue.pop(); + if (cur == target) return true; + if (cur->topo_index > target->topo_index) continue; + for (const auto& subgraph : cur->downstreams) { + if (visited.count(subgraph)) continue; + queue.push(subgraph); + visited.insert(subgraph); + } + } + return false; +} + +bool HasLiftRoute(const SubGraphPtr& source, const SubGraphPtr& target) { + if (source == target) return true; + std::unordered_set visited; + std::queue queue; + queue.push(source); + visited.insert(source); + while (!queue.empty()) { + SubGraphPtr cur = queue.front(); + queue.pop(); + if (cur == target) return true; + if (source->topo_index < target->topo_index) continue; + for (const auto& subgraph : cur->upstreams) { + if (visited.count(subgraph)) continue; + queue.push(subgraph); + visited.insert(subgraph); + } + } + return false; +} + +bool HasRoute(const SubGraphPtr& up, const SubGraphPtr& down) { + return HasSinkRoute(up, down) || HasLiftRoute(down, up); +} + +bool CanFuseUpstream2Downstream(const SubGraphPtr& upstream, + const SubGraphPtr& downstream) { + PADDLE_ENFORCE(upstream->downstreams.count(downstream) && + downstream->upstreams.count(upstream), + ::common::errors::InvalidArgument( + "Subgraphs to be fused must have direct relationship.")); + auto up_downstreams = upstream->downstreams; + up_downstreams.erase(downstream); + auto down_upstreams = downstream->upstreams; + down_upstreams.erase(upstream); + if (up_downstreams.empty() || down_upstreams.empty()) return true; + for (const auto& subgraph : up_downstreams) { + if (HasSinkRoute(subgraph, downstream)) return false; + } + for (const auto& subgraph : down_upstreams) { + if (HasLiftRoute(subgraph, upstream)) return false; + } + return true; +} + +std::optional DetectCirclesInSubgraphs( + const std::vector& subgraph_list) { + std::set subgraph_set(subgraph_list.begin(), + subgraph_list.end()); + std::unordered_map in_degree; + std::unordered_map out_degree; + for (const auto& subgraph : subgraph_set) { + in_degree[subgraph] = subgraph->upstreams.size(); + out_degree[subgraph] = subgraph->downstreams.size(); + } + // Recursively remove nodes with in_degree or out_degree = 0 + bool erase_flag = true; + while (erase_flag) { + erase_flag = false; + for (const auto& subgraph : subgraph_list) { + if (subgraph_set.count(subgraph) == 0) continue; + if (in_degree[subgraph] == 0) { + for (const auto& downstream : subgraph->downstreams) { + in_degree[downstream]--; + } + subgraph_set.erase(subgraph); + erase_flag = true; + continue; + } + if (out_degree[subgraph] == 0) { + for (const auto& upstream : subgraph->upstreams) { + out_degree[upstream]--; + } + subgraph_set.erase(subgraph); + erase_flag = true; + continue; + } + } + } + if (subgraph_set.empty()) return std::nullopt; + // If subgraph_set is not empty, there are circles in the subgraphs. + auto circle_size = subgraph_set.size(); + std::stringstream ss; + ss << "Circles detected in subgraphs (size=" << circle_size << "): \n"; + for (const auto& subgraph : subgraph_set) { + ss << subgraph->DebugStr() << "\n"; + } + return std::make_optional(ss.str()); +} + +class SubgraphDetector { + public: + SubgraphDetector(pir::Block* block, const OpClassifier& classifier); + + void SubgraphFusion(); + + std::vector BuildGroups(); + + private: + void ReorderIndexOfSubgraphs(); + + void MergeSource2Target(const SubGraphPtr& source, const SubGraphPtr& target); + + SubGraphPtr GetOpSubgraph(pir::Operation* op) { + PADDLE_ENFORCE( + op2subgraph_.count(op), + ::common::errors::InvalidArgument( + "Can not find op in op2subgraph_: \n%s", OpsDebugStr({op}))); + return op2subgraph_.at(op); + } + + std::unordered_map op2index_; + std::vector sort_ops_; + std::unordered_map op2subgraph_; + std::unordered_set subgraph_index_set_; +}; + +void SubgraphDetector::ReorderIndexOfSubgraphs() { + // After merging subgraphs with direct relation, brother subgraphs with + // indirect relation may not be detected by index order. So we need to + // reorder the index of subgraphs. + std::queue queue; + std::unordered_map in_degree; + for (auto it = sort_ops_.rbegin(); it != sort_ops_.rend(); ++it) { + auto subgraph = GetOpSubgraph(*it); + if (in_degree.count(subgraph)) continue; + in_degree[subgraph] = subgraph->upstreams.size(); + if (in_degree[subgraph] == 0) queue.push(subgraph); + } + int index = 0; + while (!queue.empty()) { + auto subgraph = queue.front(); + queue.pop(); + subgraph->topo_index = index++; + for (const auto& downstream : subgraph->downstreams) { + in_degree[downstream]--; + if (in_degree[downstream] == 0) queue.push(downstream); + } + } +} + +void SubgraphDetector::MergeSource2Target(const SubGraphPtr& source, + const SubGraphPtr& target) { + VLOG(6) << "Merge source: " << source->DebugStr(); + VLOG(6) << "Merge target: " << target->DebugStr(); + target->Merge(source); + int max_index = std::max(source->topo_index, target->topo_index); + int min_index = std::min(source->topo_index, target->topo_index); + auto merged = target; + // Check if merged subgraph and its related subgraphs + // satisfy the topological order condition. + int upstream_max_index = -1, downstream_min_index = INT_MAX; + for (const auto& upstream : merged->upstreams) { + upstream_max_index = std::max(upstream->topo_index, upstream_max_index); + } + for (const auto& downstream : merged->downstreams) { + downstream_min_index = + std::min(downstream->topo_index, downstream_min_index); + } + // 1. If satisfy the topological order after merging, just set max_index + VLOG(6) << "Check if satisfy the topological order after merging"; + if (min_index > upstream_max_index && max_index < downstream_min_index) { + merged->topo_index = max_index; + subgraph_index_set_.erase(min_index); + return; + } + // 2. If not satisfy the order, find a index between upstream_max_index + // and downstream_min_index while not in subgraph_index_set_. + VLOG(6) << "Try to find a valid index not in subgraph_index_set_"; + for (int i = upstream_max_index + 1; i < downstream_min_index; ++i) { + if (!subgraph_index_set_.count(i)) { + merged->topo_index = i; + subgraph_index_set_.erase(min_index); + subgraph_index_set_.erase(max_index); + subgraph_index_set_.insert(i); + return; + } + } + // 3. If can not find a valid index, reorder topo index of all subgraphs. + VLOG(6) << "Reorder topo index of all subgraphs"; + ReorderIndexOfSubgraphs(); +} + +SubgraphDetector::SubgraphDetector(pir::Block* block, + const OpClassifier& classifier) { + // init sort_ops_ in reverse topo order + sort_ops_ = InverselyTopologicalSort(block); + // init op2index_ in topo order + int index = 0; + for (auto& op : *block) { + op2index_[&op] = index++; + } + // construct subgraphs and upstream/downstream relation + std::vector subgraph_list; + for (const auto& op : sort_ops_) { + bool substitute = classifier(*op); + auto subgraph = std::make_shared(op, op2index_[op], substitute); + op2subgraph_[op] = subgraph; + subgraph_index_set_.insert(op2index_[op]); + subgraph_list.push_back(subgraph); + } + for (const auto& op : sort_ops_) { + auto subgraph = op2subgraph_[op]; + for (const auto& producer : GetProducerOps(op)) { + if (!op2subgraph_.count(producer)) continue; + subgraph->upstreams.insert(op2subgraph_[producer]); + op2subgraph_[producer]->downstreams.insert(subgraph); + } + for (const auto& consumer : GetConsumerOps(op, op2index_)) { + if (!op2subgraph_.count(consumer)) continue; + subgraph->downstreams.insert(op2subgraph_[consumer]); + op2subgraph_[consumer]->upstreams.insert(subgraph); + } + } + VLOG(6) << "Subgraphs before building groups: "; + for (const auto& subgraph : subgraph_list) { + VLOG(6) << subgraph->DebugStr(true); + } + auto circle_info = DetectCirclesInSubgraphs(subgraph_list); + if (circle_info) { + PADDLE_THROW(::common::errors::PreconditionNotMet( + "Before building groups: %s", circle_info.value())); + } +} + +void SubgraphDetector::SubgraphFusion() { + // Two subgraphs can be merged only if they have no route except direct + // connection between them (brother subgraphs should have no any route), + // otherwise a circle will be formed after merging them. + VLOG(4) << "Merge subgraphs with direct relation"; + for (const auto& op : sort_ops_) { + auto downstream = GetOpSubgraph(op); + if (!downstream->substitute) continue; + for (const auto& producer : GetProducerOpsReverseSort(op, op2index_)) { + auto upstream = GetOpSubgraph(producer); + if (upstream == downstream || !upstream->substitute) continue; + if (CanFuseUpstream2Downstream(upstream, downstream)) { + MergeSource2Target(upstream, downstream); + for (auto upstream_op : upstream->ops) { + op2subgraph_[upstream_op] = downstream; + } + VLOG(6) << "Merged subgraph: " << downstream->DebugStr(); + } + } + } + + VLOG(4) << "Merge brother subgraphs with same upstream"; + for (const auto& op : sort_ops_) { + auto subgraph = GetOpSubgraph(op); + if (!subgraph->substitute) continue; + for (auto producer : GetProducerOpsReverseSort(op, op2index_)) { + if (GetOpSubgraph(producer) == subgraph) continue; + for (auto consumer : GetConsumerOps(producer, op2index_)) { + auto brother = GetOpSubgraph(consumer); + if (brother == subgraph || !brother->substitute) continue; + if (!HasRoute(subgraph, brother) && !HasRoute(brother, subgraph)) { + MergeSource2Target(brother, subgraph); + for (auto brother_op : brother->ops) { + op2subgraph_[brother_op] = subgraph; + } + VLOG(6) << "Merged subgraph: " << subgraph->DebugStr(); + } + } + } + } +} + +std::vector SubgraphDetector::BuildGroups() { + // 1. Get subgraph list in topo order + std::unordered_set subgraph_set; + std::vector subgraph_list; + for (const auto& op : sort_ops_) { + SubGraphPtr subgraph = GetOpSubgraph(op); + if (subgraph_set.count(subgraph)) continue; + subgraph_set.insert(subgraph); + subgraph_list.push_back(subgraph); + } + std::reverse(subgraph_list.begin(), subgraph_list.end()); + VLOG(6) << "Subgraphs after building groups: "; + for (const auto& subgraph : subgraph_list) { + VLOG(6) << subgraph->DebugStr(true); + } + auto circle_info = DetectCirclesInSubgraphs(subgraph_list); + if (circle_info) { + PADDLE_THROW(::common::errors::PreconditionNotMet( + "After building groups: %s", circle_info.value())); + } + + // 2. Build group ops in subgraph which can be substituted + std::vector groups; + for (const auto& subgraph : subgraph_list) { + if (!subgraph->substitute) { + continue; + } + // sort group ops by natural increasing index. + std::vector group_ops(subgraph->ops.begin(), + subgraph->ops.end()); + std::sort(group_ops.begin(), + group_ops.end(), + [this](pir::Operation* a, pir::Operation* b) { + return this->op2index_.at(a) < this->op2index_.at(b); + }); + groups.push_back(group_ops); + } + return groups; +} + +std::vector DetectSubGraphs(pir::Block* block, + const OpClassifier& classifier) { + auto subgraph_detector = SubgraphDetector(block, classifier); + subgraph_detector.SubgraphFusion(); + return subgraph_detector.BuildGroups(); +} + +std::vector AnalysisOutputs( + const GroupOpsVec& group_ops) { // NOLINT + // Get output by ud chain + std::unordered_set op_set(group_ops.begin(), + group_ops.end()); + + std::vector outputs; + for (auto* op : group_ops) { + for (size_t i = 0; i < op->num_results(); ++i) { + auto result = op->result(i); + + for (auto use_iter = result.use_begin(); use_iter != result.use_end(); + ++use_iter) { + if (!op_set.count(use_iter->owner())) { + outputs.push_back(result); + break; + } + } + } + } + + // NOTE: If all value are not used outside, we mark last op's results + // as outputs. But keep in mind that is risky. + if (outputs.size() == 0) { + for (size_t i = 0; i < group_ops.back()->num_results(); ++i) { + outputs.push_back(group_ops.back()->result(i)); + } + } + + return outputs; +} + +std::vector AnalysisInputs( + const GroupOpsVec& group_ops) { // NOLINT + std::unordered_set visited_values; + std::vector group_inputs; + std::unordered_set ops_set(group_ops.begin(), + group_ops.end()); + + // count all op's input Value + for (auto* op : group_ops) { + for (auto& value : op->operands_source()) { + if (!value || !value.type() || ops_set.count(value.defining_op())) + continue; + if (visited_values.count(value)) continue; + // if the input value owner op is not in OpSet, it's the group's input + visited_values.insert(value); + group_inputs.push_back(value); + } + } + return group_inputs; +} + +namespace { + +struct IncrementalOrder { + bool operator()(const pir::Operation* lhs, const pir::Operation* rhs) const { + PADDLE_ENFORCE_EQ(lhs->GetParent() == rhs->GetParent(), + true, + common::errors::PreconditionNotMet( + "lhs and rhs should have same parent block.")); + auto lhs_iter = lhs->operator pir::Block::ConstIterator(); + auto rhs_iter = rhs->operator pir::Block::ConstIterator(); + auto end_iter = lhs->GetParent()->end(); + while (lhs_iter != end_iter) { + lhs_iter++; + if (lhs_iter == rhs_iter) return true; + if (lhs_iter == end_iter) return false; + } + PADDLE_ENFORCE_EQ( + false, + true, + common::errors::InvalidArgument("rhs is not reachable from lhs.")); + return false; + } +}; + +std::unordered_set GetUpstreamOpsAfterPosition( + const pir::Operation* position_op, + const pir::Block* block, + pir::Operation* op, + std::unordered_set* visited_ops) { + std::unordered_set ops; + const auto& IsInBlock = [](const pir::Operation* src_op, + const pir::Block* block) { + for (auto& item : *block) { + if (src_op->id() == item.id()) return true; + } + return false; + }; + std::vector op_inputs = GetUsedExternalValue(*op); + for (auto value : op_inputs) { + if (!value || !value.defining_op()) continue; + pir::Operation* defining_op = value.defining_op(); + if (visited_ops->count(defining_op)) continue; + visited_ops->insert(defining_op); + if (!IsInBlock(defining_op, block)) continue; + if (IncrementalOrder()(defining_op, position_op)) continue; + + ops.insert(defining_op); + auto recursive_ops = GetUpstreamOpsAfterPosition( + position_op, block, defining_op, visited_ops); + ops.insert(recursive_ops.begin(), recursive_ops.end()); + } + return ops; +} +} // namespace + +void MoveUpstreamOpBeforeGroup(const GroupOpsVec& group_ops, + pir::Block* block, + pir::Operation* insert_point_op) { + const auto moved_ops = [&]() { + std::set ops_set; + std::unordered_set visited_ops; + for (auto& op : group_ops) { + auto upstream_ops = + GetUpstreamOpsAfterPosition(insert_point_op, block, op, &visited_ops); + ops_set.insert(upstream_ops.begin(), upstream_ops.end()); + } + return ops_set; + }(); + + for (auto& op : moved_ops) { + if (op == insert_point_op) continue; + VLOG(4) << "Move " << op->id() << " " << op->name() << " before " + << insert_point_op->id() << " " << insert_point_op->name(); + op->MoveTo(block, insert_point_op->operator pir::Block::Iterator()); + } +} + +pir::Operation* FindInsertPoint(const GroupOpsVec& group_ops, + const std::vector& outputs) { + // Regard last op as insert position if there are no downstream ops between in + // group_ops. + pir::Operation* first_op = group_ops.front(); + pir::Operation* insert_point_op = group_ops.back(); + auto order_info = + [&]() -> std::unordered_map { + std::unordered_map map; + // initialize the position index with block size by default. + auto block = insert_point_op->GetParent(); + int64_t order = 0; + for (auto& op : *block) { + map[&op] = order++; + } + return map; + }(); + + for (auto* op : group_ops) { + if (order_info.at(op) > order_info.at(insert_point_op)) { + insert_point_op = op; + } + if (order_info.at(op) < order_info.at(first_op)) { + first_op = op; + } + } + + auto begin = first_op->operator pir::Block::ConstIterator(); + auto end = ++(insert_point_op->operator pir::Block::ConstIterator()); + const std::unordered_set outputs_set(outputs.begin(), + outputs.end()); + const std::unordered_set group_ops_set( + group_ops.begin(), group_ops.end()); + + const auto& IsDownstreamOp = [&](const pir::Operation* op) -> bool { + if (group_ops_set.find(op) != group_ops_set.end()) return false; + for (auto& value : GetUsedExternalValue(*op)) { + if (outputs_set.find(value) != outputs_set.end()) { + return true; + } + } + return false; + }; + // Find first downstream op as final insert position. + for (; begin != end; ++begin) { + if (IsDownstreamOp(begin)) { + insert_point_op = begin; + break; + } + } + return insert_point_op; +} + +void ReplaceWithGroupOp(pir::Block* block, + const GroupOpsVec& group_ops) { // NOLINT + ::pir::IrContext* ctx = ::pir::IrContext::Instance(); + // #ifdef PADDLE_WITH_CINN + // ctx->GetOrRegisterDialect(); + // #endif + // #ifdef PADDLE_WITH_DNNL + // ctx->GetOrRegisterDialect(); + // #endif + ::pir::Builder builder = ::pir::Builder(ctx, block); + const std::vector outputs = + custom_pass::AnalysisOutputs(group_ops); + + // step 1: Analysis and insert group op before insert_point. + auto* insert_point = custom_pass::FindInsertPoint(group_ops, outputs); + custom_pass::MoveUpstreamOpBeforeGroup(group_ops, block, insert_point); + builder.set_insertion_point(insert_point); + VLOG(6) << "Insert GroupOp after " << insert_point->name(); + +// step 2: Replace the old op with GroupOp. +#ifdef PADDLE_WITH_CINN + + auto new_group_op = [&]() -> cinn::dialect::GroupOp { + std::vector output_types; + for (auto& value : outputs) output_types.emplace_back(value.type()); + + auto group_op = builder.Build(output_types); + for (auto op : group_ops) { + op->MoveTo(group_op.block(), group_op.block()->end()); + } + return group_op; + }(); +#else + auto new_group_op = [&]() -> pir::GroupOp { + std::vector output_types; + for (auto& value : outputs) output_types.emplace_back(value.type()); + + auto group_op = builder.Build(output_types); + for (auto op : group_ops) { + op->MoveTo(group_op.block(), group_op.block()->end()); + } + return group_op; + }(); +#endif + + // step 3: Replace outputs of inner ops + const std::vector group_outs = new_group_op->results(); + std::unordered_set inner_ops(group_ops.begin(), + group_ops.end()); + for (size_t i = 0; i < outputs.size(); ++i) { + outputs[i].ReplaceUsesWithIf(group_outs[i], + [&inner_ops](pir::OpOperand op) { + return !inner_ops.count(op.owner()); + }); + } + + // step 4: Insert YieldOp for outputs + builder.SetInsertionPointToBlockEnd(new_group_op.block()); + builder.Build<::pir::YieldOp>(outputs); +} + +// void ReplaceWithCustomEngineOp(pir::Block* block, const GroupOpsVec& +// group_ops) { +// ::pir::IrContext* ctx = ::pir::IrContext::Instance(); +// ::pir::Builder builder = ::pir::Builder(ctx, block); +// const std::vector outputs = +// custom_pass::AnalysisOutputs(group_ops); const std::vector +// inputs = custom_pass::AnalysisInputs(group_ops); + +// // step 1: Analysis and insert group op before insert_point. +// auto* insert_point = custom_pass::FindInsertPoint(group_ops, outputs); +// custom_pass::MoveUpstreamOpBeforeGroup(group_ops, block, insert_point); +// builder.set_insertion_point(insert_point); +// VLOG(6) << "Insert GroupOp after " << insert_point->name(); + +// // attrs +// std::vector input_names; +// std::vector output_names; +// for (size_t i = 0; i < inputs.size(); ++i) { +// std::string input_name = "graph_input_" + std::to_string(i) + "_op_" + +// std::to_string(inputs[i].defining_op()->id()); +// input_names.emplace_back(pir::StrAttribute::get(ctx, input_name)); +// } +// for (size_t i = 0; i < outputs.size(); ++i) { +// std::string output_name = "graph_output_" + std::to_string(i) + "_op_" + +// std::to_string(outputs[i].defining_op()->id()); +// output_names.emplace_back(pir::StrAttribute::get(ctx, output_name)); +// } +// pir::AttributeMap attribute_map; +// attribute_map.insert( +// {"input_names", pir::ArrayAttribute::get(ctx, input_names)}); +// attribute_map.insert( +// {"output_names", pir::ArrayAttribute::get(ctx, output_names)}); + +// // step 2: Replace the old op with CustomEngineOp. +// auto custom_engine_op = [&]() -> custom_engine::CustomEngineOp* { +// std::vector output_types; +// for (auto& value : outputs) output_types.emplace_back(value.type()); +// pir::OpInfo custom_engine_op_info = +// ctx->GetRegisteredOpInfo(custom_engine::CustomEngineOp::name()); + +// pir::Operation* engine_op = pir::Operation::Create( +// inputs, attribute_map, output_types, custom_engine_op_info); + +// builder.Insert(engine_op); + +// // auto engine_op = builder.Build( +// // inputs, attribute_map, output_types, custom_engine_op_info); +// for (auto op : group_ops) { +// op->MoveTo(engine_op->block(), engine_op->block()->end()); +// } +// return engine_op; +// }(); + +// // step 3: Replace outputs of inner ops +// const std::vector group_outs = custom_engine_op->results(); +// std::unordered_set inner_ops(group_ops.begin(), +// group_ops.end()); +// for (size_t i = 0; i < outputs.size(); ++i) { +// outputs[i].ReplaceUsesWithIf(group_outs[i], +// [&inner_ops](pir::OpOperand op) { +// return !inner_ops.count(op.owner()); +// }); +// } + +// // step 4: Insert YieldOp for outputs +// builder.SetInsertionPointToBlockEnd(custom_engine_op.block()); +// builder.Build<::pir::YieldOp>(outputs); +// } + +} // namespace custom_pass diff --git a/backends/gcu/passes/sub_graph_detector.h b/backends/gcu/passes/sub_graph_detector.h new file mode 100644 index 000000000..98040583e --- /dev/null +++ b/backends/gcu/passes/sub_graph_detector.h @@ -0,0 +1,45 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/pir/include/core/builder.h" + +namespace custom_pass { +using OpClassifier = std::function; +using GroupOpsVec = std::vector; + +std::vector DetectSubGraphs(pir::Block* block, + const OpClassifier& classifier); + +std::vector AnalysisOutputs(const GroupOpsVec& group_ops); +std::vector AnalysisInputs(const GroupOpsVec& group_ops); +void ReplaceWithGroupOp(pir::Block* block, const GroupOpsVec& group_ops); +void ReplaceWithCustomEngineOp(pir::Block* block, const GroupOpsVec& group_ops); + +pir::Operation* FindInsertPoint(const GroupOpsVec& group_ops, + const std::vector& outputs); +void MoveUpstreamOpBeforeGroup(const GroupOpsVec& group_ops, + pir::Block* block, + pir::Operation* insert_point_op); + +} // namespace custom_pass diff --git a/backends/gcu/runtime/runtime.h b/backends/gcu/runtime/runtime.h index a31b5e8d8..145cc8442 100644 --- a/backends/gcu/runtime/runtime.h +++ b/backends/gcu/runtime/runtime.h @@ -40,6 +40,7 @@ #define ECCL_CHECK(func) CHECK_COMMON(func, ecclSuccess) #define TOPSATEN_CHECK(func) CHECK_COMMON(func, TOPSATEN_STATUS_SUCCESS) #define TOPSOP_CHECK(func) CHECK_COMMON(func, TOPSOP_STATUS_SUCCESS) +#define TOPSGRAPH_CHECK(func) CHECK_COMMON(func, TOPS_GRAPH_SUCCESS) #ifdef __cplusplus extern "C" {