diff --git a/backends/gcu/CMakeLists.txt b/backends/gcu/CMakeLists.txt
index 7f1b17ee3..bc3edafc7 100644
--- a/backends/gcu/CMakeLists.txt
+++ b/backends/gcu/CMakeLists.txt
@@ -40,6 +40,7 @@ include(external/topscc)
 
 include_directories(${CMAKE_SOURCE_DIR})
 include_directories(/opt/tops/include)
+include_directories(${PADDLE_INC_DIR}/build)
 
 option(WITH_KERNELS "compile with custom kernels" ON)
 option(WITH_TESTING "compile with unit testing" OFF)
diff --git a/backends/gcu/custom_engine/custom_engine_interface.cc b/backends/gcu/custom_engine/custom_engine_interface.cc
index 199366164..0971c7f26 100644
--- a/backends/gcu/custom_engine/custom_engine_interface.cc
+++ b/backends/gcu/custom_engine/custom_engine_interface.cc
@@ -15,8 +15,13 @@
 #include "custom_engine/custom_engine_interface.h"
 
 #include "custom_engine/custom_engine_op.h"
+#include "custom_engine/gcu_engine.h"
+#include "custom_engine/gcu_engine_compiler.h"
+#include "paddle/fluid/framework/new_executor/instruction/custom_engine_instruction.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/utils/pd_op_to_kernel_utils.h"
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/builtin_dialect.h"
@@ -27,28 +32,308 @@
 #include "paddle/pir/include/core/op_trait.h"
 #include "paddle/pir/include/core/operation_utils.h"
 
-namespace paddle {
-namespace dialect {
+namespace {
+using DenseTensorType = pir::DenseTensorType;
+using AllocatedDenseTensorType = paddle::dialect::AllocatedDenseTensorType;
+using SelectedRowsType = paddle::dialect::SelectedRowsType;
+using AllocatedSelectedRowsType = paddle::dialect::AllocatedSelectedRowsType;
+using DenseTensorArrayType = paddle::dialect::DenseTensorArrayType;
+using AllocatedDenseTensorArrayType =
+    paddle::dialect::AllocatedDenseTensorArrayType;
+using SparseCooTensorType = paddle::dialect::SparseCooTensorType;
+using SparseCsrTensorType = paddle::dialect::SparseCsrTensorType;
 
-void RegisterCustomEngineOp() {
-  pir::IrContext *ctx = pir::IrContext::Instance();
+template <class IrType1, class IrType2>
+static pir::Type CreatType(pir::Type type,
+                           const phi::Place& place,
+                           pir::Type out_dtype,
+                           pir::IrContext* ctx) {
+  auto input_type = type.dyn_cast<IrType1>();
+  return IrType2::get(ctx,
+                      place,
+                      out_dtype,
+                      input_type.dims(),
+                      input_type.data_layout(),
+                      input_type.lod(),
+                      input_type.offset());
+}
+
+static pir::Type BuildOutputType(pir::Type type,
+                                 const phi::Place& place,
+                                 pir::IrContext* ctx) {
+  if (type.isa<DenseTensorType>()) {
+    auto out_dtype = type.dyn_cast<DenseTensorType>().dtype();
+    return CreatType<DenseTensorType, AllocatedDenseTensorType>(
+        type, place, out_dtype, ctx);
+  } else if (type.isa<SelectedRowsType>()) {
+    auto out_dtype = type.dyn_cast<SelectedRowsType>().dtype();
+    return CreatType<SelectedRowsType,
+                     paddle::dialect::AllocatedSelectedRowsType>(
+        type, place, out_dtype, ctx);
+  } else if (type.isa<DenseTensorArrayType>()) {
+    auto array_type = type.dyn_cast<DenseTensorArrayType>();
+    return AllocatedDenseTensorArrayType::get(ctx,
+                                              place,
+                                              array_type.dtype(),
+                                              array_type.dims(),
+                                              array_type.data_layout());
+  } else {
+    PADDLE_THROW(common::errors::Unimplemented(
+        "BuildOutputType only support DenseTensorType, SelectedRowsType, "
+        "and DenseTensorArrayType"));
+  }
+}
+
+void PushBackOutputTypes(pir::IrContext* ctx,
+                         pir::Operation* op_item,
+                         const pir::Type& origin_type,
+                         const phi::Place& out_place,
+                         const phi::KernelKey& kernel_key,
+                         std::vector<pir::Type>* op_output_types) {
+  auto result_type = origin_type;
+  if (!result_type) {
+    op_output_types->push_back(result_type);
+  } else if (result_type.isa<DenseTensorType>() ||
+             result_type.isa<SelectedRowsType>() ||
+             result_type.isa<DenseTensorArrayType>() ||
+             result_type.isa<SparseCooTensorType>() ||
+             result_type.isa<SparseCsrTensorType>()) {
+    op_output_types->push_back(BuildOutputType(result_type, out_place, ctx));
+
+  } else if (result_type.isa<pir::VectorType>()) {
+    std::vector<pir::Type> vec_inner_types;
+    auto base_types = result_type.dyn_cast<pir::VectorType>().data();
+    for (auto& base_type : base_types) {
+      if (base_type) {
+        if (base_type.isa<DenseTensorType>() ||
+            base_type.isa<SelectedRowsType>()) {
+          vec_inner_types.push_back(BuildOutputType(base_type, out_place, ctx));
+        } else {
+          PADDLE_THROW(common::errors::Unimplemented(
+              "only support dense tensor and selected rows in vector type "
+              "for now"));
+        }
+      } else {
+        // NOTE(phlrain), kernel not support a nullptr in output
+        pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+        phi::DDim dims = {};
+        phi::DataLayout data_layout = phi::DataLayout::NCHW;
+        phi::LegacyLoD lod = {{}};
+        size_t offset = 0;
+        auto dense_tensor_dtype = DenseTensorType::get(
+            ctx, fp32_dtype, dims, data_layout, lod, offset);
+        auto allocated_dense_tensor_dtype =
+            AllocatedDenseTensorType::get(ctx, out_place, dense_tensor_dtype);
+        vec_inner_types.push_back(allocated_dense_tensor_dtype);
+      }
+    }
+
+    pir::Type t1 = pir::VectorType::get(ctx, vec_inner_types);
+    op_output_types->push_back(t1);
+  } else {
+    PADDLE_THROW(common::errors::Unimplemented(
+        "Result type only support DenseTensorType, SelectedRowType, "
+        "SparseCooTensorType, SparseCsrTensorType and "
+        "VectorType"));
+  }
+}
+}  // namespace
+
+C_Status RegisterCustomEngineOp() {
+  pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-  pir::Dialect *custom_engine_dialect =
+  pir::Dialect* custom_engine_dialect =
       ctx->GetOrRegisterDialect<paddle::dialect::CustomEngineDialect>();
   PADDLE_ENFORCE_NOT_NULL(custom_engine_dialect,
                           "Failed to register CustomEngineDialect.");
   ctx->RegisterOpInfo(custom_engine_dialect,
-                      pir::TypeId::get<paddle::dialect::CustomEngineOp>(),
-                      paddle::dialect::CustomEngineOp::name(),
-                      paddle::dialect::CustomEngineOp::interface_set(),
-                      paddle::dialect::CustomEngineOp::GetTraitSet(),
-                      paddle::dialect::CustomEngineOp::attributes_num,
-                      paddle::dialect::CustomEngineOp::attributes_name,
-                      paddle::dialect::CustomEngineOp::VerifySigInvariants,
-                      paddle::dialect::CustomEngineOp::VerifyRegionInvariants);
+                      pir::TypeId::get<custom_engine::CustomEngineOp>(),
+                      custom_engine::CustomEngineOp::name(),
+                      custom_engine::CustomEngineOp::interface_set(),
+                      custom_engine::CustomEngineOp::GetTraitSet(),
+                      custom_engine::CustomEngineOp::attributes_num,
+                      custom_engine::CustomEngineOp::attributes_name,
+                      custom_engine::CustomEngineOp::VerifySigInvariants,
+                      custom_engine::CustomEngineOp::VerifyRegionInvariants);
   VLOG(3) << "Register CustomEngineOp successfully.";
+  return C_SUCCESS;
+}
+
+C_Status CustomEngineOpLower(C_CustomEngineLowerParams* lower_param) {
+  VLOG(3) << "Enter CustomEngineOpLower.";
+  // get lower params
+  pir::IrContext* ctx =
+      reinterpret_cast<pir::IrContext*>(lower_param->ir_context);
+  pir::Operation* op_item =
+      reinterpret_cast<pir::Operation*>(lower_param->operation);
+  phi::KernelKey* kernel_key =
+      reinterpret_cast<phi::KernelKey*>(lower_param->kernel_key);
+  phi::Place* place = reinterpret_cast<phi::Place*>(lower_param->place);
+  std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair =
+      reinterpret_cast<std::unordered_map<pir::Operation*, pir::Operation*>*>(
+          lower_param->map_op_pair);
+  std::unordered_map<pir::Value, pir::Value>* map_value_pair =
+      reinterpret_cast<std::unordered_map<pir::Value, pir::Value>*>(
+          lower_param->map_value_pair);
+  pir::Block* block = reinterpret_cast<pir::Block*>(lower_param->block);
+
+  // Prepare output types
+  std::vector<pir::Type> op_output_types;
+
+  for (size_t i = 0; i < op_item->num_results(); ++i) {
+    phi::Place out_place = phi::TransToPhiPlace(kernel_key->backend());
+    PushBackOutputTypes(ctx,
+                        op_item,
+                        op_item->result(i).type(),
+                        out_place,
+                        *kernel_key,
+                        &op_output_types);
+  }
+
+  // Prepare input
+  std::vector<pir::Value> vec_inputs;
+
+  for (size_t i = 0; i < op_item->num_operands(); ++i) {
+    auto cur_in = op_item->operand_source(i);
+    PADDLE_ENFORCE_EQ(
+        map_value_pair->count(cur_in),
+        true,
+        common::errors::PreconditionNotMet(
+            "[%d]'s input of [%s] op MUST in map pair", i, op_item->name()));
+
+    auto new_in = map_value_pair->at(cur_in);
+
+    vec_inputs.push_back(new_in);
+  }
+
+  // Prepare attr
+  std::unordered_map<std::string, pir::Attribute> op_attribute;
+  auto op_attr_map = op_item->attributes();
+  for (auto& map_item : op_attr_map) {
+    op_attribute.emplace(map_item.first, map_item.second);
+  }
+  op_attribute["op_name"] = pir::StrAttribute::get(ctx, op_item->name());
+
+  pir::OpInfo custom_engine_op_info =
+      ctx->GetRegisteredOpInfo(custom_engine::CustomEngineOp::name());
+
+  pir::Operation* op = pir::Operation::Create(
+      vec_inputs, op_attribute, op_output_types, custom_engine_op_info, 1);
+  op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id()));
+  VLOG(3) << "CustomEngineOpLower create custom_engine_op";
+
+  VLOG(3) << "CustomEngineOpLower get op_item subgraph block.";
+  pir::Region& op_item_region = op_item->region(0);
+  PADDLE_ENFORCE_EQ(
+      op_item_region.empty(),
+      false,
+      ::common::errors::Unavailable(
+          "Required CustomEngineOp's region must not be emptpy."));
+  pir::Block* sub_graph_block = &(op_item_region.front());
+
+  VLOG(3) << "CustomEngineOpLower set new op subgraph block.";
+  pir::Region& region = op->region(0);
+  if (region.empty()) {
+    region.emplace_back();
+  }
+  pir::Block* op_block = &(region.front());
+
+  // process subgraph block
+  paddle::dialect::ProcessBlock(
+      *place, sub_graph_block, op_block, ctx, map_op_pair, map_value_pair);
+
+  if (VLOG_IS_ON(3)) {
+    std::stringstream ss;
+    ss << "CustomEngineOpLower new op:";
+    op->Print(ss);
+    VLOG(3) << ss.str();
+  }
+
+  (*map_op_pair)[op_item] = op;
+
+  // only deal with single output
+  if (op_item->num_results() > 0) {
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      (*map_value_pair)[op_item->result(i)] = op->result(i);
+    }
+  }
+  block->push_back(op);
+  VLOG(3) << "CustomEngineOpLower successfully.";
+  return C_SUCCESS;
 }
 
-}  // namespace dialect
-}  // namespace paddle
+C_Status GraphEngineBuild(C_CustomEngineInstruction instruction) {
+  VLOG(3) << "Enter GraphEngineBuild.";
+  paddle::framework::CustomEngineInstruction* instruction_ =
+      reinterpret_cast<paddle::framework::CustomEngineInstruction*>(
+          instruction);
+  pir::Operation* op = instruction_->Operation();
+  const phi::KernelContext& kernel_context = instruction_->KernelContext();
+  phi::KernelContext kernel_ctx = kernel_context;
+  auto engine_inputs = instruction_->GetEngineInputs();
+  auto engine_outputs = instruction_->GetEngineOutputs();
+  auto engine_value_to_tensors = instruction_->GetEngineValueToTensors();
+  auto engine_value_to_var_names = instruction_->GetEngineValueToVarNames();
+
+  // NOTES: The memory is managed by CustomEngineInstruction, and we provide a
+  // release interface here.
+  custom_engine::GCUEngine* gcu_engine = new custom_engine::GCUEngine();
+  auto gcu_engine_deleter = [](void* ptr) {
+    custom_engine::GCUEngine* gcu_engine =
+        static_cast<custom_engine::GCUEngine*>(ptr);
+
+    if (gcu_engine != nullptr) {
+      delete gcu_engine;
+    } else {
+      PADDLE_THROW(phi::errors::PreconditionNotMet("gcu_engine is nullptr"));
+    }
+  };
+
+  std::string engine_key =
+      "GCUEngine_" +
+      std::to_string(reinterpret_cast<std::uintptr_t>(instruction));
+  custom_engine::GCUEngineCompiler gcu_compiler(kernel_ctx,
+                                                op,
+                                                engine_inputs,
+                                                engine_outputs,
+                                                engine_value_to_tensors,
+                                                engine_value_to_var_names,
+                                                engine_key);
+  gcu_compiler.Compile(gcu_engine);
+
+  instruction_->SetCustomEngine(reinterpret_cast<void*>(gcu_engine));
+  instruction_->SetCustomEngineDeleter(gcu_engine_deleter);
+  VLOG(3) << "GraphEngineBuild successfully.";
+
+  return C_SUCCESS;
+}
+
+C_Status GraphEngineExecute(C_CustomEngineInstruction instruction) {
+  VLOG(3) << "Enter GraphEngineExecute.";
+  paddle::framework::CustomEngineInstruction* instruction_ =
+      reinterpret_cast<paddle::framework::CustomEngineInstruction*>(
+          instruction);
+  custom_engine::GCUEngine* gcu_engine =
+      reinterpret_cast<custom_engine::GCUEngine*>(instruction_->CustomEngine());
+  PADDLE_ENFORCE_NOT_NULL(gcu_engine, "GCUEngine is nullptr.");
+
+  auto* dev_ctx =
+      static_cast<phi::CustomContext*>(phi::DeviceContextPool::Instance().Get(
+          instruction_->DeviceContext().GetPlace()));
+
+  gcu_engine->Run(*dev_ctx);
+  VLOG(3) << "GraphEngineExecute successfully.";
+  return C_SUCCESS;
+}
+
+void InitPluginCustomEngine(CustomEngineParams* params) {
+  memset(reinterpret_cast<void*>(params->interface),
+         0,
+         sizeof(C_CustomEngineInterface));
+
+  params->interface->register_custom_engine_op = RegisterCustomEngineOp;
+  params->interface->graph_engine_build = GraphEngineBuild;
+  params->interface->graph_engine_execute = GraphEngineExecute;
+  params->interface->custom_engine_op_lower = CustomEngineOpLower;
+}
diff --git a/backends/gcu/custom_engine/custom_engine_interface.h b/backends/gcu/custom_engine/custom_engine_interface.h
index 2229a8071..b0e1151d7 100644
--- a/backends/gcu/custom_engine/custom_engine_interface.h
+++ b/backends/gcu/custom_engine/custom_engine_interface.h
@@ -13,13 +13,20 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/fluid/custom_engine/custom_engine_ext.h"
 #include "paddle/phi/extension.h"
 
-namespace paddle {
-namespace dialect {
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-void InitPluginCustomEngine(void*);
-void RegisterCustomEngineOp();
+C_Status RegisterCustomEngineOp();
+C_Status CustomEngineOpLower(C_CustomEngineLowerParams* lower_param);
+C_Status GraphEngineBuild(C_CustomEngineInstruction instruction);
+C_Status GraphEngineExecute(C_CustomEngineInstruction instruction);
 
-}  // namespace dialect
-}  // namespace paddle
+void InitPluginCustomEngine(CustomEngineParams* params);
+
+#ifdef __cplusplus
+} /* extern "c" */
+#endif
diff --git a/backends/gcu/custom_engine/custom_engine_op.cc b/backends/gcu/custom_engine/custom_engine_op.cc
index 89dd934fe..ff762d821 100644
--- a/backends/gcu/custom_engine/custom_engine_op.cc
+++ b/backends/gcu/custom_engine/custom_engine_op.cc
@@ -16,20 +16,19 @@
 
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 
-namespace paddle {
-namespace dialect {
-
+namespace custom_engine {
 const char *CustomEngineOp::attributes_name[2] = {"input_names",
                                                   "output_names"};
 
 OpInfoTuple CustomEngineOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
-      OpInputInfo("x",
-                  "pir::VectorType<paddle::dialect::DenseTensorType>",
-                  false,
-                  false,
-                  false,
-                  false)};
+      paddle::dialect::OpInputInfo(
+          "x",
+          "pir::VectorType<paddle::dialect::DenseTensorType>",
+          false,
+          false,
+          false,
+          false)};
 
   std::vector<paddle::dialect::OpAttributeInfo> attributes = {
       paddle::dialect::OpAttributeInfo(
@@ -38,13 +37,14 @@ OpInfoTuple CustomEngineOp::GetOpInfo() {
           "output_names", "pir::ArrayAttribute", "")};
 
   std::vector<paddle::dialect::OpOutputInfo> outputs = {
-      OpOutputInfo("out",
-                   "pir::VectorType<paddle::dialect::DenseTensorType>",
-                   false,
-                   false)};
+      paddle::dialect::OpOutputInfo(
+          "out",
+          "pir::VectorType<paddle::dialect::DenseTensorType>",
+          false,
+          false)};
 
   paddle::dialect::OpRunTimeInfo run_time_info =
-      OpRunTimeInfo("", {""}, "", {""}, {}, {}, {}, {});
+      paddle::dialect::OpRunTimeInfo("", {}, "", {}, {}, {}, {}, {});
 
   return std::make_tuple(
       inputs, attributes, outputs, run_time_info, "gcu_engine_op");
@@ -98,7 +98,7 @@ void CustomEngineOp::Build(pir::Builder &builder,             // NOLINT
     } else {
       out_types.emplace_back(pir::DenseTensorType::get(
           pir::IrContext::Instance(),
-          TransToIrDataType(outputs_dtype[i]),
+          paddle::dialect::TransToIrDataType(outputs_dtype[i]),
           phi::DDim(outputs_shape[i].data(), outputs_shape[i].size()),
           phi::DataLayout::kNCHW,
           phi::LoD(),
@@ -110,6 +110,35 @@ void CustomEngineOp::Build(pir::Builder &builder,             // NOLINT
   argument_outputs.emplace_back(out_vector_type);
 
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
+  argument.AddRegion(nullptr);
+  pir::PassStopGradientsDefaultly(argument);
+}
+
+void CustomEngineOp::Build(pir::Builder &builder,             // NOLINT
+                           pir::OperationArgument &argument,  // NOLINT
+                           pir::Value x,
+                           const std::vector<std::string> &input_names,
+                           const std::vector<std::string> &output_names,
+                           const std::vector<pir::Type> &outputs_type) {
+  VLOG(3) << "Start building CustomEngineOp";
+
+  VLOG(3) << "Builder construction inputs";
+  std::vector<pir::Value> argument_inputs = {x};
+  argument.AddInputs(argument_inputs);
+
+  VLOG(3) << "Builder construction attributes";
+
+  ADD_VEC_ATTRIBUTE(pir::StrAttribute, input_names);
+  ADD_VEC_ATTRIBUTE(pir::StrAttribute, output_names);
+
+  VLOG(3) << "Builder construction outputs";
+  pir::Type out_vector_type =
+      pir::VectorType::get(pir::IrContext::Instance(), outputs_type);
+  //   std::vector<pir::Type> argument_outputs;
+  //   argument_outputs.emplace_back(out_vector_type);
+  //   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
+  argument.AddOutput(out_vector_type);
+  argument.AddRegion(nullptr);
   pir::PassStopGradientsDefaultly(argument);
 }
 
@@ -165,7 +194,22 @@ void CustomEngineOp::VerifySig() {
   VLOG(3) << "End Verifying for: CustomEngineOp.";
 }
 
-}  // namespace dialect
-}  // namespace paddle
+pir::Block *CustomEngineOp::block() {
+  pir::Region &region = (*this)->region(0);
+  if (region.empty()) region.emplace_back();
+  return &region.front();
+}
+
+pir::Block *CustomEngineOp::block() const {
+  pir::Region &region = (*this)->region(0);
+  PADDLE_ENFORCE_EQ(
+      region.empty(),
+      false,
+      ::common::errors::Unavailable(
+          "Required CustomEngineOp's region must not be emptpy."));
+  return &region.front();
+}
+
+}  // namespace custom_engine
 
-IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::CustomEngineOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(custom_engine::CustomEngineOp)
diff --git a/backends/gcu/custom_engine/custom_engine_op.h b/backends/gcu/custom_engine/custom_engine_op.h
index 0b2dfbb6b..0034e20b5 100644
--- a/backends/gcu/custom_engine/custom_engine_op.h
+++ b/backends/gcu/custom_engine/custom_engine_op.h
@@ -48,9 +48,7 @@
   }                                                    \
   }  // namespace pir
 
-namespace paddle {
-namespace dialect {
-
+namespace custom_engine {
 class CustomEngineOp
     : public pir::Op<CustomEngineOp, paddle::dialect::OpYamlInfoInterface> {
  public:
@@ -68,13 +66,22 @@ class CustomEngineOp
                     std::vector<std::vector<int64_t>> outputs_shape,
                     std::vector<phi::DataType> outputs_dtype);
 
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::Value x,
+                    const std::vector<std::string> &input_names,
+                    const std::vector<std::string> &output_names,
+                    const std::vector<pir::Type> &outputs_type);
+
   void VerifySig();
 
+  pir::Block *block();
+  pir::Block *block() const;
+
   pir::Value x() { return operand_source(0); }
   pir::Value out() { return result(0); }
 };
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace custom_engine
 
-IR_DECLARE_EXPLICIT_PLUGIN_TYPE_ID(paddle::dialect::CustomEngineOp)
+IR_DECLARE_EXPLICIT_PLUGIN_TYPE_ID(custom_engine::CustomEngineOp)
diff --git a/backends/gcu/custom_engine/gcu_engine.cc b/backends/gcu/custom_engine/gcu_engine.cc
new file mode 100644
index 000000000..9ea369bf7
--- /dev/null
+++ b/backends/gcu/custom_engine/gcu_engine.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "custom_engine/gcu_engine.h"
+
+namespace custom_engine {
+
+GCUEngine::GCUEngine(const std::string &engine_key,
+                     topsExecutable_t tops_exec,
+                     const std::vector<const phi::DenseTensor *> &tensor_args,
+                     const std::vector<phi::DenseTensor *> &return_tensor)
+    : engine_key_(engine_key),
+      tops_exec_(tops_exec),
+      tensor_args_(tensor_args),
+      return_tensor_(return_tensor) {
+  PADDLE_ENFORCE_NOT_NULL(
+      tops_exec_,
+      phi::errors::InvalidArgument("Expect executable is not null."));
+}
+
+void GCUEngine::Init(const std::string &engine_key,
+                     topsExecutable_t tops_exec,
+                     const std::vector<const phi::DenseTensor *> &tensor_args,
+                     const std::vector<phi::DenseTensor *> &return_tensor) {
+  engine_key_ = engine_key;
+  tops_exec_ = tops_exec;
+  tensor_args_ = tensor_args;
+  return_tensor_ = return_tensor;
+  executor_ = std::make_shared<GCUEngineExecutor>(
+      tops_exec, tensor_args, return_tensor);
+}
+
+void GCUEngine::Run(const phi::CustomContext &dev_ctx) {
+  VLOG(3) << "=== GCUEngine Run ===";
+  executor_->Run(dev_ctx);
+}
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/gcu_engine.h b/backends/gcu/custom_engine/gcu_engine.h
new file mode 100644
index 000000000..29751f46c
--- /dev/null
+++ b/backends/gcu/custom_engine/gcu_engine.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <vector>
+
+#include "custom_engine/gcu_engine_executor.h"
+#include "custom_engine/ir_translator/utils/utils.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace custom_engine {
+
+class GCUEngine {
+ public:
+  GCUEngine() = default;
+  GCUEngine(const std::string &engine_key,
+            topsExecutable_t tops_exec,
+            const std::vector<const phi::DenseTensor *> &tensor_args,
+            const std::vector<phi::DenseTensor *> &return_tensor);
+  ~GCUEngine() {
+    if (tops_exec_ != nullptr) {
+      RT_CHECK(topsDestroyExecutable(tops_exec_));
+      tops_exec_ = nullptr;
+      VLOG(3) << "Release topsExecutable and destory GCUEngine " << engine_key_;
+    }
+  }
+
+  void Init(const std::string &engine_key,
+            topsExecutable_t tops_exec,
+            const std::vector<const phi::DenseTensor *> &tensor_args,
+            const std::vector<phi::DenseTensor *> &return_tensor);
+
+  void Run(const phi::CustomContext &dev_ctx);
+
+ private:
+  std::string engine_key_;
+  topsExecutable_t tops_exec_ = nullptr;
+  std::vector<const phi::DenseTensor *> tensor_args_;
+  std::vector<phi::DenseTensor *> return_tensor_;
+  std::shared_ptr<GCUEngineExecutor> executor_ = nullptr;
+};
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/gcu_engine_compiler.cc b/backends/gcu/custom_engine/gcu_engine_compiler.cc
new file mode 100644
index 000000000..450cd8b1d
--- /dev/null
+++ b/backends/gcu/custom_engine/gcu_engine_compiler.cc
@@ -0,0 +1,372 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "custom_engine/gcu_engine_compiler.h"
+
+#include "custom_engine/ir_translator/translator_registry.h"
+#include "paddle/common/flags.h"
+#include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
+
+COMMON_DECLARE_bool(print_ir);
+
+namespace custom_engine {
+class GCUEngineCompiler::GCUEngineCompilerImpl {
+ public:
+  GCUEngineCompilerImpl(
+      const phi::KernelContext& kernel_context,
+      pir::Operation* op,
+      const std::vector<pir::Value>& engine_inputs,
+      const std::vector<pir::Value>& engine_outputs,
+      const std::unordered_map<pir::Value, std::vector<phi::DenseTensor*>>&
+          engine_value_to_tensors,
+      const std::unordered_map<pir::Value, std::vector<std::string>>&
+          engine_value_to_var_names,
+      const std::string& engine_key)
+      : kernel_context_(kernel_context),
+        op_(op),
+        engine_inputs_(engine_inputs),
+        engine_outputs_(engine_outputs),
+        engine_value_to_tensors_(engine_value_to_tensors),
+        engine_value_to_var_names_(engine_value_to_var_names),
+        engine_key_(engine_key) {
+    Init();
+  }
+
+  ~GCUEngineCompilerImpl() {}
+
+  void Init();
+
+  void Compile(GCUEngine* gcu_engine);
+
+ private:
+  void CreateInputs();
+  void MapInnerOutputValues(const pir::Operation* yield_op);
+  void SetGraphOutputs();
+  void ConvertGraph();
+
+  phi::KernelContext kernel_context_;
+  pir::Operation* op_;  // Not owned
+
+  std::string engine_key_;
+
+  std::vector<pir::Value> engine_inputs_;
+  std::vector<pir::Value> engine_outputs_;
+  std::vector<pir::Value> engine_inner_outputs_;
+  std::unordered_map<pir::Value, std::vector<phi::DenseTensor*>>
+      engine_value_to_tensors_;
+  std::unordered_map<pir::Value, std::vector<std::string>>
+      engine_value_to_var_names_;
+
+  pir::Block* block_;
+  std::vector<const phi::DenseTensor*> inputs_;
+  std::vector<phi::DenseTensor*> outputs_;
+
+  // for GCU graph
+  GcuBuilderPtr builder_ = nullptr;
+  std::unordered_map<phi::DenseTensor*, GcuOpPtr> gcu_op_cache_;
+};
+
+GCUEngineCompiler::GCUEngineCompiler::GCUEngineCompiler(
+    const phi::KernelContext& kernel_context,
+    pir::Operation* op,
+    const std::vector<pir::Value>& engine_inputs,
+    const std::vector<pir::Value>& engine_outputs,
+    const std::unordered_map<pir::Value, std::vector<phi::DenseTensor*>>&
+        engine_value_to_tensors,
+    const std::unordered_map<pir::Value, std::vector<std::string>>&
+        engine_value_to_var_names,
+    const std::string& engine_key) {
+  impl_ = std::make_shared<GCUEngineCompilerImpl>(kernel_context,
+                                                  op,
+                                                  engine_inputs,
+                                                  engine_outputs,
+                                                  engine_value_to_tensors,
+                                                  engine_value_to_var_names,
+                                                  engine_key);
+}
+
+void GCUEngineCompiler::Compile(GCUEngine* gcu_engine) {
+  impl_->Compile(gcu_engine);
+}
+
+void GCUEngineCompiler::GCUEngineCompilerImpl::Init() {
+  pir::Region& region = op_->region(0);
+  PADDLE_ENFORCE_EQ(
+      region.empty(),
+      false,
+      ::common::errors::Unavailable(
+          "Required CustomEngineOp's region must not be emptpy."));
+  block_ = &(region.front());
+
+  //   inputs_ = kernel_context_.InputsBetween<phi::DenseTensor>(
+  //       size_t(0), kernel_context_.InputsSize());
+
+  //   auto outputs = kernel_context_.MutableOutputBetween<phi::DenseTensor>(
+  //       size_t(0), kernel_context_.OutputsSize());
+  //   outputs_.assign(outputs.begin(), outputs.end());
+
+  for (size_t i = 0; i < engine_inputs_.size(); ++i) {
+    PADDLE_ENFORCE_GT(engine_value_to_tensors_.count(engine_inputs_[i]),
+                      0,
+                      common::errors::PreconditionNotMet(
+                          "Input[%zu] is not in value map", i));
+    inputs_.emplace_back(engine_value_to_tensors_.at(engine_inputs_[i]).at(0));
+  }
+
+  for (size_t i = 0; i < engine_outputs_.size(); ++i) {
+    PADDLE_ENFORCE_GT(engine_value_to_tensors_.count(engine_outputs_[i]),
+                      0,
+                      common::errors::PreconditionNotMet(
+                          "Output[%zu] is not in value map", i));
+    outputs_.emplace_back(
+        engine_value_to_tensors_.at(engine_outputs_[i]).at(0));
+  }
+
+  builder_ = std::make_shared<GcuBuilder>();
+  PADDLE_ENFORCE_NOT_NULL(
+      builder_, "Faild to reate gcu builder for %s.", engine_key_.c_str());
+  builder_->SetShapeInference(true);
+
+  VLOG(3) << "GCUEngineCompiler Init successfully.";
+}
+
+void GCUEngineCompiler::GCUEngineCompilerImpl::Compile(GCUEngine* gcu_engine) {
+  PADDLE_ENFORCE_NOT_NULL(gcu_engine,
+                          "The return GCUEngine memory is not allocated.");
+  VLOG(3) << "Compile for " << engine_key_;
+  ConvertGraph();
+
+  auto hlir_module = builder_->GetModule();
+  VLOG(3) << "Compiler begin to CompileHLIR for " << engine_key_;
+  topsExecutable_t tops_executable =
+      custom_engine::CompileTopsExecutable(hlir_module);
+  VLOG(3) << "Compiler CompileHLIR end for " << engine_key_;
+  gcu_engine->Init(engine_key_, tops_executable, inputs_, outputs_);
+  VLOG(3) << "Generate GCUEngine for " << engine_key_;
+  return;
+}
+
+void GCUEngineCompiler::GCUEngineCompilerImpl::CreateInputs() {
+  for (size_t i = 0; i < engine_inputs_.size(); ++i) {
+    auto tensor = engine_value_to_tensors_.at(engine_inputs_[i]).at(0);
+
+    auto ptype = custom_engine::ConvertFromPhiDataType(tensor->dtype());
+    std::vector<int64_t> dims = common::vectorize(tensor->dims());
+    builder::Type input_type(dims, ptype);
+    gcu_op_cache_[tensor] =
+        std::make_shared<GcuOp>(builder_->CreateInput(input_type));
+    VLOG(6) << "Create gcu builder input[" << i
+            << "]: " << engine_value_to_var_names_.at(engine_inputs_[i]).at(0);
+  }
+}
+
+void GCUEngineCompiler::GCUEngineCompilerImpl::MapInnerOutputValues(
+    const pir::Operation* yield_op) {
+  std::vector<std::vector<GcuOpPtr>> input_gcu_ops;
+  size_t input_num = yield_op->num_operands();
+  VLOG(6) << "MapOutputValues for yeild op:" << yield_op->name()
+          << ", input num:" << input_num;
+  PADDLE_ENFORCE_EQ(input_num,
+                    engine_outputs_.size(),
+                    common::errors::PreconditionNotMet(
+                        "Output num check failed, except:%zu, but get:%zu",
+                        engine_outputs_.size(),
+                        input_num));
+  for (size_t i = 0; i < input_num; ++i) {
+    auto value = yield_op->operand_source(i);
+    PADDLE_ENFORCE_GT(
+        engine_value_to_tensors_.count(value),
+        0,
+        common::errors::PreconditionNotMet(
+            "Input[%zu] value of yeild is not in engine_value_to_tensors_", i));
+    PADDLE_ENFORCE_GT(
+        engine_value_to_var_names_.count(value),
+        0,
+        common::errors::PreconditionNotMet(
+            "Input[%zu] value of yeild is not in engine_value_to_var_names_",
+            i));
+
+    engine_inner_outputs_.emplace_back(value);
+  }
+}
+
+void GCUEngineCompiler::GCUEngineCompilerImpl::SetGraphOutputs() {
+  std::vector<GcuOp> graph_outputs;
+  for (size_t i = 0; i < engine_inner_outputs_.size(); ++i) {
+    auto tensors = engine_value_to_tensors_.at(engine_inner_outputs_[i]);
+    PADDLE_ENFORCE_EQ(tensors.size(),
+                      1,
+                      common::errors::PreconditionNotMet(
+                          "Only support one tensor now, but get %zu, "
+                          "output_index:%zu",
+                          tensors.size(),
+                          i));
+
+    auto tensor = tensors.at(0);
+    auto inner_value_name =
+        engine_value_to_var_names_.at(engine_inner_outputs_[i]).at(0);
+    auto external_value_name =
+        engine_value_to_var_names_.at(engine_outputs_[i]).at(0);
+    PADDLE_ENFORCE_GT(
+        gcu_op_cache_.count(tensor),
+        0,
+        common::errors::PreconditionNotMet(
+            "Output[%zu] is not generated in gcu_op map, value name:%s",
+            i,
+            inner_value_name.c_str()));
+    graph_outputs.emplace_back(*(gcu_op_cache_.at(tensor)));
+
+    // set output shapes
+    auto gcu_shape = gcu_op_cache_.at(tensor)->GetType().GetShape();
+    tensor->Resize(common::make_ddim(gcu_shape));
+    outputs_[i]->Resize(common::make_ddim(gcu_shape));
+    // *(outputs_[i]) = *tensor;
+    VLOG(6) << "Found gcu builder output[" << i << "]: " << inner_value_name
+            << ", external var name:" << external_value_name
+            << ", dims:" << tensor->dims();
+  }
+  builder_->SetOutput(graph_outputs);
+}
+
+void GCUEngineCompiler::GCUEngineCompilerImpl::ConvertGraph() {
+  VLOG(3) << "ConvertGraph for " << engine_key_;
+  if (FLAGS_print_ir) {
+    std::cout << "IR Before conversion = " << *block_ << std::endl;
+  }
+
+  VLOG(3) << "Create inputs node for " << engine_key_;
+  CreateInputs();
+  //   builder_->Dump();
+
+  VLOG(3) << "Convert calc ops for " << engine_key_;
+  // NOTES: Consider the subgraph to be topologically sorted.
+  std::list<pir::Operation*> graph_ops = block_->ops();
+  for (const auto* op : graph_ops) {
+    if (op->isa<pir::YieldOp>()) {
+      MapInnerOutputValues(op);
+      continue;
+    }
+    std::string op_name = op->name();
+    auto op_attributes = op->attributes();
+    if (op->HasAttribute("op_name")) {
+      op_name = op->attribute<pir::StrAttribute>("op_name").AsString();
+    }
+
+    OpTranslateFunc convert_func =
+        TranslatorRegistry::Instance().Get(OpTranslateFuncKey(op_name));
+    PADDLE_ENFORCE_NOT_NULL(convert_func);
+
+    // inputs
+    std::vector<std::vector<GcuOpPtr>> input_gcu_ops;
+    size_t input_num = op->num_operands();
+    VLOG(6) << "Get input_gcu_ops for " << op_name << ", num:" << input_num;
+    for (size_t i = 0; i < input_num; ++i) {
+      auto value = op->operand_source(i);
+      PADDLE_ENFORCE_GT(
+          engine_value_to_tensors_.count(value),
+          0,
+          common::errors::PreconditionNotMet(
+              "Input[%zu] value is not in engine_value_to_tensors_", i));
+      PADDLE_ENFORCE_GT(
+          engine_value_to_var_names_.count(value),
+          0,
+          common::errors::PreconditionNotMet(
+              "Input[%zu] value is not in engine_value_to_var_names_", i));
+
+      std::vector<GcuOpPtr> gcu_ops;
+      auto tensors = engine_value_to_tensors_.at(value);
+      auto var_names = engine_value_to_var_names_.at(value);
+
+      for (size_t n = 0; n < tensors.size(); ++n) {
+        PADDLE_ENFORCE_GT(
+            gcu_op_cache_.count(tensors[n]),
+            0,
+            common::errors::PreconditionNotMet(
+                "Input[%zu][%zu] is not generated in gcu_op map, name: %s",
+                i,
+                n,
+                var_names.at(n)));
+        gcu_ops.emplace_back(gcu_op_cache_.at(tensors[n]));
+        VLOG(6) << "op_name:" << op_name << ", inputs[" << i << "][" << n
+                << "], var name:" << var_names.at(n);
+      }
+      input_gcu_ops.emplace_back(gcu_ops);
+    }
+
+    // convert
+    VLOG(6) << "Start to convert for " << op_name;
+    GcuOpPtr gcu_op = convert_func(builder_, op, input_gcu_ops);
+    VLOG(6) << "End of conversion for " << op_name;
+
+    bool is_tuple_out = gcu_op->GetType().IsTuple();
+    if (is_tuple_out) {
+      size_t gcu_output_num = gcu_op->GetType().GetTupleSize();
+      size_t output_num = op->num_results();
+      PADDLE_ENFORCE_EQ(
+          gcu_output_num,
+          output_num,
+          common::errors::PreconditionNotMet("Output num check failed, op: %s",
+                                             op_name.c_str()));
+
+      for (size_t i = 0; i < output_num; ++i) {
+        auto out_value = op->result(i);
+        auto tensors = engine_value_to_tensors_.at(out_value);
+        PADDLE_ENFORCE_EQ(tensors.size(),
+                          1,
+                          common::errors::PreconditionNotMet(
+                              "Only support one tensor now, but get %zu, op: "
+                              "%s, output_index:%zu",
+                              tensors.size(),
+                              op_name.c_str(),
+                              i));
+
+        auto tensor = tensors.at(0);
+        auto ptype = custom_engine::ConvertFromPhiDataType(tensor->dtype());
+        std::vector<int64_t> dims = common::vectorize(tensor->dims());
+        builder::Type input_type(dims, ptype);
+        gcu_op_cache_[tensor] =
+            std::make_shared<GcuOp>(builder::GetTupleElement(*gcu_op, i));
+        VLOG(6) << "Output GetTupleElement for " << op_name
+                << ", output index:" << i
+                << ", name:" << engine_value_to_var_names_.at(out_value).at(0);
+      }
+    } else {
+      if (op->num_results() == 1) {
+        auto out_value = op->result(0);
+        auto tensors = engine_value_to_tensors_.at(out_value);
+        PADDLE_ENFORCE_EQ(tensors.size(),
+                          1,
+                          common::errors::PreconditionNotMet(
+                              "Output num should be one, but get %zu, op: "
+                              "%s, output_index:%zu",
+                              tensors.size(),
+                              op_name.c_str()));
+        gcu_op_cache_[tensors.at(0)] = gcu_op;
+        VLOG(6) << "Output set for " << op_name
+                << ", name:" << engine_value_to_var_names_.at(out_value).at(0);
+      } else {
+        VLOG(6) << "Op " << op_name << " does not have any output value.";
+      }
+    }
+  }  // end of for (const auto* op : graph_ops)
+  // outputs
+  SetGraphOutputs();
+  if (FLAGS_print_ir) {
+    std::cout << "IR After conversion = " << std::endl;
+    builder_->Dump();
+  }
+}
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/gcu_engine_compiler.h b/backends/gcu/custom_engine/gcu_engine_compiler.h
new file mode 100644
index 000000000..35a9a0438
--- /dev/null
+++ b/backends/gcu/custom_engine/gcu_engine_compiler.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <vector>
+
+#include "custom_engine/gcu_engine.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_context.h"
+#include "paddle/pir/include/core/operation.h"
+
+namespace custom_engine {
+
+class GCUEngineCompiler {
+ public:
+  GCUEngineCompiler(
+      const phi::KernelContext& kernel_context,
+      pir::Operation* op,
+      const std::vector<pir::Value>& engine_inputs,
+      const std::vector<pir::Value>& engine_outputs,
+      const std::unordered_map<pir::Value, std::vector<phi::DenseTensor*>>&
+          engine_value_to_tensors,
+      const std::unordered_map<pir::Value, std::vector<std::string>>&
+          engine_value_to_var_names,
+      const std::string& engine_key = "GCUEngineCompiler_default");
+  ~GCUEngineCompiler() {}
+
+  // NOTES: This function applies for memory and is released by the caller.
+  void Compile(GCUEngine* gcu_engine);
+
+ private:
+  class GCUEngineCompilerImpl;
+  std::shared_ptr<GCUEngineCompilerImpl> impl_ = nullptr;
+};
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/gcu_engine_executor.cc b/backends/gcu/custom_engine/gcu_engine_executor.cc
new file mode 100644
index 000000000..9ede59fd5
--- /dev/null
+++ b/backends/gcu/custom_engine/gcu_engine_executor.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "custom_engine/gcu_engine_executor.h"
+
+namespace custom_engine {
+void GCUEngineExecutor::Init() {
+  tensor_args_device_.resize(tensor_args_.size());
+}
+
+void GCUEngineExecutor::Run(const phi::CustomContext &dev_ctx) {
+  VLOG(3) << "=== GCUEngineExecutor Run ===";
+  std::vector<void *> dev_inputs;
+  dev_inputs.reserve(tensor_args_.size());
+  std::vector<void *> dev_outputs;
+  dev_outputs.resize(return_tensor_.size());
+
+  for (size_t i = 0; i < tensor_args_.size(); ++i) {
+    auto input = tensor_args_[i];
+    PADDLE_ENFORCE_NE(
+        input, nullptr, phi::errors::InvalidArgument("inputs is null"));
+
+    if (input->initialized()) {
+      phi::DenseTensor *tensor = &(tensor_args_device_[i]);
+      if (input->place().GetType() != phi::AllocationType::CUSTOM) {
+        custom_kernel::TensorCopy(dev_ctx, *input, false, tensor);
+      } else {
+        *tensor = *input;
+      }
+      auto device_tensor = tensor_args_device_[i];
+      dev_inputs.emplace_back(device_tensor.data());
+      VLOG(6) << "GCUEngineExecutor::Run, Inputs[" << i
+              << "] addr:" << device_tensor.data() << ", capacity is "
+              << device_tensor.capacity() << ", type:" << device_tensor.dtype()
+              << ", place:" << device_tensor.place()
+              << ", ddim:" << device_tensor.dims().to_str();
+    } else {
+      VLOG(6) << "GCUEngineExecutor::Run, inputs[" << i
+              << "] is not initialized.";
+    }
+  }
+
+  for (size_t i = 0; i < return_tensor_.size(); ++i) {
+    auto *tensor = return_tensor_[i];
+    PADDLE_ENFORCE_NE(
+        tensor, nullptr, phi::errors::InvalidArgument("outputs is null"));
+    dev_ctx.Alloc(tensor, tensor->dtype());
+    dev_outputs[i] = tensor->data();
+
+    VLOG(6) << "GCUEngineExecutor::Run, outputs[" << i
+            << "] addr:" << tensor->data() << ", capacity is "
+            << tensor->capacity() << ", type:" << tensor->dtype()
+            << ", place:" << tensor->place()
+            << ", ddim:" << tensor->dims().to_str();
+  }
+
+  auto tops_stream = static_cast<topsStream_t>(dev_ctx.stream());
+  VLOG(6) << "GCUEngineExecutor Run on stream:" << tops_stream
+          << ", tops_exec_:" << tops_exec_;
+
+  static double total_time_cost = 0;
+  static int32_t exec_count = 0;
+  auto start_time = custom_kernel::GetCurrentTimestap();
+
+  RT_CHECK(topsLaunchExecutable(tops_exec_,
+                                nullptr,
+                                dev_inputs.data(),
+                                dev_inputs.size(),
+                                nullptr,
+                                nullptr,
+                                dev_outputs.data(),
+                                dev_outputs.size(),
+                                nullptr,
+                                nullptr,
+                                tops_stream));
+
+  if (VLOG_IS_ON(6)) {
+    auto time_cost = custom_kernel::GetTimeCostInMs(
+        start_time, custom_kernel::GetCurrentTimestap());
+    total_time_cost += time_cost;
+
+    VLOG(6) << "exec_count: " << ++exec_count << ", time_cost: " << time_cost
+            << ", total_time_cost: " << total_time_cost;
+  }
+  return;
+}
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/gcu_engine_executor.h b/backends/gcu/custom_engine/gcu_engine_executor.h
new file mode 100644
index 000000000..ddadba01a
--- /dev/null
+++ b/backends/gcu/custom_engine/gcu_engine_executor.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <vector>
+
+#include "custom_engine/ir_translator/utils/utils.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace custom_engine {
+
+class GCUEngineExecutor {
+ public:
+  GCUEngineExecutor(topsExecutable_t tops_exec,
+                    const std::vector<const phi::DenseTensor *> &tensor_args,
+                    const std::vector<phi::DenseTensor *> &return_tensor)
+      : tops_exec_(tops_exec),
+        tensor_args_(tensor_args),
+        return_tensor_(return_tensor) {
+    Init();
+  }
+  ~GCUEngineExecutor() {}
+
+  void Init();
+
+  void Run(const phi::CustomContext &dev_ctx);
+
+ private:
+  topsExecutable_t tops_exec_ = nullptr;  // Not owned
+  std::vector<const phi::DenseTensor *> tensor_args_;
+  std::vector<phi::DenseTensor *> return_tensor_;
+
+  std::vector<phi::DenseTensor> tensor_args_device_;
+};
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/ir_translator/operators/activation_ops.cc b/backends/gcu/custom_engine/ir_translator/operators/activation_ops.cc
new file mode 100644
index 000000000..80267a109
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/operators/activation_ops.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "custom_engine/ir_translator/translator_registry.h"
+
+namespace custom_engine {
+
+static GcuOpPtr TranslateAbs(
+    GcuBuilderPtr gcu_builder,
+    const pir::Operation *op,
+    const std::vector<std::vector<GcuOpPtr>> &gcu_op_inputs) {
+  auto input = *(gcu_op_inputs[0][0]);
+  return std::make_shared<GcuOp>(builder::Abs(input));
+}
+
+}  // namespace custom_engine
+
+REGISTER_OP_TRANSLATOR(pd_op_abs, custom_engine::TranslateAbs)
diff --git a/backends/gcu/custom_engine/ir_translator/operators/binary_ops.cc b/backends/gcu/custom_engine/ir_translator/operators/binary_ops.cc
new file mode 100644
index 000000000..275b7e68a
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/operators/binary_ops.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "custom_engine/ir_translator/translator_registry.h"
+
+namespace custom_engine {
+
+#define DEFINE_BINARY_TRANS_FUNC(func)                                   \
+  static GcuOpPtr TranslateBinaryOps##func(                              \
+      GcuBuilderPtr gcu_builder,                                         \
+      const pir::Operation *op,                                          \
+      const std::vector<std::vector<GcuOpPtr>> &gcu_op_inputs) {         \
+    PADDLE_ENFORCE_EQ(gcu_op_inputs.size(),                              \
+                      2,                                                 \
+                      common::errors::PreconditionNotMet(                \
+                          "Intput op num check failed, op: %s, num:%zu", \
+                          std::string(#func).c_str(),                    \
+                          gcu_op_inputs.size()));                        \
+    auto lhs = *(gcu_op_inputs[0][0]);                                   \
+    auto rhs = *(gcu_op_inputs[1][0]);                                   \
+    return std::make_shared<GcuOp>(builder::func(lhs, rhs));             \
+  }
+
+DEFINE_BINARY_TRANS_FUNC(Add)
+DEFINE_BINARY_TRANS_FUNC(Sub)
+DEFINE_BINARY_TRANS_FUNC(Mul)
+DEFINE_BINARY_TRANS_FUNC(Div)
+DEFINE_BINARY_TRANS_FUNC(Greater)
+DEFINE_BINARY_TRANS_FUNC(GreaterEqual)
+DEFINE_BINARY_TRANS_FUNC(Less)
+DEFINE_BINARY_TRANS_FUNC(LessEqual)
+
+#undef DEFINE_BINARY_TRANS_FUNC
+
+}  // namespace custom_engine
+
+REGISTER_OP_TRANSLATOR(pd_op_add, custom_engine::TranslateBinaryOpsAdd)
+REGISTER_OP_TRANSLATOR(pd_op_add_, custom_engine::TranslateBinaryOpsAdd)
+REGISTER_OP_TRANSLATOR(pd_op_subtract, custom_engine::TranslateBinaryOpsSub)
+REGISTER_OP_TRANSLATOR(pd_op_multiply, custom_engine::TranslateBinaryOpsMul)
+REGISTER_OP_TRANSLATOR(pd_op_divide, custom_engine::TranslateBinaryOpsDiv)
+REGISTER_OP_TRANSLATOR(pd_op_greater_than,
+                       custom_engine::TranslateBinaryOpsGreater)
+REGISTER_OP_TRANSLATOR(pd_op_greater_equal,
+                       custom_engine::TranslateBinaryOpsGreaterEqual)
+REGISTER_OP_TRANSLATOR(pd_op_less_than, custom_engine::TranslateBinaryOpsLess)
+REGISTER_OP_TRANSLATOR(pd_op_less_equal,
+                       custom_engine::TranslateBinaryOpsLessEqual)
diff --git a/backends/gcu/custom_engine/ir_translator/operators/full.cc b/backends/gcu/custom_engine/ir_translator/operators/full.cc
new file mode 100644
index 000000000..a3f4b3528
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/operators/full.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "custom_engine/ir_translator/translator_registry.h"
+
+namespace custom_engine {
+
+static GcuOpPtr TranslateFull(
+    GcuBuilderPtr gcu_builder,
+    const pir::Operation *op,
+    const std::vector<std::vector<GcuOpPtr>> &gcu_op_inputs) {
+  const auto &attrs = op->attributes();
+
+  auto shape_array =
+      attrs.at("shape").dyn_cast<pir::ArrayAttribute>().AsVector();
+  std::vector<int64_t> shape;
+  if (shape_array.size() > 0) {
+    PADDLE_ENFORCE_EQ(shape_array[0].isa<pir::Int64Attribute>(),
+                      true,
+                      common::errors::Unimplemented(
+                          "the 0th elementwise MUST be pir::Int64Attribute"));
+    for (size_t i = 0; i < shape_array.size(); ++i) {
+      shape.emplace_back(shape_array[i].dyn_cast<pir::Int64Attribute>().data());
+    }
+  }
+  double value = attrs.at("value").dyn_cast<pir::DoubleAttribute>().data();
+  phi::DataType dtype =
+      attrs.at("dtype").dyn_cast<paddle::dialect::DataTypeAttribute>().data();
+  auto ptype = custom_engine::ConvertFromPhiDataType(dtype);
+  auto result = builder::Const(gcu_builder, value, builder::Type(shape, ptype));
+  return std::make_shared<GcuOp>(result);
+}
+
+}  // namespace custom_engine
+
+REGISTER_OP_TRANSLATOR(pd_op_full, custom_engine::TranslateFull)
+REGISTER_OP_TRANSLATOR(pd_op_full_, custom_engine::TranslateFull)
diff --git a/backends/gcu/custom_engine/ir_translator/operators/matmul.cc b/backends/gcu/custom_engine/ir_translator/operators/matmul.cc
new file mode 100644
index 000000000..016f0c6d0
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/operators/matmul.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "custom_engine/ir_translator/translator_registry.h"
+
+namespace custom_engine {
+
+static GcuOpPtr TranslateMatmul(
+    GcuBuilderPtr gcu_builder,
+    const pir::Operation *op,
+    const std::vector<std::vector<GcuOpPtr>> &gcu_op_inputs) {
+  PADDLE_ENFORCE_EQ(
+      gcu_op_inputs.size(),
+      2,
+      common::errors::PreconditionNotMet(
+          "Intput op num check failed, get num:%zu", gcu_op_inputs.size()));
+  auto X = *(gcu_op_inputs[0][0]);
+  auto Y = *(gcu_op_inputs[1][0]);
+
+  auto x_shape = X.GetType().GetShape();
+  auto y_shape = Y.GetType().GetShape();
+
+  bool trans_x =
+      op->attribute("transpose_x").dyn_cast<pir::BoolAttribute>().data();
+  bool trans_y =
+      op->attribute("transpose_y").dyn_cast<pir::BoolAttribute>().data();
+
+  int64_t x_rank = x_shape.size();
+  int64_t y_rank = y_shape.size();
+  int64_t max_rank = std::max(x_rank, y_rank);
+  int64_t rank_diff = std::abs(x_rank - y_rank);
+  auto ptype = X.GetType().GetPrimitiveType();
+  int64_t batch_dim;
+
+  if (x_rank > y_rank) {
+    if (trans_x || y_rank == 1) {
+      std::vector<int64_t> broadcast_dims;
+      std::vector<int64_t> bc_shape;
+      if (y_rank == 1) {
+        for (int64_t i = 0; i < rank_diff - 1; i++) {
+          bc_shape.emplace_back(x_shape[i]);
+        }
+        bc_shape.emplace_back(y_shape[0]);
+        bc_shape.emplace_back(1);
+        broadcast_dims.emplace_back(rank_diff - 1);
+      } else {
+        for (int64_t i = 0; i < rank_diff; i++) {
+          bc_shape.emplace_back(x_shape[i]);
+        }
+        for (int64_t i = 0; i < y_rank; i++) {
+          bc_shape.emplace_back(y_shape[i]);
+        }
+        int iter = 0;
+        for (int64_t i = 0; i < x_rank; ++i) {
+          if (i < rank_diff) {
+            ++iter;
+          } else {
+            broadcast_dims.emplace_back(i);
+          }
+        }
+      }
+      builder::Type type(bc_shape, ptype);
+      Y = builder::BroadcastInDim(Y, broadcast_dims, type);
+    }
+    if (y_rank == 1) {
+      batch_dim = rank_diff - 1;
+    } else {
+      batch_dim = rank_diff;
+    }
+
+  } else if (x_rank < y_rank) {
+    std::vector<int64_t> broadcast_dims;
+    std::vector<int64_t> bc_shape;
+    if (x_rank == 1) {
+      for (int64_t i = 0; i < rank_diff - 1; i++) {
+        bc_shape.emplace_back(y_shape[i]);
+      }
+      bc_shape.emplace_back(1);
+      bc_shape.emplace_back(x_shape[0]);
+      broadcast_dims.emplace_back(rank_diff);
+    } else {
+      for (int64_t i = 0; i < rank_diff; i++) {
+        bc_shape.emplace_back(y_shape[i]);
+      }
+      for (int64_t i = 0; i < x_rank; i++) {
+        bc_shape.emplace_back(x_shape[i]);
+      }
+      int iter = 0;
+      for (int64_t i = 0; i < y_rank; ++i) {
+        if (i < rank_diff) {
+          ++iter;
+        } else {
+          broadcast_dims.emplace_back(i);
+        }
+      }
+    }
+    builder::Type type(bc_shape, ptype);
+    X = builder::BroadcastInDim(X, broadcast_dims, type);
+    if (x_rank == 1) {
+      batch_dim = rank_diff - 1;
+    } else {
+      batch_dim = rank_diff;
+    }
+
+  } else {
+    batch_dim = max_rank - 2;
+    if (x_rank == y_rank && x_rank > 3) {
+      auto x_brd_shape = x_shape;
+      auto y_brd_shape = y_shape;
+      std::vector<int64_t> x_brd_dims, y_brd_dims;
+      for (int64_t i = 0; i < x_rank - 2; ++i) {
+        x_brd_shape[i] = x_shape[i] > y_shape[i] ? x_shape[i] : y_shape[i];
+        y_brd_shape[i] = x_shape[i] > y_shape[i] ? x_shape[i] : y_shape[i];
+      }
+      x_brd_dims.resize(x_rank);
+      y_brd_dims.resize(y_rank);
+      std::iota(x_brd_dims.begin(), x_brd_dims.end(), 0);
+      std::iota(y_brd_dims.begin(), y_brd_dims.end(), 0);
+      if (x_brd_shape != x_shape) {
+        X = builder::BroadcastInDim(
+            X, x_brd_dims, builder::Type(x_brd_shape, ptype));
+      }
+      if (y_brd_shape != y_shape) {
+        Y = builder::BroadcastInDim(
+            Y, y_brd_dims, builder::Type(y_brd_shape, ptype));
+      }
+    }
+  }
+
+  builder::DotDimensionNumbers dims_attr;
+  std::vector<int64_t> lhs_batching_dimensions = {};
+  std::vector<int64_t> rhs_batching_dimensions = {};
+  std::vector<int64_t> lhs_contracting_dimensions = {};
+  std::vector<int64_t> rhs_contracting_dimensions = {};
+  if (x_rank == 1 && y_rank == 1) {
+    lhs_contracting_dimensions.emplace_back(0);
+    rhs_contracting_dimensions.emplace_back(0);
+  } else if (x_rank <= y_rank || trans_x || y_rank == 1) {
+    for (int64_t i = 0; i < max_rank - 1; ++i) {
+      if (i < batch_dim) {
+        lhs_batching_dimensions.emplace_back(i);
+        rhs_batching_dimensions.emplace_back(i);
+      } else {
+        if (trans_x && x_rank != 1) {
+          lhs_contracting_dimensions.emplace_back(i);
+        } else {
+          lhs_contracting_dimensions.emplace_back(i + 1);
+        }
+        if (trans_y && y_rank != 1) {
+          rhs_contracting_dimensions.emplace_back(i + 1);
+        } else {
+          rhs_contracting_dimensions.emplace_back(i);
+        }
+      }
+    }
+  } else {
+    lhs_contracting_dimensions.emplace_back(x_rank - 1);
+    if (y_rank != 1) {
+      if (trans_y) {
+        rhs_contracting_dimensions.emplace_back(y_rank - 1);
+      } else {
+        rhs_contracting_dimensions.emplace_back(y_rank - 2);
+      }
+    } else {
+      rhs_contracting_dimensions.emplace_back(0);
+    }
+  }
+
+  dims_attr.set_lhs_batching_dimensions(lhs_batching_dimensions);
+  dims_attr.set_rhs_batching_dimensions(rhs_batching_dimensions);
+  dims_attr.set_lhs_contracting_dimensions(lhs_contracting_dimensions);
+  dims_attr.set_rhs_contracting_dimensions(rhs_contracting_dimensions);
+  std::vector<const char *> precision_config = {};
+  auto dot = builder::DotGeneral(X, Y, dims_attr, precision_config);
+  dot.SetAttribute("op_type", builder::Attribute("DotInference"));
+  if (x_rank == 1 && y_rank == 1) {
+    auto type = dot.GetType().GetPrimitiveType();
+    std::vector<int64_t> new_shape;
+    new_shape.push_back(1);
+    builder::Type output_type(new_shape, type);
+    dot = builder::Reshape(dot, output_type);
+  } else if (y_rank == 1) {
+    auto shape = dot.GetType().GetShape();
+    auto type = dot.GetType().GetPrimitiveType();
+    std::vector<int64_t> new_shape;
+    for (size_t i = 0; i < shape.size() - 1; i++) {
+      new_shape.push_back(shape[i]);
+    }
+    builder::Type output_type(new_shape, type);
+    dot = builder::Reshape(dot, output_type);
+  } else if (x_rank == 1) {
+    auto shape = dot.GetType().GetShape();
+    auto type = dot.GetType().GetPrimitiveType();
+    std::vector<int64_t> new_shape;
+    for (size_t i = 0; i < shape.size(); i++) {
+      if (i != shape.size() - 2) {
+        new_shape.push_back(shape[i]);
+      }
+    }
+    builder::Type output_type(new_shape, type);
+    dot = builder::Reshape(dot, output_type);
+  }
+
+  auto result = std::make_shared<GcuOp>(dot);
+
+  return result;
+}
+
+}  // namespace custom_engine
+
+REGISTER_OP_TRANSLATOR(pd_op_matmul, custom_engine::TranslateMatmul)
diff --git a/backends/gcu/custom_engine/ir_translator/operators/scale.cc b/backends/gcu/custom_engine/ir_translator/operators/scale.cc
new file mode 100644
index 000000000..c5aa18bec
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/operators/scale.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "custom_engine/ir_translator/translator_registry.h"
+
+namespace custom_engine {
+
+static GcuOpPtr TranslateScale(
+    GcuBuilderPtr gcu_builder,
+    const pir::Operation *op,
+    const std::vector<std::vector<GcuOpPtr>> &gcu_op_inputs) {
+  auto input = gcu_op_inputs[0][0];
+  const auto &attrs = op->attributes();
+  bool bias_after_scale =
+      attrs.at("bias_after_scale").dyn_cast<pir::BoolAttribute>().data();
+
+  builder::Op scale_op;
+  if (gcu_op_inputs.size() == 2) {  // with scale tensor
+    scale_op = *(gcu_op_inputs[1][0]);
+  } else {
+    float scale = attrs.at("scale").dyn_cast<::pir::FloatAttribute>().data();
+    scale_op = builder::FullLike(*input, scale);
+  }
+  float bias = attrs.at("bias").dyn_cast<pir::FloatAttribute>().data();
+  auto bias_op = builder::FullLike(*input, bias);
+  if (bias_after_scale) {
+    return std::make_shared<GcuOp>((*input) * scale_op + bias_op);
+  } else {
+    return std::make_shared<GcuOp>(((*input) + bias_op) * scale_op);
+  }
+}
+
+}  // namespace custom_engine
+
+REGISTER_OP_TRANSLATOR(pd_op_scale, custom_engine::TranslateScale)
+REGISTER_OP_TRANSLATOR(pd_op_scale_, custom_engine::TranslateScale)
diff --git a/backends/gcu/custom_engine/ir_translator/operators/yield.cc b/backends/gcu/custom_engine/ir_translator/operators/yield.cc
new file mode 100644
index 000000000..fd456e7b7
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/operators/yield.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "custom_engine/ir_translator/translator_registry.h"
+
+namespace custom_engine {
+
+static GcuOpPtr TranslateYield(
+    GcuBuilderPtr gcu_builder,
+    const pir::Operation *op,
+    const std::vector<std::vector<GcuOpPtr>> &gcu_op_inputs) {
+  size_t output_num = gcu_op_inputs.size();
+  if (output_num > 1) {
+    std::vector<builder::Op> outputs;
+    for (size_t i = 0; i < output_num; ++i) {
+      outputs.emplace_back(*(gcu_op_inputs[i][0]));
+    }
+    builder::Op result = builder::Tuple(outputs);
+    return std::make_shared<GcuOp>(result);
+  } else if (output_num == 1) {
+    return gcu_op_inputs[0][0];
+  } else {
+    PADDLE_THROW(common::errors::PreconditionNotMet("Not support now."));
+  }
+}
+
+}  // namespace custom_engine
+
+REGISTER_OP_TRANSLATOR(cf_yield, custom_engine::TranslateYield)
diff --git a/backends/gcu/custom_engine/ir_translator/translator_registry.h b/backends/gcu/custom_engine/ir_translator/translator_registry.h
new file mode 100644
index 000000000..d45e3c588
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/translator_registry.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "custom_engine/ir_translator/utils/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/include/core/operation.h"
+
+namespace custom_engine {
+
+using OpTranslateFunc = std::function<GcuOpPtr(
+    GcuBuilderPtr builder,
+    const pir::Operation *op,
+    const std::vector<std::vector<GcuOpPtr>> &map_inputs)>;
+
+static inline std::string OpTranslateFuncKey(const std::string &op_name) {
+  std::regex pattern("\\.");
+  std::string result = std::regex_replace(op_name, pattern, "_");
+  return result;
+}
+
+class TranslatorRegistry {
+ public:
+  static TranslatorRegistry &Instance() {
+    static TranslatorRegistry g_op_translator_registry_instance;
+    return g_op_translator_registry_instance;
+  }
+
+  bool Has(const std::string &op_name) const {
+    return translator_map_.find(op_name) != translator_map_.end();
+  }
+
+  void Insert(const std::string &op_name,
+              const OpTranslateFunc &op_trans_func) {
+    PADDLE_ENFORCE_NE(
+        Has(op_name),
+        true,
+        common::errors::InvalidArgument(
+            "OpTranslateFunc of %s has been registered.", op_name));
+    translator_map_.insert({op_name, op_trans_func});
+    std::cout << "TranslatorRegistry insert " << op_name << std::endl;
+  }
+
+  OpTranslateFunc Get(const std::string &op_name) const {
+    PADDLE_ENFORCE_EQ(
+        Has(op_name),
+        true,
+        common::errors::InvalidArgument(
+            "OpTranslateFunc of %s has not been registered.", op_name));
+    return translator_map_.at(op_name);
+  }
+
+ private:
+  TranslatorRegistry() = default;
+  std::unordered_map<std::string, OpTranslateFunc> translator_map_;
+
+  TranslatorRegistry(const TranslatorRegistry &) = delete;
+  TranslatorRegistry(TranslatorRegistry &&) = delete;
+  TranslatorRegistry &operator=(const TranslatorRegistry &) = delete;
+  TranslatorRegistry &operator=(TranslatorRegistry &&) = delete;
+};
+
+class OpTranslatorRegistrar {
+ public:
+  // The action of registration is in the constructor of a global registrar
+  // variable, which are not used in the code that calls package framework, and
+  // would be removed from the generated binary file by the linker. To avoid
+  // such removal, we add Touch to all registrar classes and make
+  // USE_OP_TRANSLATOR macros to call this method. So, as long as the callee
+  // code calls USE_OP_TRANSLATOR, the global registrar variable won't be
+  // removed by the linker.
+  void Touch() {}
+  OpTranslatorRegistrar(const char *op_name,
+                        const OpTranslateFunc &op_trans_func) {
+    TranslatorRegistry::Instance().Insert(op_name, op_trans_func);
+  }
+};
+
+#define STATIC_ASSERT_TRANSLATOR_GLOBAL_NAMESPACE(uniq_name, msg)              \
+  struct __test_translator_global_namespace_##uniq_name##__ {};                \
+  static_assert(                                                               \
+      std::is_same<::__test_translator_global_namespace_##uniq_name##__,       \
+                   __test_translator_global_namespace_##uniq_name##__>::value, \
+      msg)
+
+// Register a new op_trans_func that can be applied on the operator.
+#define REGISTER_OP_TRANSLATOR(op_name, op_trans_func)                  \
+  STATIC_ASSERT_TRANSLATOR_GLOBAL_NAMESPACE(                            \
+      __reg_op_translator__##op_name,                                   \
+      "REGISTER_OP_TRANSLATOR must be called in global namespace");     \
+  static custom_engine::OpTranslatorRegistrar                           \
+      __op_translator_registrar_##op_name##__(#op_name, op_trans_func); \
+  int TouchOpTranslatorRegistrar_##op_name() {                          \
+    __op_translator_registrar_##op_name##__.Touch();                    \
+    return 0;                                                           \
+  }
+
+#define USE_OP_TRANSLATOR(op_name)                             \
+  STATIC_ASSERT_TRANSLATOR_GLOBAL_NAMESPACE(                   \
+      __use_op_translator_itself_##op_name,                    \
+      "USE_OP_TRANSLATOR must be called in global namespace"); \
+  extern int TouchOpTranslatorRegistrar_##op_name();           \
+  static int use_op_translator_itself_##op_name##_ UNUSED =    \
+      TouchOpTranslatorRegistrar_##op_name()
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/ir_translator/utils/utils.cc b/backends/gcu/custom_engine/ir_translator/utils/utils.cc
new file mode 100644
index 000000000..fd45265e7
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/utils/utils.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "custom_engine/ir_translator/utils/utils.h"
+
+#include "gcu/tops_graph_compiler/tops_graph_compiler.h"
+#include "gcu/tops_graph_compiler/tops_graph_compiler_option.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace custom_engine {
+namespace {
+std::vector<std::string> TargetOptionSplit(const std::string& s,
+                                           char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(s);
+  while (std::getline(tokenStream, token, delimiter)) {
+    std::size_t first_non_space = token.find_first_not_of(" \t\n\r");
+    std::size_t last_non_space = token.find_last_not_of(" \t\n\r");
+    if (first_non_space == std::string::npos ||
+        last_non_space == std::string::npos) {
+      continue;
+    }
+    token.substr(first_non_space, last_non_space - first_non_space + 1);
+    if (!token.empty()) tokens.push_back(token);
+  }
+  return tokens;
+}
+}  // namespace
+
+GcuPrimitiveType ConvertFromPhiDataType(const phi::DataType& type) {
+  switch (type) {
+    case phi::DataType::BOOL:
+      return builder::PrimitiveType::PRED();
+    case phi::DataType::INT8:
+      return builder::PrimitiveType::S8();
+    case phi::DataType::INT16:
+      return builder::PrimitiveType::S16();
+    case phi::DataType::INT32:
+      return builder::PrimitiveType::S32();
+    case phi::DataType::INT64:
+      return builder::PrimitiveType::S64();
+    case phi::DataType::FLOAT16:
+      return builder::PrimitiveType::F16();
+    case phi::DataType::FLOAT32:
+      return builder::PrimitiveType::F32();
+    case phi::DataType::FLOAT64:
+      return builder::PrimitiveType::F64();
+    case phi::DataType::UINT8:
+      return builder::PrimitiveType::U8();
+    case phi::DataType::UINT16:
+      return builder::PrimitiveType::U16();
+    case phi::DataType::UINT32:
+      return builder::PrimitiveType::U32();
+    case phi::DataType::UINT64:
+      return builder::PrimitiveType::U64();
+
+    default:
+      return builder::PrimitiveType::NONE();
+  }
+}
+
+std::vector<std::string> GetTopsCompileOptions() {
+  std::vector<std::string> opts;
+
+  auto target_name = custom_kernel::GetTargetName();
+  //   std::string hlir_options = "hlir-codegen-pipeline";
+  std::string hlir_options = "tops-hlir-pipeline";
+
+  // add target options
+  int options_len = 1024;            // NOLINT
+  char target_options[options_len];  // NOLINT
+  TOPSGRAPH_CHECK(
+      topsgraphInitOptions(target_name.c_str(), target_options, options_len));
+
+  std::string target_opt_s = std::string(target_options);
+  char delimiter = '-';
+  auto target_opt_vec = TargetOptionSplit(target_opt_s, delimiter);
+  for (auto it : target_opt_vec) {
+    auto temp_opt = "-" + it;
+    opts.emplace_back(temp_opt);
+  }
+  opts.emplace_back(std::string("-hlir=") + hlir_options);
+  //   opts.emplace_back(
+  //       std::string("-codegen=codegen-gcu-pipeline{enable-memory-reuse=true}"));
+  //   opts.emplace_back(std::string("-output=codegen"));
+
+  if (VLOG_IS_ON(3)) {
+    std::stringstream ss;
+    ss << "compile options: ";
+    for (auto it : opts) {
+      ss << it << " ";
+    }
+    VLOG(3) << ss.str();
+  }
+
+  return opts;
+}
+
+topsExecutable_t CompileTopsExecutable(
+    const std::shared_ptr<hlir::Module>& module) {
+  std::vector<const char*> options;
+  auto compile_options = GetTopsCompileOptions();
+  for (auto& option : compile_options) {
+    options.push_back(option.c_str());
+  }
+
+  // create program and compile
+  topsgraphProgram program;
+  TOPSGRAPH_CHECK(topsgraphCreateProgramFromModule(&program, module.get()));
+  TOPSGRAPH_CHECK(
+      topsgraphCompileProgram(program, options.size(), options.data()));
+
+  // get binary size and binary data
+  uint64_t binary_size = 0;
+  TOPSGRAPH_CHECK(topsgraphGetBinSize(program, &binary_size));
+  std::unique_ptr<char[]> binary(new char[binary_size]);
+  TOPSGRAPH_CHECK(topsgraphGetBin(program, binary.get()));
+
+  // delete program
+  topsgraphDestroyProgram(&program);
+
+  topsExecutable_t exe;
+  RT_CHECK(topsCreateExecutable(&exe, binary.get(), binary_size));
+
+  return exe;
+}
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_engine/ir_translator/utils/utils.h b/backends/gcu/custom_engine/ir_translator/utils/utils.h
new file mode 100644
index 000000000..a3637812b
--- /dev/null
+++ b/backends/gcu/custom_engine/ir_translator/utils/utils.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <tops/tops_ext.h>
+
+#include "common/utils.h"
+#include "gcu/hlir_builder/hlir_builder.h"
+#include "paddle/phi/common/data_type.h"
+
+using GcuOp = ::builder::Op;
+using GcuOpPtr = std::shared_ptr<GcuOp>;
+using GcuPrimitiveType = builder::PrimitiveType;
+using GcuType = builder::Type;
+// using GcuShape = std::vector<int64_t>;
+using GcuBuilder = builder::Builder;
+using GcuBuilderPtr = std::shared_ptr<builder::Builder>;
+using GcuGraphPtr = std::shared_ptr<hlir::Module>;
+// using GcuOpDescPtr = std::shared_ptr<backend::GcuOpDesc>;
+
+namespace custom_engine {
+GcuPrimitiveType ConvertFromPhiDataType(const phi::DataType& type);
+
+std::vector<std::string> GetTopsCompileOptions();
+topsExecutable_t CompileTopsExecutable(
+    const std::shared_ptr<hlir::Module>& module);
+
+}  // namespace custom_engine
diff --git a/backends/gcu/custom_op/test_for_custom_engine_op.cc b/backends/gcu/custom_op/test_for_custom_engine_op.cc
index e40e59bf9..be69ff3c1 100644
--- a/backends/gcu/custom_op/test_for_custom_engine_op.cc
+++ b/backends/gcu/custom_op/test_for_custom_engine_op.cc
@@ -54,7 +54,7 @@ void TestCustomEngineOp() {
       std::vector<pir::Value>{const_op1.result(0), const_op2.result(0)});
 
   pir::OpInfo custom_engine_op_info =
-      ctx->GetRegisteredOpInfo(paddle::dialect::CustomEngineOp::name());
+      ctx->GetRegisteredOpInfo(custom_engine::CustomEngineOp::name());
 
   std::vector<pir::Type> out_types;
   out_types.emplace_back(
@@ -90,7 +90,7 @@ void TestCustomEngineOp() {
 
   builder.Insert(op1);
 
-  auto op2 = builder.Build<paddle::dialect::CustomEngineOp>(
+  auto op2 = builder.Build<custom_engine::CustomEngineOp>(
       buildin_combine_op.result(0),
       std::vector<std::string>{"input_0", "input_1"},
       std::vector<std::string>{"output_0"},
@@ -110,7 +110,7 @@ void TestCustomEngineOp() {
 }
 
 void RunTestCustomEngineOp() {
-  paddle::dialect::RegisterCustomEngineOp();
+  (void)RegisterCustomEngineOp();
   TestCustomEngineOp();
 }
 }  // namespace
diff --git a/backends/gcu/paddle_gcu_export.map b/backends/gcu/paddle_gcu_export.map
index 1b410925a..975b9be19 100644
--- a/backends/gcu/paddle_gcu_export.map
+++ b/backends/gcu/paddle_gcu_export.map
@@ -57,5 +57,6 @@ PADDLE_GCU_1.0 {
         XcclGroupEnd;
         XcclSend;
         XcclRecv;
+        InitPluginCustomEngine;
     local: *;
 };
diff --git a/backends/gcu/passes/gcu_custom_passes.h b/backends/gcu/passes/gcu_custom_passes.h
index 3871a21dd..6609dff66 100644
--- a/backends/gcu/passes/gcu_custom_passes.h
+++ b/backends/gcu/passes/gcu_custom_passes.h
@@ -17,3 +17,6 @@
 #include "paddle/pir/include/pass/pass_registry.h"
 
 USE_PIR_PASS(addn_replace_pass);
+USE_PIR_PASS(gcu_op_marker_pass);
+USE_PIR_PASS(gcu_sub_graph_extract_pass);
+USE_PIR_PASS(gcu_replace_with_engine_op_pass);
diff --git a/backends/gcu/passes/gcu_op_marker_pass.cc b/backends/gcu/passes/gcu_op_marker_pass.cc
new file mode 100644
index 000000000..ac9cc034c
--- /dev/null
+++ b/backends/gcu/passes/gcu_op_marker_pass.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <bitset>
+#include <vector>
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/utils/general_functions.h"
+#include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/core/builtin_op.h"
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+namespace {
+
+// inline auto kCanRunGcuAttr = paddle::dialect::kCanRunGcuAttr;
+inline const char kCanRunGcuAttr[] = "__l_gcu__";
+
+#define DEFINE_GENERAL_PATTERN(OpName, OpType)                            \
+  class OpName##OpPattern : public pir::OpRewritePattern<OpType> {        \
+   public:                                                                \
+    using pir::OpRewritePattern<OpType>::OpRewritePattern;                \
+    bool MatchAndRewrite(OpType op,                                       \
+                         pir::PatternRewriter &rewriter) const override { \
+      if (op->HasAttribute(kCanRunGcuAttr) &&                             \
+          op->attribute<pir::BoolAttribute>(kCanRunGcuAttr).data()) {     \
+        return false;                                                     \
+      }                                                                   \
+      op->set_attribute(kCanRunGcuAttr, rewriter.bool_attr(true));        \
+      return true;                                                        \
+    }                                                                     \
+  };
+
+DEFINE_GENERAL_PATTERN(Matmul, paddle::dialect::MatmulOp)
+DEFINE_GENERAL_PATTERN(Add, paddle::dialect::AddOp)
+DEFINE_GENERAL_PATTERN(Abs, paddle::dialect::AbsOp)
+DEFINE_GENERAL_PATTERN(Full, paddle::dialect::FullOp)
+DEFINE_GENERAL_PATTERN(ScaleOp, paddle::dialect::ScaleOp)
+
+class GcuOpMarkerPass : public pir::PatternRewritePass {
+ public:
+  GcuOpMarkerPass() : pir::PatternRewritePass("gcu_op_marker_pass", 2) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+
+#define ADD_PATTERN(OpName) \
+  ps.Add(std::make_unique<OpName##OpPattern>(context));
+    ADD_PATTERN(Matmul)
+    ADD_PATTERN(Add)
+    ADD_PATTERN(Abs)
+    ADD_PATTERN(Full)
+    ADD_PATTERN(ScaleOp)
+#undef ADD_PATTERN
+
+    return ps;
+  }
+};
+}  // namespace
+
+namespace pir {
+std::unique_ptr<Pass> CreateGcuOpMarkerPass() {
+  return std::make_unique<GcuOpMarkerPass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(gcu_op_marker_pass, GcuOpMarkerPass);
diff --git a/backends/gcu/passes/gcu_replace_with_engine_op_pass.cc b/backends/gcu/passes/gcu_replace_with_engine_op_pass.cc
new file mode 100644
index 000000000..3dafea263
--- /dev/null
+++ b/backends/gcu/passes/gcu_replace_with_engine_op_pass.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <queue>
+#include <regex>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+#include "custom_engine/custom_engine_op.h"
+#include "paddle/common/flags.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/pir/include/core/builder.h"
+#include "paddle/pir/include/core/builtin_op.h"
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+COMMON_DECLARE_bool(print_ir);
+namespace {
+using OpListType = std::list<pir::Operation*>;
+
+std::vector<pir::Value> AnalysisOutputs(
+    const OpListType& group_ops) {  // NOLINT
+  // Get output by ud chain
+  std::unordered_set<pir::Operation*> op_set(group_ops.begin(),
+                                             group_ops.end());
+
+  std::vector<pir::Value> outputs;
+  for (auto* op : group_ops) {
+    for (size_t i = 0; i < op->num_results(); ++i) {
+      auto result = op->result(i);
+
+      for (auto use_iter = result.use_begin(); use_iter != result.use_end();
+           ++use_iter) {
+        if (!op_set.count(use_iter->owner())) {
+          outputs.push_back(result);
+          break;
+        }
+      }
+    }
+  }
+
+  // NOTE: If all value are not used outside, we mark last op's results
+  // as outputs. But keep in mind that is risky.
+  if (outputs.size() == 0) {
+    for (size_t i = 0; i < group_ops.back()->num_results(); ++i) {
+      outputs.push_back(group_ops.back()->result(i));
+    }
+  }
+
+  return outputs;
+}
+
+std::vector<pir::Value> AnalysisInputs(const OpListType& group_ops) {  // NOLINT
+  std::unordered_set<pir::Value> visited_values;
+  std::vector<pir::Value> group_inputs;
+  std::unordered_set<pir::Operation*> ops_set(group_ops.begin(),
+                                              group_ops.end());
+
+  // count all op's input Value
+  for (auto* op : group_ops) {
+    for (auto& value : op->operands_source()) {
+      if (!value || !value.type() || ops_set.count(value.defining_op()))
+        continue;
+      if (visited_values.count(value)) continue;
+      // if the input value owner op is not in OpSet, it's the group's input
+      visited_values.insert(value);
+      group_inputs.push_back(value);
+    }
+  }
+  return group_inputs;
+}
+
+class ReplaceWithCustomEngineOpPattern
+    : public pir::OpRewritePattern<pir::GroupOp> {
+ public:
+  using pir::OpRewritePattern<pir::GroupOp>::OpRewritePattern;
+
+  bool MatchAndRewrite(
+      pir::GroupOp op,
+      pir::PatternRewriter& rewriter) const override {  // NOLINT
+    pir::Block* block = op.block();
+
+    if (FLAGS_print_ir) {
+      std::cout
+          << "ReplaceWithCustomEngineOpPattern MatchAndRewrite before IR = "
+          << *(op->GetParent()) << std::endl;
+    }
+
+    OpListType group_ops = block->ops();
+
+    const std::vector<pir::Value> inputs = AnalysisInputs(group_ops);
+    const std::vector<pir::Value> outputs = op->results();
+
+    // attrs
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      std::string input_name = "graph_input_" + std::to_string(i) + "_op_" +
+                               std::to_string(inputs[i].defining_op()->id());
+      input_names.emplace_back(input_name);
+    }
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      std::string output_name = "graph_output_" + std::to_string(i) + "_op_" +
+                                std::to_string(outputs[i].defining_op()->id());
+      output_names.emplace_back(output_name);
+    }
+
+    std::vector<pir::Type> output_types;
+    for (auto& value : outputs) {
+      output_types.emplace_back(value.type());
+    }
+
+    auto buildin_combine_op = rewriter.Build<pir::CombineOp>(inputs);
+
+    custom_engine::CustomEngineOp custom_engine_op =
+        rewriter.Build<custom_engine::CustomEngineOp>(
+            buildin_combine_op.out(), input_names, output_names, output_types);
+
+    auto out_split_op = rewriter.Build<pir::SplitOp>(custom_engine_op.out());
+    std::vector<pir::Value> new_outputs = out_split_op.outputs();
+
+    if (FLAGS_print_ir) {
+      std::cout << "custom_engine_op name: " << custom_engine_op.name()
+                << std::endl;
+      std::cout << "ReplaceWithCustomEngineOpPattern MatchAndRewrite mid IR = "
+                << *(op->GetParent()) << std::endl;
+    }
+
+    for (auto inner_op : group_ops) {
+      inner_op->MoveTo(custom_engine_op.block(),
+                       custom_engine_op.block()->end());
+    }
+    rewriter.ReplaceOp(op, new_outputs);
+
+    if (FLAGS_print_ir) {
+      std::cout
+          << "ReplaceWithCustomEngineOpPattern MatchAndRewrite after IR = "
+          << *(op->GetParent()) << std::endl;
+    }
+
+    return true;
+  }
+};
+
+class GcuReplaceWithCustomEngineOpPass : public pir::PatternRewritePass {
+ public:
+  GcuReplaceWithCustomEngineOpPass()
+      : pir::PatternRewritePass("gcu_replace_with_engine_op_pass", 2) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(std::make_unique<ReplaceWithCustomEngineOpPattern>(context));
+    return ps;
+  }
+};
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateGcuReplaceWithCustomEngineOpPass() {
+  return std::make_unique<GcuReplaceWithCustomEngineOpPass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(gcu_replace_with_engine_op_pass,
+                 GcuReplaceWithCustomEngineOpPass);
diff --git a/backends/gcu/passes/gcu_sub_graph_extract_pass.cc b/backends/gcu/passes/gcu_sub_graph_extract_pass.cc
new file mode 100644
index 000000000..26870a845
--- /dev/null
+++ b/backends/gcu/passes/gcu_sub_graph_extract_pass.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <queue>
+#include <regex>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/common/flags.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/pir/include/core/builder.h"
+#include "paddle/pir/include/core/builtin_op.h"
+#include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
+
+// #include "passes/sub_graph_detector.h"
+#include "paddle/fluid/pir/utils/sub_graph_detector.h"
+
+// COMMON_DECLARE_int32(gcu_min_group_size);
+PHI_DEFINE_EXPORTED_int32(
+    gcu_min_group_size,
+    1,
+    "when the gcu subgraph size is not larger than `gcu_min_group_size`, the "
+    "group will fallback to original graph.");
+
+COMMON_DECLARE_bool(print_ir);
+
+namespace {
+using GroupOpsVec = std::vector<pir::Operation*>;
+inline const char kCanRunGcuAttr[] = "__l_gcu__";
+
+bool IsSupportedByGCU(const pir::Operation& op) {
+  if (op.HasAttribute(kCanRunGcuAttr) &&
+      op.attribute<pir::BoolAttribute>(kCanRunGcuAttr).data()) {
+    return true;
+  }
+  return false;
+}
+
+class GcuSubGraphExtractPass : public pir::Pass {
+ public:
+  GcuSubGraphExtractPass() : pir::Pass("gcu_sub_graph_extract_pass", 2) {}
+
+  void Run(pir::Operation* op) override {
+    auto module_op = op->dyn_cast<pir::ModuleOp>();
+    PADDLE_ENFORCE_NOT_NULL(
+        module_op,
+        common::errors::InvalidArgument(
+            "sub_graph_extract_pass should run on module op."));
+    auto& block = module_op.block();
+
+    if (FLAGS_print_ir) {
+      std::cout << "GcuSubGraphExtractPass before IR = " << block << std::endl;
+    }
+
+    std::vector<GroupOpsVec> groups =
+        pir::DetectSubGraphs(&block, IsSupportedByGCU);
+    AddStatistics(groups.size());
+    for (auto& group_ops : groups) {
+      if (group_ops.size() < static_cast<size_t>(FLAGS_gcu_min_group_size)) {
+        VLOG(0) << "current group_ops.size(): " << group_ops.size()
+                << ", less than min_group_size:"
+                << static_cast<size_t>(FLAGS_gcu_min_group_size)
+                << ", will fallback to paddle original graph";
+        continue;
+      }
+      VLOG(0) << "current group_ops.size(): " << group_ops.size()
+              << ", greater or equal than min_group_size:"
+              << static_cast<size_t>(FLAGS_gcu_min_group_size)
+              << ", will lower to GCU graph";
+      pir::ReplaceWithGroupOp(&block, group_ops);
+    }
+    if (FLAGS_print_ir) {
+      std::cout << "GcuSubGraphExtractPass after IR = " << block << std::endl;
+    }
+  }
+
+  bool CanApplyOn(pir::Operation* op) const override {
+    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
+  }
+};
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateGcuSubGraphExtractPass() {
+  return std::make_unique<GcuSubGraphExtractPass>();
+}
+
+}  // namespace pir
+
+REGISTER_IR_PASS(gcu_sub_graph_extract_pass, GcuSubGraphExtractPass);
diff --git a/backends/gcu/passes/sub_graph_detector.cc b/backends/gcu/passes/sub_graph_detector.cc
new file mode 100644
index 000000000..1d2f22112
--- /dev/null
+++ b/backends/gcu/passes/sub_graph_detector.cc
@@ -0,0 +1,894 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "passes/sub_graph_detector.h"
+
+#include <climits>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <regex>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/utils/general_functions.h"
+#include "paddle/pir/include/core/builder.h"
+#include "paddle/pir/include/core/builtin_op.h"
+#include "paddle/pir/include/dialect/control_flow/ir/cf_dialect.h"
+#include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
+// #include "paddle/pir/include/pass/pass.h"
+// #include "paddle/pir/include/pass/pass_registry.h"
+#include "custom_engine/custom_engine_op.h"
+#include "paddle/common/flags.h"
+
+namespace custom_pass {
+std::vector<pir::Operation*> InverselyTopologicalSort(pir::Block* block) {
+  std::vector<pir::Operation*> sort_ops;
+  std::unordered_map<pir::Operation*, size_t> pending_count;
+  // step 1: initialize pending_cout for defined op
+  for (auto& op : *block) {
+    if (pending_count.find(&op) == pending_count.end()) {
+      pending_count[&op] = 0;
+    }
+    for (auto operand : GetUsedExternalValue(op)) {
+      if (!operand || !operand.defining_op()) {
+        continue;
+      }
+      auto* defined_op = operand.defining_op();
+      if (pending_count.find(defined_op) != pending_count.end()) {
+        ++pending_count[defined_op];
+      } else {
+        pending_count[defined_op] = 1;
+      }
+    }
+  }
+
+  std::queue<pir::Operation*> queue;
+  for (auto& op : *block) {
+    if (pending_count[&op] == 0) {
+      queue.push(&op);
+    }
+  }
+
+  while (!queue.empty()) {
+    auto* op = queue.front();
+    queue.pop();
+    sort_ops.push_back(op);
+    for (auto operand : GetUsedExternalValue(*op)) {
+      if (!operand || !operand.defining_op()) {
+        continue;
+      }
+      auto* defined_op = operand.defining_op();
+      --pending_count[defined_op];
+      if (defined_op && pending_count[defined_op] == 0 &&
+          defined_op->GetParent() == block) {
+        queue.push(defined_op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      block->size(),
+      sort_ops.size(),
+      common::errors::InvalidArgument("sort_ops.size() must be equal to "
+                                      "block.size(), but received %d != %d",
+                                      block->size(),
+                                      sort_ops.size()));
+
+  return sort_ops;
+}
+
+std::vector<pir::Operation*> GetProducerOpsReverseSort(
+    pir::Operation* op,
+    const std::unordered_map<pir::Operation*, int>& op2index) {
+  std::unordered_set<pir::Operation*> producers;
+
+  std::vector<pir::Operation*> vec_res;
+  for (auto operand : GetUsedExternalValue(*op)) {
+    if (!operand || !operand.defining_op()) {
+      continue;
+    }
+    auto* source_op = operand.defining_op();
+    if (source_op && !producers.count(source_op) &&
+        source_op->GetParent() == op->GetParent()) {
+      producers.insert(source_op);
+      PADDLE_ENFORCE(
+          op2index.count(source_op),
+          common::errors::PreconditionNotMet("source op MUST in op2index map"));
+      vec_res.emplace_back(source_op);
+    }
+  }
+
+  std::sort(vec_res.begin(),
+            vec_res.end(),
+            [&op2index](pir::Operation* a, pir::Operation* b) {
+              return op2index.at(a) > op2index.at(b);
+            });
+
+  return vec_res;
+}
+
+std::vector<pir::Operation*> GetProducerOps(pir::Operation* op) {
+  std::vector<pir::Operation*> producers;
+
+  for (auto operand : GetUsedExternalValue(*op)) {
+    if (!operand || !operand.defining_op()) {
+      continue;
+    }
+    auto* source_op = operand.defining_op();
+    if (source_op && source_op->GetParent() == op->GetParent()) {
+      producers.push_back(source_op);
+    }
+  }
+  return producers;
+}
+
+std::vector<pir::Operation*> GetConsumerOps(
+    pir::Operation* op,
+    const std::unordered_map<pir::Operation*, int>& op2index) {
+  std::vector<pir::Operation*> consumers;
+
+  for (auto& result : op->results()) {
+    for (auto it = result.use_begin(); it != result.use_end(); ++it) {
+      auto parent_op = it->owner();
+      while (parent_op) {
+        if (op2index.count(parent_op)) {
+          consumers.push_back(parent_op);
+          break;
+        }
+        parent_op = parent_op->GetParentOp();
+      }
+    }
+  }
+  return consumers;
+}
+
+static std::string OpsDebugStr(std::vector<pir::Operation*> ops) {
+  std::stringstream ss;
+  pir::IrPrinter printer(ss);
+  for (const auto* op : ops) {
+    printer.PrintOperation(*op);
+    ss << "{" << op->id() << "}\n";
+  }
+  return ss.str();
+}
+
+struct SubGraph : public std::enable_shared_from_this<SubGraph> {
+  using SubGraphPtr = std::shared_ptr<SubGraph>;
+  SubGraph() = delete;
+  SubGraph(pir::Operation* op, int index, bool subst)
+      : substitute(subst), topo_index(index), id(UniqueId()) {
+    ops.push_back(op);
+  }
+
+  void Merge(const SubGraphPtr& other);
+
+  static size_t UniqueId() {
+    static std::atomic<size_t> counter{0};
+    return counter++;
+  }
+
+  template <typename V>
+  static std::string JointName(const V& subgraphs) {
+    std::stringstream ss;
+    for (const auto& subgraph : subgraphs) {
+      ss << subgraph->name() << ", ";
+    }
+    auto str = ss.str();
+    return str.empty() ? str : str.substr(0, str.size() - 2);
+  }
+
+  std::string DebugStr(bool print_ops = false) const {
+    std::stringstream ss;
+    ss << "=========================================\n";
+    ss << name() << " (substitute=" << substitute << ", "
+       << "index=" << topo_index << ", "
+       << "size=" << ops.size() << ")\n";
+    if (print_ops) ss << OpsDebugStr(ops);
+    ss << "upstream: " << JointName(upstreams);
+    ss << "\ndownstream: " << JointName(downstreams);
+    return ss.str();
+  }
+
+  std::string name() const {
+    return std::string("Subgraph_") + std::to_string(id);
+  }
+
+  struct compare {
+    bool operator()(const SubGraphPtr& lhs, const SubGraphPtr& rhs) const {
+      // sort by reverse order of topo id
+      return lhs->id > rhs->id;
+    }
+  };
+
+  std::vector<pir::Operation*> ops;
+  std::set<SubGraphPtr, compare> upstreams;
+  std::set<SubGraphPtr, compare> downstreams;
+
+  bool substitute;  // whether this subgraph can be merged
+  int topo_index;
+  size_t id;
+};
+using SubGraphPtr = std::shared_ptr<SubGraph>;
+
+void SubGraph::Merge(const SubGraphPtr& other) {
+  // Merge other subgraph into this subgraph:
+  // Inherit its upstreams/downstreams and ops
+  SubGraphPtr self = shared_from_this();
+  for (const auto& upstream : other->upstreams) {
+    if (upstream == self) continue;
+    upstream->downstreams.erase(other);
+    upstream->downstreams.insert(self);
+    upstreams.insert(upstream);
+  }
+  for (const auto& downstream : other->downstreams) {
+    if (downstream == self) continue;
+    downstream->upstreams.erase(other);
+    downstream->upstreams.insert(self);
+    downstreams.insert(downstream);
+  }
+  upstreams.erase(other);
+  downstreams.erase(other);
+  ops.insert(ops.begin(), other->ops.begin(), other->ops.end());
+}
+
+bool HasSinkRoute(const SubGraphPtr& source, const SubGraphPtr& target) {
+  if (source == target) return true;
+  std::unordered_set<SubGraphPtr> visited;
+  std::queue<SubGraphPtr> queue;
+  queue.push(source);
+  visited.insert(source);
+  while (!queue.empty()) {
+    SubGraphPtr cur = queue.front();
+    queue.pop();
+    if (cur == target) return true;
+    if (cur->topo_index > target->topo_index) continue;
+    for (const auto& subgraph : cur->downstreams) {
+      if (visited.count(subgraph)) continue;
+      queue.push(subgraph);
+      visited.insert(subgraph);
+    }
+  }
+  return false;
+}
+
+bool HasLiftRoute(const SubGraphPtr& source, const SubGraphPtr& target) {
+  if (source == target) return true;
+  std::unordered_set<SubGraphPtr> visited;
+  std::queue<SubGraphPtr> queue;
+  queue.push(source);
+  visited.insert(source);
+  while (!queue.empty()) {
+    SubGraphPtr cur = queue.front();
+    queue.pop();
+    if (cur == target) return true;
+    if (source->topo_index < target->topo_index) continue;
+    for (const auto& subgraph : cur->upstreams) {
+      if (visited.count(subgraph)) continue;
+      queue.push(subgraph);
+      visited.insert(subgraph);
+    }
+  }
+  return false;
+}
+
+bool HasRoute(const SubGraphPtr& up, const SubGraphPtr& down) {
+  return HasSinkRoute(up, down) || HasLiftRoute(down, up);
+}
+
+bool CanFuseUpstream2Downstream(const SubGraphPtr& upstream,
+                                const SubGraphPtr& downstream) {
+  PADDLE_ENFORCE(upstream->downstreams.count(downstream) &&
+                     downstream->upstreams.count(upstream),
+                 ::common::errors::InvalidArgument(
+                     "Subgraphs to be fused must have direct relationship."));
+  auto up_downstreams = upstream->downstreams;
+  up_downstreams.erase(downstream);
+  auto down_upstreams = downstream->upstreams;
+  down_upstreams.erase(upstream);
+  if (up_downstreams.empty() || down_upstreams.empty()) return true;
+  for (const auto& subgraph : up_downstreams) {
+    if (HasSinkRoute(subgraph, downstream)) return false;
+  }
+  for (const auto& subgraph : down_upstreams) {
+    if (HasLiftRoute(subgraph, upstream)) return false;
+  }
+  return true;
+}
+
+std::optional<std::string> DetectCirclesInSubgraphs(
+    const std::vector<SubGraphPtr>& subgraph_list) {
+  std::set<SubGraphPtr, SubGraph::compare> subgraph_set(subgraph_list.begin(),
+                                                        subgraph_list.end());
+  std::unordered_map<SubGraphPtr, size_t> in_degree;
+  std::unordered_map<SubGraphPtr, size_t> out_degree;
+  for (const auto& subgraph : subgraph_set) {
+    in_degree[subgraph] = subgraph->upstreams.size();
+    out_degree[subgraph] = subgraph->downstreams.size();
+  }
+  // Recursively remove nodes with in_degree or out_degree = 0
+  bool erase_flag = true;
+  while (erase_flag) {
+    erase_flag = false;
+    for (const auto& subgraph : subgraph_list) {
+      if (subgraph_set.count(subgraph) == 0) continue;
+      if (in_degree[subgraph] == 0) {
+        for (const auto& downstream : subgraph->downstreams) {
+          in_degree[downstream]--;
+        }
+        subgraph_set.erase(subgraph);
+        erase_flag = true;
+        continue;
+      }
+      if (out_degree[subgraph] == 0) {
+        for (const auto& upstream : subgraph->upstreams) {
+          out_degree[upstream]--;
+        }
+        subgraph_set.erase(subgraph);
+        erase_flag = true;
+        continue;
+      }
+    }
+  }
+  if (subgraph_set.empty()) return std::nullopt;
+  // If subgraph_set is not empty, there are circles in the subgraphs.
+  auto circle_size = subgraph_set.size();
+  std::stringstream ss;
+  ss << "Circles detected in subgraphs (size=" << circle_size << "): \n";
+  for (const auto& subgraph : subgraph_set) {
+    ss << subgraph->DebugStr() << "\n";
+  }
+  return std::make_optional(ss.str());
+}
+
+class SubgraphDetector {
+ public:
+  SubgraphDetector(pir::Block* block, const OpClassifier& classifier);
+
+  void SubgraphFusion();
+
+  std::vector<GroupOpsVec> BuildGroups();
+
+ private:
+  void ReorderIndexOfSubgraphs();
+
+  void MergeSource2Target(const SubGraphPtr& source, const SubGraphPtr& target);
+
+  SubGraphPtr GetOpSubgraph(pir::Operation* op) {
+    PADDLE_ENFORCE(
+        op2subgraph_.count(op),
+        ::common::errors::InvalidArgument(
+            "Can not find op in op2subgraph_: \n%s", OpsDebugStr({op})));
+    return op2subgraph_.at(op);
+  }
+
+  std::unordered_map<pir::Operation*, int> op2index_;
+  std::vector<pir::Operation*> sort_ops_;
+  std::unordered_map<pir::Operation*, SubGraphPtr> op2subgraph_;
+  std::unordered_set<int> subgraph_index_set_;
+};
+
+void SubgraphDetector::ReorderIndexOfSubgraphs() {
+  // After merging subgraphs with direct relation, brother subgraphs with
+  // indirect relation may not be detected by index order. So we need to
+  // reorder the index of subgraphs.
+  std::queue<SubGraphPtr> queue;
+  std::unordered_map<SubGraphPtr, int> in_degree;
+  for (auto it = sort_ops_.rbegin(); it != sort_ops_.rend(); ++it) {
+    auto subgraph = GetOpSubgraph(*it);
+    if (in_degree.count(subgraph)) continue;
+    in_degree[subgraph] = subgraph->upstreams.size();
+    if (in_degree[subgraph] == 0) queue.push(subgraph);
+  }
+  int index = 0;
+  while (!queue.empty()) {
+    auto subgraph = queue.front();
+    queue.pop();
+    subgraph->topo_index = index++;
+    for (const auto& downstream : subgraph->downstreams) {
+      in_degree[downstream]--;
+      if (in_degree[downstream] == 0) queue.push(downstream);
+    }
+  }
+}
+
+void SubgraphDetector::MergeSource2Target(const SubGraphPtr& source,
+                                          const SubGraphPtr& target) {
+  VLOG(6) << "Merge source: " << source->DebugStr();
+  VLOG(6) << "Merge target: " << target->DebugStr();
+  target->Merge(source);
+  int max_index = std::max(source->topo_index, target->topo_index);
+  int min_index = std::min(source->topo_index, target->topo_index);
+  auto merged = target;
+  // Check if merged subgraph and its related subgraphs
+  // satisfy the topological order condition.
+  int upstream_max_index = -1, downstream_min_index = INT_MAX;
+  for (const auto& upstream : merged->upstreams) {
+    upstream_max_index = std::max(upstream->topo_index, upstream_max_index);
+  }
+  for (const auto& downstream : merged->downstreams) {
+    downstream_min_index =
+        std::min(downstream->topo_index, downstream_min_index);
+  }
+  // 1. If satisfy the topological order after merging, just set max_index
+  VLOG(6) << "Check if satisfy the topological order after merging";
+  if (min_index > upstream_max_index && max_index < downstream_min_index) {
+    merged->topo_index = max_index;
+    subgraph_index_set_.erase(min_index);
+    return;
+  }
+  // 2. If not satisfy the order, find a index between upstream_max_index
+  // and downstream_min_index while not in subgraph_index_set_.
+  VLOG(6) << "Try to find a valid index not in subgraph_index_set_";
+  for (int i = upstream_max_index + 1; i < downstream_min_index; ++i) {
+    if (!subgraph_index_set_.count(i)) {
+      merged->topo_index = i;
+      subgraph_index_set_.erase(min_index);
+      subgraph_index_set_.erase(max_index);
+      subgraph_index_set_.insert(i);
+      return;
+    }
+  }
+  // 3. If can not find a valid index, reorder topo index of all subgraphs.
+  VLOG(6) << "Reorder topo index of all subgraphs";
+  ReorderIndexOfSubgraphs();
+}
+
+SubgraphDetector::SubgraphDetector(pir::Block* block,
+                                   const OpClassifier& classifier) {
+  // init sort_ops_ in reverse topo order
+  sort_ops_ = InverselyTopologicalSort(block);
+  // init op2index_ in topo order
+  int index = 0;
+  for (auto& op : *block) {
+    op2index_[&op] = index++;
+  }
+  // construct subgraphs and upstream/downstream relation
+  std::vector<SubGraphPtr> subgraph_list;
+  for (const auto& op : sort_ops_) {
+    bool substitute = classifier(*op);
+    auto subgraph = std::make_shared<SubGraph>(op, op2index_[op], substitute);
+    op2subgraph_[op] = subgraph;
+    subgraph_index_set_.insert(op2index_[op]);
+    subgraph_list.push_back(subgraph);
+  }
+  for (const auto& op : sort_ops_) {
+    auto subgraph = op2subgraph_[op];
+    for (const auto& producer : GetProducerOps(op)) {
+      if (!op2subgraph_.count(producer)) continue;
+      subgraph->upstreams.insert(op2subgraph_[producer]);
+      op2subgraph_[producer]->downstreams.insert(subgraph);
+    }
+    for (const auto& consumer : GetConsumerOps(op, op2index_)) {
+      if (!op2subgraph_.count(consumer)) continue;
+      subgraph->downstreams.insert(op2subgraph_[consumer]);
+      op2subgraph_[consumer]->upstreams.insert(subgraph);
+    }
+  }
+  VLOG(6) << "Subgraphs before building groups: ";
+  for (const auto& subgraph : subgraph_list) {
+    VLOG(6) << subgraph->DebugStr(true);
+  }
+  auto circle_info = DetectCirclesInSubgraphs(subgraph_list);
+  if (circle_info) {
+    PADDLE_THROW(::common::errors::PreconditionNotMet(
+        "Before building groups: %s", circle_info.value()));
+  }
+}
+
+void SubgraphDetector::SubgraphFusion() {
+  // Two subgraphs can be merged only if they have no route except direct
+  // connection between them (brother subgraphs should have no any route),
+  // otherwise a circle will be formed after merging them.
+  VLOG(4) << "Merge subgraphs with direct relation";
+  for (const auto& op : sort_ops_) {
+    auto downstream = GetOpSubgraph(op);
+    if (!downstream->substitute) continue;
+    for (const auto& producer : GetProducerOpsReverseSort(op, op2index_)) {
+      auto upstream = GetOpSubgraph(producer);
+      if (upstream == downstream || !upstream->substitute) continue;
+      if (CanFuseUpstream2Downstream(upstream, downstream)) {
+        MergeSource2Target(upstream, downstream);
+        for (auto upstream_op : upstream->ops) {
+          op2subgraph_[upstream_op] = downstream;
+        }
+        VLOG(6) << "Merged subgraph: " << downstream->DebugStr();
+      }
+    }
+  }
+
+  VLOG(4) << "Merge brother subgraphs with same upstream";
+  for (const auto& op : sort_ops_) {
+    auto subgraph = GetOpSubgraph(op);
+    if (!subgraph->substitute) continue;
+    for (auto producer : GetProducerOpsReverseSort(op, op2index_)) {
+      if (GetOpSubgraph(producer) == subgraph) continue;
+      for (auto consumer : GetConsumerOps(producer, op2index_)) {
+        auto brother = GetOpSubgraph(consumer);
+        if (brother == subgraph || !brother->substitute) continue;
+        if (!HasRoute(subgraph, brother) && !HasRoute(brother, subgraph)) {
+          MergeSource2Target(brother, subgraph);
+          for (auto brother_op : brother->ops) {
+            op2subgraph_[brother_op] = subgraph;
+          }
+          VLOG(6) << "Merged subgraph: " << subgraph->DebugStr();
+        }
+      }
+    }
+  }
+}
+
+std::vector<GroupOpsVec> SubgraphDetector::BuildGroups() {
+  // 1. Get subgraph list in topo order
+  std::unordered_set<SubGraphPtr> subgraph_set;
+  std::vector<SubGraphPtr> subgraph_list;
+  for (const auto& op : sort_ops_) {
+    SubGraphPtr subgraph = GetOpSubgraph(op);
+    if (subgraph_set.count(subgraph)) continue;
+    subgraph_set.insert(subgraph);
+    subgraph_list.push_back(subgraph);
+  }
+  std::reverse(subgraph_list.begin(), subgraph_list.end());
+  VLOG(6) << "Subgraphs after building groups: ";
+  for (const auto& subgraph : subgraph_list) {
+    VLOG(6) << subgraph->DebugStr(true);
+  }
+  auto circle_info = DetectCirclesInSubgraphs(subgraph_list);
+  if (circle_info) {
+    PADDLE_THROW(::common::errors::PreconditionNotMet(
+        "After building groups: %s", circle_info.value()));
+  }
+
+  // 2. Build group ops in subgraph which can be substituted
+  std::vector<GroupOpsVec> groups;
+  for (const auto& subgraph : subgraph_list) {
+    if (!subgraph->substitute) {
+      continue;
+    }
+    // sort group ops by natural increasing index.
+    std::vector<pir::Operation*> group_ops(subgraph->ops.begin(),
+                                           subgraph->ops.end());
+    std::sort(group_ops.begin(),
+              group_ops.end(),
+              [this](pir::Operation* a, pir::Operation* b) {
+                return this->op2index_.at(a) < this->op2index_.at(b);
+              });
+    groups.push_back(group_ops);
+  }
+  return groups;
+}
+
+std::vector<GroupOpsVec> DetectSubGraphs(pir::Block* block,
+                                         const OpClassifier& classifier) {
+  auto subgraph_detector = SubgraphDetector(block, classifier);
+  subgraph_detector.SubgraphFusion();
+  return subgraph_detector.BuildGroups();
+}
+
+std::vector<pir::Value> AnalysisOutputs(
+    const GroupOpsVec& group_ops) {  // NOLINT
+  // Get output by ud chain
+  std::unordered_set<pir::Operation*> op_set(group_ops.begin(),
+                                             group_ops.end());
+
+  std::vector<pir::Value> outputs;
+  for (auto* op : group_ops) {
+    for (size_t i = 0; i < op->num_results(); ++i) {
+      auto result = op->result(i);
+
+      for (auto use_iter = result.use_begin(); use_iter != result.use_end();
+           ++use_iter) {
+        if (!op_set.count(use_iter->owner())) {
+          outputs.push_back(result);
+          break;
+        }
+      }
+    }
+  }
+
+  // NOTE: If all value are not used outside, we mark last op's results
+  // as outputs. But keep in mind that is risky.
+  if (outputs.size() == 0) {
+    for (size_t i = 0; i < group_ops.back()->num_results(); ++i) {
+      outputs.push_back(group_ops.back()->result(i));
+    }
+  }
+
+  return outputs;
+}
+
+std::vector<pir::Value> AnalysisInputs(
+    const GroupOpsVec& group_ops) {  // NOLINT
+  std::unordered_set<pir::Value> visited_values;
+  std::vector<pir::Value> group_inputs;
+  std::unordered_set<pir::Operation*> ops_set(group_ops.begin(),
+                                              group_ops.end());
+
+  // count all op's input Value
+  for (auto* op : group_ops) {
+    for (auto& value : op->operands_source()) {
+      if (!value || !value.type() || ops_set.count(value.defining_op()))
+        continue;
+      if (visited_values.count(value)) continue;
+      // if the input value owner op is not in OpSet, it's the group's input
+      visited_values.insert(value);
+      group_inputs.push_back(value);
+    }
+  }
+  return group_inputs;
+}
+
+namespace {
+
+struct IncrementalOrder {
+  bool operator()(const pir::Operation* lhs, const pir::Operation* rhs) const {
+    PADDLE_ENFORCE_EQ(lhs->GetParent() == rhs->GetParent(),
+                      true,
+                      common::errors::PreconditionNotMet(
+                          "lhs and rhs should have same parent block."));
+    auto lhs_iter = lhs->operator pir::Block::ConstIterator();
+    auto rhs_iter = rhs->operator pir::Block::ConstIterator();
+    auto end_iter = lhs->GetParent()->end();
+    while (lhs_iter != end_iter) {
+      lhs_iter++;
+      if (lhs_iter == rhs_iter) return true;
+      if (lhs_iter == end_iter) return false;
+    }
+    PADDLE_ENFORCE_EQ(
+        false,
+        true,
+        common::errors::InvalidArgument("rhs is not reachable from lhs."));
+    return false;
+  }
+};
+
+std::unordered_set<pir::Operation*> GetUpstreamOpsAfterPosition(
+    const pir::Operation* position_op,
+    const pir::Block* block,
+    pir::Operation* op,
+    std::unordered_set<pir::Operation*>* visited_ops) {
+  std::unordered_set<pir::Operation*> ops;
+  const auto& IsInBlock = [](const pir::Operation* src_op,
+                             const pir::Block* block) {
+    for (auto& item : *block) {
+      if (src_op->id() == item.id()) return true;
+    }
+    return false;
+  };
+  std::vector<pir::Value> op_inputs = GetUsedExternalValue(*op);
+  for (auto value : op_inputs) {
+    if (!value || !value.defining_op()) continue;
+    pir::Operation* defining_op = value.defining_op();
+    if (visited_ops->count(defining_op)) continue;
+    visited_ops->insert(defining_op);
+    if (!IsInBlock(defining_op, block)) continue;
+    if (IncrementalOrder()(defining_op, position_op)) continue;
+
+    ops.insert(defining_op);
+    auto recursive_ops = GetUpstreamOpsAfterPosition(
+        position_op, block, defining_op, visited_ops);
+    ops.insert(recursive_ops.begin(), recursive_ops.end());
+  }
+  return ops;
+}
+}  // namespace
+
+void MoveUpstreamOpBeforeGroup(const GroupOpsVec& group_ops,
+                               pir::Block* block,
+                               pir::Operation* insert_point_op) {
+  const auto moved_ops = [&]() {
+    std::set<pir::Operation*, IncrementalOrder> ops_set;
+    std::unordered_set<pir::Operation*> visited_ops;
+    for (auto& op : group_ops) {
+      auto upstream_ops =
+          GetUpstreamOpsAfterPosition(insert_point_op, block, op, &visited_ops);
+      ops_set.insert(upstream_ops.begin(), upstream_ops.end());
+    }
+    return ops_set;
+  }();
+
+  for (auto& op : moved_ops) {
+    if (op == insert_point_op) continue;
+    VLOG(4) << "Move " << op->id() << " " << op->name() << " before "
+            << insert_point_op->id() << " " << insert_point_op->name();
+    op->MoveTo(block, insert_point_op->operator pir::Block::Iterator());
+  }
+}
+
+pir::Operation* FindInsertPoint(const GroupOpsVec& group_ops,
+                                const std::vector<pir::Value>& outputs) {
+  // Regard last op as insert position if there are no downstream ops between in
+  // group_ops.
+  pir::Operation* first_op = group_ops.front();
+  pir::Operation* insert_point_op = group_ops.back();
+  auto order_info =
+      [&]() -> std::unordered_map<const pir::Operation*, int64_t> {
+    std::unordered_map<const pir::Operation*, int64_t> map;
+    // initialize the position index with block size by default.
+    auto block = insert_point_op->GetParent();
+    int64_t order = 0;
+    for (auto& op : *block) {
+      map[&op] = order++;
+    }
+    return map;
+  }();
+
+  for (auto* op : group_ops) {
+    if (order_info.at(op) > order_info.at(insert_point_op)) {
+      insert_point_op = op;
+    }
+    if (order_info.at(op) < order_info.at(first_op)) {
+      first_op = op;
+    }
+  }
+
+  auto begin = first_op->operator pir::Block::ConstIterator();
+  auto end = ++(insert_point_op->operator pir::Block::ConstIterator());
+  const std::unordered_set<pir::Value> outputs_set(outputs.begin(),
+                                                   outputs.end());
+  const std::unordered_set<const pir::Operation*> group_ops_set(
+      group_ops.begin(), group_ops.end());
+
+  const auto& IsDownstreamOp = [&](const pir::Operation* op) -> bool {
+    if (group_ops_set.find(op) != group_ops_set.end()) return false;
+    for (auto& value : GetUsedExternalValue(*op)) {
+      if (outputs_set.find(value) != outputs_set.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+  // Find first downstream op as final insert position.
+  for (; begin != end; ++begin) {
+    if (IsDownstreamOp(begin)) {
+      insert_point_op = begin;
+      break;
+    }
+  }
+  return insert_point_op;
+}
+
+void ReplaceWithGroupOp(pir::Block* block,
+                        const GroupOpsVec& group_ops) {  // NOLINT
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  // #ifdef PADDLE_WITH_CINN
+  //   ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  // #endif
+  // #ifdef PADDLE_WITH_DNNL
+  //   ctx->GetOrRegisterDialect<paddle::dialect::OneDNNOperatorDialect>();
+  // #endif
+  ::pir::Builder builder = ::pir::Builder(ctx, block);
+  const std::vector<pir::Value> outputs =
+      custom_pass::AnalysisOutputs(group_ops);
+
+  // step 1: Analysis and insert group op before insert_point.
+  auto* insert_point = custom_pass::FindInsertPoint(group_ops, outputs);
+  custom_pass::MoveUpstreamOpBeforeGroup(group_ops, block, insert_point);
+  builder.set_insertion_point(insert_point);
+  VLOG(6) << "Insert GroupOp after " << insert_point->name();
+
+// step 2: Replace the old op with GroupOp.
+#ifdef PADDLE_WITH_CINN
+
+  auto new_group_op = [&]() -> cinn::dialect::GroupOp {
+    std::vector<pir::Type> output_types;
+    for (auto& value : outputs) output_types.emplace_back(value.type());
+
+    auto group_op = builder.Build<cinn::dialect::GroupOp>(output_types);
+    for (auto op : group_ops) {
+      op->MoveTo(group_op.block(), group_op.block()->end());
+    }
+    return group_op;
+  }();
+#else
+  auto new_group_op = [&]() -> pir::GroupOp {
+    std::vector<pir::Type> output_types;
+    for (auto& value : outputs) output_types.emplace_back(value.type());
+
+    auto group_op = builder.Build<pir::GroupOp>(output_types);
+    for (auto op : group_ops) {
+      op->MoveTo(group_op.block(), group_op.block()->end());
+    }
+    return group_op;
+  }();
+#endif
+
+  // step 3: Replace outputs of inner ops
+  const std::vector<pir::Value> group_outs = new_group_op->results();
+  std::unordered_set<pir::Operation*> inner_ops(group_ops.begin(),
+                                                group_ops.end());
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    outputs[i].ReplaceUsesWithIf(group_outs[i],
+                                 [&inner_ops](pir::OpOperand op) {
+                                   return !inner_ops.count(op.owner());
+                                 });
+  }
+
+  // step 4: Insert YieldOp for outputs
+  builder.SetInsertionPointToBlockEnd(new_group_op.block());
+  builder.Build<::pir::YieldOp>(outputs);
+}
+
+// void ReplaceWithCustomEngineOp(pir::Block* block, const GroupOpsVec&
+// group_ops) {
+//   ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+//   ::pir::Builder builder = ::pir::Builder(ctx, block);
+//   const std::vector<pir::Value> outputs =
+//   custom_pass::AnalysisOutputs(group_ops); const std::vector<pir::Value>
+//   inputs = custom_pass::AnalysisInputs(group_ops);
+
+//   // step 1: Analysis and insert group op before insert_point.
+//   auto* insert_point = custom_pass::FindInsertPoint(group_ops, outputs);
+//   custom_pass::MoveUpstreamOpBeforeGroup(group_ops, block, insert_point);
+//   builder.set_insertion_point(insert_point);
+//   VLOG(6) << "Insert GroupOp after " << insert_point->name();
+
+//   // attrs
+//   std::vector<pir::Attribute> input_names;
+//   std::vector<pir::Attribute> output_names;
+//   for (size_t i = 0; i < inputs.size(); ++i) {
+//     std::string input_name = "graph_input_" + std::to_string(i) + "_op_" +
+//                      std::to_string(inputs[i].defining_op()->id());
+//     input_names.emplace_back(pir::StrAttribute::get(ctx, input_name));
+//   }
+//   for (size_t i = 0; i < outputs.size(); ++i) {
+//     std::string output_name = "graph_output_" + std::to_string(i) + "_op_" +
+//                       std::to_string(outputs[i].defining_op()->id());
+//     output_names.emplace_back(pir::StrAttribute::get(ctx, output_name));
+//   }
+//   pir::AttributeMap attribute_map;
+//   attribute_map.insert(
+//       {"input_names", pir::ArrayAttribute::get(ctx, input_names)});
+//   attribute_map.insert(
+//       {"output_names", pir::ArrayAttribute::get(ctx, output_names)});
+
+// // step 2: Replace the old op with CustomEngineOp.
+//   auto custom_engine_op = [&]() -> custom_engine::CustomEngineOp* {
+//     std::vector<pir::Type> output_types;
+//     for (auto& value : outputs) output_types.emplace_back(value.type());
+//     pir::OpInfo custom_engine_op_info =
+//         ctx->GetRegisteredOpInfo(custom_engine::CustomEngineOp::name());
+
+//     pir::Operation* engine_op = pir::Operation::Create(
+//         inputs, attribute_map, output_types, custom_engine_op_info);
+
+//     builder.Insert(engine_op);
+
+//     // auto engine_op = builder.Build<custom_engine::CustomEngineOp>(
+//     //     inputs, attribute_map, output_types, custom_engine_op_info);
+//     for (auto op : group_ops) {
+//       op->MoveTo(engine_op->block(), engine_op->block()->end());
+//     }
+//     return engine_op;
+//   }();
+
+//   // step 3: Replace outputs of inner ops
+//   const std::vector<pir::Value> group_outs = custom_engine_op->results();
+//   std::unordered_set<pir::Operation*> inner_ops(group_ops.begin(),
+//                                                 group_ops.end());
+//   for (size_t i = 0; i < outputs.size(); ++i) {
+//     outputs[i].ReplaceUsesWithIf(group_outs[i],
+//                                  [&inner_ops](pir::OpOperand op) {
+//                                    return !inner_ops.count(op.owner());
+//                                  });
+//   }
+
+//   // step 4: Insert YieldOp for outputs
+//   builder.SetInsertionPointToBlockEnd(custom_engine_op.block());
+//   builder.Build<::pir::YieldOp>(outputs);
+// }
+
+}  // namespace custom_pass
diff --git a/backends/gcu/passes/sub_graph_detector.h b/backends/gcu/passes/sub_graph_detector.h
new file mode 100644
index 000000000..98040583e
--- /dev/null
+++ b/backends/gcu/passes/sub_graph_detector.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <queue>
+#include <regex>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/pir/include/core/builder.h"
+
+namespace custom_pass {
+using OpClassifier = std::function<bool(const pir::Operation&)>;
+using GroupOpsVec = std::vector<pir::Operation*>;
+
+std::vector<GroupOpsVec> DetectSubGraphs(pir::Block* block,
+                                         const OpClassifier& classifier);
+
+std::vector<pir::Value> AnalysisOutputs(const GroupOpsVec& group_ops);
+std::vector<pir::Value> AnalysisInputs(const GroupOpsVec& group_ops);
+void ReplaceWithGroupOp(pir::Block* block, const GroupOpsVec& group_ops);
+void ReplaceWithCustomEngineOp(pir::Block* block, const GroupOpsVec& group_ops);
+
+pir::Operation* FindInsertPoint(const GroupOpsVec& group_ops,
+                                const std::vector<pir::Value>& outputs);
+void MoveUpstreamOpBeforeGroup(const GroupOpsVec& group_ops,
+                               pir::Block* block,
+                               pir::Operation* insert_point_op);
+
+}  // namespace custom_pass
diff --git a/backends/gcu/runtime/runtime.h b/backends/gcu/runtime/runtime.h
index a31b5e8d8..145cc8442 100644
--- a/backends/gcu/runtime/runtime.h
+++ b/backends/gcu/runtime/runtime.h
@@ -40,6 +40,7 @@
 #define ECCL_CHECK(func) CHECK_COMMON(func, ecclSuccess)
 #define TOPSATEN_CHECK(func) CHECK_COMMON(func, TOPSATEN_STATUS_SUCCESS)
 #define TOPSOP_CHECK(func) CHECK_COMMON(func, TOPSOP_STATUS_SUCCESS)
+#define TOPSGRAPH_CHECK(func) CHECK_COMMON(func, TOPS_GRAPH_SUCCESS)
 
 #ifdef __cplusplus
 extern "C" {