From 1443a1f5021d534da962fd7894d3823dfa044142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=91=E5=90=AF=E8=88=AA?= <597323109@qq.com> Date: Tue, 14 Jan 2025 09:38:34 +0800 Subject: [PATCH] Feature/opt boxing (#1282) * add opt boxing * add filter * pass ut * don't allow gather reduce with reshape * fix reshape typeinfer bug * add full search space * fix compile bug * remove fold boxing * add latency in cpu options * fix bug * fix build * Apply code-format changes * fix build * fix build * fix ci * fix lstm * fix cycle * fix ci * fix boxing type infer * fix dotnet test --------- Co-authored-by: zhen8838 --- .../CodeGen/CPU/CSourceBuiltn.cs | 9 +- .../CPU/KernelCSourceConvertVisitor.cs | 9 +- .../CodeGen/CPU/KernelUtility.cs | 2 +- .../CPU/Templates/topo_aware_runtime.cshtml | 66 +- .../Evaluator/CPU/Boxing.cs | 141 ++- .../Evaluator/CPU/CPUModule.cs | 1 + .../Evaluator/CPU/ForceBoxing.cs | 98 ++ .../Evaluator/CPU/Im2col.cs | 2 +- .../Evaluator/CPU/PackedReduce.cs | 5 +- modules/Nncase.Modules.CPU/IR/CPU/Boxing.cs | 4 +- .../Nncase.Modules.CPU/IR/CPU/ForceBoxing.cs | 29 + .../Nncase.Modules.CPU/IR/CPU/Functional.cs | 9 +- .../Passes/Distributed/AutoDistributed.cs | 1117 +++++++++++------ ...tributedScheme.cs => DistributedSchema.cs} | 2 +- .../Passes/Rules/CPU/FoldMatmulReduce.cs | 12 +- .../Passes/Rules/CPU/PackRule.cs | 4 +- .../Passes/Tile/KernelToTIRVisitor.cs | 3 + .../Nncase.Modules.CPU/Targets/CPUTarget.cs | 7 +- .../Targets/CPUTargetOptions.cs | 16 +- .../Targets/CPUTargetOptionsCommand.cs | 24 +- .../Utilities/PackUtility.cs | 124 -- modules/Nncase.Modules.CPU/packages.lock.json | 3 +- .../Nncase.Modules.StackVM/packages.lock.json | 3 +- python/_nncase.pyi | 2 + python/nncase/native/ffi.cpp | 8 + src/Native/include/nncase/compiler.h | 24 + src/Nncase.Cli/packages.lock.json | 3 +- src/Nncase.Compiler/Compiler.cs | 2 +- src/Nncase.Compiler/Interop/CApi.cs | 16 + src/Nncase.Compiler/packages.lock.json | 3 +- src/Nncase.Core/CostModel/Cost.cs | 2 + src/Nncase.Core/DistributedType.cs | 14 +- src/Nncase.Core/Enum/ReduceOp.cs | 3 + src/Nncase.Core/IR/Math/Reduce.cs | 4 +- src/Nncase.Core/IR/Tensors/Functional.cs | 4 +- src/Nncase.Core/IR/Tensors/GetItem.cs | 2 +- src/Nncase.Core/ITarget.cs | 4 + .../Utilities/DistributedUtility.cs | 37 - src/Nncase.Core/Utilities/IRUtility.cs | 131 ++ src/Nncase.EGraph/Nncase.EGraph.csproj | 1 + src/Nncase.EGraph/Passes/EGraphExtractor.cs | 521 +------- src/Nncase.EGraph/packages.lock.json | 14 + src/Nncase.Evaluator/Math/Binary.cs | 8 +- src/Nncase.Evaluator/Math/Compare.cs | 6 +- src/Nncase.Evaluator/Math/MatMul.cs | 4 +- src/Nncase.Evaluator/Math/Reduce.cs | 18 +- src/Nncase.Evaluator/Math/ReduceArg.cs | 2 +- src/Nncase.Evaluator/Math/Unary.cs | 2 +- src/Nncase.Evaluator/NN/Activations.cs | 6 +- src/Nncase.Evaluator/NN/Conv2D.cs | 2 +- src/Nncase.Evaluator/Tensors/Cast.cs | 2 +- src/Nncase.Evaluator/Tensors/Concat.cs | 2 +- src/Nncase.Evaluator/Tensors/Reshape.cs | 134 +- src/Nncase.Graph/Graphs/HyperGraph.cs | 526 ++++++++ .../Rules/Neutral/CombineTranspose.cs | 38 +- .../Rules/Neutral/CombineUnary.cs | 2 +- src/Nncase.Passes/packages.lock.json | 3 +- src/Nncase.Quantization/packages.lock.json | 3 +- .../Schedule/TilingUtilities.cs | 2 +- src/Nncase.Schedule/packages.lock.json | 3 +- src/Nncase.Studio/packages.lock.json | 3 +- src/Nncase.Targets/packages.lock.json | 3 +- .../packages.lock.json | 3 +- .../UnitTestIRUtilities.cs} | 6 +- ...eScheme.cs => UnitTestDistributeSchema.cs} | 9 +- .../UnitTestDistributedTypeInfer.cs | 116 ++ .../UnitTestDistributedUtilities.cs | 24 - .../Rewrite/Fusion/UnitTestGraphPartition.cs | 10 +- src/Nncase.Tests/Rewrite/RewriteBase.cs | 40 +- .../Rewrite/UnitTestDataFlowRewrite.cs | 66 + .../Rewrite/UnitTestEGraphRewriteFactory.cs | 3 +- .../Rules/Neutral/UnitTestCombineTranspose.cs | 2 +- .../Rules/Neutral/UnitTestCombineUnary.cs | 2 +- .../Targets/UnitTestCPUKernels.cs | 271 ++-- src/Nncase.Tests/packages.lock.json | 3 +- 75 files changed, 2368 insertions(+), 1441 deletions(-) create mode 100644 modules/Nncase.Modules.CPU/Evaluator/CPU/ForceBoxing.cs create mode 100644 modules/Nncase.Modules.CPU/IR/CPU/ForceBoxing.cs rename modules/Nncase.Modules.CPU/Passes/Distributed/{DistributedScheme.cs => DistributedSchema.cs} (75%) create mode 100644 src/Nncase.Core/Utilities/IRUtility.cs create mode 100644 src/Nncase.Graph/Graphs/HyperGraph.cs rename src/Nncase.Tests/{Rules/Packing/PackUtilityTest.cs => Core/UnitTestIRUtilities.cs} (93%) rename src/Nncase.Tests/Distributed/{UnitTestDistributeScheme.cs => UnitTestDistributeSchema.cs} (87%) create mode 100644 src/Nncase.Tests/Distributed/UnitTestDistributedTypeInfer.cs delete mode 100644 src/Nncase.Tests/Distributed/UnitTestDistributedUtilities.cs diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs index 5037fc6e3..260c963d5 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs @@ -47,13 +47,8 @@ public static class CSourceBuiltn public static string TopoAwareRuntimeDef(CpuTargetOptions options, ulong dataAlign, ulong collective_pool_size) { - if (options.Hierarchies[0].Any(i => i > 1)) - { - var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/topo_aware_runtime.cshtml", new CpuTargetOptionsModel(options, dataAlign, collective_pool_size)).Result; - return content; - } - - return string.Empty; + var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/topo_aware_runtime.cshtml", new CpuTargetOptionsModel(options, dataAlign, collective_pool_size)).Result; + return content; } public static string TopologyDef(CpuTargetOptions options) diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs index 7b69500ca..4213a0112 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs @@ -437,7 +437,7 @@ protected override CSymbol VisitCall(Call expr) break; case TIR.Memcopy copy: - IndentScope.Writer.Write($"tensor_copy({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name});\n"); + IndentScope.Writer.Write($"tensor_copy({VisitBuffer(args[1], local: true).Name}, {VisitBuffer(args[0], local: true).Name});\n"); break; case TIR.CPU.Gather gather: IndentScope.Writer.Write($"gather<{gather.Axis}>({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name}, {VisitBuffer(args[2], local: true).Name});\n"); @@ -505,10 +505,11 @@ protected override CSymbol VisitCall(Call expr) break; case TIR.CPU.GatherReduceScatter grs: { - if (grs.InType.NdSBP.Any(s => s is SBPPartialSum)) + if (grs.InType.NdSBP.Any(s => s is SBPPartial)) { - var reduceKind = "tar::reduce_kind::" + string.Join("_", grs.InType.NdSBP.Select((s, i) => (s is SBPPartialSum ? "r" : string.Empty) + TargetOptions.HierarchyNames[i])); - IndentScope.Writer.IndWrite($"tac::tensor_reduce_sync({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name});\n"); + var sbpPartial = (SBPPartial)grs.InType.NdSBP.Where(s => s is SBPPartial).Distinct().First(); + var reduceKind = "tar::reduce_kind::" + string.Join("_", grs.InType.NdSBP.Select((s, i) => (s is SBPPartial ? "r" : string.Empty) + TargetOptions.HierarchyNames[i])); + IndentScope.Writer.IndWrite($"tac::tensor_reduce_sync({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name});\n"); } else { diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelUtility.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelUtility.cs index 84c249313..080be9d5c 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelUtility.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelUtility.cs @@ -80,7 +80,7 @@ public static string DistributedToC(DistributedType distributedType) } } - var implicitPolicy = ndSBP.Any(x => x is SBPPartialSum) ? "P" : "B"; + var implicitPolicy = ndSBP.Any(x => x is SBPPartial) ? "P" : "B"; sb.Append($">, {implicitPolicy}"); for (int axis = 0; axis < distributedType.TensorType.Shape.Rank; axis++) diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml index edfe0e290..ad6fb71d9 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/topo_aware_runtime.cshtml @@ -102,7 +102,7 @@ template class group_heirarchy_getter; @:}; } -template