Update release v03 (#1528)

* op unittest for cbrt/ceil/cholesky/concat/constant/fill_constant (#1495) * op unittest for cbrt * op unittest for ceil * op unittest for cholesky * op unittest for concat * op unittest for constant * add 4d test for constant op * fix ci * op unittest for fill_constant * op unittest for fill_constant * refine * fix(schedule): fix SimpleComputeAt primitive (#1504) * Fix reduce cast schedule bug (#1512) * fix(fuse): fix reduce cast schedule bug * test(fuse): add unittest for reduce_cast subgroup * Refactor some op tests and fix bugs (#1515) * Add depthwise_conv2d op test * Refactor log op test * Refactor round op test and fix bugs * Only test depthwise_conv2d in cuda_cudnn * op unittest for repeat/arange/reverse/elementwise_add_grad/flip (#1514) * op unittest for repeat op * add repeat frontend * op unittest for repeat * op unittest for arange * op unittest for reverse * format & remove old add op test * op unittest for flipe && remove redundant flip implementation * remove test_add_op_new.py * update reverse * Refactor some op tests and fix bugs (#1513) * Refactor op isclose test * Refactor op logical_right_shift and add more dtypes support * Refactor pow op test and fix bugs * Refactor lookup_table op test * Add logical_right_shift host function proto * Improve isclose test case * Fixed jitify commit to prevent header file conflicts (#1522) * Fixed jitify commit to prevent header file conflicts * Set random seed for debug floor_divide * Avoid oom error * Just for debug ci * Fix floor_divide error when input dtype is int * Fix bugs and add more tests for floor_divide * Experimental PR for the first OP to clean old schedule (#1524) --------- Co-authored-by: zzk0 <[email protected]> Co-authored-by: Fisher <[email protected]> Co-authored-by: Huihuang Zheng <[email protected]>
PaddlePaddle · Jun 19, 2023 · 416712a · 416712a
1 parent df023dd
commit 416712a
Show file tree

Hide file tree

Showing 43 changed files with 3,150 additions and 1,408 deletions.
diff --git a/cinn/frontend/net_builder.cc b/cinn/frontend/net_builder.cc
@@ -246,17 +246,6 @@ Placeholder NetBuilder::CreateInput(const Variable& var) {
   return Placeholder(var);
 }
 
-Variable NetBuilder::FillConstant(
-    const std::vector<int>& shape, float value, const std::string& name, const std::string& dtype, bool force_cpu) {
-  auto out =
-      CustomInstr("fill_constant", {}, {{"shape", shape}, {"value", value}, {"dtype", dtype}, {"force_cpu", force_cpu}})
-          .front();
-  if (!name.empty()) {
-    out.set_id(cinn::utils::TransValidVarName(name));
-  }
-  return out;
-}
-
 Variable NetBuilder::FillConstant(const std::vector<int>& shape,
                                   const std::string& str_value,
                                   const std::string& name,
@@ -827,11 +816,7 @@ Variable NetBuilder::Arange(const float start, const float stop, const float ste
 }
 
 Variable NetBuilder::Flip(const Variable& operand, const std::vector<int>& axes) {
-  Instruction instr("flip", {operand});
-  instr.SetAttr("axes", axes);
-  InferShape(instr);
-  AppendInstruction(instr);
-  return instr.GetOutput(0);
+  return CustomInstr("reverse", {operand}, {{"axis", utils::GetPositiveAxes(axes, operand->shape.size())}}).front();
 }
 
 Variable NetBuilder::Matmul(const Variable& x, const Variable& y, bool trans_x, bool trans_y, float alpha) {

diff --git a/cinn/frontend/net_builder.h b/cinn/frontend/net_builder.h
@@ -350,7 +350,7 @@ class NetBuilder {
                           const std::string& id_hint = "");
 
   /**
-   * @brief Create constant tensor with the specific value/vector and type, the type is infered from value.
+   * @brief Create constant tensor with the specific value/vector and type
    * @param value The constant value to be set.
    * @param name The name of output variable.
    * @return The result variable.
@@ -408,11 +408,21 @@ class NetBuilder {
    * @param force_cpu Whether the variable should force placed in cpu, default in device memory. Default is false.
    * @return The result variable.
    */
+  template <typename T = float>
   Variable FillConstant(const cinn::utils::ShapeType& shape,
-                        float value,
+                        T value,
                         const std::string& name,
                         const std::string& dtype,
-                        bool force_cpu = false);
+                        bool force_cpu = false) {
+    auto out =
+        CustomInstr(
+            "fill_constant", {}, {{"shape", shape}, {"value", value}, {"dtype", dtype}, {"force_cpu", force_cpu}})
+            .front();
+    if (!name.empty()) {
+      out.set_id(cinn::utils::TransValidVarName(name));
+    }
+    return out;
+  }
 
   /**
    * @brief The op return a variable with the specific string value, shape and type.
@@ -442,7 +452,7 @@ class NetBuilder {
                         T value,
                         const std::string& name = "",
                         bool force_cpu          = false) {
-    return FillConstant(shape, static_cast<float>(value), name, common::Type2Str(common::type_of<T>()), force_cpu);
+    return FillConstant<T>(shape, value, name, common::Type2Str(common::type_of<T>()), force_cpu);
   }
 
   /**
@@ -891,7 +901,10 @@ class NetBuilder {
                   const std::string& padding_algorithm = "EXPLICIT");
 
   /**
-   * This API flipes the Variable x along the given axis.
+   * @brief This API reverse the Variable x along the given axis.
+   * @param x An N-D variable.
+   * @param axis Specify the axis to operate on the input reverse.
+   * @return A reversed variable with the same data type as x.
    */
   Variable Flip(const Variable& operand, const std::vector<int>& axes);
 

diff --git a/cinn/frontend/net_builder_test.cc b/cinn/frontend/net_builder_test.cc
@@ -984,76 +984,6 @@ TEST(net_build, program_execute_arange_int) {
   }
 }
 
-TEST(net_build, program_execute_flip) {
-  const int C = 2;
-  const int H = 2;
-  const int W = 2;
-  const std::vector<int> axes{0};
-
-  NetBuilder builder("net_builder");
-  Placeholder input = builder.CreateInput(Float(32), {C, H, W}, "Img");
-  Variable output   = builder.Flip(input, axes);
-  auto program      = builder.Build();
-
-#ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
-#else
-  Target target = common::DefaultHostTarget();
-#endif
-  std::unordered_set<std::string> fetch_ids;
-  auto graph = Optimize(&program, fetch_ids, target);
-
-  auto scope = BuildScope(target, graph);
-  hlir::framework::GraphCompiler gc(target, scope, graph);
-  auto runtime_program = gc.Build();
-
-  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(output->id));
-
-  auto input_tensor = scope->GetTensor(std::string(input.id()));
-  SetRandData<float>(input_tensor, target);
-  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
-
-  runtime_program->Execute();
-  auto output_tensor                   = scope->GetTensor(std::string(output->id));
-  const std::vector<int>& output_shape = output_tensor->shape().data();
-  EXPECT_EQ(output_tensor->type(), Float(32));
-  EXPECT_EQ(output_shape.size(), 3UL);
-  EXPECT_EQ(output_shape[0], C);
-  EXPECT_EQ(output_shape[1], H);
-  EXPECT_EQ(output_shape[2], W);
-
-  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
-  VLOG(6) << "Visualize flip input_data";
-  for (int c = 0; c < C; c++) {
-    for (int h = 0; h < H; h++) {
-      std::string line;
-      for (int w = 0; w < W; w++) {
-        int index = c * (H * W) + h * W + w;
-        line += (std::to_string(index) + ": " + std::to_string(input_data[index]) + ", ");
-      }
-      VLOG(6) << line;
-    }
-  }
-
-  VLOG(6) << "Visualize flip output_data";
-  for (int c = 0; c < C; c++) {
-    int flip_c = std::find(axes.begin(), axes.end(), 0) == axes.end() ? c : C - c - 1;
-    for (int h = 0; h < H; h++) {
-      std::string line;
-      int flip_h = std::find(axes.begin(), axes.end(), 1) == axes.end() ? h : H - h - 1;
-      for (int w = 0; w < W; w++) {
-        int flip_w     = std::find(axes.begin(), axes.end(), 2) == axes.end() ? w : W - w - 1;
-        int flip_index = flip_c * H * W + flip_h * W + flip_w;
-        int index      = c * (H * W) + h * W + w;
-        line += (std::to_string(index) + ": " + std::to_string(output_data[index]) + ", ");
-        EXPECT_EQ(input_data[index], output_data[flip_index]);
-      }
-      VLOG(6) << line;
-    }
-  }
-}
-
 TEST(net_build, program_argmax_case1) {
   const int N     = 4;
   const int IN_C  = 3;

diff --git a/cinn/hlir/framework/op_lowering_util.cc b/cinn/hlir/framework/op_lowering_util.cc
@@ -935,10 +935,10 @@ void LoopAssignReduce(ir::IRSchedule& ir_sch,
   };
 
   auto node_shape = shape_dict.at(node_data->id());
-  // node output is same shape with reduce output.
+  // The output shape of node is different from that of reduce node
   if (std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) !=
       std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>())) {
-    // split loop to assign master loop
+    // get loop factors of reduce node
     int extend = 1;
     std::vector<int> factors;
     loops       = ir_sch.GetLoops(node_data->id());
@@ -953,8 +953,63 @@ void LoopAssignReduce(ir::IRSchedule& ir_sch,
       factors.push_back(loop.As<ir::For>()->extent.as_int32());
     }
 
-    ir_sch.Split(loops.back(), factors);
-    loops = ir_sch.GetLoops(node_data->id());
+    // If there are IfThenElse stmt in loop, we need to find out the indices in condition,
+    // and special treatment should be applied to loops with these indices.
+    // We apply two step split on loop of src node to align the loop of reduce node.
+    std::unordered_set<int> loop_index_in_if;
+    auto first_reduce_loop = rloops.front();
+    // collect if
+    auto if_checker               = [](const Expr* x) { return x->As<ir::IfThenElse>(); };
+    auto if_set                   = ir::CollectIRNodesWithoutTensor(first_reduce_loop.As<ir::For>()->body, if_checker);
+    std::string reduce_block_name = reducer_data->id();
+    for (auto if_expr : if_set) {
+      auto checker = [reduce_block_name](const Expr* x) {
+        return x->As<ir::ScheduleBlockRealize>() &&
+               x->As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name == reduce_block_name;
+      };
+      auto blocks_in_if = ir::CollectIRNodesWithoutTensor(if_expr, checker);
+      if (!blocks_in_if.empty()) {
+        ir::Expr condition = if_expr.As<ir::IfThenElse>()->condition;
+        auto indices_in_if =
+            ir::CollectIRNodesWithoutTensor(condition, [](const Expr* x) { return x->As<ir::_Var_>(); });
+        for (int i = 0; i < rloops.size(); ++i) {
+          std::string var_name = rloops[i].As<ir::For>()->loop_var->name;
+          auto find_var_iter = std::find_if(indices_in_if.begin(), indices_in_if.end(), [&var_name](const ir::Expr& x) {
+            return x.As<ir::_Var_>()->name == var_name;
+          });
+          if (find_var_iter != indices_in_if.end()) {
+            loop_index_in_if.insert(i);
+          }
+        }
+        break;
+      }
+    }
+
+    // prepare factors of two step split
+    std::vector<int> first_step_factors;
+    std::vector<int> second_step_factors;
+    int second_start_loop_index;
+    for (int i = 0; i < factors.size(); ++i) {
+      if (loop_index_in_if.count(i) == 0) {
+        first_step_factors.push_back(factors[i]);
+      } else if (loop_index_in_if.count(i) != 0 && second_step_factors.empty()) {
+        first_step_factors.push_back(-1);
+        second_step_factors.push_back(factors[i]);
+        second_start_loop_index = i;
+      } else if (loop_index_in_if.count(i) != 0 && !second_step_factors.empty()) {
+        second_step_factors.push_back(factors[i]);
+      }
+    }
+    // do two step split
+    if (!first_step_factors.empty()) {
+      ir_sch.Split(loops.back(), first_step_factors);
+      loops = ir_sch.GetLoops(node_data->id());
+    }
+    if (!second_step_factors.empty()) {
+      ir_sch.Split(loops.at(second_start_loop_index), second_step_factors);
+      loops = ir_sch.GetLoops(node_data->id());
+    }
+
     // copy loop info form rloops.
     copy_loop_info(loops, rloops);
     return;

diff --git a/cinn/hlir/op/contrib/CMakeLists.txt b/cinn/hlir/op/contrib/CMakeLists.txt
@@ -2,7 +2,6 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS
         gather_nd.cc
-        flip.cc
         sort.cc
         argmin.cc
         argmax.cc
@@ -24,7 +23,6 @@ cc_test(test_gather_nd SRCS gather_nd_test.cc DEPS cinncore)
 cc_test(test_sort SRCS sort_test.cc DEPS cinncore)
 cc_test(test_argmin SRCS argmin_test.cc DEPS cinncore)
 cc_test(test_argmax SRCS argmax_test.cc DEPS cinncore)
-cc_test(test_flip SRCS flip_test.cc DEPS cinncore)
 cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
 cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
 cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)

diff --git a/cinn/hlir/op/contrib/argmin.cc b/cinn/hlir/op/contrib/argmin.cc
@@ -113,18 +113,15 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmin(const framework::NodeAt
   framework::CINNCompute argmin_compute([=](lang::Args args, lang::RetValue *ret) {
     CHECK(!args.empty()) << "The input argument of argmin compute is empty! Please check.";
     common::CINNValuePack pack_args = args[0];
-    std::string tensor_name         = UniqName("Argmin_out");
     CHECK_GE(pack_args.size(), 1U) << "There should be 1 input args for argmax compute";
     Expr in_expr = pack_args[0];
     CHECK(in_expr.as_tensor());
     Tensor in_tensor = in_expr.as_tensor_ref();
     auto stages      = CreateStages({in_tensor});
-    if (FLAGS_cinn_ir_schedule) {
-      CHECK_EQ(pack_args.size(), 2U);
-      CHECK(pack_args[1].is_string());
-      tensor_name = pack_args[1].operator std::string();
-    }
-    auto out_tensor = Argmin(in_tensor, target, stages, axis, keep_dims, tensor_name);
+    CHECK_EQ(pack_args.size(), 2U);
+    CHECK(pack_args[1].is_string());
+    std::string tensor_name = pack_args[1].operator std::string();
+    auto out_tensor         = Argmin(in_tensor, target, stages, axis, keep_dims, tensor_name);
 
     stages->InsertLazily(out_tensor[0]);
     std::vector<CINNValue> cinn_values{
@@ -133,38 +130,30 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmin(const framework::NodeAt
   });
 
   framework::CINNSchedule argmin_schedule([=](lang::Args args, lang::RetValue *ret) {
-    if (FLAGS_cinn_ir_schedule) {
-      CHECK(!args.empty()) << "The input argument of arange_schedule is empty! Please check.\n";
-      common::CINNValuePack arg_pack = args[0];
-      std::vector<Expr> vec_ast;
-      for (int i = 0; i < arg_pack.size(); i++) {
-        if (arg_pack[i].is_expr()) {
-          Expr temp = arg_pack[i];
-          vec_ast.emplace_back(temp);
-        }
-      }
-      CHECK(!vec_ast.empty());
-      ir::ModuleExpr mod_expr(vec_ast);
-      ir::IRSchedule ir_sch(mod_expr);
-      ir_sch.MergeExprs();
-      auto blocks = ir_sch.GetAllBlocks();
-      // TODO: It needs to be rewritten according to the reduction_min operator to improve performance.
-      // Do not use local variables, because the size will exceed the limit.
-      ir_sch.SetBuffer(blocks[0], "local");
-      ir_sch.SetBuffer(blocks[1], "local");
-      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
-      if (prod_size > 1 && target.arch == Target::Arch::X86) {
-        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+    CHECK(!args.empty()) << "The input argument of arange_schedule is empty! Please check.\n";
+    common::CINNValuePack arg_pack = args[0];
+    std::vector<Expr> vec_ast;
+    for (int i = 0; i < arg_pack.size(); i++) {
+      if (arg_pack[i].is_expr()) {
+        Expr temp = arg_pack[i];
+        vec_ast.emplace_back(temp);
       }
-      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-      *ret = common::CINNValuePack{res};
-    } else {
-      CHECK(!args.empty()) << "The input argument of arange_schedule is empty! Please check.\n";
-      common::CINNValuePack arg_pack = args[0];
-      Expr out                       = arg_pack[0];
-      CHECK(out.as_tensor());
-      *ret = arg_pack;
     }
+    CHECK(!vec_ast.empty());
+    ir::ModuleExpr mod_expr(vec_ast);
+    ir::IRSchedule ir_sch(mod_expr);
+    ir_sch.MergeExprs();
+    auto blocks = ir_sch.GetAllBlocks();
+    // TODO: It needs to be rewritten according to the reduction_min operator to improve performance.
+    // Do not use local variables, because the size will exceed the limit.
+    ir_sch.SetBuffer(blocks[0], "local");
+    ir_sch.SetBuffer(blocks[1], "local");
+    long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+    if (prod_size > 1 && target.arch == Target::Arch::X86) {
+      pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+    }
+    std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();