From 3b18d96b576862d41c023f97461acc4f15614bb0 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 18 Nov 2022 14:22:12 +0800 Subject: [PATCH] fix device id issue for xpu eager mode (#48076) * fix device id issue for xpu eager xpu device id is not correctly set in eager mode, thus vars are on dev0 unless XPUDeviceGurad is called, leading to this error message for all node rank != 0: "NotImplementedError: (Unimplemented) Place Place(xpu:0) is not supported." * fix typo * fix pybind error --- paddle/fluid/distributed/collective/ProcessGroupBKCL.cc | 1 + .../eager/auto_code_generator/generator/python_c_gen.py | 9 +++++++++ paddle/fluid/pybind/distributed_py.cc | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index a5c80cb04108d..8dfb65d981374 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -105,6 +105,7 @@ void ProcessGroupBKCL::BroadcastUniqueBKCLID(BKCLUniqueId* bkcl_id) { void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place, const std::string& place_key) { + platform::XPUDeviceGuard guard(place.GetDeviceId()); BKCLUniqueId bkcl_id; if (rank_ == 0) { PADDLE_ENFORCE_XPU_SUCCESS(bkcl_get_unique_id(&bkcl_id)); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 8e3944b79c30f..aacde58fa7bc2 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -128,6 +128,15 @@ def FindParsingFunctionFromAttributeType(atype): #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace.")); +#endif + }} + if (paddle::platform::is_xpu_place(place)) {{ +#if defined(PADDLE_WITH_XPU) + phi::backends::xpu::SetXPUDeviceId(place.device); + VLOG(4) <<"CurrentDeviceId: " << phi::backends::xpu::GetXPUCurrentDeviceId() << " from " << (int)place.device; +#else + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with XPU if use XPUPlace.")); #endif }} """ diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index dbc4c57c656ba..52160ea99a083 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -1284,7 +1284,7 @@ void BindDistributed(py::module *m) { auto processGroupBKCL = py::class_>( - *m, "ProcessGroupBKCL", ProcessGroup) + *m, "ProcessGroupBKCL", ProcessGroupStream) .def(py::init &, int, int,