From 15e4a596c32bfde633fe7f69b89d74acca40b07c Mon Sep 17 00:00:00 2001 From: kongAKun Date: Sat, 2 Sep 2023 16:02:56 +0800 Subject: [PATCH 1/9] Fix styles of code --- .../incubate/nn/layer/fused_dropout_add.py | 14 ++++---- .../incubate/nn/layer/fused_dropout_nd.py | 34 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/python/paddle/incubate/nn/layer/fused_dropout_add.py b/python/paddle/incubate/nn/layer/fused_dropout_add.py index 77874d2944764..1a644169cc47a 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_add.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_add.py @@ -44,16 +44,16 @@ class FusedDropoutAdd(Layer): Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd + >>> # required: gpu + >>> import paddle + >>> from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd - x = paddle.to_tensor([[1,2,3], [4,5,6]], dtype="float32") - y = paddle.to_tensor([[1,2,3], [4,5,6]], dtype="float32") + >>> x = paddle.to_tensor([[1,2,3], [4,5,6]], dtype="float32") + >>> y = paddle.to_tensor([[1,2,3], [4,5,6]], dtype="float32") - m = FusedDropoutAdd(p=0.5) + >>> m = FusedDropoutAdd(p=0.5) - out = m(x, y) + >>> out = m(x, y) """ def __init__(self, p=0.5, mode="upscale_in_train", name=None): diff --git a/python/paddle/incubate/nn/layer/fused_dropout_nd.py b/python/paddle/incubate/nn/layer/fused_dropout_nd.py index 156880f73281b..93aee7081020c 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py @@ -53,23 +53,23 @@ class FusedDropout(paddle.nn.Layer): Examples: .. code-block:: python - import paddle - - x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype="float32") - m = paddle.incubate.nn.FusedDropout(p=0.5) - - y_train = m(x) - print(y_train) - # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [[2., 0., 6.], - # [0., 0., 0.]]) - - m.eval() # switch the model to test phase - y_test = m(x) - print(y_test) - # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [[1., 2., 3.], - # [4., 5., 6.]]) + >>> import paddle + + >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype="float32") + >>> m = paddle.incubate.nn.FusedDropout(p=0.5) + + >>> y_train = m(x) + >>> print(y_train) + >>> # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, + >>> # [[2., 0., 6.], + >>> # [0., 0., 0.]]) + + >>> m.eval() # switch the model to test phase + >>> y_test = m(x) + >>> print(y_test) + >>> # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, + >>> # [[1., 2., 3.], + >>> # [4., 5., 6.]]) """ def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None): From e9c5e96141a0baae8775e38a58a9beb266b51768 Mon Sep 17 00:00:00 2001 From: kongAKun Date: Sat, 2 Sep 2023 18:39:04 +0800 Subject: [PATCH 2/9] update the GPU option --- python/paddle/incubate/nn/layer/fused_dropout_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/incubate/nn/layer/fused_dropout_add.py b/python/paddle/incubate/nn/layer/fused_dropout_add.py index 1a644169cc47a..7c0d618eafec9 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_add.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_add.py @@ -44,7 +44,7 @@ class FusedDropoutAdd(Layer): Examples: .. code-block:: python - >>> # required: gpu + >>> # doctest: +REQUIRES(env:GPU) >>> import paddle >>> from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd From d59911f160054e76790a5ae51ba7583f7cd1abf0 Mon Sep 17 00:00:00 2001 From: kongAKun Date: Sat, 2 Sep 2023 19:35:16 +0800 Subject: [PATCH 3/9] add the GPU setup --- python/paddle/incubate/nn/layer/fused_dropout_add.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/incubate/nn/layer/fused_dropout_add.py b/python/paddle/incubate/nn/layer/fused_dropout_add.py index 7c0d618eafec9..51cfc18d443b4 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_add.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_add.py @@ -46,6 +46,7 @@ class FusedDropoutAdd(Layer): >>> # doctest: +REQUIRES(env:GPU) >>> import paddle + >>> paddle.device.set_device('gpu') >>> from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd >>> x = paddle.to_tensor([[1,2,3], [4,5,6]], dtype="float32") From 831c44e2f317a1e07bce7bfc121b29c5e51c42d8 Mon Sep 17 00:00:00 2001 From: kongAKun Date: Sat, 2 Sep 2023 21:20:08 +0800 Subject: [PATCH 4/9] remove the note --- python/paddle/incubate/nn/layer/fused_dropout_nd.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/incubate/nn/layer/fused_dropout_nd.py b/python/paddle/incubate/nn/layer/fused_dropout_nd.py index 93aee7081020c..4943a16fe9e0a 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py @@ -60,16 +60,16 @@ class FusedDropout(paddle.nn.Layer): >>> y_train = m(x) >>> print(y_train) - >>> # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - >>> # [[2., 0., 6.], - >>> # [0., 0., 0.]]) + # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[2., 0., 6.], + # [0., 0., 0.]]) >>> m.eval() # switch the model to test phase >>> y_test = m(x) >>> print(y_test) - >>> # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - >>> # [[1., 2., 3.], - >>> # [4., 5., 6.]]) + # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[1., 2., 3.], + # [4., 5., 6.]]) """ def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None): From 3ce9b2a1add3cea201b8bcc36fa175a9be0803e6 Mon Sep 17 00:00:00 2001 From: kongAKun Date: Sun, 3 Sep 2023 15:37:03 +0800 Subject: [PATCH 5/9] update the code --- python/paddle/incubate/nn/layer/fused_dropout_nd.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/incubate/nn/layer/fused_dropout_nd.py b/python/paddle/incubate/nn/layer/fused_dropout_nd.py index 4943a16fe9e0a..a820654fa9efc 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py @@ -60,16 +60,16 @@ class FusedDropout(paddle.nn.Layer): >>> y_train = m(x) >>> print(y_train) - # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [[2., 0., 6.], - # [0., 0., 0.]]) + Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, + [[2., 0., 6.], + [0., 0., 0.]]) >>> m.eval() # switch the model to test phase >>> y_test = m(x) >>> print(y_test) - # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [[1., 2., 3.], - # [4., 5., 6.]]) + Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, + [[1., 2., 3.], + [4., 5., 6.]]) """ def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None): From 12048711a79526c115825a7707a517ef2f55a6d5 Mon Sep 17 00:00:00 2001 From: kongAKun Date: Mon, 25 Sep 2023 16:11:58 +0800 Subject: [PATCH 6/9] Fix styles of code --- python/paddle/distributed/fleet/fleet.py | 334 +++++++++--------- .../incubate/nn/functional/fused_rms_norm.py | 15 +- .../nn/functional/fused_transformer.py | 328 ++++++++--------- 3 files changed, 341 insertions(+), 336 deletions(-) diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index 2dab355264b4d..3761cbfbf82c6 100755 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -128,31 +128,31 @@ class Fleet: :name: code-example2 # Example2: for parameter server training - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - fleet.init(strategy=strategy) + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet + >>> strategy = fleet.DistributedStrategy() + >>> fleet.init(strategy=strategy) - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> optimizer = fleet.distributed_optimizer(optimizer) - if fleet.is_first_worker(): - print("this is first worker") + >>> if fleet.is_first_worker(): + ... print("this is first worker") - print("current node index: {}".format(fleet.worker_index())) - print("total number of worker num: {}".format(fleet.worker_num())) + >>> print("current node index: {}".format(fleet.worker_index())) + >>> print("total number of worker num: {}".format(fleet.worker_num())) - if fleet.is_worker(): - print("this is worker") - print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True))) + >>> if fleet.is_worker(): + ... print("this is worker") + >>> print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True))) - print("server num: {}".format(fleet.server_num())) - print("server endpoints: {}".format(fleet.server_endpoints(to_string=True))) + >>> print("server num: {}".format(fleet.server_num())) + >>> print("server endpoints: {}".format(fleet.server_endpoints(to_string=True))) - if fleet.is_server(): - print("this is server") - fleet.stop_worker() + >>> if fleet.is_server(): + ... print("this is server") + >>> fleet.stop_worker() """ @@ -202,37 +202,37 @@ def init( .. code-block:: python :name: code-example1 - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() .. code-block:: python :name: code-example2 - import paddle.distributed.fleet as fleet - fleet.init(is_collective=True) + >>> import paddle.distributed.fleet as fleet + >>> fleet.init(is_collective=True) .. code-block:: python :name: code-example3 - import paddle.distributed.fleet as fleet - role = fleet.PaddleCloudRoleMaker() - fleet.init(role) + >>> import paddle.distributed.fleet as fleet + >>> role = fleet.PaddleCloudRoleMaker() + >>> fleet.init(role) .. code-block:: python :name: code-example4 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - fleet.init(strategy=strategy) + >>> import paddle.distributed.fleet as fleet + >>> strategy = fleet.DistributedStrategy() + >>> fleet.init(strategy=strategy) .. code-block:: python :name: code-example5 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - fleet.init(log_level = "DEBUG") + >>> import paddle.distributed.fleet as fleet + >>> strategy = fleet.DistributedStrategy() + >>> fleet.init(log_level = "DEBUG") """ from paddle.distributed import parallel_helper @@ -448,9 +448,9 @@ def is_first_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.is_first_worker() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.is_first_worker() """ return self._role_maker._is_first_worker() @@ -466,9 +466,9 @@ def worker_index(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.worker_index() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.worker_index() """ return self._role_maker._worker_index() @@ -484,9 +484,9 @@ def worker_num(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.worker_num() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.worker_num() """ return self._role_maker._worker_num() @@ -515,9 +515,9 @@ def is_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.is_worker() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.is_worker() """ return self._role_maker._is_worker() @@ -536,9 +536,9 @@ def worker_endpoints(self, to_string=False): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.worker_endpoints() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.worker_endpoints() """ if to_string: @@ -557,9 +557,9 @@ def server_num(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.server_num() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.server_num() """ return len(self._role_maker._get_pserver_endpoints()) @@ -574,9 +574,9 @@ def server_index(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.server_index() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.server_index() """ return self._role_maker._server_index() @@ -592,9 +592,9 @@ def server_endpoints(self, to_string=False): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.server_endpoints() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.server_endpoints() """ @@ -615,9 +615,9 @@ def is_server(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.is_server() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.is_server() """ return self._role_maker._is_server() @@ -633,9 +633,9 @@ def barrier_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.barrier_worker() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> fleet.barrier_worker() """ self._role_maker._barrier("worker") @@ -653,13 +653,13 @@ def init_worker(self, scopes=None): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.init_worker() + >>> fleet.init_worker() """ self._runtime_handle._init_worker(scopes) @@ -698,13 +698,13 @@ def init_server(self, *args, **kwargs): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.init_server() + >>> fleet.init_server() """ self._runtime_handle._init_server(*args, **kwargs) @@ -723,13 +723,13 @@ def load_model(self, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.load_model("path", mode=0) + >>> fleet.load_model("path", mode=0) """ self._runtime_handle._load_persistables(path, mode) @@ -748,13 +748,13 @@ def load_one_table(self, table_id, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.load_one_table(0, "path", mode=0) + >>> fleet.load_one_table(0, "path", mode=0) """ self._runtime_handle._load_one_table(table_id, path, mode) @@ -773,13 +773,13 @@ def load_inference_model(self, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.load_inference_model("path", mode=1) + >>> fleet.load_inference_model("path", mode=1) """ self._runtime_handle._load_inference_model(path, mode) @@ -797,14 +797,14 @@ def run_server(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - if fleet.is_server(): - fleet.init_server() + >>> if fleet.is_server(): + ... fleet.init_server() """ self._runtime_handle._run_server() @@ -822,13 +822,13 @@ def stop_worker(self): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.init_server() + >>> fleet.init_server() """ self._runtime_handle._stop_worker() @@ -902,13 +902,13 @@ def save_inference_model( .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.init_server() + >>> fleet.init_server() """ @@ -952,17 +952,17 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0): .. code-block:: text - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet - fleet.init() + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - exe = paddle.static.Executor(paddle.CPUPlace()) - fleet.save_persistables(exe, "dirname", paddle.static.default_main_program()) + >>> exe = paddle.static.Executor(paddle.CPUPlace()) + >>> fleet.save_persistables(exe, "dirname", paddle.static.default_main_program()) """ self._runtime_handle._save_persistables( @@ -1002,13 +1002,13 @@ def save_one_table(self, table_id, path, mode): .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() # build net # fleet.distributed_optimizer(...) - fleet.save_one_table(0, "path", mode=0) + >>> fleet.save_one_table(0, "path", mode=0) """ self._runtime_handle._save_one_table(table_id, path, mode) @@ -1029,16 +1029,16 @@ def save_dense_params( .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - import paddle - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) + >>> import paddle.distributed.fleet as fleet + >>> fleet.init() + >>> import paddle + >>> place = paddle.CPUPlace() + >>> exe = paddle.static.Executor(place) # build net # fleet.distributed_optimizer(...) - fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program()) + >>> fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program()) """ self._runtime_handle._save_dense_params( @@ -1072,12 +1072,12 @@ def distributed_optimizer(self, optimizer, strategy=None): .. code-block:: python - import paddle - import paddle.distributed.fleet as fleet - fleet.init(is_collective=True) - strategy = fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + >>> import paddle + >>> import paddle.distributed.fleet as fleet + >>> fleet.init(is_collective=True) + >>> strategy = fleet.DistributedStrategy() + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) """ self.user_defined_optimizer = optimizer @@ -1135,45 +1135,45 @@ def amp_init( Examples: .. code-block:: python - import paddle - import paddle.nn.functional as F - paddle.enable_static() - - def run_example_code(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') - conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) - # 1) Use fp16_guard to control the range of fp16 kernels used. - with paddle.static.amp.fp16_guard(): - bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") - pool = F.max_pool2d(bn, kernel_size=2, stride=2) - hidden = paddle.static.nn.fc(pool, size=10) - loss = paddle.mean(hidden) - # 2) Create the optimizer and set `multi_precision` to True. - # Setting `multi_precision` to True can avoid the poor accuracy - # or the slow convergence in a way. - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) - # 3) These ops in `custom_black_list` will keep in the float32 computation type. - amp_list = paddle.static.amp.CustomOpLists( - custom_black_list=['pool2d']) - # 4) The entry of Paddle AMP. - # Enable pure fp16 training by setting `use_pure_fp16` to True. - optimizer = paddle.static.amp.decorate( - optimizer, - amp_list, - init_loss_scaling=128.0, - use_dynamic_loss_scaling=True, - use_pure_fp16=True) - # If you don't use the default_startup_program(), you sholud pass - # your defined `startup_program` into `minimize`. - optimizer.minimize(loss) - exe.run(paddle.static.default_startup_program()) - # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). - # If you want to perform the testing process, you should pass `test_program` into `amp_init`. - optimizer.amp_init(place, scope=paddle.static.global_scope()) - - if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: + >>> import paddle + >>> import paddle.nn.functional as F + >>> paddle.enable_static() + + >>> def run_example_code(): + ... place = paddle.CUDAPlace(0) + ... exe = paddle.static.Executor(place) + ... data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + ... conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + ... # 1) Use fp16_guard to control the range of fp16 kernels used. + ... with paddle.static.amp.fp16_guard(): + ... bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + ... pool = F.max_pool2d(bn, kernel_size=2, stride=2) + ... hidden = paddle.static.nn.fc(pool, size=10) + ... loss = paddle.mean(hidden) + ... # 2) Create the optimizer and set `multi_precision` to True. + ... # Setting `multi_precision` to True can avoid the poor accuracy + ... # or the slow convergence in a way. + ... optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) + ... # 3) These ops in `custom_black_list` will keep in the float32 computation type. + ... amp_list = paddle.static.amp.CustomOpLists( + ... custom_black_list=['pool2d']) + ... # 4) The entry of Paddle AMP. + ... # Enable pure fp16 training by setting `use_pure_fp16` to True. + ... optimizer = paddle.static.amp.decorate( + ... optimizer, + ... amp_list, + ... init_loss_scaling=128.0, + ... use_dynamic_loss_scaling=True, + ... use_pure_fp16=True) + ... # If you don't use the default_startup_program(), you sholud pass + ... # your defined `startup_program` into `minimize`. + ... optimizer.minimize(loss) + ... exe.run(paddle.static.default_startup_program()) + ... # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). + ... # If you want to perform the testing process, you should pass `test_program` into `amp_init`. + ... optimizer.amp_init(place, scope=paddle.static.global_scope()) + + >>> if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: run_example_code() """ amp_optimizer = self._get_amp_optimizer() @@ -1267,26 +1267,26 @@ def minimize( .. code-block:: python - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - import paddle.nn.functional as F - - hid_dim = 10 - label_dim = 2 - input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') - input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64') - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh') - fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh') - prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax') - cost = F.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.mean(x=cost) - - fleet.init(is_collective=True) - strategy = fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet + >>> import paddle.nn.functional as F + + >>> hid_dim = 10 + >>> label_dim = 2 + >>> input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') + >>> input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64') + >>> fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh') + >>> fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh') + >>> prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax') + >>> cost = F.cross_entropy(input=prediction, label=input_y) + >>> avg_cost = paddle.mean(x=cost) + + >>> fleet.init(is_collective=True) + >>> strategy = fleet.DistributedStrategy() + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + >>> optimizer.minimize(avg_cost) # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX diff --git a/python/paddle/incubate/nn/functional/fused_rms_norm.py b/python/paddle/incubate/nn/functional/fused_rms_norm.py index 54c6e1dfba021..e1066c53052ca 100644 --- a/python/paddle/incubate/nn/functional/fused_rms_norm.py +++ b/python/paddle/incubate/nn/functional/fused_rms_norm.py @@ -54,14 +54,15 @@ def fused_rms_norm( Examples: .. code-block:: python - # required: gpu - import paddle + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') - paddle_x = paddle.cast(paddle.randn(shape=[32, 256]), dtype=paddle.float16) - paddle_weight = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) - paddle_bias = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) - epsilon = 1e-6 - paddle_rmsnorm = paddle.incubate.nn.functional.fused_rms_norm(paddle_x, paddle_weight, paddle_bias, epsilon, 1) + >>> paddle_x = paddle.cast(paddle.randn(shape=[32, 256]), dtype=paddle.float16) + >>> paddle_weight = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) + >>> paddle_bias = paddle.cast(paddle.randn(shape=[256]), dtype=paddle.float16) + >>> epsilon = 1e-6 + >>> paddle_rmsnorm = paddle.incubate.nn.functional.fused_rms_norm(paddle_x, paddle_weight, paddle_bias, epsilon, 1) """ if in_dynamic_mode(): return _C_ops.rms_norm( diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index 469aea26cc600..52aee587ec22a 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -58,18 +58,18 @@ def fused_feedforward( .. code-block:: python - residual = x - if pre_layer_norm: - out = layer_norm1(x) - else: - out = x - out = linear2(dropout1(activation(linear1(src)))) - if add_residual: - out = residual + dropout2(out) - else: - out = dropout2(out) - if not pre_layer_norm: - out = layer_norm2(out) + >>> residual = x + >>> if pre_layer_norm: + ... out = layer_norm1(x) + >>> else: + ... out = x + >>> out = linear2(dropout1(activation(linear1(src)))) + >>> if add_residual: + ... out = residual + dropout2(out) + >>> else: + ... out = dropout2(out) + >>> if not pre_layer_norm: + ... out = layer_norm2(out) Args: @@ -110,16 +110,17 @@ def fused_feedforward( Examples: .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - x = paddle.randn(shape=(1, 8, 8), dtype="float32") - linear1_weight = paddle.randn(shape=(8, 8), dtype="float32") - linear2_weight = paddle.randn(shape=(8, 8), dtype="float32") - out = F.fused_feedforward(x, linear1_weight, linear2_weight) - print(out.shape) - # (1, 8, 8) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> x = paddle.randn(shape=(1, 8, 8), dtype="float32") + >>> linear1_weight = paddle.randn(shape=(8, 8), dtype="float32") + >>> linear2_weight = paddle.randn(shape=(8, 8), dtype="float32") + >>> out = F.fused_feedforward(x, linear1_weight, linear2_weight) + >>> print(out.shape) + (1, 8, 8) """ _verify_dropout_rate(dropout1_rate) _verify_dropout_rate(dropout2_rate) @@ -290,7 +291,7 @@ def fused_bias_dropout_residual_layer_norm( .. code-block:: python - y = layer_norm(residual + dropout(bias + x)) + >>> y = layer_norm(residual + dropout(bias + x)) Parameters: x (Tensor): The input tensor. The shape is `[*, embed\_dim]`. @@ -323,21 +324,22 @@ def fused_bias_dropout_residual_layer_norm( Examples: .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - # input: [batch_size, seq_len, embed_dim] - x = paddle.rand(shape=(2, 4, 128), dtype="float32") - # residual: [batch_size, seq_len, embed_dim] - residual = paddle.rand(shape=(2, 4, 128), dtype="float32") - # linear bias: [embed_dim] - bias = paddle.rand(shape=[128], dtype="float32") - # output: [batch_size, seq_len, embed_dim] - output = F.fused_bias_dropout_residual_layer_norm( - x, residual, bias) - # [2, 4, 128] - print(output.shape) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> # input: [batch_size, seq_len, embed_dim] + >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32") + >>> # residual: [batch_size, seq_len, embed_dim] + >>> residual = paddle.rand(shape=(2, 4, 128), dtype="float32") + >>> # linear bias: [embed_dim] + >>> bias = paddle.rand(shape=[128], dtype="float32") + >>> # output: [batch_size, seq_len, embed_dim] + >>> output = F.fused_bias_dropout_residual_layer_norm( + ... x, residual, bias) + >>> # [2, 4, 128] + >>> print(output.shape) """ seed = None @@ -495,33 +497,33 @@ def fused_multi_head_attention( .. code-block:: python - residual = x - if pre_layer_norm: - out = layer_norm(x) - else: - out = x - # compute q, k, v - out = matmul(out, qkv_weight) + qkv_bias - out = transpose(out, perm=[2, 0, 3, 1, 4]) - # extract q, k and v from out - q = out[0:1,::] * (head_dim ** -0.5) - k = out[1:2,::] - v = out[2:3,::] - out = matmul(q, k, transpose_y=True) - out = out + attn_mask - out = softmax(out) - out = dropout(out) - out = matmul(out, v) - # combine heads - out = transpose(out, perm=[0, 2, 1, 3]) - # project to output - out = linear(out) - if add_residual: - out = residual + dropout(out) - else: - out = dropout(out) - if not pre_layer_norm: - out = layer_norm(out) + >>> residual = x + >>> if pre_layer_norm: + ... out = layer_norm(x) + >>> else: + ... out = x + >>> # compute q, k, v + >>> out = matmul(out, qkv_weight) + qkv_bias + >>> out = transpose(out, perm=[2, 0, 3, 1, 4]) + >>> # extract q, k and v from out + >>> q = out[0:1,::] * (head_dim ** -0.5) + >>> k = out[1:2,::] + >>> v = out[2:3,::] + >>> out = matmul(q, k, transpose_y=True) + >>> out = out + attn_mask + >>> out = softmax(out) + >>> out = dropout(out) + >>> out = matmul(out, v) + >>> # combine heads + >>> out = transpose(out, perm=[0, 2, 1, 3]) + >>> # project to output + >>> out = linear(out) + >>> if add_residual: + ... out = residual + dropout(out) + >>> else: + ... out = dropout(out) + >>> if not pre_layer_norm: + ... out = layer_norm(out) Parameters: @@ -581,30 +583,31 @@ def fused_multi_head_attention( .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - # input: [batch_size, seq_len, embed_dim] - x = paddle.rand(shape=(2, 4, 128), dtype="float32") - # qkv_weight: [3, num_head, head_dim, embed_dim] - qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") - # qkv_bias: [3, num_head, head_dim] - qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") - # linear_weight: [embed_dim, embed_dim] - linear_weight = paddle.rand(shape=(128, 128), dtype="float32") - # linear_bias: [embed_dim] - linear_bias = paddle.rand(shape=[128], dtype="float32") - # self attention mask: [batch_size, num_heads, seq_len, seq_len] - attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32") - - # output: [batch_size, seq_len, embed_dim] - output = F.fused_multi_head_attention( - x, qkv_weight, linear_weight, False, - None, None, None, None, 1e-5, qkv_bias, - linear_bias, None, attn_mask) - # [2, 4, 128] - print(output.shape) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> # input: [batch_size, seq_len, embed_dim] + >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32") + >>> # qkv_weight: [3, num_head, head_dim, embed_dim] + >>> qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") + >>> # qkv_bias: [3, num_head, head_dim] + >>> qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") + >>> # linear_weight: [embed_dim, embed_dim] + >>> linear_weight = paddle.rand(shape=(128, 128), dtype="float32") + >>> # linear_bias: [embed_dim] + >>> linear_bias = paddle.rand(shape=[128], dtype="float32") + >>> # self attention mask: [batch_size, num_heads, seq_len, seq_len] + >>> attn_mask = paddle.rand(shape=(2, 4, 4, 4), dtype="float32") + + >>> # output: [batch_size, seq_len, embed_dim] + >>> output = F.fused_multi_head_attention( + ... x, qkv_weight, linear_weight, False, + ... None, None, None, None, 1e-5, qkv_bias, + ... linear_bias, None, attn_mask) + >>> # [2, 4, 128] + >>> print(output.shape) """ seed = None @@ -908,37 +911,37 @@ def fused_multi_transformer( .. code-block:: python - if pre_layer_norm: - out = layer_norm(x) - out = qkv_linear(out) + qkv_bias - else: - out = qkv_linear(x) + qkv_bias - out = transpose(out, perm=[2, 0, 3, 1, 4]) - # extract q, k and v from out. - q = out[0:1, ::] - k = out[1:2, ::] - v = out[2:3, ::] - out = q * k^t - out = attn_mask + out - out = softmax(out) - out = dropout(out) - out = out * v - out = transpose(out, perm=[0, 2, 1, 3]) - out = linear(out) - if pre_layer_norm: - out = x + dropout(out + bias) - else: - out = layer_norm(x + dropout(out + bias)) - - residual = out; - if pre_layer_norm: - out = ffn_layer_norm(out) - out = ffn1_linear(out) - out = dropout(activation(out + ffn1_bias)) - out = ffn2_linear(out) - out = residual + dropout(out + ffn2_bias) - if not pre_layer_norm: - out = ffn_layer_norm(out) + >>> if pre_layer_norm: + ... out = layer_norm(x) + ... out = qkv_linear(out) + qkv_bias + >>> else: + ... out = qkv_linear(x) + qkv_bias + >>> out = transpose(out, perm=[2, 0, 3, 1, 4]) + >>> # extract q, k and v from out. + >>> q = out[0:1, ::] + >>> k = out[1:2, ::] + >>> v = out[2:3, ::] + >>> out = q * k^t + >>> out = attn_mask + out + >>> out = softmax(out) + >>> out = dropout(out) + >>> out = out * v + >>> out = transpose(out, perm=[0, 2, 1, 3]) + >>> out = linear(out) + >>> if pre_layer_norm: + ... out = x + dropout(out + bias) + >>> else: + ... out = layer_norm(x + dropout(out + bias)) + + >>> residual = out; + >>> if pre_layer_norm: + ... out = ffn_layer_norm(out) + >>> out = ffn1_linear(out) + >>> out = dropout(activation(out + ffn1_bias)) + >>> out = ffn2_linear(out) + >>> out = residual + dropout(out + ffn2_bias) + >>> if not pre_layer_norm: + ... out = ffn_layer_norm(out) Args: x (Tensor): the input tensor could be 3-D tensor, the input data type could be float16 or float32, the shape is `[batch\_size, sequence\_length, d\_model]`. @@ -996,48 +999,49 @@ def fused_multi_transformer( Examples: .. code-block:: python - # required: gpu - import paddle - import paddle.incubate.nn.functional as F - - # input: [batch_size, seq_len, embed_dim] - x = paddle.rand(shape=(2, 4, 128), dtype="float32") - - # ln_scale: [embed_dim], ln_bias: [embed_dim] - ln_scale = paddle.rand(shape=(128,), dtype="float32") - ln_bias = paddle.rand(shape=(128,), dtype="float32") - - # qkv_weight: [3, num_head, head_dim, embed_dim], qkv_bias: [3, num_head, head_dim] - qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") - qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") - - # linear_weight: [embed_dim, embed_dim], linear_bias: [embed_dim] - linear_weight = paddle.rand(shape=(128, 128), dtype="float32") - linear_bias = paddle.rand(shape=(128,), dtype="float32") - - # ffn_ln_scale: [embed_dim], ffn_ln_bias: [embed_dim] - ffn_ln_scale = paddle.rand(shape=(128,), dtype="float32") - ffn_ln_bias = paddle.rand(shape=(128,), dtype="float32") - - # ffn1_weight: [embed_dim, 4*embed_dim], ffn1_bias: [4*embed_dim] - ffn1_weight = paddle.rand(shape=(128, 4*128), dtype="float32") - ffn1_bias = paddle.rand(shape=(4*128,), dtype="float32") - - # ffn2_weight: [4*embed_dim, embed_dim], ffn2_bias: [embed_dim] - ffn2_weight = paddle.rand(shape=(4*128, 128), dtype="float32") - ffn2_bias = paddle.rand(shape=(128,), dtype="float32") - - # self attention mask: [batch_size, 1, seq_len, seq_len] - attn_mask = paddle.rand(shape=(2, 1, 4, 4), dtype="float32") - - # output: [batch_size, seq_len, embed_dim] - output = F.fused_multi_transformer( - x, [ln_scale], [ln_bias], [qkv_weight], [qkv_bias], - [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias], - [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias], - attn_mask=attn_mask) - # [2, 4, 128] - print(output.shape) + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> import paddle.incubate.nn.functional as F + + >>> # input: [batch_size, seq_len, embed_dim] + >>> x = paddle.rand(shape=(2, 4, 128), dtype="float32") + + >>> # ln_scale: [embed_dim], ln_bias: [embed_dim] + >>> ln_scale = paddle.rand(shape=(128,), dtype="float32") + >>> ln_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # qkv_weight: [3, num_head, head_dim, embed_dim], qkv_bias: [3, num_head, head_dim] + >>> qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32") + >>> qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32") + + >>> # linear_weight: [embed_dim, embed_dim], linear_bias: [embed_dim] + >>> linear_weight = paddle.rand(shape=(128, 128), dtype="float32") + >>> linear_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # ffn_ln_scale: [embed_dim], ffn_ln_bias: [embed_dim] + >>> ffn_ln_scale = paddle.rand(shape=(128,), dtype="float32") + >>> ffn_ln_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # ffn1_weight: [embed_dim, 4*embed_dim], ffn1_bias: [4*embed_dim] + >>> ffn1_weight = paddle.rand(shape=(128, 4*128), dtype="float32") + >>> ffn1_bias = paddle.rand(shape=(4*128,), dtype="float32") + + >>> # ffn2_weight: [4*embed_dim, embed_dim], ffn2_bias: [embed_dim] + >>> ffn2_weight = paddle.rand(shape=(4*128, 128), dtype="float32") + >>> ffn2_bias = paddle.rand(shape=(128,), dtype="float32") + + >>> # self attention mask: [batch_size, 1, seq_len, seq_len] + >>> attn_mask = paddle.rand(shape=(2, 1, 4, 4), dtype="float32") + + >>> # output: [batch_size, seq_len, embed_dim] + >>> output = F.fused_multi_transformer( + ... x, [ln_scale], [ln_bias], [qkv_weight], [qkv_bias], + ... [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias], + ... [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias], + ... attn_mask=attn_mask) + >>> # [2, 4, 128] + >>> print(output.shape) """ if mode not in ('downscale_in_infer', 'upscale_in_train'): raise ValueError( From 361fa9d059f05627265453aa858b9fae2aca0ea1 Mon Sep 17 00:00:00 2001 From: kongAKun Date: Tue, 26 Sep 2023 22:04:40 +0800 Subject: [PATCH 7/9] add the code --- python/paddle/distributed/fleet/fleet.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index 3761cbfbf82c6..8676f79c91f94 100755 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -110,15 +110,15 @@ class Fleet: :name: code-example1 # Example1: for collective training - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet + >>> import paddle + >>> paddle.enable_static() + >>> import paddle.distributed.fleet as fleet - fleet.init(is_collective=True) + >>> fleet.init(is_collective=True) - strategy = fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.001) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + >>> strategy = fleet.DistributedStrategy() + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) # do distributed training @@ -1174,7 +1174,7 @@ def amp_init( ... optimizer.amp_init(place, scope=paddle.static.global_scope()) >>> if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: - run_example_code() + ... run_example_code() """ amp_optimizer = self._get_amp_optimizer() return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test) From 3bb75b6c51b760f0f43c23863da8aeeec82edad7 Mon Sep 17 00:00:00 2001 From: ooo oo <3164076421@qq.com> Date: Tue, 17 Oct 2023 20:27:23 +0800 Subject: [PATCH 8/9] [Doctest]fix No.261-263, test=docs_preview --- python/paddle/distributed/fleet/fleet.py | 63 ++++++++++--------- .../nn/functional/fused_transformer.py | 26 ++++---- .../incubate/nn/layer/fused_dropout_nd.py | 9 +-- 3 files changed, 51 insertions(+), 47 deletions(-) diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index 9808319e38cf3..f18f7aeb06876 100755 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -105,11 +105,11 @@ class Fleet: Returns: Fleet: A Fleet instance - + Examples: .. code-block:: python :name: code-example1 - # Example1: for collective training + >>> # Example1: for collective training >>> import paddle >>> paddle.enable_static() >>> import paddle.distributed.fleet as fleet @@ -117,17 +117,18 @@ class Fleet: >>> fleet.init(is_collective=True) >>> strategy = fleet.DistributedStrategy() - >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> linear = paddle.nn.Linear(10, 10) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters()) >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - # do distributed training + >>> # do distributed training .. code-block:: python :name: code-example2 - # Example2: for parameter server training + >>> # Example2: for parameter server training >>> import paddle >>> paddle.enable_static() >>> import paddle.distributed.fleet as fleet @@ -662,8 +663,8 @@ def init_worker(self, scopes=None): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.init_worker() @@ -707,8 +708,8 @@ def init_server(self, *args, **kwargs): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.init_server() @@ -732,8 +733,8 @@ def load_model(self, path, mode): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.load_model("path", mode=0) @@ -757,8 +758,8 @@ def load_one_table(self, table_id, path, mode): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.load_one_table(0, "path", mode=0) @@ -782,8 +783,8 @@ def load_inference_model(self, path, mode): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.load_inference_model("path", mode=1) @@ -806,8 +807,8 @@ def run_server(self): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> if fleet.is_server(): ... fleet.init_server() @@ -831,8 +832,8 @@ def stop_worker(self): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.init_server() @@ -911,8 +912,8 @@ def save_inference_model( >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.init_server() @@ -964,8 +965,8 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0): >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> exe = paddle.static.Executor(paddle.CPUPlace()) >>> fleet.save_persistables(exe, "dirname", paddle.static.default_main_program()) @@ -1011,8 +1012,8 @@ def save_one_table(self, table_id, path, mode): >>> import paddle.distributed.fleet as fleet >>> fleet.init() - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.save_one_table(0, "path", mode=0) @@ -1041,8 +1042,8 @@ def save_dense_params( >>> place = paddle.CPUPlace() >>> exe = paddle.static.Executor(place) - # build net - # fleet.distributed_optimizer(...) + >>> # build net + >>> # fleet.distributed_optimizer(...) >>> fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program()) @@ -1081,8 +1082,9 @@ def distributed_optimizer(self, optimizer, strategy=None): >>> import paddle >>> import paddle.distributed.fleet as fleet >>> fleet.init(is_collective=True) + >>> linear = paddle.nn.Linear(10, 10) >>> strategy = fleet.DistributedStrategy() - >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters()) >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) """ @@ -1290,11 +1292,12 @@ def minimize( >>> fleet.init(is_collective=True) >>> strategy = fleet.DistributedStrategy() - >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001) + >>> linear = paddle.nn.Linear(10, 10) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.001, parameters=linear.parameters()) >>> optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) >>> optimizer.minimize(avg_cost) - # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX + >>> # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX """ if not isinstance(loss, list): diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index d487f0c789335..faee0b0db50c8 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -56,17 +56,17 @@ def fused_feedforward( This operator only supports running on GPU. The function of the operator is consistent with the following pseudo code: - .. code-block:: python + .. code-block:: text >>> residual = x >>> if pre_layer_norm: ... out = layer_norm1(x) - >>> else: + ... else: ... out = x >>> out = linear2(dropout1(activation(linear1(src)))) >>> if add_residual: ... out = residual + dropout2(out) - >>> else: + ... else: ... out = dropout2(out) >>> if not pre_layer_norm: ... out = layer_norm2(out) @@ -289,7 +289,7 @@ def fused_bias_dropout_residual_layer_norm( The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows: - .. code-block:: python + .. code-block:: text >>> y = layer_norm(residual + dropout(bias + x)) @@ -338,8 +338,8 @@ def fused_bias_dropout_residual_layer_norm( >>> # output: [batch_size, seq_len, embed_dim] >>> output = F.fused_bias_dropout_residual_layer_norm( ... x, residual, bias) - >>> # [2, 4, 128] >>> print(output.shape) + (2, 4, 128) """ seed = None @@ -495,12 +495,12 @@ def fused_multi_head_attention( to information from different representation subspaces. This API only support self_attention. The pseudo code is as follows: - .. code-block:: python + .. code-block:: text >>> residual = x >>> if pre_layer_norm: ... out = layer_norm(x) - >>> else: + ... else: ... out = x >>> # compute q, k, v >>> out = matmul(out, qkv_weight) + qkv_bias @@ -520,7 +520,7 @@ def fused_multi_head_attention( >>> out = linear(out) >>> if add_residual: ... out = residual + dropout(out) - >>> else: + ... else: ... out = dropout(out) >>> if not pre_layer_norm: ... out = layer_norm(out) @@ -606,8 +606,8 @@ def fused_multi_head_attention( ... x, qkv_weight, linear_weight, False, ... None, None, None, None, 1e-5, qkv_bias, ... linear_bias, None, attn_mask) - >>> # [2, 4, 128] >>> print(output.shape) + (2, 4, 128) """ seed = None @@ -909,12 +909,12 @@ def fused_multi_transformer( This operator only supports running on GPU. The function of the transformer layer is consistent with the following pseudo code: - .. code-block:: python + .. code-block:: text >>> if pre_layer_norm: ... out = layer_norm(x) ... out = qkv_linear(out) + qkv_bias - >>> else: + ... else: ... out = qkv_linear(x) + qkv_bias >>> out = transpose(out, perm=[2, 0, 3, 1, 4]) >>> # extract q, k and v from out. @@ -930,7 +930,7 @@ def fused_multi_transformer( >>> out = linear(out) >>> if pre_layer_norm: ... out = x + dropout(out + bias) - >>> else: + ... else: ... out = layer_norm(x + dropout(out + bias)) >>> residual = out; @@ -1040,8 +1040,8 @@ def fused_multi_transformer( ... [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias], ... [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias], ... attn_mask=attn_mask) - >>> # [2, 4, 128] >>> print(output.shape) + (2, 4, 128) """ if mode not in ('downscale_in_infer', 'upscale_in_train'): raise ValueError( diff --git a/python/paddle/incubate/nn/layer/fused_dropout_nd.py b/python/paddle/incubate/nn/layer/fused_dropout_nd.py index ded171158fe3d..09f083da88c74 100644 --- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py +++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py @@ -54,6 +54,7 @@ class FusedDropout(paddle.nn.Layer): .. code-block:: python >>> import paddle + >>> paddle.seed(2023) >>> x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype="float32") >>> m = paddle.incubate.nn.FusedDropout(p=0.5) @@ -61,15 +62,15 @@ class FusedDropout(paddle.nn.Layer): >>> y_train = m(x) >>> print(y_train) Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - [[2., 0., 6.], - [0., 0., 0.]]) + [[0., 0., 6.], + [0., 0., 0.]]) >>> m.eval() # switch the model to test phase >>> y_test = m(x) >>> print(y_test) Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - [[1., 2., 3.], - [4., 5., 6.]]) + [[1., 2., 3.], + [4., 5., 6.]]) """ def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None): From 24bd3217ec42396b546353f3d1ccd3a77901c6b4 Mon Sep 17 00:00:00 2001 From: ooo oo <3164076421@qq.com> Date: Tue, 17 Oct 2023 22:43:11 +0800 Subject: [PATCH 9/9] fix --- python/paddle/incubate/nn/functional/fused_transformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index faee0b0db50c8..c4cf8abfdb354 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -120,7 +120,7 @@ def fused_feedforward( >>> linear2_weight = paddle.randn(shape=(8, 8), dtype="float32") >>> out = F.fused_feedforward(x, linear1_weight, linear2_weight) >>> print(out.shape) - (1, 8, 8) + [1, 8, 8] """ _verify_dropout_rate(dropout1_rate) _verify_dropout_rate(dropout2_rate) @@ -339,7 +339,7 @@ def fused_bias_dropout_residual_layer_norm( >>> output = F.fused_bias_dropout_residual_layer_norm( ... x, residual, bias) >>> print(output.shape) - (2, 4, 128) + [2, 4, 128] """ seed = None @@ -607,7 +607,7 @@ def fused_multi_head_attention( ... None, None, None, None, 1e-5, qkv_bias, ... linear_bias, None, attn_mask) >>> print(output.shape) - (2, 4, 128) + [2, 4, 128] """ seed = None @@ -1041,7 +1041,7 @@ def fused_multi_transformer( ... [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias], ... attn_mask=attn_mask) >>> print(output.shape) - (2, 4, 128) + [2, 4, 128] """ if mode not in ('downscale_in_infer', 'upscale_in_train'): raise ValueError(