diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py
index 6c0e80b1f5710..b7ae315576d79 100644
--- a/python/paddle/incubate/optimizer/pipeline.py
+++ b/python/paddle/incubate/optimizer/pipeline.py
@@ -48,47 +48,47 @@ class PipelineOptimizer:
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-            import paddle.base.layers as layers
-            import numpy as np
-
-            paddle.enable_static()
-            with base.device_guard("gpu:0"):
-                x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0)
-                y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0)
-                data_loader = base.io.DataLoader.from_generator(
-                    feed_list=[x, y],
-                    capacity=64,
-                    use_double_buffer=True,
-                    iterable=False)
-
-                emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
-                emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
-
-            with base.device_guard("gpu:1"):
-                concat = layers.concat([emb_x, emb_y], axis=1)
-                fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
-                loss = paddle.mean(fc)
-            optimizer = paddle.optimizer.SGD(learning_rate=0.5)
-            optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer)
-            optimizer.minimize(loss)
-
-            def train_reader():
-                for _ in range(4):
-                    x = np.random.random(size=[1]).astype('int64')
-                    y = np.random.random(size=[1]).astype('int64')
-                    yield x, y
-            data_loader.set_sample_generator(train_reader, batch_size=1)
-
-            place = base.CUDAPlace(0)
-            exe = base.Executor(place)
-            exe.run(base.default_startup_program())
-            batch_size = 1
-            data_loader.start()
-            exe.train_from_dataset(
-                    base.default_main_program())
-            data_loader.reset()
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> import paddle.base.layers as layers
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+            >>> with base.device_guard("gpu:0"):
+            ...     x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0)
+            ...     y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0)
+            ...     data_loader = base.io.DataLoader.from_generator(
+            ...         feed_list=[x, y],
+            ...         capacity=64,
+            ...         use_double_buffer=True,
+            ...         iterable=False)
+
+            ...     emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
+            ...     emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
+
+            >>> with base.device_guard("gpu:1"):
+            ...     concat = layers.concat([emb_x, emb_y], axis=1)
+            ...     fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
+            ...     loss = paddle.mean(fc)
+            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.5)
+            >>> optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer)
+            >>> optimizer.minimize(loss)
+
+            >>> def train_reader():
+            ...     for _ in range(4):
+            ...         x = np.random.random(size=[1]).astype('int64')
+            ...         y = np.random.random(size=[1]).astype('int64')
+            ...         yield x, y
+            >>> data_loader.set_sample_generator(train_reader, batch_size=1)
+
+            >>> place = paddle.CUDAPlace(0)
+            >>> exe = paddle.static.Executor(place)
+            >>> exe.run(paddle.static.default_startup_program())
+            >>> batch_size = 1
+            >>> data_loader.start()
+            >>> exe.train_from_dataset(
+            ...         paddle.static.default_main_program())
+            >>> data_loader.reset()
     """
 
     def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py
index 9cbd8894f1889..2545115fa0d01 100644
--- a/python/paddle/incubate/optimizer/recompute.py
+++ b/python/paddle/incubate/optimizer/recompute.py
@@ -49,45 +49,57 @@ class RecomputeOptimizer(Optimizer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-            import numpy as np
-
-            paddle.enable_static()
-
-            def gen_data():
-                return {"x": np.random.random(size=(32, 32)).astype('float32'),
-                "y": np.random.randint(2, size=(32, 1)).astype('int64')}
-            def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                print(input_x)
-                fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                cost = paddle.nn.functional.cross_entropy(
-                    input=prediction, label=input_y,
-                    reduction='none', use_softmax=False
-                )
-                sum_cost = paddle.mean(cost)
-                return sum_cost, fc_1, prediction
-            input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-            input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-            cost, fc_1, pred = mlp(input_x, input_y)
-
-            sgd = paddle.optimizer.Adam(learning_rate=0.01)
-            sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-            sgd._set_checkpoints([fc_1, pred])
-            sgd.minimize(cost)
-
-            print("Finished optimize")
-            place = base.CPUPlace()
-            exe = base.Executor(place)
-            exe.run(base.default_startup_program())
-            step = 10
-
-            for i in range(step):
-                cost_val = exe.run(feed=gen_data(),
-                       program=base.default_main_program(),
-                       fetch_list=[cost.name])
-                print("step=%d cost=%f" % (i, cost_val[0]))
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+
+            >>> def gen_data():
+            ...     return {"x": np.random.random(size=(32, 32)).astype('float32'),
+            ...     "y": np.random.randint(2, size=(32, 1)).astype('int64')}
+            >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+            ...     print(input_x)
+            ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+            ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+            ...     cost = paddle.nn.functional.cross_entropy(
+            ...         input=prediction, label=input_y,
+            ...         reduction='none', use_softmax=False
+            ...     )
+            ...     sum_cost = paddle.mean(cost)
+            ...     return sum_cost, fc_1, prediction
+            >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+            >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+            >>> cost, fc_1, pred = mlp(input_x, input_y)
+
+            >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+            >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+            >>> sgd._set_checkpoints([fc_1, pred])
+            >>> sgd.minimize(cost)
+
+            >>> print("Finished optimize")
+            Finished optimize
+            >>> place = paddle.CPUPlace()
+            >>> exe = paddle.static.Executor(place)
+            >>> exe.run(paddle.static.default_startup_program())
+            >>> step = 10
+
+            >>> for i in range(step):
+            ...     cost_val = exe.run(feed=gen_data(),
+            ...             program=paddle.static.default_main_program(),
+            ...             fetch_list=[cost.name])
+            ...     print("step=%d cost=%f" % (i, cost_val[0]))
+            var x : LOD_TENSOR.shape(-1, 32).dtype(float32).stop_gradient(True)
+            Finished optimize
+            step=0 cost=0.737203
+            step=1 cost=1.308077
+            step=2 cost=0.768422
+            step=3 cost=1.239475
+            step=4 cost=0.882643
+            step=5 cost=0.738027
+            step=6 cost=0.819374
+            step=7 cost=0.818534
+            step=8 cost=0.753692
+            step=9 cost=0.787448
 
     """
 
@@ -132,33 +144,34 @@ def load(self, state_dict):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-
-                paddle.enable_static()
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                try:
-                    state_dict = {}
-                    sgd.load(state_dict)
-                except NotImplementedError as e:
-                    print(e)
+                >>> import paddle
+
+                >>> paddle.enable_static()
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> try:
+                ...     state_dict = {}
+                ...     sgd.load(state_dict)
+                >>> except NotImplementedError as e:
+                ...     print(e)
+                load function is not supported by Recompute Optimizer for now
         """
         raise NotImplementedError(
             "load function is not supported by Recompute Optimizer for now"
@@ -177,42 +190,42 @@ def apply_gradients(self, params_grads):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import paddle.base.framework as framework
-
-                paddle.enable_static()
-
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None)
-
-                program = cost.block.program
-                with framework.program_guard(program, None):
-                    optimize_ops = sgd.apply_gradients(params_grads)
-
-                print("Finished apply gradients")
+                >>> import paddle
+                >>> import paddle.base.framework as framework
+
+                >>> paddle.enable_static()
+
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> params_grads = sgd.backward(
+                ...     cost,
+                ...     startup_program=None,
+                ...     parameter_list=None,
+                ...     no_grad_set=None)
+
+                >>> program = cost.block.program
+                >>> with framework.program_guard(program, None):
+                ...     optimize_ops = sgd.apply_gradients(params_grads)
+
+                >>> print("Finished apply gradients")
+                Finished apply gradients
         """
 
         return self._optimizer.apply_gradients(params_grads=params_grads)
@@ -651,36 +664,36 @@ def backward(
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-
-                paddle.enable_static()
-
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None)
-                print("Finished backward")
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> params_grads = sgd.backward(
+                ...     cost,
+                ...     startup_program=None,
+                ...     parameter_list=None,
+                ...     no_grad_set=None)
+                >>> print("Finished backward")
+                Finished backward
         """
         assert (
             self._checkpoints is not None
@@ -733,39 +746,41 @@ def apply_optimize(self, loss, startup_program, params_grads):
             params_grads (list): list of (param, grad) pair to do optimization.
         Examples:
             .. code-block:: python
-                import paddle
-                import paddle.base as base
 
-                paddle.enable_static()
-
-                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
-                    fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
-                    prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
-                    cost = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=input_y,
-                        reduction='none', use_softmax=False
-                    )
-                    sum_cost = paddle.mean(cost)
-                    return sum_cost, fc_1, prediction
-
-                input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
-                input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
-                cost, fc_1, pred = mlp(input_x, input_y)
-                print("Finished FF")
-
-                sgd = paddle.optimizer.Adam(learning_rate=0.01)
-                sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
-                sgd._set_checkpoints([fc_1, pred])
-                params_grads = sgd.backward(
-                    cost,
-                    startup_program=None,
-                    parameter_list=None,
-                    no_grad_set=None)
-
-                optimize_ops = sgd.apply_optimize(
-                    cost, startup_program=None, params_grads=params_grads)
-
-                print("Finished apply_optimize")
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+                ...     fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim)
+                ...     prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax')
+                ...     cost = paddle.nn.functional.cross_entropy(
+                ...         input=prediction, label=input_y,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...     sum_cost = paddle.mean(cost)
+                ...     return sum_cost, fc_1, prediction
+
+                >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32')
+                >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64')
+                >>> cost, fc_1, pred = mlp(input_x, input_y)
+                >>> print("Finished FF")
+                Finished FF
+
+                >>> sgd = paddle.optimizer.Adam(learning_rate=0.01)
+                >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd)
+                >>> sgd._set_checkpoints([fc_1, pred])
+                >>> params_grads = sgd.backward(
+                ...     cost,
+                ...     startup_program=None,
+                ...     parameter_list=None,
+                ...     no_grad_set=None)
+
+                >>> optimize_ops = sgd.apply_optimize(
+                ...     cost, startup_program=None, params_grads=params_grads)
+
+                >>> print("Finished apply_optimize")
+                Finished apply_optimize
         """
 
         func = (
diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py
index 803135ff9f5a6..96cfcce58d95f 100644
--- a/python/paddle/nn/quant/quantized_linear.py
+++ b/python/paddle/nn/quant/quantized_linear.py
@@ -31,16 +31,17 @@ def weight_quantize(x, algo="weight_only_int8"):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import numpy as np
-            from paddle.nn.quant import weight_quantize
-
-            paddle.device.set_device("cpu")
-            x = np.random.randn(64, 32).astype('float16')
-            x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace())
-            out, scale = weight_quantize(x, algo='weight_only_int8')
-            print(out.shape) # [32, 64]
-            print(scale.shape) # [32]
+            >>> # doctest: +SKIP('No testing required')
+            >>> import paddle
+            >>> from paddle.nn.quant import weight_quantize
+
+            >>> paddle.seed(2023)
+            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
+            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
+            >>> print(out.shape)
+            [32, 64]
+            >>> print(scale.shape)
+            [32]
     """
 
     if in_dynamic_mode():
@@ -84,17 +85,18 @@ def weight_only_linear(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.nn.quant import weight_only_linear
-
-            x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
-            weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
-            scale = paddle.randn([32], dtype='float32')
-            bias = paddle.cast(paddle.randn([32]), dtype='float16')
-            if paddle.device.cuda.get_device_capability()[0] >= 8:
-                out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8')
-                print(out.shape) # [1, 2, 32]
+            >>> # doctest: +SKIP('No testing required')
+            >>> import paddle
+            >>> from paddle.nn.quant import weight_only_linear
+
+            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
+            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
+            >>> scale = paddle.randn([32], dtype='float32')
+            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
+            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
+            ...    out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8')
+            ...    print(out.shape)
+            [1, 2, 32]
     """
     if in_dynamic_mode():
         out = _C_ops.weight_only_linear(
@@ -151,17 +153,18 @@ def llm_int8_linear(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.nn.quant import llm_int8_linear
-
-            x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
-            weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
-            scale = paddle.randn([32], dtype='float32')
-            bias = paddle.cast(paddle.randn([32]), dtype='float16')
-            if paddle.device.cuda.get_device_capability()[0] >= 8:
-                out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
-                print(out.shape) # [1, 2, 32]
+            >>> # doctest: +SKIP('No testing required')
+            >>> import paddle
+            >>> from paddle.nn.quant import llm_int8_linear
+
+            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
+            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
+            >>> scale = paddle.randn([32], dtype='float32')
+            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
+            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
+            ...    out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
+            ...    print(out.shape)
+            [1, 2, 32]
     """
     if in_dynamic_mode():
         out = _C_ops.llm_int8_linear(x, weight, bias, weight_scale, threshold)
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 2e1314a3a1536..59176c7b07d2e 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -312,11 +312,11 @@ def state_dict(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                emb = paddle.nn.Embedding(10, 10)
+                >>> import paddle
+                >>> emb = paddle.nn.Embedding(10, 10)
 
-                adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
-                state_dict = adam.state_dict()
+                >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
+                >>> state_dict = adam.state_dict()
 
         '''
         state_dict = {}