diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py index 6c0e80b1f5710..b7ae315576d79 100644 --- a/python/paddle/incubate/optimizer/pipeline.py +++ b/python/paddle/incubate/optimizer/pipeline.py @@ -48,47 +48,47 @@ class PipelineOptimizer: Examples: .. code-block:: python - import paddle - import paddle.base as base - import paddle.base.layers as layers - import numpy as np - - paddle.enable_static() - with base.device_guard("gpu:0"): - x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) - y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) - data_loader = base.io.DataLoader.from_generator( - feed_list=[x, y], - capacity=64, - use_double_buffer=True, - iterable=False) - - emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False) - emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) - - with base.device_guard("gpu:1"): - concat = layers.concat([emb_x, emb_y], axis=1) - fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) - loss = paddle.mean(fc) - optimizer = paddle.optimizer.SGD(learning_rate=0.5) - optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) - optimizer.minimize(loss) - - def train_reader(): - for _ in range(4): - x = np.random.random(size=[1]).astype('int64') - y = np.random.random(size=[1]).astype('int64') - yield x, y - data_loader.set_sample_generator(train_reader, batch_size=1) - - place = base.CUDAPlace(0) - exe = base.Executor(place) - exe.run(base.default_startup_program()) - batch_size = 1 - data_loader.start() - exe.train_from_dataset( - base.default_main_program()) - data_loader.reset() + >>> import paddle + >>> import paddle.base as base + >>> import paddle.base.layers as layers + >>> import numpy as np + + >>> paddle.enable_static() + >>> with base.device_guard("gpu:0"): + ... x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) + ... y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) + ... data_loader = base.io.DataLoader.from_generator( + ... feed_list=[x, y], + ... capacity=64, + ... use_double_buffer=True, + ... iterable=False) + + ... emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False) + ... emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) + + >>> with base.device_guard("gpu:1"): + ... concat = layers.concat([emb_x, emb_y], axis=1) + ... fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) + ... loss = paddle.mean(fc) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.5) + >>> optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) + >>> optimizer.minimize(loss) + + >>> def train_reader(): + ... for _ in range(4): + ... x = np.random.random(size=[1]).astype('int64') + ... y = np.random.random(size=[1]).astype('int64') + ... yield x, y + >>> data_loader.set_sample_generator(train_reader, batch_size=1) + + >>> place = paddle.CUDAPlace(0) + >>> exe = paddle.static.Executor(place) + >>> exe.run(paddle.static.default_startup_program()) + >>> batch_size = 1 + >>> data_loader.start() + >>> exe.train_from_dataset( + ... paddle.static.default_main_program()) + >>> data_loader.reset() """ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py index 9cbd8894f1889..2545115fa0d01 100644 --- a/python/paddle/incubate/optimizer/recompute.py +++ b/python/paddle/incubate/optimizer/recompute.py @@ -49,45 +49,57 @@ class RecomputeOptimizer(Optimizer): Examples: .. code-block:: python - import paddle - import paddle.base as base - import numpy as np - - paddle.enable_static() - - def gen_data(): - return {"x": np.random.random(size=(32, 32)).astype('float32'), - "y": np.random.randint(2, size=(32, 1)).astype('int64')} - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - print(input_x) - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - sgd.minimize(cost) - - print("Finished optimize") - place = base.CPUPlace() - exe = base.Executor(place) - exe.run(base.default_startup_program()) - step = 10 - - for i in range(step): - cost_val = exe.run(feed=gen_data(), - program=base.default_main_program(), - fetch_list=[cost.name]) - print("step=%d cost=%f" % (i, cost_val[0])) + >>> import paddle + >>> import numpy as np + + >>> paddle.enable_static() + + >>> def gen_data(): + ... return {"x": np.random.random(size=(32, 32)).astype('float32'), + ... "y": np.random.randint(2, size=(32, 1)).astype('int64')} + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... print(input_x) + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> sgd.minimize(cost) + + >>> print("Finished optimize") + Finished optimize + >>> place = paddle.CPUPlace() + >>> exe = paddle.static.Executor(place) + >>> exe.run(paddle.static.default_startup_program()) + >>> step = 10 + + >>> for i in range(step): + ... cost_val = exe.run(feed=gen_data(), + ... program=paddle.static.default_main_program(), + ... fetch_list=[cost.name]) + ... print("step=%d cost=%f" % (i, cost_val[0])) + var x : LOD_TENSOR.shape(-1, 32).dtype(float32).stop_gradient(True) + Finished optimize + step=0 cost=0.737203 + step=1 cost=1.308077 + step=2 cost=0.768422 + step=3 cost=1.239475 + step=4 cost=0.882643 + step=5 cost=0.738027 + step=6 cost=0.819374 + step=7 cost=0.818534 + step=8 cost=0.753692 + step=9 cost=0.787448 """ @@ -132,33 +144,34 @@ def load(self, state_dict): Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - try: - state_dict = {} - sgd.load(state_dict) - except NotImplementedError as e: - print(e) + >>> import paddle + + >>> paddle.enable_static() + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> try: + ... state_dict = {} + ... sgd.load(state_dict) + >>> except NotImplementedError as e: + ... print(e) + load function is not supported by Recompute Optimizer for now """ raise NotImplementedError( "load function is not supported by Recompute Optimizer for now" @@ -177,42 +190,42 @@ def apply_gradients(self, params_grads): Examples: .. code-block:: python - import paddle - import paddle.base as base - import paddle.base.framework as framework - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - program = cost.block.program - with framework.program_guard(program, None): - optimize_ops = sgd.apply_gradients(params_grads) - - print("Finished apply gradients") + >>> import paddle + >>> import paddle.base.framework as framework + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + + >>> program = cost.block.program + >>> with framework.program_guard(program, None): + ... optimize_ops = sgd.apply_gradients(params_grads) + + >>> print("Finished apply gradients") + Finished apply gradients """ return self._optimizer.apply_gradients(params_grads=params_grads) @@ -651,36 +664,36 @@ def backward( Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - print("Finished backward") + >>> import paddle + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + >>> print("Finished backward") + Finished backward """ assert ( self._checkpoints is not None @@ -733,39 +746,41 @@ def apply_optimize(self, loss, startup_program, params_grads): params_grads (list): list of (param, grad) pair to do optimization. Examples: .. code-block:: python - import paddle - import paddle.base as base - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - optimize_ops = sgd.apply_optimize( - cost, startup_program=None, params_grads=params_grads) - - print("Finished apply_optimize") + >>> import paddle + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + + >>> optimize_ops = sgd.apply_optimize( + ... cost, startup_program=None, params_grads=params_grads) + + >>> print("Finished apply_optimize") + Finished apply_optimize """ func = ( diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index 803135ff9f5a6..96cfcce58d95f 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -31,16 +31,17 @@ def weight_quantize(x, algo="weight_only_int8"): Examples: .. code-block:: python - import paddle - import numpy as np - from paddle.nn.quant import weight_quantize - - paddle.device.set_device("cpu") - x = np.random.randn(64, 32).astype('float16') - x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) - out, scale = weight_quantize(x, algo='weight_only_int8') - print(out.shape) # [32, 64] - print(scale.shape) # [32] + >>> # doctest: +SKIP('No testing required') + >>> import paddle + >>> from paddle.nn.quant import weight_quantize + + >>> paddle.seed(2023) + >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16) + >>> out, scale = weight_quantize(x, algo='weight_only_int8') + >>> print(out.shape) + [32, 64] + >>> print(scale.shape) + [32] """ if in_dynamic_mode(): @@ -84,17 +85,18 @@ def weight_only_linear( Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.nn.quant import weight_only_linear - - x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - scale = paddle.randn([32], dtype='float32') - bias = paddle.cast(paddle.randn([32]), dtype='float16') - if paddle.device.cuda.get_device_capability()[0] >= 8: - out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') - print(out.shape) # [1, 2, 32] + >>> # doctest: +SKIP('No testing required') + >>> import paddle + >>> from paddle.nn.quant import weight_only_linear + + >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + >>> scale = paddle.randn([32], dtype='float32') + >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') + >>> if paddle.device.cuda.get_device_capability()[0] >= 8: + ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') + ... print(out.shape) + [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.weight_only_linear( @@ -151,17 +153,18 @@ def llm_int8_linear( Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.nn.quant import llm_int8_linear - - x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - scale = paddle.randn([32], dtype='float32') - bias = paddle.cast(paddle.randn([32]), dtype='float16') - if paddle.device.cuda.get_device_capability()[0] >= 8: - out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) - print(out.shape) # [1, 2, 32] + >>> # doctest: +SKIP('No testing required') + >>> import paddle + >>> from paddle.nn.quant import llm_int8_linear + + >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + >>> scale = paddle.randn([32], dtype='float32') + >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') + >>> if paddle.device.cuda.get_device_capability()[0] >= 8: + ... out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) + ... print(out.shape) + [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.llm_int8_linear(x, weight, bias, weight_scale, threshold) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 2e1314a3a1536..59176c7b07d2e 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -312,11 +312,11 @@ def state_dict(self): Examples: .. code-block:: python - import paddle - emb = paddle.nn.Embedding(10, 10) + >>> import paddle + >>> emb = paddle.nn.Embedding(10, 10) - adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) - state_dict = adam.state_dict() + >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + >>> state_dict = adam.state_dict() ''' state_dict = {}