From 3ccf152fc9c4576bf86b90826ce32ce216a362ba Mon Sep 17 00:00:00 2001 From: yuchen202 <103028470+yuchen202@users.noreply.github.com> Date: Thu, 28 Sep 2023 00:07:44 +0800 Subject: [PATCH 1/8] 0928 --- adam.pdopt | Bin 0 -> 101 bytes emb.pdparams | Bin 0 -> 618 bytes python/paddle/incubate/optimizer/pipeline.py | 82 ++-- python/paddle/incubate/optimizer/recompute.py | 349 +++++++++--------- python/paddle/nn/quant/quantized_linear.py | 68 ++-- python/paddle/optimizer/optimizer.py | 8 +- 6 files changed, 265 insertions(+), 242 deletions(-) create mode 100644 adam.pdopt create mode 100644 emb.pdparams diff --git a/adam.pdopt b/adam.pdopt new file mode 100644 index 0000000000000000000000000000000000000000..89b2cb911e622388ee7957b17141fbf53dc7c2ac GIT binary patch literal 101 zcmZo*nHt3a0ku;!dU$+-;)9bjQd3HEQj4a5M7VMii%a5D3-XgQrg$^-u!FfdMN{0N nx)@4(q=QR}N|Q@Ui&9fU@&gi!5_3~aQj7e6l!F7%j8Z)SO`##8 literal 0 HcmV?d00001 diff --git a/emb.pdparams b/emb.pdparams new file mode 100644 index 0000000000000000000000000000000000000000..1b61b63835c52734a0c04d1a93a1a0386d7b2bce GIT binary patch literal 618 zcmX9*T}V@L9KF|lXv1pR2a*{<2>Q@6=wYV&P5c0HZhdKpS+~vIoayGc-eD$^wekZl zvD#lHFdB)rLLt#gW%oBrC{3eWg;`Qc)I$$*ln-i#t$7~Kc{u01oYN?6W1^@C8c4Dr zuVpLBEj$r++8kDUovN&wV^vm16<=ZGIAa|VVi}GttG3nfoTH2}r(L^jVL1yOb?Ck$i)cM1twPnMX2y_N97!3LAVMItr@x;`c z<)SGnX4)wRr}A(Ia}PbYMR0xmq{*AgfvWzeKvW07H6+o_Wy?T4Uw}$N z>F{pi3oM=(0DHb!6MD8+EPfq?(5MNPm)#m?RR?ERXpn&ZSPKcoGj%2WuQaU&JFEA^(SU-;Tk`bWN z_oSB!`*AS+59Dr<`Dc!HqpOw{_q`hS73za%{PG(g{_gzB7L3@y+p!2YC?|ME^=SFK2Jw{%(A$1ZB;jX-gp-l? s9UN;etUhMsj8>LsIYU@7Gf4xn(85;MYGTdi3R^jINL9>> import paddle + >>> import paddle.base as base + >>> import paddle.base.layers as layers + >>> import numpy as np + + >>> paddle.enable_static() + >>> with base.device_guard("gpu:0"): + ... x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) + ... y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) + ... data_loader = base.io.DataLoader.from_generator( + ... feed_list=[x, y], + ... capacity=64, + ... use_double_buffer=True, + ... iterable=False) + + ... emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False) + ... emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) + + >>> with base.device_guard("gpu:1"): + ... concat = layers.concat([emb_x, emb_y], axis=1) + ... fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) + ... loss = paddle.mean(fc) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.5) + >>> optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) + >>> optimizer.minimize(loss) + + >>> def train_reader(): + ... for _ in range(4): + ... x = np.random.random(size=[1]).astype('int64') + ... y = np.random.random(size=[1]).astype('int64') + ... yield x, y + >>> data_loader.set_sample_generator(train_reader, batch_size=1) + + >>> place = base.CUDAPlace(0) + >>> exe = base.Executor(place) + >>> exe.run(base.default_startup_program()) + >>> batch_size = 1 + >>> data_loader.start() + >>> exe.train_from_dataset( + ... base.default_main_program()) + >>> data_loader.reset() """ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py index 9cbd8894f1889..92f397e3984c9 100644 --- a/python/paddle/incubate/optimizer/recompute.py +++ b/python/paddle/incubate/optimizer/recompute.py @@ -49,45 +49,58 @@ class RecomputeOptimizer(Optimizer): Examples: .. code-block:: python - import paddle - import paddle.base as base - import numpy as np - - paddle.enable_static() - - def gen_data(): - return {"x": np.random.random(size=(32, 32)).astype('float32'), - "y": np.random.randint(2, size=(32, 1)).astype('int64')} - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - print(input_x) - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - sgd.minimize(cost) - - print("Finished optimize") - place = base.CPUPlace() - exe = base.Executor(place) - exe.run(base.default_startup_program()) - step = 10 - - for i in range(step): - cost_val = exe.run(feed=gen_data(), - program=base.default_main_program(), - fetch_list=[cost.name]) - print("step=%d cost=%f" % (i, cost_val[0])) + >>> import paddle + >>> import paddle.base as base + >>> import numpy as np + + >>> paddle.enable_static() + + >>> def gen_data(): + ... return {"x": np.random.random(size=(32, 32)).astype('float32'), + ... "y": np.random.randint(2, size=(32, 1)).astype('int64')} + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... print(input_x) + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> sgd.minimize(cost) + + >>> print("Finished optimize") + Finished optimize + >>> place = base.CPUPlace() + >>> exe = base.Executor(place) + >>> exe.run(base.default_startup_program()) + >>> step = 10 + + >>> for i in range(step): + ... cost_val = exe.run(feed=gen_data(), + ... program=base.default_main_program(), + ... fetch_list=[cost.name]) + ... print("step=%d cost=%f" % (i, cost_val[0])) + var x : LOD_TENSOR.shape(-1, 32).dtype(float32).stop_gradient(True) + Finished optimize + step=0 cost=0.737203 + step=1 cost=1.308077 + step=2 cost=0.768422 + step=3 cost=1.239475 + step=4 cost=0.882643 + step=5 cost=0.738027 + step=6 cost=0.819374 + step=7 cost=0.818534 + step=8 cost=0.753692 + step=9 cost=0.787448 """ @@ -132,33 +145,35 @@ def load(self, state_dict): Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - try: - state_dict = {} - sgd.load(state_dict) - except NotImplementedError as e: - print(e) + >>> import paddle + >>> import paddle.base as base + + >>> paddle.enable_static() + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> try: + ... state_dict = {} + ... sgd.load(state_dict) + >>> except NotImplementedError as e: + ... print(e) + load function is not supported by Recompute Optimizer for now """ raise NotImplementedError( "load function is not supported by Recompute Optimizer for now" @@ -177,42 +192,43 @@ def apply_gradients(self, params_grads): Examples: .. code-block:: python - import paddle - import paddle.base as base - import paddle.base.framework as framework - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - program = cost.block.program - with framework.program_guard(program, None): - optimize_ops = sgd.apply_gradients(params_grads) - - print("Finished apply gradients") + >>> import paddle + >>> import paddle.base as base + >>> import paddle.base.framework as framework + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + + >>> program = cost.block.program + >>> with framework.program_guard(program, None): + ... optimize_ops = sgd.apply_gradients(params_grads) + + >>> print("Finished apply gradients") + Finished apply gradients """ return self._optimizer.apply_gradients(params_grads=params_grads) @@ -651,36 +667,37 @@ def backward( Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - print("Finished backward") + >>> import paddle + >>> import paddle.base as base + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + >>> print("Finished backward") + Finished backward """ assert ( self._checkpoints is not None @@ -733,39 +750,41 @@ def apply_optimize(self, loss, startup_program, params_grads): params_grads (list): list of (param, grad) pair to do optimization. Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - optimize_ops = sgd.apply_optimize( - cost, startup_program=None, params_grads=params_grads) - - print("Finished apply_optimize") + >>> import paddle + >>> import paddle.base as base + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + + >>> optimize_ops = sgd.apply_optimize( + ... cost, startup_program=None, params_grads=params_grads) + + >>> print("Finished apply_optimize") + Finished apply_optimize """ func = ( diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index 803135ff9f5a6..3d8102b9027ed 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -31,16 +31,18 @@ def weight_quantize(x, algo="weight_only_int8"): Examples: .. code-block:: python - import paddle - import numpy as np - from paddle.nn.quant import weight_quantize - - paddle.device.set_device("cpu") - x = np.random.randn(64, 32).astype('float16') - x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) - out, scale = weight_quantize(x, algo='weight_only_int8') - print(out.shape) # [32, 64] - print(scale.shape) # [32] + >>> import paddle + >>> import numpy as np + >>> from paddle.nn.quant import weight_quantize + + >>> paddle.device.set_device("cpu") + >>> x = np.random.randn(64, 32).astype('float16') + >>> x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) + >>> out, scale = weight_quantize(x, algo='weight_only_int8') + >>> print(out.shape) + [32, 64] + >>> print(scale.shape) + [32] """ if in_dynamic_mode(): @@ -84,17 +86,18 @@ def weight_only_linear( Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.nn.quant import weight_only_linear - - x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - scale = paddle.randn([32], dtype='float32') - bias = paddle.cast(paddle.randn([32]), dtype='float16') - if paddle.device.cuda.get_device_capability()[0] >= 8: - out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') - print(out.shape) # [1, 2, 32] + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> from paddle.nn.quant import weight_only_linear + + >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + >>> scale = paddle.randn([32], dtype='float32') + >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') + >>> if paddle.device.cuda.get_device_capability()[0] >= 8: + ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') + ... print(out.shape) + [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.weight_only_linear( @@ -151,17 +154,18 @@ def llm_int8_linear( Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.nn.quant import llm_int8_linear - - x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - scale = paddle.randn([32], dtype='float32') - bias = paddle.cast(paddle.randn([32]), dtype='float16') - if paddle.device.cuda.get_device_capability()[0] >= 8: - out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) - print(out.shape) # [1, 2, 32] + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> from paddle.nn.quant import llm_int8_linear + + >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + >>> scale = paddle.randn([32], dtype='float32') + >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') + >>> if paddle.device.cuda.get_device_capability()[0] >= 8: + ... out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) + ... print(out.shape) + [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.llm_int8_linear(x, weight, bias, weight_scale, threshold) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 2e1314a3a1536..59176c7b07d2e 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -312,11 +312,11 @@ def state_dict(self): Examples: .. code-block:: python - import paddle - emb = paddle.nn.Embedding(10, 10) + >>> import paddle + >>> emb = paddle.nn.Embedding(10, 10) - adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) - state_dict = adam.state_dict() + >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + >>> state_dict = adam.state_dict() ''' state_dict = {} From fea8d4bf08830f2f39a40f568b6d6a4f104e049f Mon Sep 17 00:00:00 2001 From: yuchen202 <103028470+yuchen202@users.noreply.github.com> Date: Thu, 28 Sep 2023 00:13:12 +0800 Subject: [PATCH 2/8] Revert "0928" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 不知道为啥出现了6个文件,现在变成4个了 --- python/paddle/incubate/optimizer/pipeline.py | 82 ++-- python/paddle/incubate/optimizer/recompute.py | 349 +++++++++--------- python/paddle/nn/quant/quantized_linear.py | 68 ++-- python/paddle/optimizer/optimizer.py | 8 +- 4 files changed, 242 insertions(+), 265 deletions(-) diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py index ad0083ff5f000..6c0e80b1f5710 100644 --- a/python/paddle/incubate/optimizer/pipeline.py +++ b/python/paddle/incubate/optimizer/pipeline.py @@ -48,47 +48,47 @@ class PipelineOptimizer: Examples: .. code-block:: python - >>> import paddle - >>> import paddle.base as base - >>> import paddle.base.layers as layers - >>> import numpy as np - - >>> paddle.enable_static() - >>> with base.device_guard("gpu:0"): - ... x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) - ... y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) - ... data_loader = base.io.DataLoader.from_generator( - ... feed_list=[x, y], - ... capacity=64, - ... use_double_buffer=True, - ... iterable=False) - - ... emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False) - ... emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) - - >>> with base.device_guard("gpu:1"): - ... concat = layers.concat([emb_x, emb_y], axis=1) - ... fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) - ... loss = paddle.mean(fc) - >>> optimizer = paddle.optimizer.SGD(learning_rate=0.5) - >>> optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) - >>> optimizer.minimize(loss) - - >>> def train_reader(): - ... for _ in range(4): - ... x = np.random.random(size=[1]).astype('int64') - ... y = np.random.random(size=[1]).astype('int64') - ... yield x, y - >>> data_loader.set_sample_generator(train_reader, batch_size=1) - - >>> place = base.CUDAPlace(0) - >>> exe = base.Executor(place) - >>> exe.run(base.default_startup_program()) - >>> batch_size = 1 - >>> data_loader.start() - >>> exe.train_from_dataset( - ... base.default_main_program()) - >>> data_loader.reset() + import paddle + import paddle.base as base + import paddle.base.layers as layers + import numpy as np + + paddle.enable_static() + with base.device_guard("gpu:0"): + x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) + y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) + data_loader = base.io.DataLoader.from_generator( + feed_list=[x, y], + capacity=64, + use_double_buffer=True, + iterable=False) + + emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False) + emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) + + with base.device_guard("gpu:1"): + concat = layers.concat([emb_x, emb_y], axis=1) + fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) + loss = paddle.mean(fc) + optimizer = paddle.optimizer.SGD(learning_rate=0.5) + optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) + optimizer.minimize(loss) + + def train_reader(): + for _ in range(4): + x = np.random.random(size=[1]).astype('int64') + y = np.random.random(size=[1]).astype('int64') + yield x, y + data_loader.set_sample_generator(train_reader, batch_size=1) + + place = base.CUDAPlace(0) + exe = base.Executor(place) + exe.run(base.default_startup_program()) + batch_size = 1 + data_loader.start() + exe.train_from_dataset( + base.default_main_program()) + data_loader.reset() """ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py index 92f397e3984c9..9cbd8894f1889 100644 --- a/python/paddle/incubate/optimizer/recompute.py +++ b/python/paddle/incubate/optimizer/recompute.py @@ -49,58 +49,45 @@ class RecomputeOptimizer(Optimizer): Examples: .. code-block:: python - >>> import paddle - >>> import paddle.base as base - >>> import numpy as np - - >>> paddle.enable_static() - - >>> def gen_data(): - ... return {"x": np.random.random(size=(32, 32)).astype('float32'), - ... "y": np.random.randint(2, size=(32, 1)).astype('int64')} - >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): - ... print(input_x) - ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - ... cost = paddle.nn.functional.cross_entropy( - ... input=prediction, label=input_y, - ... reduction='none', use_softmax=False - ... ) - ... sum_cost = paddle.mean(cost) - ... return sum_cost, fc_1, prediction - >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - >>> cost, fc_1, pred = mlp(input_x, input_y) - - >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) - >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - >>> sgd._set_checkpoints([fc_1, pred]) - >>> sgd.minimize(cost) - - >>> print("Finished optimize") - Finished optimize - >>> place = base.CPUPlace() - >>> exe = base.Executor(place) - >>> exe.run(base.default_startup_program()) - >>> step = 10 - - >>> for i in range(step): - ... cost_val = exe.run(feed=gen_data(), - ... program=base.default_main_program(), - ... fetch_list=[cost.name]) - ... print("step=%d cost=%f" % (i, cost_val[0])) - var x : LOD_TENSOR.shape(-1, 32).dtype(float32).stop_gradient(True) - Finished optimize - step=0 cost=0.737203 - step=1 cost=1.308077 - step=2 cost=0.768422 - step=3 cost=1.239475 - step=4 cost=0.882643 - step=5 cost=0.738027 - step=6 cost=0.819374 - step=7 cost=0.818534 - step=8 cost=0.753692 - step=9 cost=0.787448 + import paddle + import paddle.base as base + import numpy as np + + paddle.enable_static() + + def gen_data(): + return {"x": np.random.random(size=(32, 32)).astype('float32'), + "y": np.random.randint(2, size=(32, 1)).astype('int64')} + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + print(input_x) + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + sgd.minimize(cost) + + print("Finished optimize") + place = base.CPUPlace() + exe = base.Executor(place) + exe.run(base.default_startup_program()) + step = 10 + + for i in range(step): + cost_val = exe.run(feed=gen_data(), + program=base.default_main_program(), + fetch_list=[cost.name]) + print("step=%d cost=%f" % (i, cost_val[0])) """ @@ -145,35 +132,33 @@ def load(self, state_dict): Examples: .. code-block:: python - >>> import paddle - >>> import paddle.base as base - - >>> paddle.enable_static() - >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): - ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - ... cost = paddle.nn.functional.cross_entropy( - ... input=prediction, label=input_y, - ... reduction='none', use_softmax=False - ... ) - ... sum_cost = paddle.mean(cost) - ... return sum_cost, fc_1, prediction - - >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - >>> cost, fc_1, pred = mlp(input_x, input_y) - >>> print("Finished FF") - Finished FF - - >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) - >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - >>> sgd._set_checkpoints([fc_1, pred]) - >>> try: - ... state_dict = {} - ... sgd.load(state_dict) - >>> except NotImplementedError as e: - ... print(e) - load function is not supported by Recompute Optimizer for now + import paddle + import paddle.base as base + + paddle.enable_static() + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + try: + state_dict = {} + sgd.load(state_dict) + except NotImplementedError as e: + print(e) """ raise NotImplementedError( "load function is not supported by Recompute Optimizer for now" @@ -192,43 +177,42 @@ def apply_gradients(self, params_grads): Examples: .. code-block:: python - >>> import paddle - >>> import paddle.base as base - >>> import paddle.base.framework as framework - - >>> paddle.enable_static() - - >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): - ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - ... cost = paddle.nn.functional.cross_entropy( - ... input=prediction, label=input_y, - ... reduction='none', use_softmax=False - ... ) - ... sum_cost = paddle.mean(cost) - ... return sum_cost, fc_1, prediction - - >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - >>> cost, fc_1, pred = mlp(input_x, input_y) - >>> print("Finished FF") - Finished FF - - >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) - >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - >>> sgd._set_checkpoints([fc_1, pred]) - >>> params_grads = sgd.backward( - ... cost, - ... startup_program=None, - ... parameter_list=None, - ... no_grad_set=None) - - >>> program = cost.block.program - >>> with framework.program_guard(program, None): - ... optimize_ops = sgd.apply_gradients(params_grads) - - >>> print("Finished apply gradients") - Finished apply gradients + import paddle + import paddle.base as base + import paddle.base.framework as framework + + paddle.enable_static() + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + + program = cost.block.program + with framework.program_guard(program, None): + optimize_ops = sgd.apply_gradients(params_grads) + + print("Finished apply gradients") """ return self._optimizer.apply_gradients(params_grads=params_grads) @@ -667,37 +651,36 @@ def backward( Examples: .. code-block:: python - >>> import paddle - >>> import paddle.base as base - - >>> paddle.enable_static() - - >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): - ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - ... cost = paddle.nn.functional.cross_entropy( - ... input=prediction, label=input_y, - ... reduction='none', use_softmax=False - ... ) - ... sum_cost = paddle.mean(cost) - ... return sum_cost, fc_1, prediction - - >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - >>> cost, fc_1, pred = mlp(input_x, input_y) - >>> print("Finished FF") - Finished FF - - >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) - >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - >>> sgd._set_checkpoints([fc_1, pred]) - >>> params_grads = sgd.backward( - ... cost, - ... startup_program=None, - ... parameter_list=None, - ... no_grad_set=None) - >>> print("Finished backward") - Finished backward + import paddle + import paddle.base as base + + paddle.enable_static() + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + print("Finished backward") """ assert ( self._checkpoints is not None @@ -750,41 +733,39 @@ def apply_optimize(self, loss, startup_program, params_grads): params_grads (list): list of (param, grad) pair to do optimization. Examples: .. code-block:: python - >>> import paddle - >>> import paddle.base as base - - >>> paddle.enable_static() - - >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): - ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - ... cost = paddle.nn.functional.cross_entropy( - ... input=prediction, label=input_y, - ... reduction='none', use_softmax=False - ... ) - ... sum_cost = paddle.mean(cost) - ... return sum_cost, fc_1, prediction - - >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - >>> cost, fc_1, pred = mlp(input_x, input_y) - >>> print("Finished FF") - Finished FF - - >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) - >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - >>> sgd._set_checkpoints([fc_1, pred]) - >>> params_grads = sgd.backward( - ... cost, - ... startup_program=None, - ... parameter_list=None, - ... no_grad_set=None) - - >>> optimize_ops = sgd.apply_optimize( - ... cost, startup_program=None, params_grads=params_grads) - - >>> print("Finished apply_optimize") - Finished apply_optimize + import paddle + import paddle.base as base + + paddle.enable_static() + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + + optimize_ops = sgd.apply_optimize( + cost, startup_program=None, params_grads=params_grads) + + print("Finished apply_optimize") """ func = ( diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index 3d8102b9027ed..803135ff9f5a6 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -31,18 +31,16 @@ def weight_quantize(x, algo="weight_only_int8"): Examples: .. code-block:: python - >>> import paddle - >>> import numpy as np - >>> from paddle.nn.quant import weight_quantize - - >>> paddle.device.set_device("cpu") - >>> x = np.random.randn(64, 32).astype('float16') - >>> x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) - >>> out, scale = weight_quantize(x, algo='weight_only_int8') - >>> print(out.shape) - [32, 64] - >>> print(scale.shape) - [32] + import paddle + import numpy as np + from paddle.nn.quant import weight_quantize + + paddle.device.set_device("cpu") + x = np.random.randn(64, 32).astype('float16') + x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) + out, scale = weight_quantize(x, algo='weight_only_int8') + print(out.shape) # [32, 64] + print(scale.shape) # [32] """ if in_dynamic_mode(): @@ -86,18 +84,17 @@ def weight_only_linear( Examples: .. code-block:: python - >>> # doctest: +REQUIRES(env:GPU) - >>> import paddle - >>> from paddle.nn.quant import weight_only_linear - - >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - >>> scale = paddle.randn([32], dtype='float32') - >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') - >>> if paddle.device.cuda.get_device_capability()[0] >= 8: - ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') - ... print(out.shape) - [1, 2, 32] + # required: gpu + import paddle + from paddle.nn.quant import weight_only_linear + + x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + scale = paddle.randn([32], dtype='float32') + bias = paddle.cast(paddle.randn([32]), dtype='float16') + if paddle.device.cuda.get_device_capability()[0] >= 8: + out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') + print(out.shape) # [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.weight_only_linear( @@ -154,18 +151,17 @@ def llm_int8_linear( Examples: .. code-block:: python - >>> # doctest: +REQUIRES(env:GPU) - >>> import paddle - >>> from paddle.nn.quant import llm_int8_linear - - >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - >>> scale = paddle.randn([32], dtype='float32') - >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') - >>> if paddle.device.cuda.get_device_capability()[0] >= 8: - ... out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) - ... print(out.shape) - [1, 2, 32] + # required: gpu + import paddle + from paddle.nn.quant import llm_int8_linear + + x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + scale = paddle.randn([32], dtype='float32') + bias = paddle.cast(paddle.randn([32]), dtype='float16') + if paddle.device.cuda.get_device_capability()[0] >= 8: + out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) + print(out.shape) # [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.llm_int8_linear(x, weight, bias, weight_scale, threshold) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 59176c7b07d2e..2e1314a3a1536 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -312,11 +312,11 @@ def state_dict(self): Examples: .. code-block:: python - >>> import paddle - >>> emb = paddle.nn.Embedding(10, 10) + import paddle + emb = paddle.nn.Embedding(10, 10) - >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) - >>> state_dict = adam.state_dict() + adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + state_dict = adam.state_dict() ''' state_dict = {} From d666564bc8138dd938b36c1f49116f56ffa01ee8 Mon Sep 17 00:00:00 2001 From: yuchen202 <103028470+yuchen202@users.noreply.github.com> Date: Wed, 11 Oct 2023 00:07:49 +0800 Subject: [PATCH 3/8] Revert "Revert "0928"" This reverts commit fea8d4bf08830f2f39a40f568b6d6a4f104e049f. --- python/paddle/incubate/optimizer/pipeline.py | 82 ++-- python/paddle/incubate/optimizer/recompute.py | 349 +++++++++--------- python/paddle/nn/quant/quantized_linear.py | 68 ++-- python/paddle/optimizer/optimizer.py | 8 +- 4 files changed, 265 insertions(+), 242 deletions(-) diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py index 6c0e80b1f5710..ad0083ff5f000 100644 --- a/python/paddle/incubate/optimizer/pipeline.py +++ b/python/paddle/incubate/optimizer/pipeline.py @@ -48,47 +48,47 @@ class PipelineOptimizer: Examples: .. code-block:: python - import paddle - import paddle.base as base - import paddle.base.layers as layers - import numpy as np - - paddle.enable_static() - with base.device_guard("gpu:0"): - x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) - y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) - data_loader = base.io.DataLoader.from_generator( - feed_list=[x, y], - capacity=64, - use_double_buffer=True, - iterable=False) - - emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False) - emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) - - with base.device_guard("gpu:1"): - concat = layers.concat([emb_x, emb_y], axis=1) - fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) - loss = paddle.mean(fc) - optimizer = paddle.optimizer.SGD(learning_rate=0.5) - optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) - optimizer.minimize(loss) - - def train_reader(): - for _ in range(4): - x = np.random.random(size=[1]).astype('int64') - y = np.random.random(size=[1]).astype('int64') - yield x, y - data_loader.set_sample_generator(train_reader, batch_size=1) - - place = base.CUDAPlace(0) - exe = base.Executor(place) - exe.run(base.default_startup_program()) - batch_size = 1 - data_loader.start() - exe.train_from_dataset( - base.default_main_program()) - data_loader.reset() + >>> import paddle + >>> import paddle.base as base + >>> import paddle.base.layers as layers + >>> import numpy as np + + >>> paddle.enable_static() + >>> with base.device_guard("gpu:0"): + ... x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) + ... y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) + ... data_loader = base.io.DataLoader.from_generator( + ... feed_list=[x, y], + ... capacity=64, + ... use_double_buffer=True, + ... iterable=False) + + ... emb_x = layers.embedding(input=x, param_attr=base.ParamAttr(name="embx"), size=[10,2], is_sparse=False) + ... emb_y = layers.embedding(input=y, param_attr=base.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) + + >>> with base.device_guard("gpu:1"): + ... concat = layers.concat([emb_x, emb_y], axis=1) + ... fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) + ... loss = paddle.mean(fc) + >>> optimizer = paddle.optimizer.SGD(learning_rate=0.5) + >>> optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) + >>> optimizer.minimize(loss) + + >>> def train_reader(): + ... for _ in range(4): + ... x = np.random.random(size=[1]).astype('int64') + ... y = np.random.random(size=[1]).astype('int64') + ... yield x, y + >>> data_loader.set_sample_generator(train_reader, batch_size=1) + + >>> place = base.CUDAPlace(0) + >>> exe = base.Executor(place) + >>> exe.run(base.default_startup_program()) + >>> batch_size = 1 + >>> data_loader.start() + >>> exe.train_from_dataset( + ... base.default_main_program()) + >>> data_loader.reset() """ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py index 9cbd8894f1889..92f397e3984c9 100644 --- a/python/paddle/incubate/optimizer/recompute.py +++ b/python/paddle/incubate/optimizer/recompute.py @@ -49,45 +49,58 @@ class RecomputeOptimizer(Optimizer): Examples: .. code-block:: python - import paddle - import paddle.base as base - import numpy as np - - paddle.enable_static() - - def gen_data(): - return {"x": np.random.random(size=(32, 32)).astype('float32'), - "y": np.random.randint(2, size=(32, 1)).astype('int64')} - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - print(input_x) - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - sgd.minimize(cost) - - print("Finished optimize") - place = base.CPUPlace() - exe = base.Executor(place) - exe.run(base.default_startup_program()) - step = 10 - - for i in range(step): - cost_val = exe.run(feed=gen_data(), - program=base.default_main_program(), - fetch_list=[cost.name]) - print("step=%d cost=%f" % (i, cost_val[0])) + >>> import paddle + >>> import paddle.base as base + >>> import numpy as np + + >>> paddle.enable_static() + + >>> def gen_data(): + ... return {"x": np.random.random(size=(32, 32)).astype('float32'), + ... "y": np.random.randint(2, size=(32, 1)).astype('int64')} + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... print(input_x) + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> sgd.minimize(cost) + + >>> print("Finished optimize") + Finished optimize + >>> place = base.CPUPlace() + >>> exe = base.Executor(place) + >>> exe.run(base.default_startup_program()) + >>> step = 10 + + >>> for i in range(step): + ... cost_val = exe.run(feed=gen_data(), + ... program=base.default_main_program(), + ... fetch_list=[cost.name]) + ... print("step=%d cost=%f" % (i, cost_val[0])) + var x : LOD_TENSOR.shape(-1, 32).dtype(float32).stop_gradient(True) + Finished optimize + step=0 cost=0.737203 + step=1 cost=1.308077 + step=2 cost=0.768422 + step=3 cost=1.239475 + step=4 cost=0.882643 + step=5 cost=0.738027 + step=6 cost=0.819374 + step=7 cost=0.818534 + step=8 cost=0.753692 + step=9 cost=0.787448 """ @@ -132,33 +145,35 @@ def load(self, state_dict): Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - try: - state_dict = {} - sgd.load(state_dict) - except NotImplementedError as e: - print(e) + >>> import paddle + >>> import paddle.base as base + + >>> paddle.enable_static() + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> try: + ... state_dict = {} + ... sgd.load(state_dict) + >>> except NotImplementedError as e: + ... print(e) + load function is not supported by Recompute Optimizer for now """ raise NotImplementedError( "load function is not supported by Recompute Optimizer for now" @@ -177,42 +192,43 @@ def apply_gradients(self, params_grads): Examples: .. code-block:: python - import paddle - import paddle.base as base - import paddle.base.framework as framework - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - program = cost.block.program - with framework.program_guard(program, None): - optimize_ops = sgd.apply_gradients(params_grads) - - print("Finished apply gradients") + >>> import paddle + >>> import paddle.base as base + >>> import paddle.base.framework as framework + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + + >>> program = cost.block.program + >>> with framework.program_guard(program, None): + ... optimize_ops = sgd.apply_gradients(params_grads) + + >>> print("Finished apply gradients") + Finished apply gradients """ return self._optimizer.apply_gradients(params_grads=params_grads) @@ -651,36 +667,37 @@ def backward( Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - print("Finished backward") + >>> import paddle + >>> import paddle.base as base + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + >>> print("Finished backward") + Finished backward """ assert ( self._checkpoints is not None @@ -733,39 +750,41 @@ def apply_optimize(self, loss, startup_program, params_grads): params_grads (list): list of (param, grad) pair to do optimization. Examples: .. code-block:: python - import paddle - import paddle.base as base - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = paddle.optimizer.Adam(learning_rate=0.01) - sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - optimize_ops = sgd.apply_optimize( - cost, startup_program=None, params_grads=params_grads) - - print("Finished apply_optimize") + >>> import paddle + >>> import paddle.base as base + + >>> paddle.enable_static() + + >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): + ... fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + ... prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + ... cost = paddle.nn.functional.cross_entropy( + ... input=prediction, label=input_y, + ... reduction='none', use_softmax=False + ... ) + ... sum_cost = paddle.mean(cost) + ... return sum_cost, fc_1, prediction + + >>> input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + >>> input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + >>> cost, fc_1, pred = mlp(input_x, input_y) + >>> print("Finished FF") + Finished FF + + >>> sgd = paddle.optimizer.Adam(learning_rate=0.01) + >>> sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + >>> sgd._set_checkpoints([fc_1, pred]) + >>> params_grads = sgd.backward( + ... cost, + ... startup_program=None, + ... parameter_list=None, + ... no_grad_set=None) + + >>> optimize_ops = sgd.apply_optimize( + ... cost, startup_program=None, params_grads=params_grads) + + >>> print("Finished apply_optimize") + Finished apply_optimize """ func = ( diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index 803135ff9f5a6..3d8102b9027ed 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -31,16 +31,18 @@ def weight_quantize(x, algo="weight_only_int8"): Examples: .. code-block:: python - import paddle - import numpy as np - from paddle.nn.quant import weight_quantize - - paddle.device.set_device("cpu") - x = np.random.randn(64, 32).astype('float16') - x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) - out, scale = weight_quantize(x, algo='weight_only_int8') - print(out.shape) # [32, 64] - print(scale.shape) # [32] + >>> import paddle + >>> import numpy as np + >>> from paddle.nn.quant import weight_quantize + + >>> paddle.device.set_device("cpu") + >>> x = np.random.randn(64, 32).astype('float16') + >>> x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) + >>> out, scale = weight_quantize(x, algo='weight_only_int8') + >>> print(out.shape) + [32, 64] + >>> print(scale.shape) + [32] """ if in_dynamic_mode(): @@ -84,17 +86,18 @@ def weight_only_linear( Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.nn.quant import weight_only_linear - - x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - scale = paddle.randn([32], dtype='float32') - bias = paddle.cast(paddle.randn([32]), dtype='float16') - if paddle.device.cuda.get_device_capability()[0] >= 8: - out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') - print(out.shape) # [1, 2, 32] + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> from paddle.nn.quant import weight_only_linear + + >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + >>> scale = paddle.randn([32], dtype='float32') + >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') + >>> if paddle.device.cuda.get_device_capability()[0] >= 8: + ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') + ... print(out.shape) + [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.weight_only_linear( @@ -151,17 +154,18 @@ def llm_int8_linear( Examples: .. code-block:: python - # required: gpu - import paddle - from paddle.nn.quant import llm_int8_linear - - x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') - weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') - scale = paddle.randn([32], dtype='float32') - bias = paddle.cast(paddle.randn([32]), dtype='float16') - if paddle.device.cuda.get_device_capability()[0] >= 8: - out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) - print(out.shape) # [1, 2, 32] + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> from paddle.nn.quant import llm_int8_linear + + >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') + >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') + >>> scale = paddle.randn([32], dtype='float32') + >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') + >>> if paddle.device.cuda.get_device_capability()[0] >= 8: + ... out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) + ... print(out.shape) + [1, 2, 32] """ if in_dynamic_mode(): out = _C_ops.llm_int8_linear(x, weight, bias, weight_scale, threshold) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 2e1314a3a1536..59176c7b07d2e 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -312,11 +312,11 @@ def state_dict(self): Examples: .. code-block:: python - import paddle - emb = paddle.nn.Embedding(10, 10) + >>> import paddle + >>> emb = paddle.nn.Embedding(10, 10) - adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) - state_dict = adam.state_dict() + >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + >>> state_dict = adam.state_dict() ''' state_dict = {} From 377dc1394d11da9e25b3e0d5cd6f43438050213b Mon Sep 17 00:00:00 2001 From: yuchen202 <103028470+yuchen202@users.noreply.github.com> Date: Wed, 11 Oct 2023 00:09:03 +0800 Subject: [PATCH 4/8] =?UTF-8?q?1011=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 3ccf152fc9c4576bf86b90826ce32ce216a362ba. --- adam.pdopt | Bin 101 -> 0 bytes emb.pdparams | Bin 618 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 adam.pdopt delete mode 100644 emb.pdparams diff --git a/adam.pdopt b/adam.pdopt deleted file mode 100644 index 89b2cb911e622388ee7957b17141fbf53dc7c2ac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 101 zcmZo*nHt3a0ku;!dU$+-;)9bjQd3HEQj4a5M7VMii%a5D3-XgQrg$^-u!FfdMN{0N nx)@4(q=QR}N|Q@Ui&9fU@&gi!5_3~aQj7e6l!F7%j8Z)SO`##8 diff --git a/emb.pdparams b/emb.pdparams deleted file mode 100644 index 1b61b63835c52734a0c04d1a93a1a0386d7b2bce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 618 zcmX9*T}V@L9KF|lXv1pR2a*{<2>Q@6=wYV&P5c0HZhdKpS+~vIoayGc-eD$^wekZl zvD#lHFdB)rLLt#gW%oBrC{3eWg;`Qc)I$$*ln-i#t$7~Kc{u01oYN?6W1^@C8c4Dr zuVpLBEj$r++8kDUovN&wV^vm16<=ZGIAa|VVi}GttG3nfoTH2}r(L^jVL1yOb?Ck$i)cM1twPnMX2y_N97!3LAVMItr@x;`c z<)SGnX4)wRr}A(Ia}PbYMR0xmq{*AgfvWzeKvW07H6+o_Wy?T4Uw}$N z>F{pi3oM=(0DHb!6MD8+EPfq?(5MNPm)#m?RR?ERXpn&ZSPKcoGj%2WuQaU&JFEA^(SU-;Tk`bWN z_oSB!`*AS+59Dr<`Dc!HqpOw{_q`hS73za%{PG(g{_gzB7L3@y+p!2YC?|ME^=SFK2Jw{%(A$1ZB;jX-gp-l? s9UN;etUhMsj8>LsIYU@7Gf4xn(85;MYGTdi3R^jINL9 Date: Wed, 11 Oct 2023 13:36:58 +0800 Subject: [PATCH 5/8] Update quantized_linear.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 解决冲突 --- python/paddle/nn/quant/quantized_linear.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index 3d8102b9027ed..2100715cc98eb 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -31,13 +31,12 @@ def weight_quantize(x, algo="weight_only_int8"): Examples: .. code-block:: python + >>> # doctest: +SKIP('No testing required') >>> import paddle - >>> import numpy as np >>> from paddle.nn.quant import weight_quantize - >>> paddle.device.set_device("cpu") - >>> x = np.random.randn(64, 32).astype('float16') - >>> x = paddle.to_tensor(x, dtype=paddle.float16, place=paddle.CPUPlace()) + >>> paddle.seed(2023) + >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16) >>> out, scale = weight_quantize(x, algo='weight_only_int8') >>> print(out.shape) [32, 64] @@ -154,7 +153,7 @@ def llm_int8_linear( Examples: .. code-block:: python - >>> # doctest: +REQUIRES(env:GPU) + >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import llm_int8_linear @@ -163,8 +162,8 @@ def llm_int8_linear( >>> scale = paddle.randn([32], dtype='float32') >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') >>> if paddle.device.cuda.get_device_capability()[0] >= 8: - ... out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) - ... print(out.shape) + ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') + ... print(out.shape) [1, 2, 32] """ if in_dynamic_mode(): From f72564b81825689722a8e53121ce1328651f1e11 Mon Sep 17 00:00:00 2001 From: yuchen202 <103028470+yuchen202@users.noreply.github.com> Date: Wed, 11 Oct 2023 13:38:52 +0800 Subject: [PATCH 6/8] Update quantized_linear.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 解决冲突 --- python/paddle/nn/quant/quantized_linear.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index 2100715cc98eb..a25227581a5ca 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -85,7 +85,7 @@ def weight_only_linear( Examples: .. code-block:: python - >>> # doctest: +REQUIRES(env:GPU) + >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import weight_only_linear @@ -94,8 +94,8 @@ def weight_only_linear( >>> scale = paddle.randn([32], dtype='float32') >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') >>> if paddle.device.cuda.get_device_capability()[0] >= 8: - ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') - ... print(out.shape) + ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') + ... print(out.shape) [1, 2, 32] """ if in_dynamic_mode(): @@ -162,7 +162,7 @@ def llm_int8_linear( >>> scale = paddle.randn([32], dtype='float32') >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') >>> if paddle.device.cuda.get_device_capability()[0] >= 8: - ... out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8') + ... out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) ... print(out.shape) [1, 2, 32] """ From 6c6140b9087bff70c9162cdbed6f0ffb957a3d2f Mon Sep 17 00:00:00 2001 From: yuchen202 <103028470+yuchen202@users.noreply.github.com> Date: Wed, 11 Oct 2023 13:40:22 +0800 Subject: [PATCH 7/8] Update quantized_linear.py --- python/paddle/nn/quant/quantized_linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index a25227581a5ca..96cfcce58d95f 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -85,7 +85,7 @@ def weight_only_linear( Examples: .. code-block:: python - >>> # doctest: +SKIP('No testing required') + >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import weight_only_linear From e7ff5ef22ba11f1e6dac27a263665dab67704630 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Thu, 12 Oct 2023 11:33:00 +0800 Subject: [PATCH 8/8] Apply suggestions from code review --- python/paddle/incubate/optimizer/pipeline.py | 8 ++++---- python/paddle/incubate/optimizer/recompute.py | 14 +++++--------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py index ad0083ff5f000..b7ae315576d79 100644 --- a/python/paddle/incubate/optimizer/pipeline.py +++ b/python/paddle/incubate/optimizer/pipeline.py @@ -81,13 +81,13 @@ class PipelineOptimizer: ... yield x, y >>> data_loader.set_sample_generator(train_reader, batch_size=1) - >>> place = base.CUDAPlace(0) - >>> exe = base.Executor(place) - >>> exe.run(base.default_startup_program()) + >>> place = paddle.CUDAPlace(0) + >>> exe = paddle.static.Executor(place) + >>> exe.run(paddle.static.default_startup_program()) >>> batch_size = 1 >>> data_loader.start() >>> exe.train_from_dataset( - ... base.default_main_program()) + ... paddle.static.default_main_program()) >>> data_loader.reset() """ diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py index 92f397e3984c9..2545115fa0d01 100644 --- a/python/paddle/incubate/optimizer/recompute.py +++ b/python/paddle/incubate/optimizer/recompute.py @@ -50,7 +50,6 @@ class RecomputeOptimizer(Optimizer): .. code-block:: python >>> import paddle - >>> import paddle.base as base >>> import numpy as np >>> paddle.enable_static() @@ -79,14 +78,14 @@ class RecomputeOptimizer(Optimizer): >>> print("Finished optimize") Finished optimize - >>> place = base.CPUPlace() - >>> exe = base.Executor(place) - >>> exe.run(base.default_startup_program()) + >>> place = paddle.CPUPlace() + >>> exe = paddle.static.Executor(place) + >>> exe.run(paddle.static.default_startup_program()) >>> step = 10 >>> for i in range(step): ... cost_val = exe.run(feed=gen_data(), - ... program=base.default_main_program(), + ... program=paddle.static.default_main_program(), ... fetch_list=[cost.name]) ... print("step=%d cost=%f" % (i, cost_val[0])) var x : LOD_TENSOR.shape(-1, 32).dtype(float32).stop_gradient(True) @@ -146,7 +145,6 @@ def load(self, state_dict): .. code-block:: python >>> import paddle - >>> import paddle.base as base >>> paddle.enable_static() >>> def mlp(input_x, input_y, hid_dim=128, label_dim=2): @@ -193,7 +191,6 @@ def apply_gradients(self, params_grads): .. code-block:: python >>> import paddle - >>> import paddle.base as base >>> import paddle.base.framework as framework >>> paddle.enable_static() @@ -668,7 +665,6 @@ def backward( .. code-block:: python >>> import paddle - >>> import paddle.base as base >>> paddle.enable_static() @@ -750,8 +746,8 @@ def apply_optimize(self, loss, startup_program, params_grads): params_grads (list): list of (param, grad) pair to do optimization. Examples: .. code-block:: python + >>> import paddle - >>> import paddle.base as base >>> paddle.enable_static()