diff --git a/docs/api/paddle/io/DataLoader_cn.rst b/docs/api/paddle/io/DataLoader_cn.rst index 3ea5203f253..6deeee8fb0b 100644 --- a/docs/api/paddle/io/DataLoader_cn.rst +++ b/docs/api/paddle/io/DataLoader_cn.rst @@ -52,388 +52,4 @@ DataLoader,迭代 ``dataset`` 数据的迭代器,迭代器返回的数据中 代码示例 :::::::::::: -.. code-block:: python - - import numpy as np - - import paddle - import paddle.nn as nn - import paddle.nn.functional as F - from paddle.io import Dataset, BatchSampler, DataLoader - - BATCH_NUM = 20 - BATCH_SIZE = 16 - EPOCH_NUM = 4 - - IMAGE_SIZE = 784 - CLASS_NUM = 10 - - USE_GPU = False # whether use GPU to run model - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([IMAGE_SIZE]).astype('float32') - label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) - - class SimpleNet(nn.Layer): - def __init__(self): - super().__init__() - self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) - - def forward(self, image, label=None): - return self.fc(image) - - simple_net = SimpleNet() - opt = paddle.optimizer.SGD(learning_rate=1e-3, - parameters=simple_net.parameters()) - - loader = DataLoader(dataset, - batch_size=BATCH_SIZE, - shuffle=True, - drop_last=True, - num_workers=2) - - for e in range(EPOCH_NUM): - for i, (image, label) in enumerate(loader()): - out = simple_net(image) - loss = F.cross_entropy(out, label) - avg_loss = paddle.mean(loss) - avg_loss.backward() - opt.minimize(avg_loss) - simple_net.clear_gradients() - print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) - -方法 -:::::::::::: -from_generator(feed_list=None, capacity=None, use_double_buffer=True, iterable=True, return_list=False, use_multiprocess=False, drop_last=True) -''''''''' - -.. warning:: - 这个 API 将在未来版本废弃,推荐使用支持多进程并发加速的 ``paddle.io.DataLoader`` - -.. note:: - 框架保证 DataLoader 的数据加载顺序与用户提供的数据源读取顺序一致。 - -创建一个 DataLoader 对象用于加载 Python 生成器产生的数据。数据会由 Python 线程预先读取,并异步送入一个队列中。 - -本方法创建的 DataLoader 对象提供了 3 个方法设置数据源,分别是 :code:`set_sample_generator` , :code:`set_sample_list_generator` 和 -:code:`set_batch_generator`。请查阅下述示例代码了解它们的使用方法。 - -如果 iterable = True,本方法创建的 DataLoader 对象是一个 Python 生成器,可以 for-range 的方法循环迭代。 - -如果 iterable = False,本方法创建的 DataLoader 对象提供 :code:`start()` 和 :code:`reset()` 方法控制数据读取过程。 - -**参数** - - - **feed_list** (list(Tensor)|tuple(Tensor)) - feed 变量列表,由 ``paddle.static.data()`` 创建。 - - **capacity** (int) - DataLoader 对象内部维护队列的容量大小。单位是 batch 数量。若 reader 读取速度较快,建议设置较大的 capacity 值。 - - **use_double_buffer** (bool,可选) - 是否使用 ``double_buffer_reader``。若 use_double_buffer=True,DataLoader 会异步地预读取下一个 batch 的数据,可加速数据读取过程,但同时会占用少量的 CPU/GPU 存储,即一个 batch 输入数据的存储空间。 - - **iterable** (bool,可选) - 所创建的 DataLoader 对象是否可迭代。 - - **return_list** (bool,可选) - 每个设备上的数据是否以 list 形式返回。仅在 iterable = True 模式下有效。若 return_list = False,每个设备上的返回数据均是 str -> Tensor 的映射表,其中映射表的 key 是每个输入变量的名称。若 return_list = True,则每个设备上的返回数据均是 list(Tensor)。推荐在静态图模式下使用 return_list = False,在动态图模式下使用 return_list = True。 - - **use_multiprocess** (bool,可选) - 设置是否是用多进程加速动态图的数据载入过程。注意:该参数的设置仅在动态图模式下有效,在静态图模式下,该参数设置与否均无任何影响。默认值为 False。 - - **drop_last** (bool,可选):是否丢弃最后的不足 CPU/GPU 设备数的批次。默认值为 True。在网络训练时,用户不能设置 drop_last=False,此时所有 CPU/GPU 设备均应从 DataLoader 中读取到数据。在网络预测时,用户可以设置 drop_last=False,此时最后不足 CPU/GPU 设备数的批次可以进行预测。 - -**返回** - - 被创建的 DataLoader 对象。 - - -**代码示例 1** - -.. code-block:: python - - ''' - Example in static graph mode - ''' - import numpy as np - - import paddle - import paddle.static as static - import paddle.nn.functional as F - - - BATCH_NUM = 10 - BATCH_SIZE = 16 - EPOCH_NUM = 4 - - CLASS_NUM = 10 - - ITERABLE = True # whether the created DataLoader object is iterable - USE_GPU = False # whether to use GPU - - DATA_FORMAT = 'batch_generator' # data format of data source user provides - - paddle.enable_static() - - def simple_net(image, label): - fc_tmp = static.nn.fc(image, size=CLASS_NUM) - cross_entropy = F.softmax_with_cross_entropy(image, label) - loss = paddle.mean(cross_entropy) - sgd = paddle.optimizer.SGD(learning_rate=1e-3) - sgd.minimize(loss) - return loss - - def get_random_images_and_labels(image_shape, label_shape): - image = np.random.random(size=image_shape).astype('float32') - label = np.random.random(size=label_shape).astype('int64') - return image, label - - # If the data generator yields one sample each time, - # use DataLoader.set_sample_generator to set the data source. - def sample_generator_creator(): - def __reader__(): - for _ in range(BATCH_NUM * BATCH_SIZE): - image, label = get_random_images_and_labels([784], [1]) - yield image, label - - return __reader__ - - # If the data generator yield list of samples each time, - # use DataLoader.set_sample_list_generator to set the data source. - def sample_list_generator_creator(): - def __reader__(): - for _ in range(BATCH_NUM): - sample_list = [] - for _ in range(BATCH_SIZE): - image, label = get_random_images_and_labels([784], [1]) - sample_list.append([image, label]) - - yield sample_list - - return __reader__ - - # If the data generator yields a batch each time, - # use DataLoader.set_batch_generator to set the data source. - def batch_generator_creator(): - def __reader__(): - for _ in range(BATCH_NUM): - batch_image, batch_label = get_random_images_and_labels([BATCH_SIZE, 784], [BATCH_SIZE, 1]) - yield batch_image, batch_label - - return __reader__ - - # If DataLoader is iterable, use for loop to train the network - def train_iterable(exe, prog, loss, loader): - for _ in range(EPOCH_NUM): - for data in loader(): - exe.run(prog, feed=data, fetch_list=[loss]) - - # If DataLoader is not iterable, use start() and reset() method to control the process - def train_non_iterable(exe, prog, loss, loader): - for _ in range(EPOCH_NUM): - loader.start() # call DataLoader.start() before each epoch starts - try: - while True: - exe.run(prog, fetch_list=[loss]) - except paddle.core.EOFException: - loader.reset() # call DataLoader.reset() after catching EOFException - - def set_data_source(loader, places): - if DATA_FORMAT == 'sample_generator': - loader.set_sample_generator(sample_generator_creator(), batch_size=BATCH_SIZE, drop_last=True, places=places) - elif DATA_FORMAT == 'sample_list_generator': - loader.set_sample_list_generator(sample_list_generator_creator(), places=places) - elif DATA_FORMAT == 'batch_generator': - loader.set_batch_generator(batch_generator_creator(), places=places) - else: - raise ValueError('Unsupported data format') - - image = static.data(name='image', shape=[None, 784], dtype='float32') - label = static.data(name='label', shape=[None, 1], dtype='int64') - - # Define DataLoader - loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE) - - # Define network - loss = simple_net(image, label) - - # Set data source of DataLoader - # - # If DataLoader is iterable, places must be given and the number of places must be the same with device number. - # - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places. - # - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places. - # - # If DataLoader is not iterable, places can be None. - places = static.cuda_places() if USE_GPU else static.cpu_places() - set_data_source(loader, places) - - exe = static.Executor(places[0]) - exe.run(static.default_startup_program()) - - prog = static.CompiledProgram(static.default_main_program()) - - if loader.iterable: - train_iterable(exe, prog, loss, loader) - else: - train_non_iterable(exe, prog, loss, loader) - - -**代码示例 2** - -.. code-block:: python - - ''' - Example in dynamic graph mode. - ''' - import numpy as np - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - import paddle.distributed as dist - - BATCH_SIZE = 16 - BATCH_NUM = 4 - EPOCH_NUM = 4 - - IMAGE_SIZE = 784 - CLASS_NUM = 1 - - USE_GPU = False # whether to use GPU - - def _get_random_images_and_labels(image_shape, label_shape): - image = np.random.random(size=image_shape).astype('float32') - label = np.random.random(size=label_shape).astype('int64') - return image, label - - def __reader__(): - for _ in range(BATCH_NUM): - batch_image, batch_label = _get_random_images_and_labels( - [BATCH_SIZE, IMAGE_SIZE], [BATCH_SIZE, CLASS_NUM]) - yield batch_image, batch_label - - def random_batch_reader(): - return __reader__ - - class LinearNet(nn.Layer): - def __init__(self): - super().__init__() - self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM) - - @paddle.jit.to_static - def forward(self, x): - return self._linear(x) - - # set device - paddle.set_device('gpu' if USE_GPU else 'cpu') - - # create network - layer = LinearNet() - dp_layer = paddle.DataParallel(layer) - loss_fn = nn.CrossEntropyLoss() - adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) - - # create data loader - loader = paddle.io.DataLoader.from_generator(capacity=5) - loader.set_batch_generator(random_batch_reader()) - - for epoch_id in range(EPOCH_NUM): - for batch_id, (image, label) in enumerate(loader()): - out = layer(image) - loss = loss_fn(out, label) - - loss.backward() - - adam.step() - adam.clear_grad() - print("Epoch {} batch {}: loss = {}".format( - epoch_id, batch_id, np.mean(loss.numpy()))) - -**代码示例 3** - -.. code-block:: python - - ''' - Example of `drop_last` using in static graph multi-cards mode - ''' - import paddle - import paddle.static as static - import numpy as np - import os - - # We use 2 CPU cores to run inference network - os.environ['CPU_NUM'] = '2' - - paddle.enable_static() - - # The data source has only 3 batches, which can not be - # divided evenly to each CPU core - def batch_generator(): - for i in range(3): - yield np.array([i+1]).astype('float32'), - - x = static.data(name='x', shape=[None], dtype='float32') - y = x * x - - def run_inference(drop_last): - loader = paddle.io.DataLoader.from_generator(feed_list=[x], - capacity=8, drop_last=drop_last) - loader.set_batch_generator(batch_generator, static.cpu_places()) - - exe = static.Executor(paddle.CPUPlace()) - prog = static.CompiledProgram(static.default_main_program()) - - result = [] - for data in loader(): - each_ret, = exe.run(prog, feed=data, fetch_list=[y]) - result.extend(each_ret) - return result - - # Set drop_last to True, so that the last batch whose - # number is less than CPU core number would be discarded. - print(run_inference(drop_last=True)) # [1.0, 4.0] - - # Set drop_last to False, so that the last batch whose - # number is less than CPU core number can be tested. - print(run_inference(drop_last=False)) # [1.0, 4.0, 9.0] - - -from_dataset(dataset, places, drop_last=True) -''''''''' - -.. warning:: - 这个 API 将在未来版本废弃,推荐使用支持多进程并发加速的 ``paddle.io.DataLoader`` - -创建一个 DataLoader 对象用于加载 Dataset 产生的数据。目前,Dataset 仅支持 Linux 系统下使用。 - -**参数** - - - **dataset** (InMemoryDataset|QueueDataset) - Dataset 对象。 - - **places** (list(CUDAPlace)|list(CPUPlace)) - DataLoader 对象返回数据所在的 place。 - - **drop_last** (bool,可选) - 是否丢弃最后样本数量不足 batch size 的 batch。若 drop_last = True 则丢弃,若 drop_last = False 则不丢弃。 - -**返回** - - 被创建的 DataLoader 对象,可以 for-range 的方式循环迭代。 - - -**代码示例** - -.. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - image = static.data(name='image', shape=[None, 784], dtype='float32') - label = static.data(name='label', shape=[None, 1], dtype='int64') - - dataset = paddle.distributed.QueueDataset() - dataset.init( - batch_size=32, - pipe_command='cat', - use_var=[image, label]) - dataset.set_filelist(['a.txt', 'b.txt', 'c.txt']) - - loader = paddle.io.DataLoader.from_dataset(dataset, static.cpu_places()) +COPY-FROM: paddle.io.DataLoader diff --git a/docs/api/paddle/io/DistributedBatchSampler_cn.rst b/docs/api/paddle/io/DistributedBatchSampler_cn.rst index 978065216e1..545256fb7e7 100644 --- a/docs/api/paddle/io/DistributedBatchSampler_cn.rst +++ b/docs/api/paddle/io/DistributedBatchSampler_cn.rst @@ -45,27 +45,4 @@ set_epoch(epoch) **代码示例** -.. code-block:: python - - import numpy as np - - from paddle.io import Dataset, DistributedBatchSampler - - # init with dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(100) - sampler = DistributedBatchSampler(dataset, batch_size=64) - - for epoch in range(10): - sampler.set_epoch(epoch) +COPY-FROM: paddle.io.DistributedBatchSampler.set_epoch