Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

修改 COPY-FROM No.12 io #6001

Merged
merged 2 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 4 additions & 273 deletions docs/api/paddle/io/DataLoader_cn.rst
RedContritio marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -52,66 +52,7 @@ DataLoader,迭代 ``dataset`` 数据的迭代器,迭代器返回的数据中
代码示例
::::::::::::

.. code-block:: python

import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, BatchSampler, DataLoader

BATCH_NUM = 20
BATCH_SIZE = 16
EPOCH_NUM = 4

IMAGE_SIZE = 784
CLASS_NUM = 10

USE_GPU = False # whether use GPU to run model

# define a random dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples

def __getitem__(self, idx):
image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
return image, label

def __len__(self):
return self.num_samples

dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)

class SimpleNet(nn.Layer):
def __init__(self):
super().__init__()
self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)

def forward(self, image, label=None):
return self.fc(image)

simple_net = SimpleNet()
opt = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=simple_net.parameters())

loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)

for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = F.cross_entropy(out, label)
avg_loss = paddle.mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
COPY-FROM: paddle.io.DataLoader:data-loader-example
RedContritio marked this conversation as resolved.
Show resolved Hide resolved

方法
::::::::::::
Expand Down Expand Up @@ -150,204 +91,11 @@ from_generator(feed_list=None, capacity=None, use_double_buffer=True, iterable=T

**代码示例 1**

.. code-block:: python

'''
Example in static graph mode
'''
import numpy as np

import paddle
import paddle.static as static
import paddle.nn.functional as F


BATCH_NUM = 10
BATCH_SIZE = 16
EPOCH_NUM = 4

CLASS_NUM = 10

ITERABLE = True # whether the created DataLoader object is iterable
USE_GPU = False # whether to use GPU

DATA_FORMAT = 'batch_generator' # data format of data source user provides

paddle.enable_static()

def simple_net(image, label):
fc_tmp = static.nn.fc(image, size=CLASS_NUM)
cross_entropy = F.softmax_with_cross_entropy(image, label)
loss = paddle.mean(cross_entropy)
sgd = paddle.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
return loss

def get_random_images_and_labels(image_shape, label_shape):
image = np.random.random(size=image_shape).astype('float32')
label = np.random.random(size=label_shape).astype('int64')
return image, label

# If the data generator yields one sample each time,
# use DataLoader.set_sample_generator to set the data source.
def sample_generator_creator():
def __reader__():
for _ in range(BATCH_NUM * BATCH_SIZE):
image, label = get_random_images_and_labels([784], [1])
yield image, label

return __reader__

# If the data generator yield list of samples each time,
# use DataLoader.set_sample_list_generator to set the data source.
def sample_list_generator_creator():
def __reader__():
for _ in range(BATCH_NUM):
sample_list = []
for _ in range(BATCH_SIZE):
image, label = get_random_images_and_labels([784], [1])
sample_list.append([image, label])

yield sample_list

return __reader__

# If the data generator yields a batch each time,
# use DataLoader.set_batch_generator to set the data source.
def batch_generator_creator():
def __reader__():
for _ in range(BATCH_NUM):
batch_image, batch_label = get_random_images_and_labels([BATCH_SIZE, 784], [BATCH_SIZE, 1])
yield batch_image, batch_label

return __reader__

# If DataLoader is iterable, use for loop to train the network
def train_iterable(exe, prog, loss, loader):
for _ in range(EPOCH_NUM):
for data in loader():
exe.run(prog, feed=data, fetch_list=[loss])

# If DataLoader is not iterable, use start() and reset() method to control the process
def train_non_iterable(exe, prog, loss, loader):
for _ in range(EPOCH_NUM):
loader.start() # call DataLoader.start() before each epoch starts
try:
while True:
exe.run(prog, fetch_list=[loss])
except paddle.core.EOFException:
loader.reset() # call DataLoader.reset() after catching EOFException

def set_data_source(loader, places):
if DATA_FORMAT == 'sample_generator':
loader.set_sample_generator(sample_generator_creator(), batch_size=BATCH_SIZE, drop_last=True, places=places)
elif DATA_FORMAT == 'sample_list_generator':
loader.set_sample_list_generator(sample_list_generator_creator(), places=places)
elif DATA_FORMAT == 'batch_generator':
loader.set_batch_generator(batch_generator_creator(), places=places)
else:
raise ValueError('Unsupported data format')

image = static.data(name='image', shape=[None, 784], dtype='float32')
label = static.data(name='label', shape=[None, 1], dtype='int64')

# Define DataLoader
loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE)

# Define network
loss = simple_net(image, label)

# Set data source of DataLoader
#
# If DataLoader is iterable, places must be given and the number of places must be the same with device number.
# - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places.
# - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places.
#
# If DataLoader is not iterable, places can be None.
places = static.cuda_places() if USE_GPU else static.cpu_places()
set_data_source(loader, places)

exe = static.Executor(places[0])
exe.run(static.default_startup_program())

prog = static.CompiledProgram(static.default_main_program())

if loader.iterable:
train_iterable(exe, prog, loss, loader)
else:
train_non_iterable(exe, prog, loss, loader)

COPY-FROM: paddle.fluid.DataLoader.from_generator:static-data-loader-example-1

**代码示例 2**

.. code-block:: python

'''
Example in dynamic graph mode.
'''
import numpy as np

import paddle
import paddle.nn as nn
import paddle.optimizer as opt
import paddle.distributed as dist

BATCH_SIZE = 16
BATCH_NUM = 4
EPOCH_NUM = 4

IMAGE_SIZE = 784
CLASS_NUM = 1

USE_GPU = False # whether to use GPU

def _get_random_images_and_labels(image_shape, label_shape):
image = np.random.random(size=image_shape).astype('float32')
label = np.random.random(size=label_shape).astype('int64')
return image, label

def __reader__():
for _ in range(BATCH_NUM):
batch_image, batch_label = _get_random_images_and_labels(
[BATCH_SIZE, IMAGE_SIZE], [BATCH_SIZE, CLASS_NUM])
yield batch_image, batch_label

def random_batch_reader():
return __reader__

class LinearNet(nn.Layer):
def __init__(self):
super().__init__()
self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)

@paddle.jit.to_static
def forward(self, x):
return self._linear(x)

# set device
paddle.set_device('gpu' if USE_GPU else 'cpu')

# create network
layer = LinearNet()
dp_layer = paddle.DataParallel(layer)
loss_fn = nn.CrossEntropyLoss()
adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())

# create data loader
loader = paddle.io.DataLoader.from_generator(capacity=5)
loader.set_batch_generator(random_batch_reader())

for epoch_id in range(EPOCH_NUM):
for batch_id, (image, label) in enumerate(loader()):
out = layer(image)
loss = loss_fn(out, label)

loss.backward()

adam.step()
adam.clear_grad()
print("Epoch {} batch {}: loss = {}".format(
epoch_id, batch_id, np.mean(loss.numpy())))
COPY-FROM: paddle.fluid.DataLoader.from_generator:static-data-loader-example-2

**代码示例 3**

Expand Down Expand Up @@ -419,21 +167,4 @@ from_dataset(dataset, places, drop_last=True)

**代码示例**

.. code-block:: python

import paddle
import paddle.static as static

paddle.enable_static()

image = static.data(name='image', shape=[None, 784], dtype='float32')
label = static.data(name='label', shape=[None, 1], dtype='int64')

dataset = paddle.distributed.QueueDataset()
dataset.init(
batch_size=32,
pipe_command='cat',
use_var=[image, label])
dataset.set_filelist(['a.txt', 'b.txt', 'c.txt'])

loader = paddle.io.DataLoader.from_dataset(dataset, static.cpu_places())
COPY-FROM: paddle.fluid.DataLoader.from_dataset
25 changes: 1 addition & 24 deletions docs/api/paddle/io/DistributedBatchSampler_cn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,27 +45,4 @@ set_epoch(epoch)

**代码示例**

.. code-block:: python

import numpy as np

from paddle.io import Dataset, DistributedBatchSampler

# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples

def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label

def __len__(self):
return self.num_samples

dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)

for epoch in range(10):
sampler.set_epoch(epoch)
COPY-FROM: paddle.io.DistributedBatchSampler.set_epoch