diff --git a/README.md b/README.md index 06d3104eb..c4a8dc4a7 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,8 @@ We also provide a some command line based examples using state of the art models model using residual vector quantization. - [MetaVoice](./candle-examples/examples/metavoice/): foundational model for text-to-speech. +- [Parler-TTS](./candle-examples/examples/parler-tts/): large text-to-speech + model. - [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/), [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings. - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained @@ -236,6 +238,7 @@ If you have an addition to this list, please submit a pull request. - Whisper, multi-lingual speech-to-text. - EnCodec, audio compression model. - MetaVoice-1B, text-to-speech model. + - Parler-TTS, text-to-speech model. - Computer Vision Models. - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT, ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4, Hiera. diff --git a/candle-examples/examples/parler-tts/README.md b/candle-examples/examples/parler-tts/README.md new file mode 100644 index 000000000..837399443 --- /dev/null +++ b/candle-examples/examples/parler-tts/README.md @@ -0,0 +1,21 @@ +# candle-parler-tts + +[Parler-TTS](https://huggingface.co/parler-tts/parler-tts-large-v1) is a large +text-to-speech model with 2.2B parameters trained on ~45K hours of audio data. +The voice can be controlled by a text prompt. + +## Run an example + +```bash +cargo run --example parler-tts -r -- \ + --prompt "Hey, how are you doing today?" +``` + +In order to specify some prompt for the voice, use the `--description` argument. +```bash +cargo run --example parler-tts -r -- \ + --prompt "Hey, how are you doing today?" \ + --description "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up." +``` + +https://github.com/huggingface/candle/raw/main/candle-examples/examples/parler-tts/hello.mp4 diff --git a/candle-examples/examples/parler-tts/decode.py b/candle-examples/examples/parler-tts/decode.py deleted file mode 100644 index 8942d32ee..000000000 --- a/candle-examples/examples/parler-tts/decode.py +++ /dev/null @@ -1,30 +0,0 @@ -import torch -import torchaudio -from safetensors.torch import load_file -from parler_tts import DACModel - -tensors = load_file("out.safetensors") -dac_model = DACModel.from_pretrained("parler-tts/dac_44khZ_8kbps") -print(dac_model.model) -output_ids = tensors["codes"][None, None] -print(output_ids, "\n", output_ids.shape) -batch_size = 1 -with torch.no_grad(): - output_values = [] - for sample_id in range(batch_size): - sample = output_ids[:, sample_id] - sample_mask = (sample >= dac_model.config.codebook_size).sum(dim=(0, 1)) == 0 - if sample_mask.sum() > 0: - sample = sample[:, :, sample_mask] - sample = dac_model.decode(sample[None, ...], [None]).audio_values - output_values.append(sample.transpose(0, 2)) - else: - output_values.append(torch.zeros((1, 1, 1)).to(dac_model.device)) - output_lengths = [audio.shape[0] for audio in output_values] - pcm = ( - torch.nn.utils.rnn.pad_sequence(output_values, batch_first=True, padding_value=0) - .squeeze(-1) - .squeeze(-1) - ) -print(pcm.shape, pcm.dtype) -torchaudio.save("out.wav", pcm.cpu(), sample_rate=44100) diff --git a/candle-examples/examples/parler-tts/hello.mp4 b/candle-examples/examples/parler-tts/hello.mp4 new file mode 100644 index 000000000..994316db2 Binary files /dev/null and b/candle-examples/examples/parler-tts/hello.mp4 differ