Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to sentence spliting #6

Merged
merged 3 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 1 addition & 46 deletions .github/workflows/build-and-push-to-ghcr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
branches: [main]
pull_request:
jobs:
build-and-push-to-ghcr-cuda117:
build-and-push-to-ghcr-cuda118:
runs-on: ubuntu-22.04
steps:
-
Expand Down Expand Up @@ -49,51 +49,6 @@ jobs:
tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }}
#build-args:

build-and-push-to-ghcr-cuda118:
runs-on: ubuntu-22.04
steps:
-
name: Checkout
uses: actions/checkout@v3

-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: 'Login to GitHub Container Registry'
run: |
set -xe
docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io

- name: 'Remove cache'
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"

- name: Build only for PR cuda 11.8
if: github.ref != 'refs/heads/main'
uses: docker/build-push-action@v5
with:
context: "{{defaultContext}}:server"
file: Dockerfile.cuda118
push: false # Do not push image for PR
cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}
cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}

- name: Build and Push image cuda 11.8
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v5
with:
context: "{{defaultContext}}:server"
file: Dockerfile.cuda118
push: true # Push if merged
cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda118-${{ github.sha }}
#build-args:

build-and-push-to-ghcr-cuda121:
runs-on: ubuntu-22.04
steps:
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ $ python test_streaming.py

## Building the container

1. To build the Docker container (Pytorch 2.01 Cuda 11.7) :
1. To build the Docker container Pytorch 2.1 and CUDA 11.8 :

```bash
$ cd server
$ docker build -t xtts-stream .
```
For Pytorch 2.1 and CUDA 11.8 version (when running set NVIDIA_DISABLE_REQUIRE=1 if you have Cuda < 11.8 drivers)
For Pytorch 2.1 and CUDA 12.1 :
```bash
$ cd server
# docker build -t xtts-stream . -f Dockerfile.cuda118
docker build -t xtts-stream . -f Dockerfile.cuda121
```
2. Run the server container:

Expand Down
3 changes: 2 additions & 1 deletion server/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel
ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
Expand All @@ -13,6 +13,7 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
RUN python -m unidic download

COPY main.py .
ENV NVIDIA_DISABLE_REQUIRE=1

ENV NUM_THREADS=2
EXPOSE 80
Expand Down
22 changes: 0 additions & 22 deletions server/Dockerfile.cuda118

This file was deleted.

21 changes: 13 additions & 8 deletions server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def predict_speaker(wav_file: UploadFile):
temp_audio_name = next(tempfile._get_candidate_names())
with open(temp_audio_name, "wb") as temp, torch.inference_mode():
temp.write(io.BytesIO(wav_file.file.read()).getbuffer())
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
temp_audio_name
)
return {
Expand Down Expand Up @@ -110,12 +110,13 @@ class StreamingInputs(BaseModel):
"nl",
"cs",
"ar",
"zh-cn",
"zh",
"ja",
"hu",
"ko",
]
add_wav_header: bool = True
stream_chunk_size: str = "20"
decoder: str = "ne_hifigan"


def predict_streaming_generator(parsed_input: dict = Body(...)):
Expand All @@ -127,16 +128,20 @@ def predict_streaming_generator(parsed_input: dict = Body(...)):
)
text = parsed_input.text
language = parsed_input.language
decoder = parsed_input.decoder

if decoder not in ["ne_hifigan","hifigan"]:
decoder = "ne_hifigan"

stream_chunk_size = int(parsed_input.stream_chunk_size)
add_wav_header = parsed_input.add_wav_header


chunks = model.inference_stream(text, language, gpt_cond_latent, speaker_embedding, decoder=decoder,stream_chunk_size=stream_chunk_size)
chunks = model.inference_stream(
text,
language,
gpt_cond_latent,
speaker_embedding,
stream_chunk_size=stream_chunk_size,
enable_text_splitting=True
)

for i, chunk in enumerate(chunks):
chunk = postprocess(chunk)
if i == 0 and add_wav_header:
Expand Down
2 changes: 1 addition & 1 deletion server/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
TTS==0.20.2
TTS @ git+https://github.com/coqui-ai/TTS@sentence_spliting
uvicorn[standard]==0.23.2
fastapi==0.95.2
deepspeed==0.10.3
Expand Down
27 changes: 14 additions & 13 deletions test/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,10 @@ def stream_ffplay(audio_stream, output_file, save=True):
ffplay_proc.wait()


def tts(text, speaker,language, server_url , decoder, stream_chunk_size) -> Iterator[bytes]:
def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
start = time.perf_counter()
speaker["text"] = text
speaker["language"] = language
speaker["decoder"] = decoder # "hifigan" or "ne_hifigan" for TTS>0.19.0
speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
res = requests.post(
f"{server_url}/tts_stream",
Expand Down Expand Up @@ -86,7 +85,6 @@ def get_speaker(ref_audio,server_url):
default="en",
help="Language to use default is 'en' (English)"
)

parser.add_argument(
"--output_file",
default=None,
Expand All @@ -102,25 +100,28 @@ def get_speaker(ref_audio,server_url):
default="http://localhost:8000",
help="Server url http://localhost:8000 default, change to your server location "
)
parser.add_argument(
"--decoder",
default="ne_hifigan",
help="Decoder for vocoder, ne_hifigan default, options ne_hifigan or hifigan"
)

parser.add_argument(
"--stream_chunk_size",
default="20",
help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality"
)

args = parser.parse_args()

with open("./default_speaker.json", "r") as file:
speaker = json.load(file)

if args.ref_file is not None:
print("Computing the latents for a new reference...")
speaker = get_speaker(args.ref_file,args.server_url)

audio = stream_ffplay(tts(args.text, speaker,args.language,args.server_url,args.decoder,args.stream_chunk_size), args.output_file, save=bool(args.output_file))
speaker = get_speaker(args.ref_file, args.server_url)

audio = stream_ffplay(
tts(
args.text,
speaker,
args.language,
args.server_url,
args.stream_chunk_size
),
args.output_file,
save=bool(args.output_file)
)