coqui-ai · gorkemgoknar · Nov 16, 2023 · Nov 15, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/.github/workflows/build-and-push-to-ghcr.yml b/.github/workflows/build-and-push-to-ghcr.yml
@@ -4,7 +4,7 @@ on:
     branches: [main]
   pull_request:
 jobs:
-  build-and-push-to-ghcr-cuda117:
+  build-and-push-to-ghcr-cuda118:
     runs-on: ubuntu-22.04
     steps:
       -
@@ -49,51 +49,6 @@ jobs:
           tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }}
           #build-args:
 
-  build-and-push-to-ghcr-cuda118:
-    runs-on: ubuntu-22.04
-    steps:
-      -
-        name: Checkout
-        uses: actions/checkout@v3
-
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: 'Login to GitHub Container Registry'
-        run: |
-          set -xe
-          docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io
-
-      - name: 'Remove cache'
-        run: | 
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-
-      - name: Build only for PR cuda 11.8
-        if: github.ref != 'refs/heads/main'
-        uses: docker/build-push-action@v5
-        with:
-          context: "{{defaultContext}}:server"
-          file: Dockerfile.cuda118
-          push: false # Do not push image for PR
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}
-
-      - name: Build and Push image cuda 11.8
-        if: github.ref == 'refs/heads/main'
-        uses: docker/build-push-action@v5
-        with:
-          context: "{{defaultContext}}:server"
-          file: Dockerfile.cuda118
-          push: true # Push if merged
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
-          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda118-${{ github.sha }}
-          #build-args:
-
   build-and-push-to-ghcr-cuda121:
     runs-on: ubuntu-22.04
     steps:

diff --git a/README.md b/README.md
@@ -32,16 +32,16 @@ $ python test_streaming.py
 
 ## Building the container
 
-1. To build the Docker container (Pytorch 2.01 Cuda 11.7) :
+1. To build the Docker container Pytorch 2.1 and CUDA 11.8 :
 
 ```bash
 $ cd server
 $ docker build -t xtts-stream .
 ```
-For Pytorch 2.1 and CUDA 11.8 version (when running set NVIDIA_DISABLE_REQUIRE=1 if you have Cuda < 11.8 drivers) 
+For Pytorch 2.1 and CUDA 12.1 :
 ```bash
 $ cd server
-# docker build -t xtts-stream . -f Dockerfile.cuda118
+docker build -t xtts-stream . -f Dockerfile.cuda121
 ```
 2. Run the server container:
 

diff --git a/server/Dockerfile b/server/Dockerfile
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
+FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel
 ARG DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update && \
@@ -13,6 +13,7 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
 RUN python -m unidic download
 
 COPY main.py .
+ENV NVIDIA_DISABLE_REQUIRE=1
 
 ENV NUM_THREADS=2
 EXPOSE 80

diff --git a/server/Dockerfile.cuda118 b/server/Dockerfile.cuda118
diff --git a/server/main.py b/server/main.py
@@ -54,7 +54,7 @@ def predict_speaker(wav_file: UploadFile):
     temp_audio_name = next(tempfile._get_candidate_names())
     with open(temp_audio_name, "wb") as temp, torch.inference_mode():
         temp.write(io.BytesIO(wav_file.file.read()).getbuffer())
-        gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(
+        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
             temp_audio_name
         )
     return {
@@ -110,12 +110,13 @@ class StreamingInputs(BaseModel):
         "nl",
         "cs",
         "ar",
-        "zh-cn",
+        "zh",
         "ja",
+        "hu",
+        "ko",
     ]
     add_wav_header: bool = True
     stream_chunk_size: str = "20"
-    decoder: str = "ne_hifigan"
 
 
 def predict_streaming_generator(parsed_input: dict = Body(...)):
@@ -127,16 +128,20 @@ def predict_streaming_generator(parsed_input: dict = Body(...)):
     )
     text = parsed_input.text
     language = parsed_input.language
-    decoder = parsed_input.decoder
-
-    if decoder not in ["ne_hifigan","hifigan"]:
-        decoder = "ne_hifigan"
 
     stream_chunk_size = int(parsed_input.stream_chunk_size)
     add_wav_header = parsed_input.add_wav_header
 
 
-    chunks = model.inference_stream(text, language, gpt_cond_latent, speaker_embedding, decoder=decoder,stream_chunk_size=stream_chunk_size)
+    chunks = model.inference_stream(
+        text,
+        language,
+        gpt_cond_latent,
+        speaker_embedding,
+        stream_chunk_size=stream_chunk_size,
+        enable_text_splitting=True
+    )
+
     for i, chunk in enumerate(chunks):
         chunk = postprocess(chunk)
         if i == 0 and add_wav_header:

diff --git a/server/requirements.txt b/server/requirements.txt
@@ -1,4 +1,4 @@
-TTS==0.20.2
+TTS @ git+https://github.com/coqui-ai/TTS@sentence_spliting
 uvicorn[standard]==0.23.2
 fastapi==0.95.2
 deepspeed==0.10.3

diff --git a/test/test_streaming.py b/test/test_streaming.py
@@ -38,11 +38,10 @@ def stream_ffplay(audio_stream, output_file, save=True):
     ffplay_proc.wait()
 
 
-def tts(text, speaker,language, server_url , decoder, stream_chunk_size) -> Iterator[bytes]:
+def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
     start = time.perf_counter()
     speaker["text"] = text
     speaker["language"] = language
-    speaker["decoder"] = decoder  # "hifigan" or "ne_hifigan" for TTS>0.19.0
     speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
     res = requests.post(
         f"{server_url}/tts_stream",
@@ -86,7 +85,6 @@ def get_speaker(ref_audio,server_url):
         default="en",
         help="Language to use default is 'en'  (English)"
     )
-
     parser.add_argument(
         "--output_file",
         default=None,
@@ -102,25 +100,28 @@ def get_speaker(ref_audio,server_url):
         default="http://localhost:8000",
         help="Server url http://localhost:8000 default, change to your server location "
     )
-    parser.add_argument(
-        "--decoder",
-        default="ne_hifigan",
-        help="Decoder for vocoder, ne_hifigan default, options ne_hifigan or hifigan"
-    )
-
     parser.add_argument(
         "--stream_chunk_size",
         default="20",
         help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality"
     )
-
     args = parser.parse_args()
 
     with open("./default_speaker.json", "r") as file:
         speaker = json.load(file)
 
     if args.ref_file is not None:
         print("Computing the latents for a new reference...")
-        speaker = get_speaker(args.ref_file,args.server_url)
-
-    audio = stream_ffplay(tts(args.text, speaker,args.language,args.server_url,args.decoder,args.stream_chunk_size), args.output_file, save=bool(args.output_file))
+        speaker = get_speaker(args.ref_file, args.server_url)
+
+    audio = stream_ffplay(
+        tts(
+            args.text,
+            speaker,
+            args.language,
+            args.server_url,
+            args.stream_chunk_size
+        ), 
+        args.output_file,
+        save=bool(args.output_file)
+    )