From 6ecf94489fec708dcb7db28656f2dc4c31ad3d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=99=BD=E5=8F=B6=20=E8=97=A4=E5=8E=9F?= <1751842477@qq.com> Date: Sat, 11 Nov 2023 18:26:10 +0800 Subject: [PATCH] =?UTF-8?q?=E5=86=8D=E6=AC=A1=E6=94=AF=E6=8C=81VITS=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=B9BertVits=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Lib/MJson/MJson.h | 17 +- .../LibDLVoiceCodec/base.h | 52 +- .../LibDLVoiceCodec/operator.cpp | 7 + .../LibDLVoiceCodec/operator.h | 27 + .../LibDLVoiceCodec/value.cpp | 216 +++++- .../LibDLVoiceCodec/value.h | 160 +++- .../BaseF0Extractor/BaseF0Extractor.hpp | 8 +- .../DioF0Extractor/DioF0Extractor.cpp | 4 +- .../DioF0Extractor/DioF0Extractor.hpp | 4 +- .../F0Extractor/F0ExtractorManager.cpp | 4 +- .../F0Extractor/F0ExtractorManager.hpp | 4 +- .../HarvestF0Extractor/HarvestF0Extractor.cpp | 4 +- .../HarvestF0Extractor/HarvestF0Extractor.hpp | 4 +- .../NetF0Predictors/NetF0Predictors.cpp | 4 +- .../NetF0Predictors/NetF0Predictors.hpp | 4 +- .../Modules/InferTools/G2P/MoeVSG2P.cpp | 644 ++++++++++++++++ .../Modules/InferTools/G2P/MoeVSG2P.hpp | 254 +++++++ .../Modules/Models/header/DiffSvc.hpp | 14 + .../Modules/Models/header/ModelBase.hpp | 6 +- .../Modules/Models/header/MoeVSProject.hpp | 64 +- .../Modules/Models/header/TTS.hpp | 151 ++++ .../Modules/Models/header/Tacotron.hpp | 39 + .../Modules/Models/header/Vits.hpp | 86 +++ .../Modules/Models/header/VitsSvc.hpp | 4 + .../Modules/Models/src/DiffSvc.cpp | 127 ++++ .../Modules/Models/src/ModelBase.cpp | 4 +- .../Modules/Models/src/TTS.cpp | 321 ++++++++ .../Modules/Models/src/Vits.cpp | 699 ++++++++++++++++++ .../Modules/Models/src/VitsSvc.cpp | 142 ++++ .../Modules/Modules.cpp | 7 + .../Modules/Modules.hpp | 1 + .../MoeVoiceStudioSvc - Core - Cmd.cpp | 388 +--------- .../MoeVoiceStudioSvc - Core - Cmd.vcxproj | 13 + ...oiceStudioSvc - Core - Cmd.vcxproj.filters | 93 ++- 34 files changed, 3128 insertions(+), 448 deletions(-) create mode 100644 MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.cpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.h create mode 100644 MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.cpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.hpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Tacotron.hpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/TTS.cpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/Vits.cpp diff --git a/Lib/MJson/MJson.h b/Lib/MJson/MJson.h index 4e673b4..fd2497f 100644 --- a/Lib/MJson/MJson.h +++ b/Lib/MJson/MJson.h @@ -130,7 +130,8 @@ class MJsonValue { if (!IsArray() && !IsString()) return true; - const auto _max = yyjson_arr_size(_Ptr); + auto _max = yyjson_arr_size(_Ptr); + if (IsString()) _max = yyjson_get_len(_Ptr); return !_max; } [[nodiscard]] size_t GetMemberCount() const @@ -148,6 +149,10 @@ class MJsonValue } return ret; } + [[nodiscard]] bool HasMember(const std::string& _key) const + { + return yyjson_obj_get(_Ptr, _key.c_str()); + } private: yyjson_val* _Ptr = nullptr; }; @@ -163,6 +168,16 @@ class MJson throw std::exception("Json Parse Error !"); root = yyjson_doc_get_root(_document); } + MJson(const std::string& _data, bool _read_from_string) + { + if (_read_from_string) + _document = yyjson_read(_data.c_str(), _data.length(), YYJSON_READ_NOFLAG); + else + _document = yyjson_read_file(_data.c_str(), YYJSON_READ_NOFLAG, nullptr, nullptr); + if (!_document) + throw std::exception("Json Parse Error !"); + root = yyjson_doc_get_root(_document); + } ~MJson() { if(_document) diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h index 0fd95b4..9abea1f 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h @@ -3,6 +3,7 @@ #include #include #include +#include #define LibDLVoiceCodecBegin namespace libdlvcodec { #define LibDLVoiceCodecEnd } #define LIBDVCND [[nodiscard]] @@ -24,6 +25,12 @@ using uint16 = uint16_t; using uint32 = uint32_t; using uint64 = uint64_t; +class TensorView; +class Tensor; + +const std::unordered_map __Dtype {{"int8", 1}, { "int16", 2 }, { "int32", 4 }, { "int64", 8 }, + { "float8", 1 }, { "float16", 2 }, { "bfloat16", 2 }, { "float32", 4 }, { "float64", 8 }, { "bool", 1 } }; + template class BaseAllocator { @@ -54,12 +61,16 @@ class MResource { data_ = allocator_.allocate(_Count * 2); size_ = _Count; + this_ = data_ + _Count; + end_ = data_ + _Count * 2; } MResource(size_t _Count, Type _Value) { data_ = allocator_.allocate(_Count * 2); size_ = _Count; + this_ = data_ + _Count; + end_ = data_ + _Count * 2; auto _ptr = data_; const auto _end = data_ + size_; while (_ptr != _end) @@ -73,12 +84,16 @@ class MResource { data_ = _Ptr; size_ = _Size; + this_ = data_ + _Size; + end_ = data_ + _Size; } MResource(const MResource& _Left) { size_ = _Left.size_; - data_ = allocator_.allocate(_Left.size_); + data_ = allocator_.allocate(_Left.capacity()); + this_ = data_ + size_; + end_ = data_ + _Left.capacity(); auto _ptr = data_, _ptrl = _Left.data_; const auto _end = data_ + size_; while (_ptr != _end) @@ -93,6 +108,8 @@ class MResource { size_ = _Right.size_; data_ = _Right.data_; + this_ = _Right.this_; + end_ = _Right.end_; _Right.size_ = 0ull; _Right.data_ = nullptr; } @@ -118,7 +135,7 @@ class MResource LIBDVCND ptr_t end() const { - return data_ + size_; + return this_; } ptr_t release() @@ -134,12 +151,37 @@ class MResource return *(data_ + _Index); } + template + reference at(size_t _Index) const + { + assert(_Index * sizeof(__Ty) < size_); + return *((__Ty*)data_ + _Index); + } + + reference at(size_t _Index) const + { + assert(_Index < size_); + return *(data_ + _Index); + } + + LIBDVCND size_t size() const + { + return size_; + } + + LIBDVCND size_t capacity() const + { + return end_ - data_; + } + MResource& operator=(const MResource& _Left) { if (&_Left == this) return *this; size_ = _Left.size_; - data_ = allocator_.allocate(_Left.size_); + data_ = allocator_.allocate(_Left.capacity()); + this_ = data_ + size_; + end_ = data_ + _Left.capacity(); auto _ptr = data_, _ptrl = _Left.data_; const auto _end = data_ + size_; while (_ptr != _end) @@ -155,12 +197,16 @@ class MResource { size_ = _Right.size_; data_ = _Right.data_; + this_ = _Right.this_; + end_ = _Right.end_; _Right.size_ = 0ull; _Right.data_ = nullptr; return *this; } protected: ptr_t data_ = nullptr; + ptr_t this_ = nullptr; + ptr_t end_ = nullptr; size_t size_ = 0ull; Allocator allocator_; }; diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.cpp b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.cpp new file mode 100644 index 0000000..4e4201d --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.cpp @@ -0,0 +1,7 @@ +#include "operator.h" +#include "value.h" +#include + +LibDLVoiceCodecBegin + +LibDLVoiceCodecEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.h b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.h new file mode 100644 index 0000000..00c7441 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/operator.h @@ -0,0 +1,27 @@ +#pragma once +#include "base.h" + +LibDLVoiceCodecBegin +Tensor equal(const Tensor& _A, const Tensor& _B); +Tensor add(const Tensor& _A, const Tensor& _B); +Tensor sub(const Tensor& _A, const Tensor& _B); +Tensor mul(const Tensor& _A, const Tensor& _B); +Tensor div(const Tensor& _A, const Tensor& _B); +void selfAdd(Tensor& _Self, const Tensor& _O); +void selfSub(Tensor& _Self, const Tensor& _O); +void selfMul(Tensor& _Self, const Tensor& _O); +void selfDiv(Tensor& _Self, const Tensor& _O); +Tensor matmul(const Tensor& _A, const Tensor& _B); +Tensor conv1d(const Tensor& _Input, const Tensor& _Weight, const Tensor& _Bias, + int64 _Stride = 1, int64 _Padding = 0, int64 _Dilation = 1, int64 _Groups = 1); +Tensor conv2d(const Tensor& _Input, const Tensor& _Weight, const Tensor& _Bias, + int64 _Stride = 1, int64 _Padding = 0, int64 _Dilation = 1, int64 _Groups = 1); +Tensor conv3d(const Tensor& _Input, const Tensor& _Weight, const Tensor& _Bias, + int64 _Stride = 1, int64 _Padding = 0, int64 _Dilation = 1, int64 _Groups = 1); +Tensor conv_transpose1d(const Tensor& _Input, const Tensor& _Weight, const Tensor& _Bias, + int64 _Stride = 1, int64 _Padding = 0, int64 _OutputPadding = 0, int64 _Dilation = 1, int64 _Groups = 1); +Tensor conv_transpose2d(const Tensor& _Input, const Tensor& _Weight, const Tensor& _Bias, + int64 _Stride = 1, int64 _Padding = 0, int64 _OutputPadding = 0, int64 _Dilation = 1, int64 _Groups = 1); +Tensor conv_transpose3d(const Tensor& _Input, const Tensor& _Weight, const Tensor& _Bias, + int64 _Stride = 1, int64 _Padding = 0, int64 _OutputPadding = 0, int64 _Dilation = 1, int64 _Groups = 1); +LibDLVoiceCodecEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp index 5449b70..05c6d5f 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp @@ -1,7 +1,6 @@ #include "value.h" LibDLVoiceCodecBegin - Value& Value::load(const std::wstring& _Path) { FileWrapper file; @@ -86,7 +85,7 @@ void Value::saveData(FileWrapper& _File) LibDLVoiceCodecThrow("Not implemented error!"); } -void Tensor::loadData(WeightDict& _Dict) +void TensorData::loadData(WeightDict& _Dict) { const auto res = _Dict.find(RegName_); if (res != _Dict.end()) @@ -95,17 +94,103 @@ void Tensor::loadData(WeightDict& _Dict) size_t TotalSize = 1; for (const auto i : Shape_) TotalSize *= i; - if (TotalSize * sizeof(DType) != res->second.Size) + if (TotalSize * __Dtype.at(Type_) != res->second.Size) LibDLVoiceCodecThrow("Expected size does not match actual size!"); Data_ = std::move(res->second.Data); + DataPtr_ = Data_.data(); } } -void Tensor::saveData(FileWrapper& _File) +void TensorData::saveData(FileWrapper& _File) { } +TensorView TensorData::operator[](int64_t index) const +{ + if (index < 0) + { + if (index < -Shape_[0]) + LibDLVoiceCodecThrow("Index Out Of Range"); + index += Shape_[0]; + std::vector NewShape{Shape_.begin() + 1, Shape_.end()}; + if (NewShape.empty()) + NewShape.emplace_back(1); + return { std::move(NewShape) ,DataPtr_ + index * (step() * __Dtype.at(Type_)) }; + } + if (index > Shape_[0]) + LibDLVoiceCodecThrow("Index Out Of Range"); + std::vector NewShape{Shape_.begin() + 1, Shape_.end()}; + if (NewShape.empty()) + NewShape.emplace_back(1); + return { std::move(NewShape) ,DataPtr_ + index * (step() * __Dtype.at(Type_)) }; +} + +template +TensorData& TensorData::operator=(const _TypeName& _Val) +{ + assert(sizeof(_TypeName) == __Dtype.at(Type_)); + if(Type_ == "int8") + { + auto it = begin(); + const auto en = end(); + while (it != en) + *(it++) = (int8)_Val; + } + else if(Type_ == "int16") + { + auto it = begin(); + const auto en = end(); + while (it != en) + *(it++) = (int16)_Val; + } + else if (Type_ == "int32") + { + auto it = begin(); + const auto en = end(); + while (it != en) + *(it++) = (int32)_Val; + } + else if (Type_ == "int64") + { + auto it = begin(); + const auto en = end(); + while (it != en) + *(it++) = (int64)_Val; + } + else if (Type_ == "float8") + { + } + else if (Type_ == "float16") + { + } + else if (Type_ == "bfloat16") + { + } + else if (Type_ == "float32") + { + auto it = begin(); + const auto en = end(); + while (it != en) + *(it++) = (float32)_Val; + } + else if (Type_ == "float64") + { + auto it = begin(); + const auto en = end(); + while (it != en) + *(it++) = (float64)_Val; + } + else + { + auto it = begin(); + const auto en = end(); + while (it != en) + *(it++) = (bool)_Val; + } + return *this; +} + void Module::loadData(WeightDict& _Dict) { for(const auto& it : Layers_) @@ -122,4 +207,127 @@ void Module::saveData(FileWrapper& _File) } } +Tensor::Tensor(const std::initializer_list& _Shape, const std::string& _Dtype, const std::string& _Name, bool _TensorLayer) +{ + if (__Dtype.find(_Dtype) == __Dtype.end()) + LibDLVoiceCodecThrow("DType Not Recognized"); + Shape_ = _Shape; + size_t TotalSize = 1; + for (const auto i : Shape_) + TotalSize *= i; + Data_ = MResource(__Dtype.at(_Dtype) * TotalSize); + RegName_ = _Name; + TensorLayer_ = _TensorLayer; + DataPtr_ = Data_.data(); + Type_ = _Dtype; +} + +Tensor::Tensor(const std::vector& _Shape, const std::string& _Dtype, const std::string& _Name, bool _TensorLayer) +{ + if (__Dtype.find(_Dtype) == __Dtype.end()) + LibDLVoiceCodecThrow("DType Not Recognized"); + Shape_ = _Shape; + size_t TotalSize = 1; + for (const auto i : Shape_) + TotalSize *= i; + Data_ = MResource(__Dtype.at(_Dtype) * TotalSize); + RegName_ = _Name; + TensorLayer_ = _TensorLayer; + DataPtr_ = Data_.data(); + Type_ = _Dtype; +} + +Tensor::Tensor(const Tensor& _Left) +{ + Shape_ = _Left.Shape_; + size_t TotalSize = 1; + for (const auto i : Shape_) + TotalSize *= i; + Data_ = MResource(__Dtype.at(_Left.Type_) * TotalSize); + RegName_ = _Left.RegName_; + TensorLayer_ = _Left.TensorLayer_; + DataPtr_ = Data_.data(); + Type_ = _Left.Type_; +} + +Tensor::Tensor(Tensor&& _Right) noexcept +{ + Shape_ = _Right.Shape_; + Data_ = std::move(_Right.Data_); + DataPtr_ = Data_.data(); + TensorLayer_ = _Right.TensorLayer_; + Type_ = _Right.Type_; + RegName_ = _Right.RegName_; +} + +Tensor& Tensor::operator=(const Tensor& _Left) +{ + if (&_Left == this) + return *this; + Shape_ = _Left.Shape_; + size_t TotalSize = 1; + for (const auto i : Shape_) + TotalSize *= i; + Data_ = MResource(__Dtype.at(_Left.Type_) * TotalSize); + RegName_ = _Left.RegName_; + TensorLayer_ = _Left.TensorLayer_; + DataPtr_ = Data_.data(); + Type_ = _Left.Type_; + return *this; +} + +Tensor& Tensor::operator=(Tensor&& _Right) noexcept +{ + Shape_ = _Right.Shape_; + Data_ = std::move(_Right.Data_); + DataPtr_ = Data_.data(); + TensorLayer_ = _Right.TensorLayer_; + Type_ = _Right.Type_; + RegName_ = _Right.RegName_; + return *this; +} + +Tensor Tensor::zeros(const std::initializer_list& _Shape, const std::string& _Dtype, const std::string& _Name, bool _TensorLayer) +{ + Tensor Output{ _Shape ,_Dtype, _Name, _TensorLayer }; + memset(Output.Data_.data(), 0, Output.Data_.size()); + return Output; +} + +Tensor Tensor::zeros_like(const Tensor& _O, bool _TensorLayer) +{ + Tensor Output{ _O.shape() ,_O.dtype(), _O.RegName_, _TensorLayer }; + memset(Output.Data_.data(), 0, Output.Data_.size()); + return Output; +} + +Tensor Tensor::ones(const std::initializer_list& _Shape, const std::string& _Dtype, const std::string& _Name, bool _TensorLayer) +{ + return {}; +} + +Tensor Tensor::ones_like(const Tensor& _O, bool _TensorLayer) +{ + return {}; +} + +Tensor Tensor::rand(const std::initializer_list& _Shape, int _Seed, const std::string& _Dtype, const std::string& _Name, bool _TensorLayer) +{ + return {}; +} + +Tensor Tensor::rand_like(const Tensor& _O, int _Seed, bool _TensorLayer) +{ + return {}; +} + +Tensor Tensor::randn(const std::initializer_list& _Shape, int _Seed, const std::string& _Dtype, const std::string& _Name, bool _TensorLayer) +{ + return {}; +} + +Tensor Tensor::randn_like(const Tensor& _O, int _Seed, bool _TensorLayer) +{ + return {}; +} LibDLVoiceCodecEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h index a2a9775..72b27f2 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h @@ -1,5 +1,4 @@ #pragma once -#include #include #include "base.h" @@ -19,8 +18,9 @@ class Value { public: Value() = default; + Value(const Value& _Left) = delete; virtual ~Value() = default; - using WeightDict = std::map; + using WeightDict = std::unordered_map; protected: std::string RegName_; @@ -46,32 +46,168 @@ class Module : public Value else RegName_ = _Name; } - + ~Module() override = default; private: - std::map Layers_; + std::unordered_map Layers_; public: void loadData(WeightDict& _Dict) override; void saveData(FileWrapper& _File) override; }; -class Tensor : Value +class TensorData : public Value { public: - using DType = float; - Tensor(const std::string& _Name = "Tensor") - { - RegName_ = _Name; - TensorLayer_ = false; - } + TensorData() = default; + TensorData(const TensorData& _Left) = delete; + TensorData(TensorData&& _Right) = delete; + ~TensorData() override = default; protected: std::vector Shape_; - MResource Data_; bool TensorLayer_ = false; + std::string Type_ = "float32"; + public: void loadData(WeightDict& _Dict) override; void saveData(FileWrapper& _File) override; + +protected: + MResource Data_; + +public: + LIBDVCND const std::string& dtype() const { return Type_; } + LIBDVCND const std::vector& shape() const { return Shape_; } + LIBDVCND size_t size() const { + if (Shape_.empty()) return 0; + return Shape_[0]; + } + LIBDVCND size_t total_size() const { + if (Shape_.empty()) return 0; + size_t ttsize = 1; + for (const auto i : Shape_) + ttsize *= i; + return ttsize; + } + LIBDVCND size_t buf_size() const { + return total_size() * __Dtype.at(Type_); + } + LIBDVCND size_t step() const { + if (Shape_.empty()) return 0; + return total_size() / Shape_[0]; + } + LIBDVCND byte* data() const { return DataPtr_; } + template + LIBDVCND _ValueType& item() + { + assert(sizeof(_ValueType) == __Dtype.at(Type_)); + return *(_ValueType*)(DataPtr_); + } + template + LIBDVCND _ValueType* begin() + { + assert(sizeof(_ValueType) == __Dtype.at(Type_)); + return (_ValueType*)(DataPtr_); + } + template + LIBDVCND _ValueType* end() + { + assert(sizeof(_ValueType) == __Dtype.at(Type_)); + return (_ValueType*)(DataPtr_)+total_size(); + } + +protected: + byte* DataPtr_ = nullptr; + +public: + LIBDVCND TensorView operator[](int64_t index) const; + template + LIBDVCND TensorData& operator=(const _TypeName& _Val); +}; + +class Tensor : public TensorData +{ +public: + using DType = float; + Tensor() = default; + Tensor(const std::initializer_list& _Shape, const std::string& _Dtype = "float32", const std::string& _Name = "Tensor", bool _TensorLayer = false); + Tensor(const std::vector& _Shape, const std::string& _Dtype = "float32", const std::string& _Name = "Tensor", bool _TensorLayer = false); + Tensor(const Tensor& _Left); + Tensor(Tensor&& _Right) noexcept; + ~Tensor() override = default; + Tensor& operator=(const Tensor& _Left); + Tensor& operator=(Tensor&& _Right) noexcept; + + static Tensor zeros(const std::initializer_list& _Shape, const std::string& _Dtype = "float32", const std::string& _Name = "Tensor", bool _TensorLayer = false); + static Tensor zeros_like(const Tensor& _O, bool _TensorLayer = false); + static Tensor ones(const std::initializer_list& _Shape, const std::string& _Dtype = "float32", const std::string& _Name = "Tensor", bool _TensorLayer = false); + static Tensor ones_like(const Tensor& _O, bool _TensorLayer = false); + static Tensor rand(const std::initializer_list& _Shape, int _Seed = 114514, const std::string& _Dtype = "float32", const std::string& _Name = "Tensor", bool _TensorLayer = false); + static Tensor rand_like(const Tensor& _O, int _Seed = 114514, bool _TensorLayer = false); + static Tensor randn(const std::initializer_list& _Shape, int _Seed = 114514, const std::string& _Dtype = "float32", const std::string& _Name = "Tensor", bool _TensorLayer = false); + static Tensor randn_like(const Tensor& _O, int _Seed = 114514, bool _TensorLayer = false); +}; + +class TensorView : public TensorData +{ +public: + TensorView() = default; + ~TensorView() override = default; + TensorView(const Tensor& _T) + { + Shape_ = _T.shape(); + DataPtr_ = _T.data(); + } + TensorView(Tensor&& _T) = delete; + TensorView(const TensorView& _T) + { + Shape_ = _T.shape(); + DataPtr_ = _T.data(); + } + TensorView(TensorView&& _T) noexcept + { + Shape_ = _T.shape(); + DataPtr_ = _T.data(); + } + TensorView(const std::initializer_list& _Shape, byte* _DataPtr) + { + Shape_ = _Shape; + DataPtr_ = _DataPtr; + } + TensorView(const std::vector& _Shape, byte* _DataPtr) + { + Shape_ = _Shape; + DataPtr_ = _DataPtr; + } + TensorView(std::vector&& _Shape, byte* _DataPtr) + { + Shape_ = _Shape; + DataPtr_ = _DataPtr; + } + TensorView& operator=(const TensorView& _Left) + { + DataPtr_ = _Left.DataPtr_; + Shape_ = _Left.Shape_; + return *this; + } + TensorView& operator=(TensorView&& _Right) noexcept + { + DataPtr_ = _Right.DataPtr_; + Shape_ = _Right.Shape_; + return *this; + } + TensorView& operator=(const Tensor& _T) + { + Shape_ = _T.shape(); + DataPtr_ = _T.data(); + return *this; + } + TensorView& operator=(Tensor&& _T) + { + Shape_ = _T.shape(); + DataPtr_ = _T.data(); + return *this; + } }; LibDLVoiceCodecEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/BaseF0Extractor/BaseF0Extractor.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/BaseF0Extractor/BaseF0Extractor.hpp index 555b366..d4cf1d7 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/BaseF0Extractor/BaseF0Extractor.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/BaseF0Extractor/BaseF0Extractor.hpp @@ -23,10 +23,10 @@ #include #include #include -#define MOEVSFOEXTRACTORHEADER namespace MoeVSF0Extractor{ -#define MOEVSFOEXTRACTOREND } +#define MoeVoiceStudioF0ExtractorHeader namespace MoeVSF0Extractor{ +#define MoeVoiceStudioF0ExtractorEnd } -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader #define __NAME__MOEVS(x) std::wstring ClassName = (x) class BaseF0Extractor { @@ -82,4 +82,4 @@ class BaseF0Extractor double f0_mel_max; }; #undef __NAME__MOEVS -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.cpp index 8f1f455..b4054c6 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.cpp @@ -3,7 +3,7 @@ #include "stonemask.h" #include "matlabfunctions.h" -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader DioF0Extractor::DioF0Extractor(int sampling_rate, int hop_size, int n_f0_bins, double max_f0, double min_f0): BaseF0Extractor(sampling_rate, hop_size, n_f0_bins, max_f0, min_f0) { @@ -58,4 +58,4 @@ void DioF0Extractor::compute_f0(const double* PCMData, size_t PCMLen) Dio(PCMData, (int)PCMLen, int(fs), &Doption, temporal_positions.data(), raw_f0.data()); StoneMask(PCMData, (int)PCMLen, int(fs), temporal_positions.data(), raw_f0.data(), (int)f0Length, refined_f0.data()); } -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.hpp index 06d3ade..0f908ea 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/DioF0Extractor/DioF0Extractor.hpp @@ -22,7 +22,7 @@ #pragma once #include "../BaseF0Extractor/BaseF0Extractor.hpp" -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader class DioF0Extractor : public BaseF0Extractor { public: @@ -38,4 +38,4 @@ class DioF0Extractor : public BaseF0Extractor private: std::vector refined_f0; }; -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.cpp index 6c1510c..b0d977f 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.cpp @@ -3,7 +3,7 @@ #include #include "../../Logger/MoeSSLogger.hpp" -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader std::map RegisteredF0Extractors; F0Extractor GetF0Extractor(const std::wstring& _name, @@ -38,4 +38,4 @@ std::vector GetF0ExtractorList() return F0ExtractorsVec; } -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.hpp index e409f0e..8f818c1 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/F0ExtractorManager.hpp @@ -23,7 +23,7 @@ #include "BaseF0Extractor/BaseF0Extractor.hpp" #include -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader class F0Extractor { @@ -81,4 +81,4 @@ F0Extractor GetF0Extractor(const std::wstring& _name, std::vector GetF0ExtractorList(); -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.cpp index 7b4a215..17d2452 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.cpp @@ -3,7 +3,7 @@ #include "harvest.h" #include "stonemask.h" -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader HarvestF0Extractor::HarvestF0Extractor(int sampling_rate, int hop_size, int n_f0_bins, double max_f0, double min_f0): BaseF0Extractor(sampling_rate, hop_size, n_f0_bins, max_f0, min_f0) { @@ -58,4 +58,4 @@ void HarvestF0Extractor::compute_f0(const double* PCMData, size_t PCMLen) Harvest(PCMData, (int)PCMLen, int(fs), &Doption, temporal_positions.data(), raw_f0.data()); StoneMask(PCMData, (int)PCMLen, int(fs), temporal_positions.data(), raw_f0.data(), (int)f0Length, refined_f0.data()); } -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.hpp index f14b930..e1c678d 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/HarvestF0Extractor/HarvestF0Extractor.hpp @@ -22,7 +22,7 @@ #pragma once #include "../BaseF0Extractor/BaseF0Extractor.hpp" -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader class HarvestF0Extractor : public BaseF0Extractor { public: @@ -39,4 +39,4 @@ class HarvestF0Extractor : public BaseF0Extractor private: std::vector refined_f0; }; -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.cpp index cfde173..b8c0799 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.cpp @@ -10,7 +10,7 @@ #error #endif -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader NetF0Class::NetF0Class() #ifdef INITF0NETPREDICTOR @@ -324,4 +324,4 @@ void EmptyCache() MELPECORE.Destory(); } -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.hpp index d78444d..3e6c312 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/F0Extractor/NetF0Predictors/NetF0Predictors.hpp @@ -23,7 +23,7 @@ #include "../BaseF0Extractor/BaseF0Extractor.hpp" #include -MOEVSFOEXTRACTORHEADER +MoeVoiceStudioF0ExtractorHeader class NetF0Class { @@ -80,4 +80,4 @@ class MELPEF0Extractor : public BaseF0Extractor void EmptyCache(); -MOEVSFOEXTRACTOREND \ No newline at end of file +MoeVoiceStudioF0ExtractorEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.cpp new file mode 100644 index 0000000..9e55309 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.cpp @@ -0,0 +1,644 @@ +#include "MoeVSG2P.hpp" +#include "MJson.h" +#include "../../StringPreprocess.hpp" +#include + +MoeVoiceStudioG2PHeader + +std::wregex SignRegex(L"[!@#$%^&*()_+\\-=`~,./;'\\[\\]<>?:\"{}|\\\\。?!,、;:“”‘’『』「」()〔〕【】─…·—~《》〈〉]+"); +std::wregex WordRegex(L"[^!@#$%^&*()_+\\-=`~,./;'\\[\\]<>?:\"{}|\\\\。?!,、;:“”‘’『』「」()〔〕【】─…·—~《》〈〉]+"); +std::wregex BlankRegex(L"[ ]+"); +std::wregex ChineseRegex(L"^[\\u4e00-\\u9fa5]{0,}$"); +std::wregex NumberRegex(L"\\d+(?:\\.?\\d+)?"); +std::wstring ChineseNumber[] = { L"零",L"一",L"二",L"三",L"四",L"五",L"六",L"七",L"八",L"九",L"十" }; +std::wstring ChineseNumberDigit[] = { L"",L"十",L"百",L"千",L"万",L"十万",L"百万",L"千万",L"亿" }; +std::wstring JapaneseNumber[] = { L"零",L"一",L"ニ",L"三",L"四",L"五",L"六",L"七",L"八",L"九",L"十" }; +std::wstring JapaneseNumberDigit[] = { L"",L"十",L"百",L"千",L"万",L"十万",L"百万",L"千万",L"億" }; +std::unordered_map _PUNCTUATION_MAP{ + { L":", L"," }, { L";", L"," }, { L",", L"," }, { L"。", L"." }, { L"!", L"!" }, { L"?", L"?" }, + { L"·", L"," }, { L"、", L"," }, { L"...", L"…" }, { L"$", L"." }, { L"“", L"'" }, + { L"”", L"'" }, { L"‘", L"'" }, { L"’", L"'" }, { L"(", L"'" }, { L")", L"'" }, { L"(", L"'" }, + { L")", L"'" }, { L"《", L"'" }, { L"》", L"'" }, { L"【", L"'" }, { L"】", L"'" }, { L"[", L"'" }, + { L"]", L"'" }, { L"—", L"-" }, { L"~", L"-" }, { L"~", L"-" }, { L"「", L"'" }, { L"」", L"'" } +}; +std::unordered_map _ALPHASYMBOL_MAP{ + {L"#", L"シャープ"}, { L"%", L"パーセント" }, { L"&", L"アンド" }, { L"+", L"プラス" }, { L"-", L"マイナス" }, + { L":", L"コロン" }, { L";", L"セミコロン" }, { L"<", L"小なり" }, { L"=", L"イコール" }, { L">", L"大なり" }, + { L"@", L"アット" }, { L"a", L"エー" }, { L"b", L"ビー" }, { L"c", L"シー" }, { L"d", L"ディー" }, { L"e", L"イー" }, + { L"f", L"エフ" }, { L"g", L"ジー" }, { L"h", L"エイチ" }, { L"i", L"アイ" }, { L"j", L"ジェー" }, { L"k", L"ケー" }, + { L"l", L"エル" }, { L"m", L"エム" }, { L"n", L"エヌ" }, { L"o", L"オー" }, { L"p", L"ピー" }, { L"q", L"キュー" }, + { L"r", L"アール" }, { L"s", L"エス" }, { L"t", L"ティー" }, { L"u", L"ユー" }, { L"v", L"ブイ" }, { L"w", L"ダブリュー" }, + { L"x", L"エックス" }, { L"y", L"ワイ" }, { L"z", L"ゼット" }, { L"α", L"アルファ" }, { L"β", L"ベータ" }, { L"γ", L"ガンマ" }, + { L"δ", L"デルタ" }, { L"ε", L"イプシロン" }, { L"ζ", L"ゼータ" }, { L"η", L"イータ" }, { L"θ", L"シータ" }, { L"ι", L"イオタ" }, + { L"κ", L"カッパ" }, { L"λ", L"ラムダ" }, { L"μ", L"ミュー" }, { L"ν", L"ニュー" }, { L"ξ", L"クサイ" }, { L"ο", L"オミクロン" }, + { L"π", L"パイ" }, { L"ρ", L"ロー" }, { L"σ", L"シグマ" }, { L"τ", L"タウ" }, { L"υ", L"ウプシロン" }, { L"φ", L"ファイ" }, + { L"χ", L"カイ" }, { L"ψ", L"プサイ" }, { L"ω", L"オメガ", }}; +std::vector> _CURRENCY_MAP{{L"\\$", L"ドル"}, { L"¥", L"円" }, { L"£", L"ポンド" }, { L"€", L"ユーロ" }}; + +MVSCleaner DefaultCleaner; + +MVSCleaner* GetDefCleaner() +{ + return &DefaultCleaner; +} + +#ifdef WIN32 +MoeVoiceStudioG2PApi::~MoeVoiceStudioG2PApi() +{ + unLoad(); +} + +MoeVoiceStudioG2PApi& MoeVoiceStudioG2PApi::operator=(MoeVoiceStudioG2PApi&& move) noexcept +{ + func = move.func; + m_hDynLib = move.m_hDynLib; + move.func = nullptr; + move.m_hDynLib = nullptr; + return *this; +} + +bool MoeVoiceStudioG2PApi::enabled() const +{ + return m_hDynLib != nullptr; +} + +MoeVoiceStudioG2PApi::SplitData MoeVoiceStudioG2PApi::GetSplitWords(const std::wstring& inputLen) const +{ + SplitData TempData; + if (getvocab) + TempData = (*(SplitData*)getvocab(inputLen.c_str())); + return TempData; +} + +void MoeVoiceStudioG2PApi::LoadDict(const std::wstring& Path) const +{ + if (loaddic) + loaddic(Path.c_str()); +} + +char MoeVoiceStudioG2PApi::Load(const std::wstring& PluginName) +{ + func = nullptr; + frel = nullptr; + if (m_hDynLib) + { + FreeLibrary(m_hDynLib); + m_hDynLib = nullptr; + } + m_hDynLib = LoadLibrary((PluginName).c_str()); + if (m_hDynLib == nullptr) + return -1; + func = reinterpret_cast( + reinterpret_cast( + GetProcAddress(m_hDynLib, "PluginMain") + ) + ); + frel = reinterpret_cast( + reinterpret_cast( + GetProcAddress(m_hDynLib, "Release") + ) + ); + getvocab = reinterpret_cast( + reinterpret_cast( + GetProcAddress(m_hDynLib, "GetSplitData") + ) + ); + vocabrel = reinterpret_cast( + reinterpret_cast( + GetProcAddress(m_hDynLib, "RefreshTokenizer") + ) + ); + loaddic = reinterpret_cast( + reinterpret_cast( + GetProcAddress(m_hDynLib, "LoadDict") + ) + ); + if (func == nullptr) + return 1; + return 0; +} + +std::wstring MoeVoiceStudioG2PApi::functionAPI(const std::wstring& inputLen, const std::wstring& placeholderSymbol, + const std::wstring& extraInfo, int64_t languageID) const +{ + if (func) + { + const auto tmp = func(inputLen.c_str(), placeholderSymbol.c_str(), extraInfo.c_str(), languageID); + std::wstring ret = tmp; + return ret; + } + return inputLen; +} + +void MoeVoiceStudioG2PApi::unLoad() +{ + if (frel) + frel(); + if (vocabrel) + vocabrel(); + vocabrel = nullptr; + getvocab = nullptr; + loaddic = nullptr; + func = nullptr; + frel = nullptr; + if (m_hDynLib) + FreeLibrary(m_hDynLib); + m_hDynLib = nullptr; +} +#endif + +void MVSDict::GetDict(const std::wstring& path) +{ + PlaceholderSymbol = L"|"; + std::string phoneInfo, phoneInfoAll; + std::ifstream phonefile(path.c_str()); + if (!phonefile.is_open()) + throw std::exception("phone file not found"); + while (std::getline(phonefile, phoneInfo)) + phoneInfoAll += phoneInfo; + phonefile.close(); + MJson PhoneJson; + PhoneJson.Parse(phoneInfoAll); + if (PhoneJson.HasParseError()) + throw std::exception("json file error"); + for (const auto& itr : PhoneJson.GetMemberArray()) + { + std::wstring Key = to_wide_string(itr.first); + if (Key == L"PlaceholderSymbol") + { + if (itr.second.IsString() && itr.second.GetStringLength()) + PlaceholderSymbol = to_wide_string(itr.second.GetString()); + if (PlaceholderSymbol.length() > 1) + PlaceholderSymbol = L"|"; + continue; + } + const auto Value = itr.second.GetArray(); + _Dict[Key] = std::vector(); + for (const auto& it : Value) + _Dict[Key].push_back(to_wide_string(it.GetString())); + } +} + +std::vector MVSDict::DictReplace(const std::vector& input) const +{ + std::vector _out; + for (const auto& i : input) + if (_Dict.find(i) != _Dict.end()) + { + const auto& Value = _Dict.at(i); + _out.insert(_out.end(), Value.begin(), Value.end()); + } + else + _out.emplace_back(i); + return _out; +} + +std::vector MVSDict::DictReplace(const std::wstring& input, const std::wstring& tPlaceholderSymbol) const +{ + std::vector _output; + auto tmp = input; + tmp += tPlaceholderSymbol; + while (!tmp.empty()) + { + const size_t pos = tmp.find(tPlaceholderSymbol); + const auto Key = tmp.substr(0, pos); + tmp = tmp.substr(pos + 1); + if (_Dict.find(Key) != _Dict.end()) + { + const auto& Value = _Dict.at(Key); + _output.insert(_output.end(), Value.begin(), Value.end()); + } + else + _output.emplace_back(Key); + } + return _output; +} + +std::wstring MVSDict::DictReplaceGetStr(const std::wstring& input, const std::wstring& tPlaceholderSymbol, bool usePlaceholderSymbol) const +{ + const auto tmp = DictReplace(input, tPlaceholderSymbol); + std::wstring output; + for (const auto& i : tmp) + if (usePlaceholderSymbol) + output += i + tPlaceholderSymbol; + else + output += i; + return output; +} + +void Tokenizer::load(const std::wstring& _Path) +{ + const MJson _VocabJson(to_byte_string(_Path).c_str()); + if (!_VocabJson.HasMember("ContinuingSubwordPrefix") || + !_VocabJson.HasMember("Type") || + !_VocabJson.HasMember("Vocab") || + _VocabJson["ContinuingSubwordPrefix"].Empty() || + _VocabJson["Type"].Empty() || + !_VocabJson["ContinuingSubwordPrefix"].IsString() || + !_VocabJson["Type"].IsString()) + throw std::exception("Vocab.json Error"); + const std::string Type = _VocabJson["Type"].GetString(); + if (Type == "Unigram") Model = TokenizerModel::Unigram; + Symbol = to_wide_string(_VocabJson["ContinuingSubwordPrefix"].GetString()); + + if(Model == TokenizerModel::WordPiece) + { + if(_VocabJson["Vocab"].IsArray()) + { + const auto _VocabArray = _VocabJson["Vocab"].GetArray(); + int64_t Index = 0; + for (const auto& Object : _VocabArray) + Vocab[to_wide_string(Object.GetString())] = Index++; + } + else + { + const auto _VocabDict = _VocabJson["Vocab"].GetMemberArray(); + for (const auto& Pair : _VocabDict) + { + if (Pair.second.IsInt()) + Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetInt()); + else if (Pair.second.IsFloat()) + Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetFloat()); + } + } + } + else + { + const auto _VocabArray = _VocabJson["Vocab"].GetArray(); + int64_t Index = 0; + for (const auto& Object : _VocabArray) + Vocab[to_wide_string(Object.GetArray()[0].GetString())] = Index++; + } + if (_VocabJson.HasMember("UseSplit") && _VocabJson["UseSplit"].IsBool()) + UseSplit = _VocabJson["UseSplit"].GetBool(); +} + +void Tokenizer::loadCleaner(const std::wstring& _Path) const +{ + if (Cleaner) + Cleaner->loadG2p(_Path); +} + +void Tokenizer::loadDict(const std::wstring& _Path) const +{ + if (Cleaner) + Cleaner->loadDict(_Path); +} + +std::vector Tokenizer::UnigramMethod(const std::wstring& Seq, size_t MaxWordLength, TokenizerMethod Method) const +{ + if (Seq.empty()) + return {}; + //auto SeqVector = SplitString(Seq, SignRegex); + std::vector Tokens; + Tokens.emplace_back(Vocab.at(L"[CLS]")); + const auto UNKId = Vocab.at(L"[UNK]"); + std::wstring SeqWord = Seq; + if (Method == TokenizerMethod::Left) + { + bool FirstTime = true; + while (!SeqWord.empty()) + { + for (size_t SearchLength = min(MaxWordLength, SeqWord.length()); SearchLength > 0; --SearchLength) + { + if (FirstTime) + { + size_t SubVal = 0; + if (SearchLength > Symbol.length()) + SubVal = Symbol.length(); + const auto SearchResult = Vocab.find(Symbol + SeqWord.substr(0, SearchLength - SubVal)); + if (SearchResult != Vocab.end()) + { + Tokens.emplace_back(SearchResult->second); + SeqWord = SeqWord.substr(SearchLength - SubVal); + FirstTime = false; + break; + } + } + const auto SearchResult = Vocab.find(SeqWord.substr(0, SearchLength)); + if (SearchResult != Vocab.end()) + { + Tokens.emplace_back(SearchResult->second); + SeqWord = SeqWord.substr(SearchLength); + if (FirstTime) FirstTime = false; + break; + } + if (SearchLength == 1) + { + const auto SubStr = SeqWord.substr(0, SearchLength); + const auto SearchRes = _PUNCTUATION_MAP.find(SubStr); + if (SearchRes != _PUNCTUATION_MAP.end()) + { + const auto SearchR = Vocab.find(SearchRes->second); + if (SearchR != Vocab.end()) + Tokens.emplace_back(SearchR->second); + SeqWord = SeqWord.substr(1); + break; + } + if (Tokens.empty() || Tokens.back() != UNKId) + Tokens.emplace_back(UNKId); + SeqWord = SeqWord.substr(1); + } + } + } + } + else + throw std::exception("NotImplementedError"); + Tokens.emplace_back(Vocab.at(L"[SEP]")); + return Tokens; +} + +std::vector Tokenizer::WordPieceMethod(const std::wstring& Seq, size_t MaxWordLength, TokenizerMethod Method) const +{ + if (Seq.empty()) + return {}; + auto SeqVector = SplitString(Seq, SignRegex); + std::vector Tokens; + Tokens.emplace_back(Vocab.at(L"[CLS]")); + const auto UNKId = Vocab.at(L"[UNK]"); + if (Method == TokenizerMethod::Left) + { + for (auto& SeqWord : SeqVector) + { + bool FirstTime = true; + while (!SeqWord.empty()) + { + if (regex_match(SeqWord.substr(0, 1), ChineseRegex)) + { + const auto SearchResult = Vocab.find(SeqWord.substr(0, 1)); + if (SearchResult != Vocab.end()) + Tokens.emplace_back(SearchResult->second); + else + Tokens.emplace_back(UNKId); + SeqWord = SeqWord.substr(1); + continue; + } + for (size_t SearchLength = min(MaxWordLength, SeqWord.length()); SearchLength > 0; --SearchLength) + { + if (!FirstTime) + { + size_t SubVal = 0; + if (SearchLength > Symbol.length()) + SubVal = Symbol.length(); + const auto SearchResult = Vocab.find(Symbol + SeqWord.substr(0, SearchLength - SubVal)); + if (SearchResult != Vocab.end()) + { + Tokens.emplace_back(SearchResult->second); + SeqWord = SeqWord.substr(SearchLength - SubVal); + break; + } + } + const auto SearchResult = Vocab.find(SeqWord.substr(0, SearchLength)); + if (SearchResult != Vocab.end()) + { + Tokens.emplace_back(SearchResult->second); + SeqWord = SeqWord.substr(SearchLength); + if (FirstTime) FirstTime = false; + break; + } + if (SearchLength == 1) + { + const auto SubStr = SeqWord.substr(0, SearchLength); + const auto SearchRes = _PUNCTUATION_MAP.find(SubStr); + if (SearchRes != _PUNCTUATION_MAP.end()) + { + const auto SearchR = Vocab.find(SearchRes->second); + if (SearchR != Vocab.end()) + Tokens.emplace_back(SearchR->second); + SeqWord = SeqWord.substr(1); + break; + } + if (Tokens.empty() || Tokens.back() != UNKId) + Tokens.emplace_back(UNKId); + SeqWord = SeqWord.substr(1); + } + } + } + } + } + else + throw std::exception("NotImplementedError"); + Tokens.emplace_back(Vocab.at(L"[SEP]")); + return Tokens; +} + +std::vector Tokenizer::operator()(const std::wstring& Seq, size_t MaxWordLength, TokenizerMethod Method) const +{ + if (Model == TokenizerModel::WordPiece) + return WordPieceMethod(Seq, MaxWordLength, Method); + return UnigramMethod(Seq, MaxWordLength, Method); +} + +std::vector Tokenizer::SplitString(const std::wstring& _InputRef, const std::wregex & _SignRegex) +{ + if (_InputRef.empty()) + return {}; + std::wstring InputStr = _InputRef; + std::vector TmpStrVec, StrVec; + std::wsmatch MatchedSign; + while (std::regex_search(InputStr, MatchedSign, _SignRegex)) + { + if (MatchedSign.prefix().matched) + TmpStrVec.push_back(MatchedSign.prefix()); + TmpStrVec.push_back(MatchedSign.str()); + InputStr = MatchedSign.suffix(); + } + if (!InputStr.empty()) + TmpStrVec.emplace_back(InputStr); + for(const auto& i : TmpStrVec) + { + std::wsregex_token_iterator TokenIter(i.begin(), i.end(), BlankRegex, -1); + decltype(TokenIter) TokenIterEnd; + for (; TokenIter != TokenIterEnd; ++TokenIter) + if (!TokenIter->str().empty()) + StrVec.push_back(TokenIter->str()); + } + return StrVec; +} + +std::vector Tokenizer::SplitWithPlugin(const std::vector& _Inputs) const +{ + std::vector SeqVec; + for(const auto& Seq : _Inputs) + { + const auto SplitedWords = GetCleaner().GetCleaner().GetSplitWords(Seq); + for (size_t i = 0; i < SplitedWords.Size; ++i) + { + auto TmpString = to_wide_string(SplitedWords.Data[i]); + TmpString = TmpString.substr(0, TmpString.find(L',')); + SeqVec.emplace_back(std::move(TmpString)); + } + } + return SeqVec; +} + +std::wstring NumberToChinese(double Number) +{ + std::wstring StrRtn; + std::wstring InputStr = std::to_wstring(Number); + const size_t PIndex = InputStr.find(L'.'); + std::wstring IntegerStr, FractionStr; + if (PIndex != std::wstring::npos) + { + IntegerStr = InputStr.substr(0, PIndex); + FractionStr = InputStr.substr(PIndex + 1); + while (!FractionStr.empty() && FractionStr.back() == L'0') + FractionStr.pop_back(); + } + else + IntegerStr = std::move(InputStr); + + if (IntegerStr != L"0") + { + size_t MaxIntegerStrLength = IntegerStr.length(); + for (; MaxIntegerStrLength > 0; --MaxIntegerStrLength) + if (IntegerStr[MaxIntegerStrLength - 1] != L'0') + break; + if (MaxIntegerStrLength < 1) + MaxIntegerStrLength = 1; + + const auto DigitNum = IntegerStr.length(); + for (size_t i = 0; i < MaxIntegerStrLength; i++) + { + const auto NumberIndex = IntegerStr[i] - L'0'; + const auto DigitIndex = DigitNum - i - 1; + if (0 == NumberIndex) + { + if ((i > 0 && L'0' == IntegerStr[i - 1]) || i == IntegerStr.length() - 1) + continue; + if (DigitIndex >= 4 && 0 == DigitIndex % 4) + StrRtn += ChineseNumberDigit[DigitIndex]; + else + StrRtn += ChineseNumber[NumberIndex]; + } + else + { + StrRtn += ChineseNumber[NumberIndex]; + if (IntegerStr.length() == 2 && IntegerStr[0] == '1' && i == 0) + StrRtn.erase(0); + if (0 == DigitIndex % 4) + StrRtn += ChineseNumberDigit[DigitIndex]; + else + StrRtn += ChineseNumberDigit[DigitIndex % 4]; + } + } + } + else + StrRtn += L"零"; + + if (!FractionStr.empty()) + StrRtn += L"点"; + for(const auto FractionI : FractionStr) + { + const auto NumberIndex = FractionI - L'0'; + StrRtn += ChineseNumber[NumberIndex]; + } + return StrRtn; +} + +std::wstring NumberToJapanese(double Number) +{ + std::wstring StrRtn; + std::wstring InputStr = std::to_wstring(Number); + const size_t PIndex = InputStr.find(L'.'); + std::wstring IntegerStr, FractionStr; + if (PIndex != std::wstring::npos) + { + IntegerStr = InputStr.substr(0, PIndex); + FractionStr = InputStr.substr(PIndex + 1); + while (!FractionStr.empty() && FractionStr.back() == L'0') + FractionStr.pop_back(); + } + else + IntegerStr = std::move(InputStr); + + if (IntegerStr != L"0") + { + size_t MaxIntegerStrLength = IntegerStr.length(); + for (; MaxIntegerStrLength > 0; --MaxIntegerStrLength) + if (IntegerStr[MaxIntegerStrLength - 1] != L'0') + break; + if (MaxIntegerStrLength < 1) + MaxIntegerStrLength = 1; + + const auto DigitNum = IntegerStr.length(); + for (size_t i = 0; i < MaxIntegerStrLength; i++) + { + const auto NumberIndex = IntegerStr[i] - L'0'; + const auto DigitIndex = DigitNum - i - 1; + if (0 == NumberIndex) + { + if ((i > 0 && L'0' == IntegerStr[i - 1]) || i == IntegerStr.length() - 1) + continue; + if (DigitIndex >= 4 && 0 == DigitIndex % 4) + StrRtn += JapaneseNumberDigit[DigitIndex]; + else + StrRtn += JapaneseNumber[NumberIndex]; + } + else + { + StrRtn += JapaneseNumber[NumberIndex]; + if (IntegerStr.length() == 2 && IntegerStr[0] == '1' && i == 0) + StrRtn.erase(0); + if (0 == DigitIndex % 4) + StrRtn += JapaneseNumberDigit[DigitIndex]; + else + StrRtn += JapaneseNumberDigit[DigitIndex % 4]; + } + } + } + else + StrRtn += L"零"; + + if (!FractionStr.empty()) + StrRtn += L"点"; + for (const auto FractionI : FractionStr) + { + const auto NumberIndex = FractionI - L'0'; + StrRtn += JapaneseNumber[NumberIndex]; + } + return StrRtn; +} + +std::wstring ChineseNormalize(const std::wstring& _Input) +{ + std::wstring RtnStr; + const auto StrVec = Tokenizer::SplitString(_Input, NumberRegex); + for(const auto& Str : StrVec) + { + if (std::regex_match(Str, NumberRegex)) + RtnStr += NumberToChinese(_wtof(Str.c_str())); + else + RtnStr += Str; + } + RtnStr = std::regex_replace(RtnStr, std::wregex(L"嗯"), L"恩"); + RtnStr = std::regex_replace(RtnStr, std::wregex(L"呣"), L"母"); + return RtnStr; +} + +std::wstring JapaneseNormalize(const std::wstring& _Input) +{ + std::wstring RtnStr; + const auto StrVec = Tokenizer::SplitString(_Input, NumberRegex); + for (const auto& Str : StrVec) + { + if (std::regex_match(Str, NumberRegex)) + RtnStr += NumberToJapanese(_wtof(Str.c_str())); + else + RtnStr += Str; + } + for (const auto& PunPair : _CURRENCY_MAP) + RtnStr = std::regex_replace(RtnStr, std::wregex(PunPair.first), PunPair.second); + return RtnStr; +} + +std::wstring NormalizeText(const std::wstring& _Input, const std::string& _Language) +{ + if (_Language == "ZH") + return ChineseNormalize(_Input); + if (_Language == "JP") + return JapaneseNormalize(_Input); + return _Input; +} + +MoeVoiceStudioG2PEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.hpp new file mode 100644 index 0000000..76551b7 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.hpp @@ -0,0 +1,254 @@ +/** + * FileName: MoeVSG2P.hpp + * Note: MoeVoiceStudioCore G2Pֵ䣨TTSã + * + * Copyright (C) 2022-2023 NaruseMioShirakana (shirakanamio@foxmail.com) + * + * This file is part of MoeVoiceStudioCore library. + * MoeVoiceStudioCore library is free software: you can redistribute it and/or modify it under the terms of the + * GNU Affero General Public License as published by the Free Software Foundation, either version 3 + * of the License, or any later version. + * + * MoeVoiceStudioCore library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License along with Foobar. + * If not, see . + * + * date: 2023-11-9 Create +*/ + +#pragma once +#include +#ifdef _WIN32 +#ifndef UNICODE +#define UNICODE +#endif +#include +#endif +#include +#include +#include +#include + +#define MoeVoiceStudioG2PHeader namespace MoeVSG2P { +#define MoeVoiceStudioG2PEnd } + +MoeVoiceStudioG2PHeader + +class MoeVoiceStudioG2PApi +{ +public: + struct SplitData + { + char** Data = nullptr; + size_t Size = 0; + }; + using funTy = const wchar_t* (*)(const wchar_t*, const wchar_t*, const wchar_t*, int64_t); + using freTy = void (*)(); + using vocabFn = void* (*)(const wchar_t*); + using loadFn = void (*)(const wchar_t*); + MoeVoiceStudioG2PApi() = default; + ~MoeVoiceStudioG2PApi(); + char Load(const std::wstring& PluginName); + void unLoad(); + void ReleaseVoc() const + { + if (vocabrel) + vocabrel(); + } + [[nodiscard]] SplitData GetSplitWords(const std::wstring& inputLen) const; + [[nodiscard]] std::wstring functionAPI(const std::wstring& inputLen, const std::wstring& placeholderSymbol, + const std::wstring& extraInfo, int64_t languageID) const; + MoeVoiceStudioG2PApi(const MoeVoiceStudioG2PApi&) = delete; + MoeVoiceStudioG2PApi(MoeVoiceStudioG2PApi&&) = delete; + MoeVoiceStudioG2PApi& operator=(MoeVoiceStudioG2PApi&& move) noexcept; + [[nodiscard]] bool enabled() const; + MoeVoiceStudioG2PApi& operator=(const MoeVoiceStudioG2PApi&) = delete; + void LoadDict(const std::wstring& Path) const; +private: +#ifdef WIN32 + const wchar_t*(*func)(const wchar_t*, const wchar_t*, const wchar_t*, int64_t) = nullptr; + void (*frel)() = nullptr; + void* (*getvocab)(const wchar_t*) = nullptr; + void (*vocabrel)() = nullptr; + void (*loaddic)(const wchar_t*) = nullptr; + HINSTANCE m_hDynLib = nullptr; +#endif +}; + +class MVSDict +{ +public: + MVSDict() = default; + ~MVSDict() = default; + + [[nodiscard]] bool enabled() const + { + return !_Dict.empty(); + } + + void unload() + { + _Dict.clear(); + } + + [[nodiscard]] std::vector DictReplace(const std::vector& input) const; + + [[nodiscard]] std::vector DictReplace(const std::wstring& input, const std::wstring& tPlaceholderSymbol) const; + + [[nodiscard]] std::wstring DictReplaceGetStr(const std::wstring& input, const std::wstring& tPlaceholderSymbol, bool usePlaceholderSymbol = true) const; + + void GetDict(const std::wstring& path); + + [[nodiscard]] std::wstring getPlaceholderSymbol() const + { + return PlaceholderSymbol; + } +private: + std::map> _Dict; + std::wstring PlaceholderSymbol = L"|"; +}; + +class MVSCleaner +{ +public: + MVSCleaner() = default; + + ~MVSCleaner() + { + unloadDict(); + unloadG2p(); + } + + void unloadDict() + { + _Dict.unload(); + } + + void unloadG2p() + { + _G2p.unLoad(); + } + + void loadDict(const std::wstring& _path) + { + if (_Dict.enabled()) + unloadDict(); + _Dict.GetDict(_path); + } + + void loadG2p(const std::wstring& _path) + { + if (_G2p.enabled()) + unloadG2p(); + _G2p.Load(_path); + } + + [[nodiscard]] bool G2pEnabled() const + { + return _G2p.enabled(); + } + + [[nodiscard]] bool DictEnabled() const + { + return _Dict.enabled(); + } + + [[nodiscard]] std::wstring G2p(const std::wstring& _text, const std::wstring& placeholderSymbol, + const std::wstring& extraInfo, int64_t languageID) const + { + return _G2p.functionAPI(_text, placeholderSymbol, extraInfo, languageID); + } + + [[nodiscard]] const MoeVoiceStudioG2PApi& GetCleaner() const + { + return _G2p; + } + + [[nodiscard]] auto DictReplace(const std::vector& input) const + { + return _Dict.DictReplace(input); + } + + [[nodiscard]] auto DictReplace(const std::wstring& input, const std::wstring& tPlaceholderSymbol) const + { + return _Dict.DictReplace(input, tPlaceholderSymbol); + } + + [[nodiscard]] auto DictReplaceGetStr(const std::wstring& input, const std::wstring& tPlaceholderSymbol, bool usePlaceholderSymbol = true) const + { + return _Dict.DictReplaceGetStr(input, tPlaceholderSymbol, usePlaceholderSymbol); + } + + [[nodiscard]] std::wstring getPlaceholderSymbol() const + { + return _Dict.getPlaceholderSymbol(); + } + +protected: + MoeVoiceStudioG2PApi _G2p; + MVSDict _Dict; +}; + +class Tokenizer +{ +public: + using TokenizerType = int64_t; + enum class TokenizerMethod + { + Left, + Right + }; + enum class TokenizerModel + { + Unigram, + WordPiece + }; + Tokenizer() = default; + Tokenizer(const std::wstring& _Path) + { + load(_Path); + } + void BondCleaner(MVSCleaner* MCleaner) + { + Cleaner = MCleaner; + } + void load(const std::wstring& _Path); + void loadCleaner(const std::wstring& _Path) const; + void loadDict(const std::wstring& _Path) const; + [[nodiscard]] const MVSCleaner& GetCleaner() const + { + return *Cleaner; + } + const MVSCleaner* operator->() const + { + return Cleaner; + } + [[nodiscard]] std::vector WordPieceMethod(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const; + [[nodiscard]] std::vector UnigramMethod(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const; + std::vector operator()(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const; + [[nodiscard]] std::vector SplitWithPlugin(const std::vector& _Inputs) const; + static std::vector SplitString(const std::wstring& _InputRef, const std::wregex& _SignRegex); +private: + std::unordered_map Vocab; + std::wstring Symbol = L"##"; + TokenizerModel Model = TokenizerModel::WordPiece; + MVSCleaner* Cleaner = nullptr; + bool UseSplit = false; +}; + +MVSCleaner* GetDefCleaner(); + +std::wstring JapaneseNormalize(const std::wstring& _Input); + +std::wstring ChineseNormalize(const std::wstring& _Input); + +std::wstring NormalizeText(const std::wstring& _Input, const std::string& _Language); + +std::wstring NumberToChinese(double Number); + +std::wstring NumberToJapanese(double Number); + +MoeVoiceStudioG2PEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/DiffSvc.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/DiffSvc.hpp index d370683..ac7056a 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/DiffSvc.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/DiffSvc.hpp @@ -20,6 +20,7 @@ */ #pragma once +#include #include "SVC.hpp" MoeVoiceStudioCoreHeader @@ -31,6 +32,19 @@ class DiffusionSvc : public SingingVoiceConversion ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU, unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0); + /** + * \brief 加载DiffSvc模型 + * \param _PathDict 路径,Key分别为["Hubert", "Hifigan", "Encoder", "DenoiseFn", "NoisePredictor", "AfterProcess", "DiffSvc", "Naive", "Alphas"],其中"DiffSvc"、"Naive"、"Alphas"为可选项 + * \param _Config 配置Json + * \param _ProgressCallback 进度条回调函数 + * \param ExecutionProvider_ Provider + * \param DeviceID_ GPU设备ID + * \param ThreadCount_ 线程数 + */ + DiffusionSvc(const std::map& _PathDict, const MJson& _Config, const ProgressCallback& _ProgressCallback, + ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU, + unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0); + ~DiffusionSvc() override; void Destory(); diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/ModelBase.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/ModelBase.hpp index 2ff0464..87ceab1 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/ModelBase.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/ModelBase.hpp @@ -83,13 +83,13 @@ class MoeVoiceStudioModule /** * \brief 输入路径推理 - * \param _Paths 路径,多个路径使用换行符隔开 + * \param _Datas [路径,多个路径使用换行符隔开, 推理文本] * \param _InferParams 推理参数 * \param _SlicerSettings 切片机配置 * \return 输出路径 */ - [[nodiscard]] virtual std::vector Inference(std::wstring& _Paths, - const MoeVSProjectSpace::MoeVSSvcParams& _InferParams, + [[nodiscard]] virtual std::vector Inference(std::wstring& _Datas, + const MoeVSProjectSpace::MoeVSParams& _InferParams, const InferTools::SlicerSettings& _SlicerSettings) const; /** diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp index 2d169b5..5c89834 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp @@ -118,28 +118,72 @@ namespace MoeVSProjectSpace {} }; - struct MoeVSSvcParams + struct MoeVSParams { + //通用 + float NoiseScale = 0.3f; //噪声修正因子 0-10 + int64_t Seed = 52468; //种子 + int64_t SpeakerId = 0; //角色ID + uint64_t SrcSamplingRate = 48000; //源采样率 + int64_t SpkCount = 2; //模型角色数 + + //SVC float IndexRate = 0.f; //索引比 0-1 float ClusterRate = 0.f; //聚类比 0-1 - float NoiseScale = 0.3f; //噪声规模 0-10 - float DDSPNoiseScale = 0.8f; //DDSP噪声规模 0-10 - int64_t Seed = 52468; //种子 + float DDSPNoiseScale = 0.8f; //DDSP噪声修正因子 0-10 float Keys = 0.f; //升降调 -64-64 size_t MeanWindowLength = 2; //均值滤波器窗口大小 1-20 size_t Pndm = 100; //Diffusion加速倍数 2-200 size_t Step = 1000; //Diffusion总步数 200-1000 - std::wstring Sampler = L"Pndm"; //采样器 + std::wstring Sampler = L"Pndm"; //Diffusion采样器 std::wstring F0Method = L"Dio"; //F0提取算法 - int64_t SpeakerId = 0; - uint64_t SrcSamplingRate = 48000; - bool UseShallowDiffusion = false; - int64_t SpkCount = 2; - //RTInfer + bool UseShallowDiffusion = false; //使用浅扩散 + + //SVCRTInfer int64_t RTSampleSize = 44100; int64_t CrossFadeLength = 320; + + //TTS + std::vector SpeakerMix; //角色混合比例 + float LengthScale = 1.0f; //时长修正因子 + float DurationPredictorNoiseScale = 0.3f; //随机时长预测器噪声修正因子 + float FactorDpSdp = 0.3f; //随机时长预测器与时长预测器混合比例 + float GateThreshold = 0.66666f; //Tacotron2解码器EOS阈值 + int64_t MaxDecodeStep = 2000; //Tacotron2最大解码步数 + std::vector EmotionPrompt; //情感标记 + std::wstring PlaceHolderSymbol = L"|"; //音素分隔符 + float RestTime = 0.5f; //停顿时间,为负数则直接断开音频并创建新音频 + int64_t Language = 0; //语言序列 + std::wstring AdditionalInfo; //G2P额外信息 + }; + + struct MoeVSTTSSeq + { + std::wstring SeqStr; + std::vector Seq; //音素序列 + std::vector Tones; //音调序列 + std::vector Durations; //时长序列 + std::vector Language; //语言序列 + std::vector SpeakerMix; //角色混合比例 + + std::vector EmotionPrompt; //情感标记 + std::wstring PlaceHolderSymbol = L"|"; //音素分隔符 + float NoiseScale = 0.3f; //噪声修正因子 0-10 + float LengthScale = 1.0f; //时长修正因子 + float DurationPredictorNoiseScale = 0.3f; //随机时长预测器噪声修正因子 + float FactorDpSdp = 0.3f; //随机时长预测器与时长预测器混合比例 + float GateThreshold = 0.66666f; //Tacotron2解码器EOS阈值 + int64_t MaxDecodeStep = 2000; //Tacotron2最大解码步数 + int64_t Seed = 52468; //种子 + int64_t SpeakerId = 0; //角色ID + float RestTime = 0.5f; //停顿时间,为负数则直接断开音频并创建新音频 + int64_t TotLang = 0; + std::wstring AdditionalInfo; //G2P额外信息 }; + using MoeVSSvcParams = MoeVSParams; + using MoeVSTTSParams = MoeVSParams; + struct ParamsOffset { std::vector OrgAudio; diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp new file mode 100644 index 0000000..e36b79b --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp @@ -0,0 +1,151 @@ +#pragma once +#include +#include "ModelBase.hpp" +#include "../../Logger/MoeSSLogger.hpp" +#include "../../InferTools/G2P/MoeVSG2P.hpp" +#include "MJson.h" + +MoeVoiceStudioCoreHeader + +class EmoLoader +{ +public: + static constexpr long startPos = 128; + EmoLoader() = default; + EmoLoader(const std::wstring& path) + { + if (emofile) + fclose(emofile); + emofile = nullptr; + _wfopen_s(&emofile, path.c_str(), L"r"); + if (!emofile) + throw std::exception("emoFile not exists"); + } + ~EmoLoader() + { + if (emofile) + fclose(emofile); + emofile = nullptr; + } + void close() + { + if (emofile) + fclose(emofile); + emofile = nullptr; + } + void open(const std::wstring& path) + { + if (emofile) + fclose(emofile); + emofile = nullptr; + _wfopen_s(&emofile, path.c_str(), L"rb"); + if (!emofile) + throw std::exception("emoFile not exists"); + } + std::vector operator[](long index) const + { + if (emofile) + { + fseek(emofile, index * 4096 + startPos, SEEK_SET); + char buffer[4096]; + const auto buf = reinterpret_cast(buffer); + const auto bufread = fread_s(buffer, 4096, 1, 4096, emofile); + if (bufread == 4096) + return { buf ,buf + 1024 }; + throw std::exception("emo index out of range"); + } + throw std::exception("emo file not opened"); + } +private: + FILE* emofile = nullptr; +}; + +class TextToSpeech : public MoeVoiceStudioModule +{ +public: + using DurationCallback = std::function&)>; + + TextToSpeech(const ExecutionProviders& ExecutionProvider_, unsigned DeviceID_, unsigned ThreadCount_ = 0); + + [[nodiscard]] std::vector GetInputSeqs(const MJson& _Input, const MoeVSProjectSpace::MoeVSParams& _InitParams) const; + + static std::vector> generatePath(float* duration, size_t durationSize, size_t maskSize); + + [[nodiscard]] std::vector GetEmotionVector(const std::vector& src) const; + + [[nodiscard]] std::vector> Inference(const std::wstring& _Seq, + const MoeVSProjectSpace::MoeVSParams& _InferParams = MoeVSProjectSpace::MoeVSParams()) const; + + [[nodiscard]] std::vector> Inference(const MJson& _Inputs, + const MoeVSProjectSpace::MoeVSParams& _InferParams = MoeVSProjectSpace::MoeVSParams()) const; + + [[nodiscard]] virtual std::vector> Inference(const std::vector& _Input) const; + + [[nodiscard]] std::vector Inference(std::wstring& _Datas, const MoeVSProjectSpace::MoeVSParams& _InferParams, const InferTools::SlicerSettings& _SlicerSettings) const override; + + [[nodiscard]] static std::vector GetAligments(size_t DstLen, size_t SrcLen); + + [[nodiscard]] std::wstring TextNormalize(const std::wstring& _Input, int64_t LanguageId) const; + + [[nodiscard]] int64_t GetLanguageToneIdx(int64_t _Index) const + { + std::string LanguageSymb; + for(const auto& i : LanguageMap) + if (_Index == i.second) + LanguageSymb = i.first; + if (LanguageSymb.empty()) + return 0; + const auto Iter = LanguageTones.find(LanguageSymb); + if (Iter != LanguageTones.end()) + return Iter->second; + return 0; + } + + static int64_t find_max_idx(const std::vector& inp) + { + int64_t idx = 0; + for (size_t i = 1; i < inp.size(); ++i) + if (inp[i] > inp[idx]) + idx = int64_t(i); + return idx; + } + + ~TextToSpeech() override = default; + + template + void LinearCombination(std::vector& _data, T Value = T(1.0)) const + { + _data.resize(SpeakerCount, 0.f); + if (_data.empty()) + { + _data = std::vector(1, Value); + return; + } + T Sum = T(0.0); + for (const auto& i : _data) + Sum += i; + if (Sum < T(0.0001)) + { + _data = std::vector(_data.size(), T(0.0)); + _data[0] = Value; + return; + } + Sum *= T(Value); + for (auto& i : _data) + i /= Sum; + } +protected: + DurationCallback CustomDurationCallback; + int64_t SpeakerCount = 1; + std::map LanguageMap = { {"ZH", 0}, {"JP", 1}, {"EN", 2} }; + std::map LanguageTones = { {"ZH", 0}, {"JP", 0}, {"EN", 0} }; + std::vector Tokenizers; + MoeVSG2P::MVSCleaner* Cleaner = nullptr; + bool AddBlank = true; + bool Emotion = false; + std::map Symbols; + EmoLoader EmoLoader; + MJson EmoJson; +}; + +MoeVoiceStudioCoreEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Tacotron.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Tacotron.hpp new file mode 100644 index 0000000..ba7b776 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Tacotron.hpp @@ -0,0 +1,39 @@ +#pragma once +#include "ModelBase.hpp" + +INFERCLASSHEADER + +class Tacotron2 : public TTS +{ +public: + Tacotron2(const MJson&, const callback&, const callback_params&, const DurationCallback&, Device _dev = Device::CPU); + + ~Tacotron2() override; + + std::vector Inference(std::wstring& _inputLens) const override; + + [[nodiscard]] std::vector Inference(const MoeVSProject::TTSParams& _input) const override; + + static void cat(std::vector& tensorA, std::vector& Shape, const MTensor& tensorB) { + const int64 n = Shape[1]; + for (int64 i = n; i > 0; --i) + tensorA.insert(tensorA.begin() + (i * Shape[2]), tensorB.GetTensorData()[i - 1]); + ++Shape[2]; + } +private: + Ort::Session* sessionEncoder = nullptr; + Ort::Session* sessionDecoderIter = nullptr; + Ort::Session* sessionPostNet = nullptr; + Ort::Session* sessionGan = nullptr; + + const std::vector ganIn = { "x" }; + const std::vector ganOut = { "audio" }; + const std::vector inputNodeNamesSessionEncoder = { "sequences","sequence_lengths" }; + const std::vector outputNodeNamesSessionEncoder = { "memory","processed_memory","lens" }; + const std::vector inputNodeNamesSessionDecoderIter = { "decoder_input","attention_hidden","attention_cell","decoder_hidden","decoder_cell","attention_weights","attention_weights_cum","attention_context","memory","processed_memory","mask" }; + const std::vector outputNodeNamesSessionDecoderIter = { "decoder_output","gate_prediction","out_attention_hidden","out_attention_cell","out_decoder_hidden","out_decoder_cell","out_attention_weights","out_attention_weights_cum","out_attention_context" }; + const std::vector inputNodeNamesSessionPostNet = { "mel_outputs" }; + const std::vector outputNodeNamesSessionPostNet = { "mel_outputs_postnet" }; +}; + +INFERCLASSEND \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp new file mode 100644 index 0000000..68b84b3 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp @@ -0,0 +1,86 @@ +#pragma once +#include "TTS.hpp" + +MoeVoiceStudioCoreHeader + +class Vits : public TextToSpeech +{ +public: + Vits(const MJson& _Config, const ProgressCallback& _ProgressCallback, + const DurationCallback& _DurationCallback, + ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU, + unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0); + + Vits(const std::map& _PathDict, const MJson& _Config, const ProgressCallback& _ProgressCallback, + const DurationCallback& _DurationCallback, const std::vector& _BertPaths, + ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU, + unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0); + + void load(const std::map& _PathDict, + const MJson& _Config, const ProgressCallback& _ProgressCallback, + const DurationCallback& _DurationCallback, const std::vector& _BertPaths = {}); + + ~Vits() override; + + void destory() + { + delete sessionDec; + delete sessionSdp; + delete sessionDp; + delete sessionEnc_p; + delete sessionFlow; + delete sessionEmb; + sessionDec = nullptr; + sessionSdp = nullptr; + sessionEnc_p = nullptr; + sessionFlow = nullptr; + sessionEmb = nullptr; + sessionDp = nullptr; + for (auto& OrtPtr : sessionBert) + { + delete OrtPtr; + OrtPtr = nullptr; + } + sessionBert.clear(); + } + + [[nodiscard]] std::vector> Inference(const std::vector& _Input) const override; +private: + Ort::Session* sessionDec = nullptr; + Ort::Session* sessionSdp = nullptr; + Ort::Session* sessionDp = nullptr; + Ort::Session* sessionEnc_p = nullptr; + Ort::Session* sessionFlow = nullptr; + Ort::Session* sessionEmb = nullptr; + std::vector sessionBert; + std::vector BertNames; + std::string VitsType; + bool UseTone = false; + bool UseBert = false; + bool UseLength = true; + bool UseLanguage = false; + bool EncoderG = false; + + std::vector EncoderInputNames = { "x" }; + const std::vector EncoderOutputNames = { "xout", "m_p", "logs_p", "x_mask" }; + + std::vector SdpInputNames = { "x", "x_mask", "zin" }; + const std::vector SdpOutputNames = { "logw" }; + + std::vector DpInputNames = { "x", "x_mask" }; + const std::vector DpOutputNames = { "logw" }; + + std::vector FlowInputNames = { "z_p", "y_mask" }; + const std::vector FlowOutputNames = { "z" }; + + std::vector DecInputNames = { "z_in" }; + const std::vector DecOutputNames = { "o" }; + + const std::vector EmbiddingInputNames = { "sid" }; + const std::vector EmbiddingOutputNames = { "g" }; + + const std::vector BertInputNames = { "input_ids", "attention_mask", "token_type_ids" }; + const std::vector BertOutputNames = { "last_hidden_state" }; +}; + +MoeVoiceStudioCoreEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/VitsSvc.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/VitsSvc.hpp index bc87622..95a3408 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/VitsSvc.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/VitsSvc.hpp @@ -39,6 +39,10 @@ class VitsSvc : public SingingVoiceConversion ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU, unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0); + VitsSvc(const std::map& _PathDict, const MJson& _Config, const ProgressCallback& _ProgressCallback, + ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU, + unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0); + ~VitsSvc() override; void Destory(); diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp index 58cd4ec..951c027 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp @@ -194,6 +194,133 @@ DiffusionSvc::DiffusionSvc(const MJson& _Config, const ProgressCallback& _Progre } } +DiffusionSvc::DiffusionSvc(const std::map& _PathDict, + const MJson& _Config, const ProgressCallback& _ProgressCallback, + ExecutionProviders ExecutionProvider_, unsigned DeviceID_, unsigned ThreadCount_) : + SingingVoiceConversion(ExecutionProvider_, DeviceID_, ThreadCount_) +{ + MoeVSClassName(L"MoeVoiceStudioDiffSingingVoiceConversion"); + + //Check SamplingRate + if (_Config["Rate"].IsNull()) + throw std::exception("[Error] Missing field \"Rate\" (SamplingRate)"); + if (_Config["Rate"].IsInt() || _Config["Rate"].IsInt64()) + _samplingRate = _Config["Rate"].GetInt(); + else + throw std::exception("[Error] Field \"Rate\" (SamplingRate) Must Be Int/Int64"); + + logger.log(L"[Info] Current Sampling Rate is" + std::to_wstring(_samplingRate)); + + if (_Config["MelBins"].IsNull()) + throw std::exception("[Error] Missing field \"MelBins\" (MelBins)"); + if (_Config["MelBins"].IsInt() || _Config["MelBins"].IsInt64()) + melBins = _Config["MelBins"].GetInt(); + else + throw std::exception("[Error] Field \"MelBins\" (MelBins) Must Be Int/Int64"); + + if (!(_Config["Hop"].IsInt() || _Config["Hop"].IsInt64())) + throw std::exception("[Error] Hop Must Be Int"); + HopSize = _Config["Hop"].GetInt(); + + if (HopSize < 1) + throw std::exception("[Error] Hop Must > 0"); + + if (!(_Config["HiddenSize"].IsInt() || _Config["HiddenSize"].IsInt64())) + logger.log(L"[Warn] Missing Field \"HiddenSize\", Use Default Value (256)"); + else + HiddenUnitKDims = _Config["HiddenSize"].GetInt(); + + if (_Config["Characters"].IsArray()) + SpeakerCount = (int64_t)_Config["Characters"].Size(); + + if (_Config["Volume"].IsBool()) + EnableVolume = _Config["Volume"].GetBool(); + else + logger.log(L"[Warn] Missing Field \"Volume\", Use Default Value (False)"); + + if (!_Config["CharaMix"].IsBool()) + logger.log(L"[Warn] Missing Field \"CharaMix\", Use Default Value (False)"); + else + EnableCharaMix = _Config["CharaMix"].GetBool(); + + if (!_Config["Diffusion"].IsBool()) + logger.log(L"[Warn] Missing Field \"Diffusion\", Use Default Value (False)"); + else if (_Config["Diffusion"].GetBool()) + DiffSvcVersion = L"DiffusionSvc"; + + if (_Config["Pndm"].IsInt()) + Pndms = _Config["Pndm"].GetInt(); + + _callback = _ProgressCallback; + + if (_Config["Cluster"].IsString()) + { + const auto clus = to_wide_string(_Config["Cluster"].GetString()); + if (!(_Config["KMeansLength"].IsInt() || _Config["KMeansLength"].IsInt64())) + logger.log(L"[Warn] Missing Field \"KMeansLength\", Use Default Value (10000)"); + else + ClusterCenterSize = _Config["KMeansLength"].GetInt(); + try + { + Cluster = MoeVoiceStudioCluster::GetMoeVSCluster(clus, _PathDict.at("Cluster"), HiddenUnitKDims, ClusterCenterSize); + EnableCluster = true; + } + catch (std::exception& e) + { + logger.error(e.what()); + EnableCluster = false; + } + } + + //LoadModels + try + { + logger.log(L"[Info] loading DiffSvc Models"); + hubert = new Ort::Session(*env, _PathDict.at("Hubert").c_str(), *session_options); + nsfHifigan = new Ort::Session(*env, _PathDict.at("Hifigan").c_str(), *session_options); + if (_waccess(_PathDict.at("Encoder").c_str(), 0) != -1) + { + encoder = new Ort::Session(*env, _PathDict.at("Encoder").c_str(), *session_options); + denoise = new Ort::Session(*env, _PathDict.at("DenoiseFn").c_str(), *session_options); + pred = new Ort::Session(*env, _PathDict.at("NoisePredictor").c_str(), *session_options); + after = new Ort::Session(*env, _PathDict.at("AfterProcess").c_str(), *session_options); + if (_waccess(_PathDict.at("Alphas").c_str(), 0) != -1) + alpha = new Ort::Session(*env, _PathDict.at("Alphas").c_str(), *session_options); + } + else + diffSvc = new Ort::Session(*env, _PathDict.at("DiffSvc").c_str(), *session_options); + + if (_waccess(_PathDict.at("Naive").c_str(), 0) != -1) + naive = new Ort::Session(*env, _PathDict.at("Naive").c_str(), *session_options); + + logger.log(L"[Info] DiffSvc Models loaded"); + } + catch (Ort::Exception& _exception) + { + Destory(); + throw std::exception(_exception.what()); + } + + if (_Config["TensorExtractor"].IsString()) + DiffSvcVersion = to_wide_string(_Config["TensorExtractor"].GetString()); + + if (_Config["MaxStep"].IsInt()) + MaxStep = _Config["MaxStep"].GetInt(); + + MoeVSTensorPreprocess::MoeVoiceStudioTensorExtractor::Others _others_param; + _others_param.Memory = *memory_info; + + try + { + _TensorExtractor = GetTensorExtractor(DiffSvcVersion, 48000, _samplingRate, HopSize, EnableCharaMix, EnableVolume, HiddenUnitKDims, SpeakerCount, _others_param); + } + catch (std::exception& e) + { + Destory(); + throw std::exception(e.what()); + } +} + std::vector DiffusionSvc::SliceInference(const MoeVSProjectSpace::MoeVSAudioSlice& _Slice, const MoeVSProjectSpace::MoeVSSvcParams& _InferParams) const { logger.log(L"[Inferring] Inferring \"" + _Slice.Path + L"\", Start!"); diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/ModelBase.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/ModelBase.cpp index ba2c70c..9ba842b 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/ModelBase.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/ModelBase.cpp @@ -84,8 +84,8 @@ std::vector MoeVoiceStudioModule::GetOpenFileNameMoeVS() #endif } -std::vector MoeVoiceStudioModule::Inference(std::wstring& _Paths, - const MoeVSProjectSpace::MoeVSSvcParams& _InferParams, +std::vector MoeVoiceStudioModule::Inference(std::wstring& _Datas, + const MoeVSProjectSpace::MoeVSParams& _InferParams, const InferTools::SlicerSettings& _SlicerSettings) const { MoeVSNotImplementedError; diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/TTS.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/TTS.cpp new file mode 100644 index 0000000..864f8b7 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/TTS.cpp @@ -0,0 +1,321 @@ +#include "../header/TTS.hpp" + +MoeVoiceStudioCoreHeader + TextToSpeech::TextToSpeech(const ExecutionProviders& ExecutionProvider_, unsigned DeviceID_, unsigned ThreadCount_) : MoeVoiceStudioModule(ExecutionProvider_, DeviceID_, ThreadCount_) +{ + MoeVSClassName(L"MoeVoiceStudioTextToSpeech"); +} + +std::vector TextToSpeech::GetInputSeqs(const MJson& _Input, const MoeVSProjectSpace::MoeVSParams& _InitParams) const +{ + if (!_Input.IsArray()) + throw std::exception("JSON Type Must Be Array"); + const auto _InpArr = _Input.GetArray(); + std::vector _TTSInputSeqs; + _TTSInputSeqs.reserve(_InpArr.size()); + for(const auto& iter : _InpArr) + { + MoeVSProjectSpace::MoeVSTTSSeq _Temp; + const bool TokenFieldIsStr = iter.HasMember("Tokens") && iter["Tokens"].IsString() && !iter["Tokens"].Empty(); + const bool SeqFieldIsStr = iter.HasMember("Seq") && iter["Seq"].IsString() && !iter["Seq"].Empty(); + + if (iter.HasMember("LanguageID") && iter["LanguageID"].IsString() && + LanguageMap.find(iter["LanguageID"].GetString()) != LanguageMap.end()) + _Temp.TotLang = LanguageMap.at(iter["LanguageID"].GetString()); + else + _Temp.TotLang = _InitParams.Language; + + const int64_t FirstToneIdx = GetLanguageToneIdx(_Temp.TotLang); + + if (iter.HasMember("G2PAdditionalInfo") && iter["G2PAdditionalInfo"].IsString() && !iter["G2PAdditionalInfo"].Empty()) + _Temp.AdditionalInfo = to_wide_string(iter["G2PAdditionalInfo"].GetString()); + else + _Temp.AdditionalInfo = _InitParams.AdditionalInfo; + + if (iter.HasMember("PlaceHolderSymbol") && iter["PlaceHolderSymbol"].IsString()) + _Temp.PlaceHolderSymbol = to_wide_string(iter["PlaceHolderSymbol"].GetString()); + else + _Temp.PlaceHolderSymbol = _InitParams.PlaceHolderSymbol; + + if(TokenFieldIsStr && SeqFieldIsStr) + { + _Temp.SeqStr = to_wide_string(iter["Tokens"].GetString()); + auto TempString = to_wide_string(iter["Seq"].GetString()); + if (TempString.find(L"[ph]") == 0) + _Temp.Seq = Cleaner->DictReplace(TempString.substr(4), _Temp.PlaceHolderSymbol); + else + _Temp.Seq = Cleaner->DictReplace(Cleaner->G2p(TempString, _Temp.PlaceHolderSymbol, _Temp.AdditionalInfo, _Temp.TotLang), _Temp.PlaceHolderSymbol); + } + else if (TokenFieldIsStr) + _Temp.SeqStr = to_wide_string(iter["Tokens"].GetString()); + else if(SeqFieldIsStr) + _Temp.SeqStr = to_wide_string(iter["Seq"].GetString()); + else + throw std::exception("You Should Input Tokens To Inference"); + if (iter.HasMember("Seq") && iter["Seq"].IsArray()) + { + const auto SeqObject = iter["Seq"]; + if (!SeqObject.Empty()) + for (const auto& j : SeqObject.GetArray()) + _Temp.Seq.emplace_back(j.IsString() ? to_wide_string(j.GetString()) : std::wstring()); + else if(_Temp.SeqStr.empty()) + throw std::exception("You Should Input Tokens To Inference"); + } + + if(_Temp.SeqStr.empty()) + throw std::exception("You Should Input Tokens To Inference"); + + if (iter.HasMember("Tones") && iter["Tones"].IsArray()) + for (const auto& j : iter["Tones"].GetArray()) + _Temp.Tones.emplace_back(j.IsInt() ? j.GetInt() + FirstToneIdx : 0); + if (iter.HasMember("Durations") && iter["Durations"].IsArray()) + for (const auto& j : iter["Durations"].GetArray()) + _Temp.Durations.emplace_back(j.IsInt() ? j.GetInt() : 0); + if (iter.HasMember("Language") && iter["Language"].IsArray()) + for (const auto& j : iter["Language"].GetArray()) + _Temp.Language.emplace_back(j.IsInt() ? j.GetInt() : (j.IsString() ? LanguageMap.at(j.GetString()) : 0)); + if (iter.HasMember("SpeakerMix") && iter["SpeakerMix"].IsArray()) + for (const auto& j : iter["SpeakerMix"].GetArray()) + _Temp.SpeakerMix.emplace_back(j.IsFloat() ? j.GetFloat() : 0.f); + else + _Temp.SpeakerMix = _InitParams.SpeakerMix; + if (iter.HasMember("EmotionPrompt") && iter["EmotionPrompt"].IsArray()) + for (const auto& j : iter["EmotionPrompt"].GetArray()) + _Temp.EmotionPrompt.emplace_back(j.IsString() ? to_wide_string(j.GetString()) : std::wstring()); + else + _Temp.EmotionPrompt = _InitParams.EmotionPrompt; + if (iter.HasMember("NoiseScale") && iter["NoiseScale"].IsFloat()) + _Temp.NoiseScale = iter["NoiseScale"].GetFloat(); + else + _Temp.NoiseScale = _InitParams.NoiseScale; + if (iter.HasMember("LengthScale") && iter["LengthScale"].IsFloat()) + _Temp.LengthScale = iter["LengthScale"].GetFloat(); + else + _Temp.LengthScale = _InitParams.LengthScale; + if (iter.HasMember("RestTime") && iter["RestTime"].IsFloat()) + _Temp.RestTime = iter["RestTime"].GetFloat(); + else + _Temp.RestTime = _InitParams.RestTime; + if (iter.HasMember("DurationPredictorNoiseScale") && iter["DurationPredictorNoiseScale"].IsFloat()) + _Temp.DurationPredictorNoiseScale = iter["DurationPredictorNoiseScale"].GetFloat(); + else + _Temp.DurationPredictorNoiseScale = _InitParams.DurationPredictorNoiseScale; + if (iter.HasMember("FactorDpSdp") && iter["FactorDpSdp"].IsFloat()) + _Temp.FactorDpSdp = iter["FactorDpSdp"].GetFloat(); + else + _Temp.FactorDpSdp = _InitParams.FactorDpSdp; + if (iter.HasMember("GateThreshold") && iter["GateThreshold"].IsFloat()) + _Temp.GateThreshold = iter["GateThreshold"].GetFloat(); + else + _Temp.GateThreshold = _InitParams.GateThreshold; + if (iter.HasMember("MaxDecodeStep") && iter["MaxDecodeStep"].IsFloat()) + _Temp.MaxDecodeStep = iter["MaxDecodeStep"].GetInt(); + else + _Temp.MaxDecodeStep = _InitParams.MaxDecodeStep; + if (iter.HasMember("Seed") && iter["Seed"].IsInt()) + _Temp.Seed = iter["Seed"].GetInt(); + else + _Temp.Seed = _InitParams.Seed; + if (iter.HasMember("SpeakerId") && iter["SpeakerId"].IsInt()) + _Temp.SpeakerId = iter["SpeakerId"].GetInt(); + else + _Temp.SpeakerId = _InitParams.SpeakerId; + + if (_Temp.MaxDecodeStep < 500) _Temp.MaxDecodeStep = 500; + if (_Temp.GateThreshold > 0.98f) _Temp.GateThreshold = 0.98f; + if (_Temp.GateThreshold < 0.2f) _Temp.GateThreshold = 0.2f; + if (_Temp.FactorDpSdp > 1.f) _Temp.FactorDpSdp = 1.f; + if (_Temp.FactorDpSdp < 0.f) _Temp.FactorDpSdp = 0.f; + if (_Temp.DurationPredictorNoiseScale > 10.f) _Temp.DurationPredictorNoiseScale = 10.f; + if (_Temp.DurationPredictorNoiseScale < 0.f) _Temp.DurationPredictorNoiseScale = 0.f; + if (_Temp.RestTime > 30.f) _Temp.RestTime = 30.f; + if (_Temp.LengthScale > 10.f) _Temp.LengthScale = 10.f; + if (_Temp.LengthScale < 0.1f) _Temp.LengthScale = 0.1f; + + if (!_Temp.SeqStr.empty() && _Temp.Seq.empty()) + { + if (_Temp.SeqStr.find(L"[ph]") == 0) + _Temp.Seq = Cleaner->DictReplace(_Temp.SeqStr.substr(4), _Temp.PlaceHolderSymbol); + else + _Temp.Seq = Cleaner->DictReplace(Cleaner->G2p(_Temp.SeqStr, _Temp.PlaceHolderSymbol, _Temp.AdditionalInfo, _Temp.TotLang), _Temp.PlaceHolderSymbol); + } + _TTSInputSeqs.emplace_back(std::move(_Temp)); + } + return _TTSInputSeqs; +} + +std::vector TextToSpeech::GetEmotionVector(const std::vector& src) const +{ + if (src.empty()) + return EmoLoader[0]; + std::vector dst(1024, 0.0); + uint64_t mul = 0; + for(const auto& iter : src) + { + long emoId; + const auto emoStr = to_byte_string(iter); + if (!EmoJson[emoStr].Empty()) + emoId = EmoJson[emoStr].GetInt(); + else + emoId = atoi(emoStr.c_str()); + auto emoVec = EmoLoader[emoId]; + for (size_t i = 0; i < 1024; ++i) + dst[i] = dst[i] + (emoVec[i] - dst[i]) / (float)(mul + 1ull); + ++mul; + } + return dst; +} + +std::vector> TextToSpeech::generatePath(float* duration, size_t durationSize, size_t maskSize) +{ + for (size_t i = 1; i < maskSize; ++i) + duration[i] = duration[i - 1] + duration[i]; + std::vector> path(durationSize, std::vector(maskSize, false)); + //const auto path = new float[maskSize * durationSize]; + /* + for (size_t i = 0; i < maskSize; ++i) + for (size_t j = 0; j < durationSize; ++j) + path[i][j] = (j < (size_t)duration[i] ? 1.0f : 0.0f); + for (size_t i = maskSize - 1; i > 0ull; --i) + for (size_t j = 0; j < durationSize; ++j) + path[i][j] -= path[i-1][j]; + */ + auto dur = (size_t)duration[0]; + for (size_t j = 0; j < dur; ++j) + path[j][0] = true; + /* + for (size_t i = maskSize - 1; i > 0ull; --i) + for (size_t j = 0; j < durationSize; ++j) + path[i][j] = (j < (size_t)duration[i] && j >= (size_t)duration[i - 1]); + std::vector> tpath(durationSize, std::vector(maskSize)); + for (size_t i = 0; i < maskSize; ++i) + for (size_t j = 0; j < durationSize; ++j) + tpath[j][i] = path[i][j]; + */ + for (size_t j = maskSize - 1; j > 0ull; --j) + { + dur = (size_t)duration[j]; + for (auto i = (size_t)duration[j - 1]; i < dur && i < durationSize; ++i) + path[i][j] = true; + } + return path; +} + +std::vector> TextToSpeech::Inference(const std::wstring& _Seq, const MoeVSProjectSpace::MoeVSParams& _InferParams) const +{ + if (_Seq.empty()) + return {}; + if (_Seq.find(L"[ph]") != 0 && _Seq[0] == L'[') + return Inference(GetInputSeqs({ to_byte_string(_Seq), true }, _InferParams)); + + std::vector SeqLens; + std::wstring TmpSeq; + for (const auto chari : _Seq) + { + if ((chari == L'\n') || (chari == L'\r')) + { + if (!TmpSeq.empty()) + { + SeqLens.push_back(TmpSeq); + TmpSeq.clear(); + } + continue; + } + TmpSeq += chari; + } + if (!TmpSeq.empty()) + SeqLens.push_back(TmpSeq); + + std::vector InputSeqs; + InputSeqs.reserve(SeqLens.size()); + for(const auto& SeqL : SeqLens) + { + MoeVSProjectSpace::MoeVSTTSSeq TmpSeqData; + if (SeqL.find(L"[ph]") == 0) + TmpSeqData.Seq = Cleaner->DictReplace(SeqL.substr(4), _InferParams.PlaceHolderSymbol); + else + TmpSeqData.Seq = Cleaner->DictReplace(Cleaner->G2p(SeqL, _InferParams.PlaceHolderSymbol, _InferParams.AdditionalInfo, _InferParams.Language), _InferParams.PlaceHolderSymbol); + TmpSeqData.SpeakerMix = _InferParams.SpeakerMix; + TmpSeqData.EmotionPrompt = _InferParams.EmotionPrompt; + TmpSeqData.PlaceHolderSymbol = _InferParams.PlaceHolderSymbol; + TmpSeqData.NoiseScale = _InferParams.NoiseScale; + TmpSeqData.LengthScale = _InferParams.LengthScale; + TmpSeqData.DurationPredictorNoiseScale = _InferParams.DurationPredictorNoiseScale; + TmpSeqData.FactorDpSdp = _InferParams.FactorDpSdp; + TmpSeqData.GateThreshold = _InferParams.GateThreshold; + TmpSeqData.MaxDecodeStep = _InferParams.MaxDecodeStep; + TmpSeqData.Seed = _InferParams.Seed; + TmpSeqData.SpeakerId = _InferParams.SpeakerId; + TmpSeqData.RestTime = _InferParams.RestTime; + InputSeqs.emplace_back(std::move(TmpSeqData)); + } + return Inference(InputSeqs); +} + +std::vector> TextToSpeech::Inference(const MJson& _Inputs, const MoeVSProjectSpace::MoeVSParams& _InferParams) const +{ + return Inference(GetInputSeqs(_Inputs, _InferParams)); +} + +std::vector TextToSpeech::Inference(std::wstring& _Datas, const MoeVSProjectSpace::MoeVSParams& _InferParams, const InferTools::SlicerSettings& _SlicerSettings) const +{ + std::vector AudioFolders; + const auto PCM = Inference(_Datas, _InferParams); + AudioFolders.reserve(PCM.size()); + for (const auto& i : PCM) + { + std::wstring OutFolder = GetCurrentFolder() + L"/Outputs/BatchInference"; + if (_waccess((OutFolder + L".wav").c_str(), 0) != -1) + { + for (size_t idx = 0; idx < 99999999; ++idx) + if (_waccess((OutFolder + L" (" + std::to_wstring(idx) + L").wav").c_str(), 0) == -1) + { + OutFolder += L" (" + std::to_wstring(idx) + L").wav"; + break; + } + } + else + OutFolder += L".wav"; + AudioFolders.emplace_back(OutFolder); + InferTools::Wav::WritePCMData(_samplingRate, 1, i, OutFolder); + } + return AudioFolders; +} + +std::vector> TextToSpeech::Inference(const std::vector& _Input) const +{ + MoeVSNotImplementedError; +} + +std::vector TextToSpeech::GetAligments(size_t DstLen, size_t SrcLen) +{ + std::vector mel2ph(DstLen + 1, 0); + + size_t startFrame = 0; + const double ph_durs = static_cast(DstLen) / static_cast(SrcLen); + for (size_t iph = 0; iph < SrcLen; ++iph) + { + const auto endFrame = static_cast(round(static_cast(iph) * ph_durs + ph_durs)); + for (auto j = startFrame; j < endFrame + 1; ++j) + mel2ph[j] = static_cast(iph) + 1; + startFrame = endFrame + 1; + } + return mel2ph; +} + +std::wstring TextToSpeech::TextNormalize(const std::wstring& _Input, int64_t LanguageId) const +{ + auto Iterator = LanguageMap.begin(); + while(Iterator != LanguageMap.end()) + { + if (Iterator->second == LanguageId) + break; + ++Iterator; + } + + if (Iterator != LanguageMap.end()) + return MoeVSG2P::NormalizeText(_Input, Iterator->first); + return _Input; +} + +MoeVoiceStudioCoreEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/Vits.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/Vits.cpp new file mode 100644 index 0000000..730455f --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/Vits.cpp @@ -0,0 +1,699 @@ +#include "../header/Vits.hpp" +#include + +MoeVoiceStudioCoreHeader + +Vits::~Vits() +{ + logger.log(L"[Info] unloading Vits Models"); + destory(); + logger.log(L"[Info] Vits Models unloaded"); +} + +Vits::Vits(const MJson& _Config, const ProgressCallback& _ProgressCallback, + const DurationCallback& _DurationCallback, + ExecutionProviders ExecutionProvider_, + unsigned DeviceID_, unsigned ThreadCount_) : + TextToSpeech(ExecutionProvider_, DeviceID_, ThreadCount_) +{ + //Check Folder + if (_Config["Folder"].IsNull()) + throw std::exception("[Error] Missing field \"folder\" (Model Folder)"); + if (!_Config["Folder"].IsString()) + throw std::exception("[Error] Field \"folder\" (Model Folder) Must Be String"); + const auto _folder = to_wide_string(_Config["Folder"].GetString()); + if (_folder.empty()) + throw std::exception("[Error] Field \"folder\" (Model Folder) Can Not Be Empty"); + const std::wstring _path = GetCurrentFolder() + L"\\Models\\" + _folder + L"\\" + _folder; + + std::map _PathDict; + + if(_Config.HasMember("EmotionalPath") && _Config["EmotionalPath"].IsString()) + { + const auto emoStringload = to_wide_string(_Config["EmotionalPath"].GetString()); + if(!emoStringload.empty()) + { + _PathDict["EmotionalPath"] = GetCurrentFolder() + L"\\emotion\\" + emoStringload + L".npy"; + _PathDict["EmotionalDictPath"] = GetCurrentFolder() + L"\\emotion\\" + emoStringload + L".json"; + } + } + + _PathDict["Decoder"] = _path + L"_dec.onnx"; + _PathDict["StochasticDurationPredictor"] = _path + L"_sdp.onnx"; + _PathDict["DurationPredictor"] = _path + L"_dp.onnx"; + _PathDict["Encoder"] = _path + L"_enc_p.onnx"; + _PathDict["FlowNet"] = _path + L"_flow.onnx"; + _PathDict["Embidding"] = _path + L"_emb.onnx"; + + if (_Config.HasMember("Dict") && _Config["Dict"].IsString() && !_Config["Dict"].Empty()) + _PathDict["Dict"] = GetCurrentFolder() + L"/Dict/" + to_wide_string(_Config["Dict"].GetString()) + L".json"; + + std::vector _BertPaths; + if (_Config.HasMember("BertPath") && _Config["BertPath"].IsArray() && !_Config["BertPath"].Empty()) + { + for(const auto& BPH : _Config["BertPath"].GetArray()) + { + const auto BertPath = to_wide_string(BPH.GetString()); + if(!BertPath.empty()) + _BertPaths.emplace_back(GetCurrentFolder() + L"/Bert/" + BertPath); + } + } + + load(_PathDict, _Config, _ProgressCallback, _DurationCallback, _BertPaths); +} + +void Vits::load(const std::map& _PathDict, + const MJson& _Config, const ProgressCallback& _ProgressCallback, + const DurationCallback& _DurationCallback, const std::vector& _BertPaths) +{ + if (_Config["Type"].IsNull()) + throw std::exception("[Error] Missing field \"Type\" (ModelType)"); + if (!_Config["Type"].IsString()) + throw std::exception("[Error] Field \"Type\" (ModelType) Must Be String"); + VitsType = _Config["Type"].GetString(); + if (VitsType == "Pits") + { + UseTone = true; + UseLength = false; + } + if (VitsType == "BertVits") + { + UseLength = false; + UseTone = true; + UseBert = true; + UseLanguage = true; + EncoderG = true; + } + + Cleaner = MoeVSG2P::GetDefCleaner(); + if (_PathDict.find("Dict") != _PathDict.end()) + if (_waccess(_PathDict.at("Dict").c_str(), 0) != -1) + Cleaner->loadDict(_PathDict.at("Dict")); + + if (_Config.HasMember("LanguageMap") && !_Config["LanguageMap"].IsNull()) + for (const auto& CMember : _Config["LanguageMap"].GetMemberArray()) + LanguageMap[CMember.first] = CMember.second.GetInt(); + else + logger.log("[Warn] Field \"LanguageMap\" Is Missing, Use Default Value"); + + if (UseLength) + EncoderInputNames.emplace_back("x_lengths"); + if (UseTone) + EncoderInputNames.emplace_back("t"); + if(Emotion) + EncoderInputNames.emplace_back("emotion"); + if (UseLanguage) + EncoderInputNames.emplace_back("language"); + + //Check SamplingRate + if (_Config["Rate"].IsNull()) + throw std::exception("[Error] Missing field \"Rate\" (SamplingRate)"); + if (_Config["Rate"].IsInt() || _Config["Rate"].IsInt64()) + _samplingRate = _Config["Rate"].GetInt(); + else + throw std::exception("[Error] Field \"Rate\" (SamplingRate) Must Be Int/Int64"); + + logger.log(L"[Info] Current Sampling Rate is" + std::to_wstring(_samplingRate)); + + //Check Symbol + if (!_Config.HasMember("Symbol") || _Config["Symbol"].IsNull()) + throw std::exception("[Error] Missing field \"Symbol\" (PhSymbol)"); + if (_Config.HasMember("AddBlank") && !_Config["AddBlank"].IsNull()) + AddBlank = _Config["AddBlank"].GetBool(); + else + logger.log(L"[Warn] Field \"AddBlank\" Is Missing, Use Default Value"); + + //Load Symbol + int64_t iter = 0; + if (_Config["Symbol"].IsArray()) + { + logger.log(L"[Info] Use Phs"); + if (_Config["Symbol"].Empty()) + throw std::exception("[Error] Field \"Symbol\" (PhSymbol) Can Not Be Empty"); + const auto SymbolArr = _Config["Symbol"].GetArray(); + if (!SymbolArr[0].IsString()) + throw std::exception("[Error] Field \"Symbol\" (PhSymbol) Must Be Array or String"); + for (const auto& it : SymbolArr) + Symbols.insert({ to_wide_string(it.GetString()), iter++ }); + } + else + { + if (!_Config["Symbol"].IsString()) + throw std::exception("[Error] Field \"Symbol\" (PhSymbol) Must Be Array or String"); + logger.log(L"[Info] Use Symbols"); + const std::wstring SymbolsStr = to_wide_string(_Config["Symbol"].GetString()); + if (SymbolsStr.empty()) + throw std::exception("[Error] Field \"Symbol\" (PhSymbol) Can Not Be Empty"); + for (size_t i = 0; i < SymbolsStr.length(); ++i) + Symbols.insert({ SymbolsStr.substr(i,1) , iter++ }); + } + + try + { + if (_PathDict.find("EmotionalPath") != _PathDict.end()) + { + const auto EmotionPath = _PathDict.at("EmotionalPath"); + if (!EmotionPath.empty()) + { + logger.log(L"[Info] Loading EmotionVector"); + EmoLoader.open(EmotionPath); + logger.log(L"[Info] EmotionVector Loaded"); + Emotion = true; + } + } + if (_PathDict.find("EmotionalDictPath") != _PathDict.end()) + { + const auto EmotionPath = _PathDict.at("EmotionalDictPath"); + if (!EmotionPath.empty()) + EmoJson = { to_byte_string(EmotionPath).c_str() }; + } + } + catch (std::exception& e) + { + logger.log((std::string("[Warn] EmotionPath Error ") + e.what()).c_str()); + } + + if (_Config.HasMember("Characters") && _Config["Characters"].IsArray()) + SpeakerCount = (int64_t)_Config["Characters"].Size(); + + if(UseLanguage) + { + if (_Config["LanguageMap"].IsNull() || !_Config.HasMember("LanguageMap")) + throw std::exception("[Error] Missing field \"LanguageMap\" (LanguageMap)"); + for(const auto& Item : _Config["LanguageMap"].GetMemberArray()) + { + if (!Item.second.IsArray()) + continue; + const auto LangArr = Item.second.GetArray(); + if (LangArr.size() != 2) + continue; + LanguageMap[Item.first] = LangArr[0].GetInt(); + LanguageTones[Item.first] = LangArr[1].GetInt(); + } + } + + if (UseBert) + { + if (LanguageMap.size() != _BertPaths.size()) + EncoderInputNames.emplace_back("bert"); + else + { + BertNames.reserve(_BertPaths.size() * 2); + for (size_t i = 0; i < _BertPaths.size(); ++i) + BertNames.emplace_back("bert_" + std::to_string(i)); + for(const auto& NameInp : BertNames) + EncoderInputNames.emplace_back(NameInp.data()); + } + for(const auto& Path : _BertPaths) + { + if (_waccess(Path.c_str(), 0) != -1) + { + Ort::Session* SessionBert = nullptr; + try + { + SessionBert = new Ort::Session(*env, (Path + L"/model.onnx").c_str(), *session_options); + } + catch(Ort::Exception& e) + { + logger.log(L"[Warn] " + to_wide_string(e.what())); + delete SessionBert; + SessionBert = nullptr; + } + sessionBert.emplace_back(SessionBert); + if (_waccess((Path + L"/Tokenizer.json").c_str(), 0) != -1) + { + Tokenizers.emplace_back(Path + L"/Tokenizer.json"); + Tokenizers.back().BondCleaner(Cleaner); + } + else if (SessionBert) + throw std::exception("Bert Must Have a Tokenizer"); + } + } + } + + _callback = _ProgressCallback; + CustomDurationCallback = _DurationCallback; + + //LoadModels + try + { + logger.log(L"[Info] loading Vits Models"); + sessionDec = new Ort::Session(*env, _PathDict.at("Decoder").c_str(), *session_options); + sessionEnc_p = new Ort::Session(*env, _PathDict.at("Encoder").c_str(), *session_options); + sessionFlow = new Ort::Session(*env, _PathDict.at("FlowNet").c_str(), *session_options); + + if (_waccess(_PathDict.at("Embidding").c_str(), 0) != -1) + sessionEmb = new Ort::Session(*env, _PathDict.at("Embidding").c_str(), *session_options); + else + sessionEmb = nullptr; + + if (_waccess(_PathDict.at("DurationPredictor").c_str(), 0) != -1) + sessionDp = new Ort::Session(*env, _PathDict.at("DurationPredictor").c_str(), *session_options); + else + sessionDp = nullptr; + + if (_waccess(_PathDict.at("StochasticDurationPredictor").c_str(), 0) != -1) + sessionSdp = new Ort::Session(*env, _PathDict.at("StochasticDurationPredictor").c_str(), *session_options); + else + sessionSdp = nullptr; + + if (!sessionDp && !sessionSdp) + { + destory(); + throw std::exception("You must have a duration predictor"); + } + + logger.log(L"[Info] Vits Models loaded"); + } + catch (Ort::Exception& _exception) + { + destory(); + throw std::exception(_exception.what()); + } + + if (sessionEmb) + { + if(EncoderG) EncoderInputNames.emplace_back("g"); + SdpInputNames.emplace_back("g"); + DpInputNames.emplace_back("g"); + FlowInputNames.emplace_back("g"); + DecInputNames.emplace_back("g"); + } +} + +Vits::Vits(const std::map& _PathDict, + const MJson& _Config, const ProgressCallback& _ProgressCallback, + const DurationCallback& _DurationCallback, const std::vector& _BertPaths, + ExecutionProviders ExecutionProvider_, + unsigned DeviceID_, unsigned ThreadCount_) : + TextToSpeech(ExecutionProvider_, DeviceID_, ThreadCount_) +{ + load(_PathDict, _Config, _ProgressCallback, _DurationCallback, _BertPaths); +} + +std::vector> Vits::Inference(const std::vector& _Input) const +{ + std::vector> PCM; + PCM.reserve(_Input.size()); + std::vector> _Audio(1); + logger.log("[Inference] Vits Inference Begin"); + size_t proc = 0; + _callback(proc, _Input.size()); + for(const auto& Seq : _Input) + { + _callback(proc++, _Input.size()); + if(Seq.Seq.empty()) + continue; + + if (!_Audio[0].empty()) + { + if (Seq.RestTime < 0.f) + { + _Audio[0].insert(_Audio[0].end(), size_t(_samplingRate), 0); + PCM.emplace_back(std::move(_Audio[0])); + _Audio[0] = std::vector(); + } + else + _Audio[0].insert(_Audio[0].end(), size_t(Seq.RestTime * float(_samplingRate)), 0); + } + + std::mt19937 gen(static_cast(Seq.Seed)); + std::normal_distribution FloatRandFn(0.f, 1.f); + std::uniform_int_distribution IntRandFn(0, RAND_MAX); + + std::vector TextSeq; + TextSeq.reserve(Seq.Seq.size() * 4 + 4); + for (const auto& it : Seq.Seq) + { + if (AddBlank) + TextSeq.push_back(0); + if (Symbols.find(it) != Symbols.end()) + TextSeq.push_back(Symbols.at(it)); + else + TextSeq.push_back(int64_t(size_t(IntRandFn(gen)) % Symbols.size())); + } + if (AddBlank) + TextSeq.push_back(0); + int64_t TextSeqLength[] = { (int64_t)TextSeq.size() }; + std::vector EncoderOutputs; + std::vector EncoderInputs; + const int64_t TextSeqShape[2] = { 1,TextSeqLength[0] }; + constexpr int64_t LengthShape[1] = { 1 }; + EncoderInputs.push_back(Ort::Value::CreateTensor( + *memory_info, TextSeq.data(), TextSeqLength[0], TextSeqShape, 2)); + if (UseLength) + EncoderInputs.push_back(Ort::Value::CreateTensor( + *memory_info, TextSeqLength, 1, LengthShape, 1)); + std::vector emoVec; + constexpr int64_t EmotionShape[1] = { 1024 }; + if(Emotion) + { + emoVec = GetEmotionVector(Seq.EmotionPrompt); + EncoderInputs.push_back(Ort::Value::CreateTensor( + *memory_info, emoVec.data(), 1024, EmotionShape, 1)); + } + std::vector ToneIn(TextSeq.size(), 0i64); + if(UseTone) + { + if (ToneIn.size() == Seq.Tones.size()) + ToneIn = Seq.Tones; + else if (AddBlank && ToneIn.size() == Seq.Tones.size() * 2 + 1) + for (size_t i = 1; i < ToneIn.size(); i += 2) + ToneIn[i] = Seq.Tones[i / 2]; + else if (ToneIn.size() * 2 + 1 == Seq.Tones.size()) + for (size_t i = 1; i < Seq.Tones.size(); i += 2) + ToneIn[i / 2] = Seq.Tones[i]; + EncoderInputs.push_back(Ort::Value::CreateTensor( + *memory_info, ToneIn.data(), TextSeqLength[0], TextSeqShape, 2)); + } + std::vector LanguageIn(TextSeq.size(), Seq.TotLang); + if(UseLanguage) + { + if (LanguageIn.size() == Seq.Tones.size()) + LanguageIn = Seq.Tones; + else if (AddBlank && LanguageIn.size() == Seq.Tones.size() * 2 + 1) + for (size_t i = 1; i < LanguageIn.size(); i += 2) + LanguageIn[i] = Seq.Tones[i / 2]; + else if (LanguageIn.size() * 2 + 1 == Seq.Tones.size()) + for (size_t i = 1; i < Seq.Tones.size(); i += 2) + LanguageIn[i / 2] = Seq.Tones[i]; + EncoderInputs.push_back(Ort::Value::CreateTensor( + *memory_info, LanguageIn.data(), TextSeqLength[0], TextSeqShape, 2)); + } + std::vector BertVecs(sessionBert.size(), std::vector(1024 * TextSeqLength[0], 0.f)); + int64_t BertShape[2] = { TextSeqLength[0],1024 }; + if(UseBert) + { + for (size_t IndexOfBert = 0; IndexOfBert < sessionBert.size(); ++IndexOfBert) + { + auto& BertData = BertVecs[IndexOfBert]; + if (sessionBert[IndexOfBert] && (IndexOfBert == size_t(Seq.TotLang) || + (IndexOfBert != size_t(Seq.TotLang) && sessionBert.size() == 1))) + { + auto input_ids = Tokenizers[IndexOfBert](TextNormalize(Seq.SeqStr, Seq.TotLang)); + std::vector attention_mask(input_ids.size(), 1), token_type_ids(input_ids.size(), 0); + int64_t AttentionShape[2] = { 1, (int64_t)input_ids.size() }; + std::vector AttentionInput, AttentionOutput; + AttentionInput.emplace_back(Ort::Value::CreateTensor( + *memory_info, input_ids.data(), input_ids.size(), AttentionShape, 2)); + AttentionInput.emplace_back(Ort::Value::CreateTensor( + *memory_info, attention_mask.data(), attention_mask.size(), AttentionShape, 2)); + AttentionInput.emplace_back(Ort::Value::CreateTensor( + *memory_info, token_type_ids.data(), token_type_ids.size(), AttentionShape, 2)); + try + { + AttentionOutput = sessionBert[IndexOfBert]->Run(Ort::RunOptions{ nullptr }, + BertInputNames.data(), + AttentionInput.data(), + 3, + BertOutputNames.data(), + 1); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: Bert\n") + e.what()).c_str()); + } + const auto AligmentMartix = GetAligments(BertShape[0], AttentionOutput[0].GetTensorTypeAndShapeInfo().GetShape()[0]); + const auto AttnData = AttentionOutput[0].GetTensorData(); + for (int64_t IndexOfSrcVector = 0; IndexOfSrcVector < TextSeqLength[0]; ++IndexOfSrcVector) + memcpy(BertData.data() + IndexOfSrcVector * 1024, AttnData + AligmentMartix[IndexOfSrcVector] * 1024, 1024 * sizeof(float)); + } + EncoderInputs.emplace_back(Ort::Value::CreateTensor( + *memory_info, BertData.data(), BertData.size(), BertShape, 2)); + } + } + + std::vector GEmbidding; + std::vector GOutShape; + if (sessionEmb) + { + auto SpeakerMixData = Seq.SpeakerMix; + if (!SpeakerMixData.empty() && SpeakerCount > 1) + { + LinearCombination(SpeakerMixData); + int64_t csid = 0; + for (const auto& CharaP : SpeakerMixData) + { + std::vector EmbiddingInput; + std::vector EmbiddingOutput; + if (csid >= SpeakerCount) + break; + if (CharaP < 0.0001f) + { + ++csid; + continue; + } + int64_t Character[1] = { csid }; + EmbiddingInput.push_back(Ort::Value::CreateTensor( + *memory_info, Character, 1, LengthShape, 1)); + try + { + EmbiddingOutput = sessionEmb->Run(Ort::RunOptions{ nullptr }, + EmbiddingInputNames.data(), + EmbiddingInput.data(), + EmbiddingInput.size(), + EmbiddingOutputNames.data(), + EmbiddingOutputNames.size()); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: emb\n") + e.what()).c_str()); + } + const auto GOutCount = EmbiddingOutput[0].GetTensorTypeAndShapeInfo().GetElementCount(); + if (GOutShape.empty()) + { + GEmbidding = std::vector(EmbiddingOutput[0].GetTensorData(), EmbiddingOutput[0].GetTensorData() + GOutCount); + GOutShape = EmbiddingOutput[0].GetTensorTypeAndShapeInfo().GetShape(); + GOutShape.emplace_back(1); + for (auto idx : GEmbidding) + idx *= float(CharaP); + } + else + for (size_t i = 0; i < GOutCount; ++i) + GEmbidding[i] += EmbiddingOutput[0].GetTensorData()[i] * float(CharaP); + ++csid; + } + } + else + { + std::vector EmbiddingInput; + std::vector EmbiddingOutput; + int64_t Character[1] = { Seq.SpeakerId }; + EmbiddingInput.push_back(Ort::Value::CreateTensor( + *memory_info, Character, 1, LengthShape, 1)); + try + { + EmbiddingOutput = sessionEmb->Run(Ort::RunOptions{ nullptr }, + EmbiddingInputNames.data(), + EmbiddingInput.data(), + EmbiddingInput.size(), + EmbiddingOutputNames.data(), + EmbiddingOutputNames.size()); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: emb\n") + e.what()).c_str()); + } + const auto GOutCount = EmbiddingOutput[0].GetTensorTypeAndShapeInfo().GetElementCount(); + GEmbidding = std::vector(EmbiddingOutput[0].GetTensorData(), EmbiddingOutput[0].GetTensorData() + GOutCount); + GOutShape = EmbiddingOutput[0].GetTensorTypeAndShapeInfo().GetShape(); + GOutShape.emplace_back(1); + } + if (EncoderG) + EncoderInputs.push_back(Ort::Value::CreateTensor(*memory_info, GEmbidding.data(), GEmbidding.size(), GOutShape.data(), 3)); + } + + try + { + EncoderOutputs = sessionEnc_p->Run(Ort::RunOptions{ nullptr }, + EncoderInputNames.data(), + EncoderInputs.data(), + EncoderInputs.size(), + EncoderOutputNames.data(), + EncoderOutputNames.size()); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: enc_p\n") + e.what()).c_str()); + } + + std::vector + m_p(EncoderOutputs[1].GetTensorData(), EncoderOutputs[1].GetTensorData() + EncoderOutputs[1].GetTensorTypeAndShapeInfo().GetElementCount()), + logs_p(EncoderOutputs[2].GetTensorData(), EncoderOutputs[2].GetTensorData() + EncoderOutputs[2].GetTensorTypeAndShapeInfo().GetElementCount()), + x_mask(EncoderOutputs[3].GetTensorData(), EncoderOutputs[3].GetTensorData() + EncoderOutputs[3].GetTensorTypeAndShapeInfo().GetElementCount()); + + const auto xshape = EncoderOutputs[0].GetTensorTypeAndShapeInfo().GetShape(); + + std::vector w_ceil(TextSeqLength[0], 1.f); + bool enable_dp = false; + if (Seq.Durations.size() == w_ceil.size() || Seq.Durations.size() == w_ceil.size() / 2) + enable_dp = true; + + const int64_t zinputShape[3] = { xshape[0],2,xshape[2] }; + const int64_t zinputCount = xshape[0] * xshape[2] * 2; + std::vector zinput(zinputCount, 0.0); + for (auto& it : zinput) + it = FloatRandFn(gen) * Seq.DurationPredictorNoiseScale; + std::vector DurationPredictorInput; + DurationPredictorInput.push_back(std::move(EncoderOutputs[0])); + DurationPredictorInput.push_back(std::move(EncoderOutputs[3])); + DurationPredictorInput.push_back(Ort::Value::CreateTensor( + *memory_info, zinput.data(), zinputCount, zinputShape, 3)); + if (sessionEmb) + DurationPredictorInput.push_back(Ort::Value::CreateTensor(*memory_info, GEmbidding.data(), GEmbidding.size(), GOutShape.data(), 3)); + if(sessionSdp) + { + std::vector StochasticDurationPredictorOutput; + try + { + StochasticDurationPredictorOutput = sessionSdp->Run(Ort::RunOptions{ nullptr }, + SdpInputNames.data(), + DurationPredictorInput.data(), + DurationPredictorInput.size(), + SdpOutputNames.data(), + SdpOutputNames.size()); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: dp\n") + e.what()).c_str()); + } + const auto w_data = StochasticDurationPredictorOutput[0].GetTensorMutableData(); + const auto w_data_length = StochasticDurationPredictorOutput[0].GetTensorTypeAndShapeInfo().GetElementCount(); + if (w_data_length != w_ceil.size()) + w_ceil.resize(w_data_length, 0.f); + float SdpFactor = 1.f - Seq.FactorDpSdp; + if (sessionDp) + for (size_t i = 0; i < w_ceil.size(); ++i) + w_ceil[i] = ceil(exp(w_data[i] * SdpFactor) * x_mask[i] * Seq.LengthScale); + else + for (size_t i = 0; i < w_ceil.size(); ++i) + w_ceil[i] = ceil(exp(w_data[i]) * x_mask[i] * Seq.LengthScale); + } + if (sessionDp) + { + std::vector DurationPredictorOutput; + DurationPredictorInput.erase(DurationPredictorInput.begin() + 2); + try + { + DurationPredictorOutput = sessionDp->Run(Ort::RunOptions{ nullptr }, + DpInputNames.data(), + DurationPredictorInput.data(), + DurationPredictorInput.size(), + DpOutputNames.data(), + DpOutputNames.size()); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: dp\n") + e.what()).c_str()); + } + const auto w_data = DurationPredictorOutput[0].GetTensorMutableData(); + const auto w_data_length = DurationPredictorOutput[0].GetTensorTypeAndShapeInfo().GetElementCount(); + if (w_data_length != w_ceil.size()) + w_ceil.resize(w_data_length, 0.f); + if (sessionSdp) + for (size_t i = 0; i < w_ceil.size(); ++i) + w_ceil[i] += ceil(exp(w_data[i] * Seq.FactorDpSdp) * x_mask[i] * Seq.LengthScale); + else + for (size_t i = 0; i < w_ceil.size(); ++i) + w_ceil[i] = ceil(exp(w_data[i]) * x_mask[i] * Seq.LengthScale); + } + if(enable_dp) + { + if (Seq.Durations.size() == TextSeq.size()) + for (size_t i = 0; i < w_ceil.size(); ++i) + w_ceil[i] = float(Seq.Durations[i]); + else if (AddBlank && Seq.Durations.size() == TextSeq.size() / 2ull) + for (size_t i = 0; i < Seq.Durations.size(); ++i) + w_ceil[1 + i * 2] = float(Seq.Durations[i]); + } + CustomDurationCallback(w_ceil); + const auto maskSize = x_mask.size(); + float y_length_f = 0.0; + int64_t y_length; + for (size_t i = 0; i < w_ceil.size(); ++i) + y_length_f += w_ceil[i]; + if (y_length_f < 1.0f) + y_length = 1; + else + y_length = (int64_t)y_length_f; + + auto attn = generatePath(w_ceil.data(), y_length, maskSize); + std::vector logVec(192, std::vector(y_length, 0.0f)); + std::vector mpVec(192, std::vector(y_length, 0.0f)); + std::vector nlogs_pData(192 * y_length); + for (size_t i = 0; i < static_cast(y_length); ++i) + { + for (size_t j = 0; j < 192; ++j) + { + for (size_t k = 0; k < maskSize; k++) + { + if (attn[i][k]) + { + mpVec[j][i] += m_p[j * maskSize + k]; + logVec[j][i] += logs_p[j * maskSize + k]; + } + } + nlogs_pData[j * y_length + i] = mpVec[j][i] + FloatRandFn(gen) * exp(logVec[j][i]) * Seq.NoiseScale; + } + } + std::vector y_mask(y_length, 1.0f); + const int64_t zshape[3] = { 1,192,y_length }; + const int64_t yshape[3] = { 1,1,y_length }; + + std::vector FlowDecInputs, FlowDecOutputs; + + FlowDecInputs.push_back(Ort::Value::CreateTensor( + *memory_info, nlogs_pData.data(), 192 * y_length, zshape, 3)); + FlowDecInputs.push_back(Ort::Value::CreateTensor( + *memory_info, y_mask.data(), y_length, yshape, 3)); + if (sessionEmb) + FlowDecInputs.push_back(Ort::Value::CreateTensor( + *memory_info, GEmbidding.data(), GEmbidding.size(), GOutShape.data(), 3)); + + try + { + FlowDecOutputs = sessionFlow->Run(Ort::RunOptions{ nullptr }, + FlowInputNames.data(), + FlowDecInputs.data(), + FlowDecInputs.size(), + FlowOutputNames.data(), + FlowOutputNames.size()); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: dec & flow\n") + e.what()).c_str()); + } + FlowDecInputs[0] = std::move(FlowDecOutputs[0]); + if (sessionEmb) + FlowDecInputs[1] = std::move(FlowDecInputs[2]); + FlowDecInputs.pop_back(); + try + { + + FlowDecOutputs = sessionDec->Run(Ort::RunOptions{ nullptr }, + DecInputNames.data(), + FlowDecInputs.data(), + FlowDecInputs.size(), + DecOutputNames.data(), + DecOutputNames.size()); + } + catch (Ort::Exception& e) + { + throw std::exception((std::string("Locate: dec & flow\n") + e.what()).c_str()); + } + const auto shapeOut = FlowDecOutputs[0].GetTensorTypeAndShapeInfo().GetShape(); + const auto outData = FlowDecOutputs[0].GetTensorData(); + for (int bbb = 0; bbb < shapeOut[2]; bbb++) + _Audio[0].emplace_back(static_cast(outData[bbb] * 32768.0f)); + } + if (!_Audio[0].empty()) + { + _Audio[0].insert(_Audio[0].end(), size_t(_samplingRate), 0); + PCM.emplace_back(std::move(_Audio[0])); + } + _callback(proc++, _Input.size()); + logger.log("[Inference] Vits Inference Fin"); + return PCM; +} + +MoeVoiceStudioCoreEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp index 18f22f6..658b0d9 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp @@ -189,6 +189,148 @@ VitsSvc::VitsSvc(const MJson& _Config, const ProgressCallback& _ProgressCallback } } +VitsSvc::VitsSvc(const std::map& _PathDict, const MJson& _Config, const ProgressCallback& _ProgressCallback, + ExecutionProviders ExecutionProvider_, + unsigned DeviceID_, unsigned ThreadCount_) : + SingingVoiceConversion(ExecutionProvider_, DeviceID_, ThreadCount_) +{ + MoeVSClassName(L"MoeVoiceStudioVitsSingingVoiceConversion"); + + //Check SamplingRate + if (_Config["Rate"].IsNull()) + throw std::exception("[Error] Missing field \"Rate\" (SamplingRate)"); + if (_Config["Rate"].IsInt() || _Config["Rate"].IsInt64()) + _samplingRate = _Config["Rate"].GetInt(); + else + throw std::exception("[Error] Field \"Rate\" (SamplingRate) Must Be Int/Int64"); + + logger.log(L"[Info] Current Sampling Rate is" + std::to_wstring(_samplingRate)); + + if (!_Config["SoVits3"].IsNull() && _Config["SoVits3"].GetBool()) + VitsSvcVersion = L"SoVits3.0"; + else if (!_Config["SoVits2"].IsNull() && _Config["SoVits2"].GetBool()) + VitsSvcVersion = L"SoVits2.0"; + else if (!_Config["SoVits2.0"].IsNull() && _Config["SoVits2.0"].GetBool()) + VitsSvcVersion = L"SoVits2.0"; + else if (!_Config["SoVits3.0"].IsNull() && _Config["SoVits3.0"].GetBool()) + VitsSvcVersion = L"SoVits3.0"; + else if (_Config["Type"].GetString() == std::string("RVC")) + VitsSvcVersion = L"RVC"; + if (!_Config["SoVits4.0V2"].IsNull() && _Config["SoVits4.0V2"].GetBool()) + VitsSvcVersion = L"SoVits4.0-DDSP"; + +#ifdef MOEVSDMLPROVIDER + if (ExecutionProvider_ == ExecutionProviders::DML && VitsSvcVersion == L"SoVits4.0-DDSP") + throw std::exception("[Error] DirectXMl Not Support SoVits4.0V2, Please Use Cuda Or Cpu"); +#endif + + if (!(_Config["Hop"].IsInt() || _Config["Hop"].IsInt64())) + throw std::exception("[Error] Hop Must Exist And Must Be Int"); + HopSize = _Config["Hop"].GetInt(); + + if (!(_Config["HiddenSize"].IsInt() || _Config["HiddenSize"].IsInt64())) + logger.log(L"[Warn] Missing Field \"HiddenSize\", Use Default Value (256)"); + else + HiddenUnitKDims = _Config["HiddenSize"].GetInt(); + + if (!_Config["CharaMix"].IsBool()) + logger.log(L"[Warn] Missing Field \"CharaMix\", Use Default Value (False)"); + else + EnableCharaMix = _Config["CharaMix"].GetBool(); + + if (_Config["Cluster"].IsString()) + { + const auto clus = to_wide_string(_Config["Cluster"].GetString()); + if (!(_Config["KMeansLength"].IsInt() || _Config["KMeansLength"].IsInt64())) + logger.log(L"[Warn] Missing Field \"KMeansLength\", Use Default Value (10000)"); + else + ClusterCenterSize = _Config["KMeansLength"].GetInt(); + try + { + Cluster = MoeVoiceStudioCluster::GetMoeVSCluster(clus, _PathDict.at("Cluster"), HiddenUnitKDims, ClusterCenterSize); + EnableCluster = true; + } + catch (std::exception& e) + { + logger.error(e.what()); + EnableCluster = false; + } + } + + if (HopSize < 1) + throw std::exception("[Error] Hop Must > 0"); + + if (_Config["Volume"].IsBool()) + EnableVolume = _Config["Volume"].GetBool(); + else + logger.log(L"[Warn] Missing Field \"Volume\", Use Default Value (False)"); + + if (_Config["Characters"].IsArray()) + SpeakerCount = int64_t(_Config["Characters"].Size()); + + _callback = _ProgressCallback; + + //LoadModels + try + { + logger.log(L"[Info] loading VitsSvcModel Models"); + hubert = new Ort::Session(*env, _PathDict.at("Hubert").c_str(), *session_options); + if (VitsSvcVersion == L"RVC") + VitsSvcModel = new Ort::Session(*env, _PathDict.at("RVC").c_str(), *session_options); + else + VitsSvcModel = new Ort::Session(*env, _PathDict.at("SoVits").c_str(), *session_options); + logger.log(L"[Info] VitsSvcModel Models loaded"); + } + catch (Ort::Exception& _exception) + { + Destory(); + throw std::exception(_exception.what()); + } + + if (VitsSvcModel->GetInputCount() == 4 && VitsSvcVersion != L"SoVits3.0") + VitsSvcVersion = L"SoVits2.0"; + + if (_Config["TensorExtractor"].IsString()) + VitsSvcVersion = to_wide_string(_Config["TensorExtractor"].GetString()); + + if (_Config["ShallowDiffusion"].IsString()) + { + const std::string ShallowDiffusionConf = to_byte_string(GetCurrentFolder()) + "/Models/" + _Config["ShallowDiffusion"].GetString() + ".json"; + try + { + shallow_diffusion = new DiffusionSvc( + _PathDict, + to_byte_string(_PathDict.at("ShallowDiffusionConfig")).c_str(), + [](size_t, size_t) {}, + ExecutionProvider_, + DeviceID_, + ThreadCount_ + ); + stft_operator = new Ort::Session(*env, _PathDict.at("MelOperators").c_str(), *session_options); + } + catch (std::exception& e) + { + delete shallow_diffusion; + shallow_diffusion = nullptr; + delete stft_operator; + stft_operator = nullptr; + logger.error(e.what()); + } + } + + MoeVSTensorPreprocess::MoeVoiceStudioTensorExtractor::Others _others_param; + _others_param.Memory = *memory_info; + try + { + _TensorExtractor = GetTensorExtractor(VitsSvcVersion, 48000, _samplingRate, HopSize, EnableCharaMix, EnableVolume, HiddenUnitKDims, SpeakerCount, _others_param); + } + catch (std::exception& e) + { + Destory(); + throw std::exception(e.what()); + } +} + //已弃用(旧MoeSS的推理函数) #ifdef MOESSDFN std::vector VitsSvc::InferBatch() const diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.cpp index 0302b93..085c499 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.cpp @@ -72,6 +72,13 @@ namespace MoeVSModuleManager #endif MoeVSRegisterSampler(L"Pndm", PndmSampler); MoeVSRegisterSampler(L"DDim", DDimSampler); + const auto BasicCleanerDir = GetCurrentFolder() + L"/G2P/BasicCleaner.dll"; + if (_waccess(BasicCleanerDir.c_str(), 0) != -1) + { + const auto Cleaner = MoeVSG2P::GetDefCleaner(); + Cleaner->loadG2p(BasicCleanerDir); + Cleaner->GetCleaner().LoadDict(GetCurrentFolder() + L"/G2P"); + } } MoeVoiceStudioCore::SingingVoiceConversion* GetCurSvcModel() diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.hpp index 38ae42d..851c8a6 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Modules.hpp @@ -22,6 +22,7 @@ #pragma once #include "Models/header/VitsSvc.hpp" #include "Models/header/DiffSvc.hpp" +#include "Models/header/Vits.hpp" namespace MoeVSModuleManager { diff --git a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp index c8ca1fc..4fffee9 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp @@ -1,4 +1,5 @@ -#ifdef MOEVSONNX +#include +#ifndef MOEVSONNX #include #include #include @@ -39,391 +40,34 @@ std::vector& operator-=(std::vector& left, const std::vector& right) } #endif -namespace RtInferenceSpace +int main() { - class MRecorder - { - public: - MRecorder() = default; - ~MRecorder() - { - if (!hWaveIn) - return; - Stop(); - waveInClose(hWaveIn); - } - void initRecorder(DWORD SamplingRate = 44100) - { - waveform.nSamplesPerSec = SamplingRate; - waveform.wBitsPerSample = 16; - waveform.nChannels = 1; - waveform.cbSize = 0; - waveform.wFormatTag = WAVE_FORMAT_PCM; - waveform.nBlockAlign = (waveform.wBitsPerSample * waveform.nChannels) / 8; - waveform.nAvgBytesPerSec = waveform.nBlockAlign * waveform.nSamplesPerSec; - SamplingRateSrc = SamplingRate; - WaitEvent = CreateEvent(nullptr, 0, 0, nullptr); - waveInOpen(&hWaveIn, WAVE_MAPPER, &waveform, (DWORD_PTR)WaitEvent, 0L, CALLBACK_EVENT); - } - - void setStreamBufferSize(double time) - { - Stop(); - StreamSize = size_t(time * SamplingRateSrc); - timems = DWORD(time * 1000); - timems -= 50; - if (timems < 50) timems = 50; - pcmVector = std::vector(StreamSize * 2); - whdri.lpData = (LPSTR)pcmVector.data(); - whdri.dwBufferLength = DWORD(StreamSize * 2); - whdri.dwBytesRecorded = 0; - whdri.dwUser = 0; - whdri.dwFlags = 0; - whdri.dwLoops = 1; - } - - [[nodiscard]] size_t GetFrameSize() const - { - return StreamSize; - } - - void Start() - { - if (isBegin) - return; - isBegin = true; - std::thread RecoderThread([&]() - { - while(isBegin) - { - whdri.lpData = (LPSTR)pcmVector.data(); - whdri.dwBufferLength = DWORD(StreamSize * 2); - whdri.dwBytesRecorded = 0; - whdri.dwUser = 0; - whdri.dwFlags = 0; - whdri.dwLoops = 1; - waveInPrepareHeader(hWaveIn, &whdri, sizeof(WAVEHDR)); - waveInAddBuffer(hWaveIn, &whdri, sizeof(WAVEHDR)); - waveInStart(hWaveIn); - Sleep(timems); - const size_t nSamples = (size_t)whdri.dwBytesRecorded / 2; - waveInReset(hWaveIn); - std::lock_guard lock(mx); - if(pcmQueue.empty() || pcmQueue.back().size() == StreamSize) - pcmQueue.emplace_back(pcmVector.data(), pcmVector.data() + nSamples); - else - { - auto& BackData = pcmQueue.back(); - if(BackData.size() + nSamples > StreamSize) - { - const auto RealSize = StreamSize - BackData.size(); - BackData.insert(BackData.end(), pcmVector.data(), pcmVector.data() + RealSize); - pcmQueue.emplace_back(pcmVector.data() + RealSize, pcmVector.data() + nSamples); - } - else - BackData.insert(BackData.end(), pcmVector.data(), pcmVector.data() + nSamples); - } - } - }); - RecoderThread.detach(); - } - void Stop() const - { - if(isBegin) - { - waveInStop(hWaveIn); - waveInReset(hWaveIn); - } - } + MoeVSModuleManager::MoeVoiceStudioCoreInitSetup(); - std::vector GetStreamData() - { - std::lock_guard lock(mx); - if (pcmQueue.empty() || pcmQueue[0].size() != StreamSize) - return {}; - auto Stream = std::move(pcmQueue[0]); - pcmQueue.pop_front(); - return Stream; - } - private: - DWORD SamplingRateSrc = 44100; - std::vector pcmVector; - std::deque> pcmQueue; - size_t StreamSize = 0; - DWORD timems = 0; - HWAVEIN hWaveIn = nullptr; - WAVEFORMATEX waveform{ WAVE_FORMAT_PCM,1,44100,88200,2,16,0 }; - WAVEHDR whdri{ nullptr,0,0,0,0,0,nullptr,0 }; - HANDLE WaitEvent = nullptr; - bool isBegin = false; - std::mutex mx; - }; - class MPCMPlayer + const MJson Config(to_byte_string(GetCurrentFolder() + L"/Models/HimenoSena.json").c_str()); //改为模型配置路径(相对exe) + const MoeVoiceStudioCore::MoeVoiceStudioModule::ProgressCallback ProCallback = [](size_t cur, size_t total) { - public: - MPCMPlayer() = default; - ~MPCMPlayer() - { - if (!hWaveOut) - return; - waveOutClose(hWaveOut); - } - void initPlayer(DWORD SamplingRate = 44100) - { - waveform.nSamplesPerSec = SamplingRate; - waveform.wBitsPerSample = 16; - waveform.nChannels = 1; - waveform.cbSize = 0; - waveform.wFormatTag = WAVE_FORMAT_PCM; - waveform.nBlockAlign = (waveform.wBitsPerSample * waveform.nChannels) / 8; - waveform.nAvgBytesPerSec = waveform.nBlockAlign * waveform.nSamplesPerSec; - WaitEvent = CreateEvent(nullptr, 0, 0, nullptr); - waveOutOpen(&hWaveOut, WAVE_MAPPER, &waveform, (DWORD_PTR)WaitEvent, 0L, CALLBACK_EVENT); - SAMP = SamplingRate; - } - void Play(std::vector& data) - { - whdri.lpData = (LPSTR)data.data(); - whdri.dwBufferLength = DWORD(data.size() * 2); - whdri.dwFlags = 0L; - whdri.dwLoops = 1L; - waveOutPrepareHeader(hWaveOut, &whdri, sizeof(WAVEHDR)); - waveOutWrite(hWaveOut, &whdri, sizeof(WAVEHDR)); - Sleep(DWORD(data.size() * 1000 / size_t(SAMP))); - } - private: - HWAVEOUT hWaveOut = nullptr; - WAVEFORMATEX waveform{ WAVE_FORMAT_PCM,1,44100,88200,2,16,0 }; - WAVEHDR whdri{ nullptr,0,0,0,0,0,nullptr,0 }; - HANDLE WaitEvent = nullptr; - DWORD SAMP = 44100; + std::cout << (double(cur) / double(total) * 100.) << "%\n"; }; - - MoeVSProjectSpace::MoeVSSvcParams Params; - short Threshold = 400; - MRecorder RTRecorder; - MPCMPlayer RTPlayer; - std::deque> InputBuffer, OutputBuffer, rawInputBuffer, rawOutputBuffer; - bool RTIsEnabled = false; - size_t crossfade_length = 0; - size_t extra_length = 0; - - void EndRtInference() - { - RTRecorder.Stop(); - RTIsEnabled = false; - InputBuffer.clear(); - OutputBuffer.clear(); - rawInputBuffer.clear(); - rawOutputBuffer.clear(); - } - - void RTInference() + const MoeVoiceStudioCore::TextToSpeech::DurationCallback DurCallback = [](std::vector&) { - if (RTIsEnabled) - { - EndRtInference(); - return; - } - std::wstring error; - RTIsEnabled = true; - crossfade_length = Params.CrossFadeLength; - extra_length = crossfade_length / 4; - std::thread RT_RECORD_THREAD = std::thread([&]() - { - logger.log(L"[RTInference] Recording Thread Start!"); - while (RTIsEnabled) - { - auto PCM = RTRecorder.GetStreamData(); - if(PCM.empty()) - continue; - rawInputBuffer.emplace_back(std::move(PCM)); - - if (rawInputBuffer.size() > 2) - { - std::vector pBuffer; - pBuffer.reserve(rawInputBuffer[1].size() + 4 * crossfade_length); - pBuffer.insert(pBuffer.end(), - rawInputBuffer[0].end() - int64_t(crossfade_length + extra_length), - rawInputBuffer[0].end()); - pBuffer.insert(pBuffer.end(), rawInputBuffer[1].begin(), rawInputBuffer[1].end()); - pBuffer.insert(pBuffer.end(), - rawInputBuffer[2].begin(), - rawInputBuffer[2].begin() + int64_t(crossfade_length + extra_length) + 1000); - InputBuffer.emplace_back(std::move(pBuffer)); - rawInputBuffer.pop_front(); - } - if (rawInputBuffer.size() > 100) - rawInputBuffer.pop_front(); - } - logger.log(L"[RTInference] Recording Thread End!"); - }); - - std::thread RT_INFERENCE_THREAD = std::thread([&]() - { - logger.log(L"[RTInference] Inferencing Thread Start!"); - while (RTIsEnabled) - { - if (!InputBuffer.empty()) - { - try - { - if (MoeVSModuleManager::GetCurSvcModel()) - { - bool zeroVector = true; - for (const auto& i16data : InputBuffer[0]) - { - if (i16data > Threshold * 10) - { - zeroVector = false; - break; - } - } - if (zeroVector) - rawOutputBuffer.emplace_back(std::vector(InputBuffer[0].size(), 0)); - else - rawOutputBuffer.emplace_back(MoeVSModuleManager::GetCurSvcModel()->InferPCMData(InputBuffer[0], (long)MoeVSModuleManager::SamplingRate, Params)); - } - else - rawOutputBuffer.emplace_back(std::move(InputBuffer[0])); - InputBuffer.pop_front(); - } - catch (std::exception& e) - { - logger.error(e.what()); - EndRtInference(); - } - } - if (InputBuffer.size() > 100) - InputBuffer.pop_front(); - } - logger.log(L"[RTInference] Inferencing Thread End!"); - }); - - std::thread RT_OUTPUT_THREAD = std::thread([&]() - { - logger.log(L"[RTInference] OutPut Thread Start!"); - while (RTIsEnabled) - { - if (rawOutputBuffer.size() > 2) - { - std::vector pBuffer( - rawOutputBuffer[1].begin() + (int64_t)(crossfade_length + extra_length), - rawOutputBuffer[1].end() - ); - pBuffer.resize(RTRecorder.GetFrameSize()); - - const auto dataBufr = pBuffer.size() - crossfade_length; - const auto crossBufl = crossfade_length + extra_length + RTRecorder.GetFrameSize(); - const auto crossBufr = extra_length; - - for (size_t i = 0; i < crossfade_length; ++i) - { - const auto crosf1 = (double(i) / double(crossfade_length)); - const auto crosf2 = (1. - (double(i) / double(crossfade_length))); - - pBuffer[i] = (int16_t)( - double(pBuffer[i]) * crosf1 + - (double)rawOutputBuffer[0][i + crossBufl] * crosf2 - ); - - pBuffer[i + dataBufr] = (int16_t)( - double(pBuffer[i + dataBufr]) * crosf2 + - (double)rawOutputBuffer[2][i + crossBufr] * crosf1 - ); - } - OutputBuffer.emplace_back(std::move(pBuffer)); - rawOutputBuffer.pop_front(); - } - if (!OutputBuffer.empty()) - { - RTPlayer.Play(OutputBuffer.front()); - OutputBuffer.pop_front(); - } - } - logger.log(L"[RTInference] OutPut Thread End!"); - }); - RTRecorder.Start(); - logger.log(L"[RTInference] Start RTInference!"); - RT_RECORD_THREAD.detach(); - RT_INFERENCE_THREAD.detach(); - RT_OUTPUT_THREAD.detach(); - } -} - -int main() -{ - MoeVSModuleManager::MoeVoiceStudioCoreInitSetup(); - + return; + }; try { - MoeVSModuleManager::LoadSvcModel( - MJson(to_byte_string(GetCurrentFolder() + L"/Models/ShirohaRVC.json").c_str()), - [](size_t cur, size_t total) - { - //std::cout << (double(cur) / double(total) * 100.) << "%\n"; - }, - 0, - 8, - 0 - ); + const MoeVoiceStudioCore::TextToSpeech* VitsTest = dynamic_cast(new MoeVoiceStudioCore::Vits(Config, ProCallback, DurCallback, MoeVoiceStudioCore::MoeVoiceStudioModule::ExecutionProviders::CPU, 8, 0)); + //这里改为Json的字符串或者Json文件 + const auto Voice = VitsTest->Inference(MJson("S:\\VSGIT\\MoeVoiceStudioSvc - Core - Cmd\\x64\\Debug\\test.json")); + //输出 + InferTools::Wav::WritePCMData(VitsTest->GetSamplingRate(), 1, Voice[0], L"Test1.wav"); } catch (std::exception& e) { std::cout << e.what(); - return 0; } - RtInferenceSpace::Params.Sampler = L"DDim"; - RtInferenceSpace::Params.Step = 100; - RtInferenceSpace::Params.Pndm = 10; - RtInferenceSpace::Params.F0Method = L"RMVPE"; - RtInferenceSpace::Params.CrossFadeLength = 8000; - RtInferenceSpace::Params.Keys = 8; - - RtInferenceSpace::RTRecorder.initRecorder((DWORD)MoeVSModuleManager::SamplingRate); - RtInferenceSpace::RTRecorder.setStreamBufferSize(0.5); - RtInferenceSpace::RTRecorder.Start(); - RtInferenceSpace::RTPlayer.initPlayer((DWORD)MoeVSModuleManager::SamplingRate); - - RtInferenceSpace::RTInference(); - - while (true); - while (true) - { - auto PCM = RtInferenceSpace::RTRecorder.GetStreamData(); - if (!PCM.empty()) - RtInferenceSpace::RTPlayer.Play(PCM); - } + return 0; } #endif -#include "LibDLVoiceCodec/value.h" -class Class0 : libdlvcodec::Module -{ -public: - Class0(Module* _Parent, const std::string& _Name) : Module(_Parent, _Name) {} -}; - -class ClassA : libdlvcodec::Module -{ -public: - ClassA(Module* _Parent, const std::string& _Name) : Module(_Parent, _Name) {} -private: - RegLayer(Class0, attrC0); -}; - -class ClassB : libdlvcodec::Module -{ -public: - ClassB() : Module(nullptr, "ClassB") {} -private: - RegLayer(ClassA, attrCA); -}; - -int main() -{ - ClassB a; - printf("%d", &a); -} \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj index 14fe639..b3efd8c 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj +++ b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj @@ -112,6 +112,7 @@ true stdcpp17 4996 + MultiThreadedDebug Console @@ -130,6 +131,7 @@ true stdcpp17 4996 + MultiThreaded Console @@ -156,16 +158,19 @@ + + + @@ -179,6 +184,8 @@ + + @@ -202,17 +209,20 @@ + + + @@ -226,6 +236,9 @@ + + + diff --git a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters index 9a65b7f..46d0a2c 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters +++ b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters @@ -97,6 +97,24 @@ {9cdc3e22-5d4a-4a08-a1cd-57dd5c657e2c} + + {c016ee4a-744e-431d-8bad-ac18310fb098} + + + {b5a33287-5ba2-45ae-9135-5a8e3da3d93f} + + + {3c32c0a2-b610-47d9-ab2b-ab37e9741903} + + + {2971f642-2c62-4b10-ad90-8cb65bcc99a0} + + + {02644a09-1bb9-48df-9364-1ea5177de68b} + + + {e1a1babf-b512-4926-9870-160cf546636a} + @@ -147,18 +165,6 @@ 源文件\Lib\Json - - 源文件\Modules\Models - - - 源文件\Modules\Models - - - 源文件\Modules\Models - - - 源文件\Modules\Models - 源文件\Modules\Models @@ -222,6 +228,33 @@ 源文件\LibDLVoiceCodec + + 源文件\LibDLVoiceCodec + + + 源文件\Modules\InferTools\Cluster + + + 源文件\Modules\InferTools\G2P + + + 源文件\Modules\Models\SVC + + + 源文件\Modules\Models\SVC + + + 源文件\Modules\Models\SVC + + + 源文件\Modules\Models + + + 源文件\Modules\Models\TTS + + + 源文件\Modules\Models\TTS + @@ -281,15 +314,6 @@ 头文件\Modules\Models - - 头文件\Modules\Models - - - 头文件\Modules\Models - - - 头文件\Modules\Models - 头文件\Modules\Models @@ -356,5 +380,32 @@ 头文件\LibDLVoiceCodec + + 头文件\LibDLVoiceCodec + + + 头文件\Modules\InferTools\Cluster + + + 头文件\Modules\InferTools\G2P + + + 头文件\Modules\Models\TTS + + + 头文件\Modules\Models\TTS + + + 头文件\Modules\Models\TTS + + + 头文件\Modules\Models\SVC + + + 头文件\Modules\Models\SVC + + + 头文件\Modules\Models\SVC + \ No newline at end of file