From 4aadbbdd916eab13dfe5ddd208b37ba3f2d7167c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=99=BD=E5=8F=B6=20=E8=97=A4=E5=8E=9F?= <1751842477@qq.com> Date: Mon, 16 Oct 2023 12:19:14 +0800 Subject: [PATCH] Create LibDlVoiceCodec --- .../LibDLVoiceCodec/base.cpp | 1 + .../LibDLVoiceCodec/base.h | 221 +++++++++ .../LibDLVoiceCodec/value.cpp | 125 +++++ .../LibDLVoiceCodec/value.h | 77 +++ .../MoeVoiceStudioTensorExtractor.cpp | 36 ++ .../MoeVoiceStudioTensorExtractor.hpp | 3 + .../Modules/Models/header/MoeVSProject.hpp | 32 +- .../Modules/Models/src/DiffSvc.cpp | 12 +- .../Modules/Models/src/MoeVSProject.cpp | 13 +- .../Modules/Models/src/VitsSvc.cpp | 11 +- .../MoeVoiceStudioSvc - Core - Cmd.cpp | 455 +++++++++++++++--- .../MoeVoiceStudioSvc - Core - Cmd.vcxproj | 14 +- ...oiceStudioSvc - Core - Cmd.vcxproj.filters | 18 + 13 files changed, 911 insertions(+), 107 deletions(-) create mode 100644 MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.cpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h create mode 100644 MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp create mode 100644 MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.cpp b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.cpp new file mode 100644 index 0000000..d271b05 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.cpp @@ -0,0 +1 @@ +#include "base.h" \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h new file mode 100644 index 0000000..0fd95b4 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/base.h @@ -0,0 +1,221 @@ +#pragma once +#include +#include +#include +#include +#define LibDLVoiceCodecBegin namespace libdlvcodec { +#define LibDLVoiceCodecEnd } +#define LIBDVCND [[nodiscard]] + +#define LibDLVoiceCodecThrow(message) throw std::exception((std::string("[In \"") + __FILE__ + "\" Line " + std::to_string(__LINE__) + "] " + (message)).c_str()) + +LibDLVoiceCodecBegin + +using int8 = int8_t; +using int16 = int16_t; +using int32 = int32_t; +using int64 = int64_t; +using float32 = float; +using float64 = double; +using byte = unsigned char; +using lpvoid = void*; +using uint8 = uint8_t; +using uint16 = uint16_t; +using uint32 = uint32_t; +using uint64 = uint64_t; + +template +class BaseAllocator +{ +public: + BaseAllocator() = default; + virtual ~BaseAllocator() = default; + virtual T* allocate(size_t size) + { + return new T[size]; + } + virtual void destroy(const T* ptr) + { + delete ptr; + } +}; + +template> +class MResource +{ +public: + using reference = Type&; + using rvalue = Type&&; + using ptr_t = Type*; + + MResource() = default; + + MResource(size_t _Count) + { + data_ = allocator_.allocate(_Count * 2); + size_ = _Count; + } + + MResource(size_t _Count, Type _Value) + { + data_ = allocator_.allocate(_Count * 2); + size_ = _Count; + auto _ptr = data_; + const auto _end = data_ + size_; + while (_ptr != _end) + { + *_ptr = _Value; + ++_ptr; + } + } + + MResource(ptr_t _Ptr, size_t _Size) + { + data_ = _Ptr; + size_ = _Size; + } + + MResource(const MResource& _Left) + { + size_ = _Left.size_; + data_ = allocator_.allocate(_Left.size_); + auto _ptr = data_, _ptrl = _Left.data_; + const auto _end = data_ + size_; + while (_ptr != _end) + { + *_ptr = *_ptrl; + ++_ptr; + ++_ptrl; + } + } + + MResource(MResource&& _Right) noexcept + { + size_ = _Right.size_; + data_ = _Right.data_; + _Right.size_ = 0ull; + _Right.data_ = nullptr; + } + + ~MResource() + { + if (data_) + { + allocator_.destroy(data_); + data_ = nullptr; + } + } + + LIBDVCND ptr_t data() const + { + return data_; + } + + LIBDVCND ptr_t begin() const + { + return data_; + } + + LIBDVCND ptr_t end() const + { + return data_ + size_; + } + + ptr_t release() + { + const ptr_t _pdata = data_; + data_ = nullptr; + return _pdata; + } + + reference operator[](size_t _Index) const + { + assert(_Index < size_); + return *(data_ + _Index); + } + + MResource& operator=(const MResource& _Left) + { + if (&_Left == this) + return *this; + size_ = _Left.size_; + data_ = allocator_.allocate(_Left.size_); + auto _ptr = data_, _ptrl = _Left.data_; + const auto _end = data_ + size_; + while (_ptr != _end) + { + *_ptr = *_ptrl; + ++_ptr; + ++_ptrl; + } + return *this; + } + + MResource& operator=(MResource&& _Right) noexcept + { + size_ = _Right.size_; + data_ = _Right.data_; + _Right.size_ = 0ull; + _Right.data_ = nullptr; + return *this; + } +protected: + ptr_t data_ = nullptr; + size_t size_ = 0ull; + Allocator allocator_; +}; + +template +std::ostream& operator<<(std::ostream& _Stream, const MResource& _Data) +{ + _Stream << '['; + for (const auto& i : _Data) + _Stream << i << ", "; + _Stream << "]\n"; + return _Stream; +} + +class FileWrapper +{ +public: + FileWrapper() = default; + ~FileWrapper() + { + if (file_) + fclose(file_); + file_ = nullptr; + } + FileWrapper(const FileWrapper& _Left) = delete; + FileWrapper& operator=(const FileWrapper& _Left) = delete; + FileWrapper(FileWrapper&& _Right) noexcept + { + file_ = _Right.file_; + _Right.file_ = nullptr; + } + FileWrapper& operator=(FileWrapper&& _Right) noexcept + { + file_ = _Right.file_; + _Right.file_ = nullptr; + return *this; + } + void open(const std::wstring& _Path, const std::wstring& _Mode) + { +#ifdef _WIN32 + _wfopen_s(&file_, _Path.c_str(), _Mode.c_str()); +#else + file_ = _wfopen(_Path.c_str(), _Mode.c_str()); +#endif + } + operator FILE* () const + { + return file_; + } + LIBDVCND bool enabled() const + { + return file_; + } +private: + FILE* file_ = nullptr; +}; + +LibDLVoiceCodecEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp new file mode 100644 index 0000000..5449b70 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.cpp @@ -0,0 +1,125 @@ +#include "value.h" + +LibDLVoiceCodecBegin + +Value& Value::load(const std::wstring& _Path) +{ + FileWrapper file; + file.open(_Path, L"rb"); + if (!file.enabled()) + LibDLVoiceCodecThrow("Failed to open file!"); + char Header[4]; + fread(Header, 1, 4, file); + if (Header[0] != 'L' || Header[1] != 'S' || Header[2] != 'B' || Header[3] != 'V') + LibDLVoiceCodecThrow("File does not recognize!"); + size_t MemberNameSize = 0; + WeightDict weight; + char MemberName[1025]; + /* + MemberNameSize (size_t) + MemberName (char[MemberNameSize]) + _WeightData.Size (size_t) + _WeightData.ShapeSize (size_t) + _WeightData.Shape (int64_t[_WeightData.ShapeSize]) + _WeightData.Data (byte[_WeightData.Size]) + */ + while (fread(&MemberNameSize, 1, sizeof(size_t), file) == sizeof(size_t)) + { + if (MemberNameSize == 0) + LibDLVoiceCodecThrow("Size of attrib name must higer than 0!"); + if (MemberNameSize > 1024) + LibDLVoiceCodecThrow("Size of attrib name must lower than 1024!"); + if (fread(MemberName, 1, MemberNameSize, file) != MemberNameSize) + LibDLVoiceCodecThrow("Unexpected EOF!"); + MemberName[MemberNameSize] = 0; + std::string AttribName = MemberName; + if (weight.find(AttribName) != weight.end()) + continue; + + WeightData _WeightData; + + if (fread(&_WeightData.Size, 1, sizeof(size_t), file) != sizeof(size_t)) + LibDLVoiceCodecThrow("Unexpected EOF!"); + if (_WeightData.Size == 0) + continue; + + if (fread(&_WeightData.ShapeSize, 1, sizeof(size_t), file) != sizeof(size_t)) + LibDLVoiceCodecThrow("Unexpected EOF!"); + if (_WeightData.ShapeSize == 0) + continue; + + size_t __SIZE = sizeof(int64_t) * _WeightData.ShapeSize; + _WeightData.Shape = std::vector(_WeightData.ShapeSize); + if (fread(_WeightData.Shape.data(), 1, __SIZE, file) != __SIZE) + LibDLVoiceCodecThrow("Unexpected EOF!"); + + __SIZE = _WeightData.Size; + _WeightData.Data = MResource(_WeightData.Size); + if (fread(_WeightData.Data.data(), 1, __SIZE, file) != __SIZE) + LibDLVoiceCodecThrow("Unexpected EOF!"); + + weight[AttribName] = std::move(_WeightData); + } + loadData(weight); + return *this; +} + +Value& Value::save(const std::wstring& _Path) +{ + FileWrapper file; + file.open(_Path, L"rb"); + if (!file.enabled()) + LibDLVoiceCodecThrow("Failed to open file!"); + constexpr char Header[5] = "LSBV"; + fwrite(Header, 1, 4, file); + saveData(file); + return *this; +} + +void Value::loadData(WeightDict& _Dict) +{ + LibDLVoiceCodecThrow("Not implemented error!"); +} + +void Value::saveData(FileWrapper& _File) +{ + LibDLVoiceCodecThrow("Not implemented error!"); +} + +void Tensor::loadData(WeightDict& _Dict) +{ + const auto res = _Dict.find(RegName_); + if (res != _Dict.end()) + { + Shape_ = res->second.Shape; + size_t TotalSize = 1; + for (const auto i : Shape_) + TotalSize *= i; + if (TotalSize * sizeof(DType) != res->second.Size) + LibDLVoiceCodecThrow("Expected size does not match actual size!"); + Data_ = std::move(res->second.Data); + } +} + +void Tensor::saveData(FileWrapper& _File) +{ + +} + +void Module::loadData(WeightDict& _Dict) +{ + for(const auto& it : Layers_) + { + it.second->loadData(_Dict); + } +} + +void Module::saveData(FileWrapper& _File) +{ + for (const auto& it : Layers_) + { + it.second->saveData(_File); + } +} + +LibDLVoiceCodecEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h new file mode 100644 index 0000000..a2a9775 --- /dev/null +++ b/MoeVoiceStudioSvc - Core - Cmd/LibDLVoiceCodec/value.h @@ -0,0 +1,77 @@ +#pragma once +#include +#include +#include "base.h" + +#define RegLayer(ModuleName, MemberName, ...) ModuleName MemberName{this, #MemberName, __VA_ARGS__} + +LibDLVoiceCodecBegin + +struct WeightData +{ + size_t Size = 0; + size_t ShapeSize = 0; + std::vector Shape; + MResource Data; +}; + +class Value +{ +public: + Value() = default; + virtual ~Value() = default; + using WeightDict = std::map; + +protected: + std::string RegName_; + +public: + Value& load(const std::wstring& _Path); + Value& save(const std::wstring& _Path); + virtual void loadData(WeightDict& _Dict); + virtual void saveData(FileWrapper& _File); + +}; + +class Module : public Value +{ +public: + Module(Module* _Parent, const std::string& _Name) + { + if (_Parent != nullptr) + { + RegName_ = _Parent->RegName_ + "." + _Name; + _Parent->Layers_[RegName_] = this; + } + else + RegName_ = _Name; + } + +private: + std::map Layers_; + +public: + void loadData(WeightDict& _Dict) override; + void saveData(FileWrapper& _File) override; +}; + +class Tensor : Value +{ +public: + using DType = float; + Tensor(const std::string& _Name = "Tensor") + { + RegName_ = _Name; + TensorLayer_ = false; + } + +protected: + std::vector Shape_; + MResource Data_; + bool TensorLayer_ = false; +public: + void loadData(WeightDict& _Dict) override; + void saveData(FileWrapper& _File) override; +}; + +LibDLVoiceCodecEnd \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.cpp index 067e4ce..5dd63bb 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.cpp @@ -66,6 +66,42 @@ std::vector MoeVoiceStudioTensorExtractor::GetCurrectSpkMixData(const std return mixData; } +std::vector MoeVoiceStudioTensorExtractor::GetSpkMixData(const std::vector>& _input, size_t dst_len, size_t spk_count) +{ + std::vector mixData; + mixData.reserve(spk_count * dst_len); + if (_input.empty()) + { + std::vector LenData(spk_count, 0.0); + LenData[0] = 1.0; + for (size_t i = 0; i < dst_len; ++i) + mixData.insert(mixData.end(), LenData.begin(), LenData.end()); + } + else + { + std::vector> _spkMap; + for (size_t i = 0; i < _input.size() && i < spk_count; ++i) + _spkMap.emplace_back(InferTools::InterpFunc(_input[i], long(_input[i].size()), long(dst_len))); + LinearCombination(_spkMap, 0); + const auto curnspk = _input.size(); + if (curnspk < spk_count) + { + std::vector LenData(spk_count - curnspk, 0.0); + for (size_t i = 0; i < dst_len; ++i) + { + for (size_t j = 0; j < curnspk; ++j) + mixData.emplace_back(_spkMap[j][i]); + mixData.insert(mixData.end(), LenData.begin(), LenData.end()); + } + } + else + for (size_t i = 0; i < dst_len; ++i) + for (size_t j = 0; j < spk_count; ++j) + mixData.emplace_back(_spkMap[j][i]); + } + return mixData; +} + std::vector MoeVoiceStudioTensorExtractor::GetNSFF0(const std::vector& F0) const { const auto f0Len = F0.size(); diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.hpp index 1a90b0f..5a00180 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/TensorExtractor/MoeVoiceStudioTensorExtractor.hpp @@ -165,6 +165,9 @@ class MoeVoiceStudioTensorExtractor //获取正确的角色混合数据 [[nodiscard]] std::vector GetCurrectSpkMixData(const std::vector>& _input, size_t dst_len, int64_t curspk) const; + + //获取正确的角色混合数据 + [[nodiscard]] static std::vector GetSpkMixData(const std::vector>& _input, size_t dst_len, size_t spk_count); protected: uint64_t _NSpeaker = 1; uint64_t _SrcSamplingRate = 32000; diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp index b85d271..2d169b5 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/MoeVSProject.hpp @@ -25,6 +25,32 @@ #include "../../StringPreprocess.hpp" namespace MoeVSProjectSpace { + class FileWrapper + { + public: + FileWrapper() = delete; + FileWrapper(const wchar_t* _path, const wchar_t* _mode) + { + _wfopen_s(&file_, _path, _mode); + } + ~FileWrapper() + { + if (file_) + fclose(file_); + file_ = nullptr; + } + operator FILE*() const + { + return file_; + } + [[nodiscard]] bool IsOpen() const + { + return file_; + } + private: + FILE* file_ = nullptr; + }; + using size_type = size_t; template @@ -106,10 +132,12 @@ namespace MoeVSProjectSpace std::wstring Sampler = L"Pndm"; //采样器 std::wstring F0Method = L"Dio"; //F0提取算法 int64_t SpeakerId = 0; - int64_t HopSize = 320; - int64_t SpkCount = 2; uint64_t SrcSamplingRate = 48000; bool UseShallowDiffusion = false; + int64_t SpkCount = 2; + //RTInfer + int64_t RTSampleSize = 44100; + int64_t CrossFadeLength = 320; }; struct ParamsOffset diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp index 88212c3..58cd4ec 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/DiffSvc.cpp @@ -728,16 +728,10 @@ std::vector DiffusionSvc::InferPCMData(const std::vector& PCMD throw std::exception((std::string("Locate: Nsf\n") + e3.what()).c_str()); } - const auto shapeOut = finaOut[0].GetTensorTypeAndShapeInfo().GetShape(); - const auto dstWavLen = (int64_t)PCMData.size(); + const auto dstWavLen = finaOut[0].GetTensorTypeAndShapeInfo().GetShape()[2]; std::vector TempVecWav(dstWavLen, 0); - if (shapeOut[2] < dstWavLen) - for (int64_t bbb = 0; bbb < shapeOut[2]; bbb++) - TempVecWav[bbb] = static_cast(Clamp(finaOut[0].GetTensorData()[bbb]) * 32766.f); - else - for (int64_t bbb = 0; bbb < dstWavLen; bbb++) - TempVecWav[bbb] = static_cast(Clamp(finaOut[0].GetTensorData()[bbb]) * 32766.f); - TempVecWav.resize(dstWavLen); + for (int64_t bbb = 0; bbb < dstWavLen; bbb++) + TempVecWav[bbb] = static_cast(Clamp(finaOut[0].GetTensorData()[bbb]) * 32766.0f); return TempVecWav; } diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/MoeVSProject.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/MoeVSProject.cpp index d5a44a2..e370f38 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/MoeVSProject.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/MoeVSProject.cpp @@ -4,13 +4,11 @@ namespace MoeVSProjectSpace { MoeVSProject::MoeVSProject(const std::wstring& _path) { - FILE* project_file = nullptr; - _wfopen_s(&project_file, _path.c_str(), L"rb"); - if (!project_file) + FileWrapper project_file(_path.c_str(), L"rb"); + if (!project_file.IsOpen()) throw std::exception("File Doesn't Exists"); fseek(project_file, 0, SEEK_SET); - if (fread(&moevs_proj_header_, 1, sizeof(Header), project_file) != sizeof(Header)) throw std::exception("Unexpected EOF"); if (!(moevs_proj_header_.ChunkSymbol[0] == 'M' && moevs_proj_header_.ChunkSymbol[1] == 'O' && moevs_proj_header_.ChunkSymbol[2] == 'E' && moevs_proj_header_.ChunkSymbol[3] == 'V' && moevs_proj_header_.ChunkSymbol[4] == 'S' && moevs_proj_header_.ChunkSymbol[5] == 'P' && moevs_proj_header_.ChunkSymbol[6] == 'R' && moevs_proj_header_.ChunkSymbol[7] == 'J')) @@ -39,7 +37,7 @@ namespace MoeVSProjectSpace throw std::exception("Unrecognized File"); - //HiddenUnit + //Audio if (_datas.Header.OrgAudioOffsetPosSize != 0) { _datas.Offset.OrgAudio = std::vector(_datas.Header.OrgAudioOffsetPosSize); _n_bytes = sizeof(size_type) * _datas.Header.OrgAudioOffsetPosSize; @@ -150,7 +148,6 @@ namespace MoeVSProjectSpace data_.emplace_back(std::move(_datas)); } - fclose(project_file); } MoeVSProject::MoeVSProject(const std::vector& _params) @@ -184,12 +181,14 @@ namespace MoeVSProjectSpace if (!i.Speaker.empty()) { for (const auto& Speaker : i.Speaker) + if (Speaker.size() > _data.Header.NSpeaker) + _data.Header.NSpeaker = Speaker.size(); + for (auto& Speaker : i.Speaker) { size_t size__ = 0; for (const auto& spkk : Speaker) size__ += spkk.size(); _data.Offset.Speaker.push_back(size__); - _data.Header.NSpeaker = Speaker.size(); } _data.Header.CharacterOffsetPosSize = _data.Offset.Speaker.size(); } diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp index 04fe844..18f22f6 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/VitsSvc.cpp @@ -1032,15 +1032,10 @@ std::vector VitsSvc::InferPCMData(const std::vector& PCMData, soVitsOutput.data(), soVitsOutput.size()); - const auto shapeOut = finaOut[0].GetTensorTypeAndShapeInfo().GetShape(); - const auto dstWavLen = int64_t(PCMData.size()); + const auto dstWavLen = finaOut[0].GetTensorTypeAndShapeInfo().GetShape()[2]; std::vector TempVecWav(dstWavLen, 0); - if (shapeOut[2] < dstWavLen) - for (int64_t bbb = 0; bbb < shapeOut[2]; bbb++) - TempVecWav[bbb] = static_cast(Clamp(finaOut[0].GetTensorData()[bbb]) * 32766.0f); - else - for (int64_t bbb = 0; bbb < dstWavLen; bbb++) - TempVecWav[bbb] = static_cast(Clamp(finaOut[0].GetTensorData()[bbb]) * 32766.0f); + for (int64_t bbb = 0; bbb < dstWavLen; bbb++) + TempVecWav[bbb] = static_cast(Clamp(finaOut[0].GetTensorData()[bbb]) * 32766.0f); return TempVecWav; } diff --git a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp index e130471..c8ca1fc 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp +++ b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.cpp @@ -1,7 +1,14 @@ - +#ifdef MOEVSONNX +#include +#include #include #include "Modules/Modules.hpp" #include "Modules/AvCodec/AvCodeResample.h" + +#include +#include +#pragma comment(lib, "winmm.lib") + #ifdef _IOSTREAM_ std::ostream& operator<<(std::ostream& stream, const std::wstring& str) { @@ -22,6 +29,328 @@ std::ostream& operator<<(std::ostream& stream, std::vector& vec) return stream; } #endif +#ifdef _VECTOR_ +template +std::vector& operator-=(std::vector& left, const std::vector& right) +{ + for (size_t i = 0; i < left.size() && i < right.size(); ++i) + left[i] -= right[i]; + return left; +} +#endif + +namespace RtInferenceSpace +{ + class MRecorder + { + public: + MRecorder() = default; + ~MRecorder() + { + if (!hWaveIn) + return; + Stop(); + waveInClose(hWaveIn); + } + void initRecorder(DWORD SamplingRate = 44100) + { + waveform.nSamplesPerSec = SamplingRate; + waveform.wBitsPerSample = 16; + waveform.nChannels = 1; + waveform.cbSize = 0; + waveform.wFormatTag = WAVE_FORMAT_PCM; + waveform.nBlockAlign = (waveform.wBitsPerSample * waveform.nChannels) / 8; + waveform.nAvgBytesPerSec = waveform.nBlockAlign * waveform.nSamplesPerSec; + SamplingRateSrc = SamplingRate; + WaitEvent = CreateEvent(nullptr, 0, 0, nullptr); + waveInOpen(&hWaveIn, WAVE_MAPPER, &waveform, (DWORD_PTR)WaitEvent, 0L, CALLBACK_EVENT); + } + + void setStreamBufferSize(double time) + { + Stop(); + StreamSize = size_t(time * SamplingRateSrc); + timems = DWORD(time * 1000); + timems -= 50; + if (timems < 50) timems = 50; + pcmVector = std::vector(StreamSize * 2); + whdri.lpData = (LPSTR)pcmVector.data(); + whdri.dwBufferLength = DWORD(StreamSize * 2); + whdri.dwBytesRecorded = 0; + whdri.dwUser = 0; + whdri.dwFlags = 0; + whdri.dwLoops = 1; + } + + [[nodiscard]] size_t GetFrameSize() const + { + return StreamSize; + } + + void Start() + { + if (isBegin) + return; + isBegin = true; + std::thread RecoderThread([&]() + { + while(isBegin) + { + whdri.lpData = (LPSTR)pcmVector.data(); + whdri.dwBufferLength = DWORD(StreamSize * 2); + whdri.dwBytesRecorded = 0; + whdri.dwUser = 0; + whdri.dwFlags = 0; + whdri.dwLoops = 1; + waveInPrepareHeader(hWaveIn, &whdri, sizeof(WAVEHDR)); + waveInAddBuffer(hWaveIn, &whdri, sizeof(WAVEHDR)); + waveInStart(hWaveIn); + Sleep(timems); + const size_t nSamples = (size_t)whdri.dwBytesRecorded / 2; + waveInReset(hWaveIn); + std::lock_guard lock(mx); + if(pcmQueue.empty() || pcmQueue.back().size() == StreamSize) + pcmQueue.emplace_back(pcmVector.data(), pcmVector.data() + nSamples); + else + { + auto& BackData = pcmQueue.back(); + if(BackData.size() + nSamples > StreamSize) + { + const auto RealSize = StreamSize - BackData.size(); + BackData.insert(BackData.end(), pcmVector.data(), pcmVector.data() + RealSize); + pcmQueue.emplace_back(pcmVector.data() + RealSize, pcmVector.data() + nSamples); + } + else + BackData.insert(BackData.end(), pcmVector.data(), pcmVector.data() + nSamples); + } + } + }); + RecoderThread.detach(); + } + + void Stop() const + { + if(isBegin) + { + waveInStop(hWaveIn); + waveInReset(hWaveIn); + } + } + + std::vector GetStreamData() + { + std::lock_guard lock(mx); + if (pcmQueue.empty() || pcmQueue[0].size() != StreamSize) + return {}; + auto Stream = std::move(pcmQueue[0]); + pcmQueue.pop_front(); + return Stream; + } + private: + DWORD SamplingRateSrc = 44100; + std::vector pcmVector; + std::deque> pcmQueue; + size_t StreamSize = 0; + DWORD timems = 0; + HWAVEIN hWaveIn = nullptr; + WAVEFORMATEX waveform{ WAVE_FORMAT_PCM,1,44100,88200,2,16,0 }; + WAVEHDR whdri{ nullptr,0,0,0,0,0,nullptr,0 }; + HANDLE WaitEvent = nullptr; + bool isBegin = false; + std::mutex mx; + }; + class MPCMPlayer + { + public: + MPCMPlayer() = default; + ~MPCMPlayer() + { + if (!hWaveOut) + return; + waveOutClose(hWaveOut); + } + void initPlayer(DWORD SamplingRate = 44100) + { + waveform.nSamplesPerSec = SamplingRate; + waveform.wBitsPerSample = 16; + waveform.nChannels = 1; + waveform.cbSize = 0; + waveform.wFormatTag = WAVE_FORMAT_PCM; + waveform.nBlockAlign = (waveform.wBitsPerSample * waveform.nChannels) / 8; + waveform.nAvgBytesPerSec = waveform.nBlockAlign * waveform.nSamplesPerSec; + WaitEvent = CreateEvent(nullptr, 0, 0, nullptr); + waveOutOpen(&hWaveOut, WAVE_MAPPER, &waveform, (DWORD_PTR)WaitEvent, 0L, CALLBACK_EVENT); + SAMP = SamplingRate; + } + void Play(std::vector& data) + { + whdri.lpData = (LPSTR)data.data(); + whdri.dwBufferLength = DWORD(data.size() * 2); + whdri.dwFlags = 0L; + whdri.dwLoops = 1L; + waveOutPrepareHeader(hWaveOut, &whdri, sizeof(WAVEHDR)); + waveOutWrite(hWaveOut, &whdri, sizeof(WAVEHDR)); + Sleep(DWORD(data.size() * 1000 / size_t(SAMP))); + } + private: + HWAVEOUT hWaveOut = nullptr; + WAVEFORMATEX waveform{ WAVE_FORMAT_PCM,1,44100,88200,2,16,0 }; + WAVEHDR whdri{ nullptr,0,0,0,0,0,nullptr,0 }; + HANDLE WaitEvent = nullptr; + DWORD SAMP = 44100; + }; + + MoeVSProjectSpace::MoeVSSvcParams Params; + short Threshold = 400; + MRecorder RTRecorder; + MPCMPlayer RTPlayer; + std::deque> InputBuffer, OutputBuffer, rawInputBuffer, rawOutputBuffer; + bool RTIsEnabled = false; + size_t crossfade_length = 0; + size_t extra_length = 0; + + void EndRtInference() + { + RTRecorder.Stop(); + RTIsEnabled = false; + InputBuffer.clear(); + OutputBuffer.clear(); + rawInputBuffer.clear(); + rawOutputBuffer.clear(); + } + + void RTInference() + { + if (RTIsEnabled) + { + EndRtInference(); + return; + } + std::wstring error; + RTIsEnabled = true; + crossfade_length = Params.CrossFadeLength; + extra_length = crossfade_length / 4; + std::thread RT_RECORD_THREAD = std::thread([&]() + { + logger.log(L"[RTInference] Recording Thread Start!"); + while (RTIsEnabled) + { + auto PCM = RTRecorder.GetStreamData(); + if(PCM.empty()) + continue; + rawInputBuffer.emplace_back(std::move(PCM)); + + if (rawInputBuffer.size() > 2) + { + std::vector pBuffer; + pBuffer.reserve(rawInputBuffer[1].size() + 4 * crossfade_length); + pBuffer.insert(pBuffer.end(), + rawInputBuffer[0].end() - int64_t(crossfade_length + extra_length), + rawInputBuffer[0].end()); + pBuffer.insert(pBuffer.end(), rawInputBuffer[1].begin(), rawInputBuffer[1].end()); + pBuffer.insert(pBuffer.end(), + rawInputBuffer[2].begin(), + rawInputBuffer[2].begin() + int64_t(crossfade_length + extra_length) + 1000); + InputBuffer.emplace_back(std::move(pBuffer)); + rawInputBuffer.pop_front(); + } + if (rawInputBuffer.size() > 100) + rawInputBuffer.pop_front(); + } + logger.log(L"[RTInference] Recording Thread End!"); + }); + + std::thread RT_INFERENCE_THREAD = std::thread([&]() + { + logger.log(L"[RTInference] Inferencing Thread Start!"); + while (RTIsEnabled) + { + if (!InputBuffer.empty()) + { + try + { + if (MoeVSModuleManager::GetCurSvcModel()) + { + bool zeroVector = true; + for (const auto& i16data : InputBuffer[0]) + { + if (i16data > Threshold * 10) + { + zeroVector = false; + break; + } + } + if (zeroVector) + rawOutputBuffer.emplace_back(std::vector(InputBuffer[0].size(), 0)); + else + rawOutputBuffer.emplace_back(MoeVSModuleManager::GetCurSvcModel()->InferPCMData(InputBuffer[0], (long)MoeVSModuleManager::SamplingRate, Params)); + } + else + rawOutputBuffer.emplace_back(std::move(InputBuffer[0])); + InputBuffer.pop_front(); + } + catch (std::exception& e) + { + logger.error(e.what()); + EndRtInference(); + } + } + if (InputBuffer.size() > 100) + InputBuffer.pop_front(); + } + logger.log(L"[RTInference] Inferencing Thread End!"); + }); + + std::thread RT_OUTPUT_THREAD = std::thread([&]() + { + logger.log(L"[RTInference] OutPut Thread Start!"); + while (RTIsEnabled) + { + if (rawOutputBuffer.size() > 2) + { + std::vector pBuffer( + rawOutputBuffer[1].begin() + (int64_t)(crossfade_length + extra_length), + rawOutputBuffer[1].end() + ); + pBuffer.resize(RTRecorder.GetFrameSize()); + + const auto dataBufr = pBuffer.size() - crossfade_length; + const auto crossBufl = crossfade_length + extra_length + RTRecorder.GetFrameSize(); + const auto crossBufr = extra_length; + + for (size_t i = 0; i < crossfade_length; ++i) + { + const auto crosf1 = (double(i) / double(crossfade_length)); + const auto crosf2 = (1. - (double(i) / double(crossfade_length))); + + pBuffer[i] = (int16_t)( + double(pBuffer[i]) * crosf1 + + (double)rawOutputBuffer[0][i + crossBufl] * crosf2 + ); + + pBuffer[i + dataBufr] = (int16_t)( + double(pBuffer[i + dataBufr]) * crosf2 + + (double)rawOutputBuffer[2][i + crossBufr] * crosf1 + ); + } + OutputBuffer.emplace_back(std::move(pBuffer)); + rawOutputBuffer.pop_front(); + } + if (!OutputBuffer.empty()) + { + RTPlayer.Play(OutputBuffer.front()); + OutputBuffer.pop_front(); + } + } + logger.log(L"[RTInference] OutPut Thread End!"); + }); + RTRecorder.Start(); + logger.log(L"[RTInference] Start RTInference!"); + RT_RECORD_THREAD.detach(); + RT_INFERENCE_THREAD.detach(); + RT_OUTPUT_THREAD.detach(); + } +} int main() { @@ -35,7 +364,7 @@ int main() { //std::cout << (double(cur) / double(total) * 100.) << "%\n"; }, - 1, + 0, 8, 0 ); @@ -45,82 +374,56 @@ int main() std::cout << e.what(); return 0; } - - MoeVSProjectSpace::MoeVSSvcParams Params; - Params.Sampler = L"DDim"; - Params.Step = 100; - Params.Pndm = 10; - InferTools::SlicerSettings Settings; - Params.F0Method = L"RMVPE"; - Settings.SamplingRate = 40000; -#ifdef DEBUGUSETRYCATCH - try - { -#endif - std::wstring Paths; - auto TPCMData = AudioPreprocess().codec(LR"(S:\VSGIT\MoeSS - Release\Testdata\123.wav)", Settings.SamplingRate); - //MoeVSModuleManager::GetCurSvcModel()->InferPCMData(TPCMData, Settings.SamplingRate, Params); - std::vector PCMData = { TPCMData.begin(),TPCMData.begin() + Settings.SamplingRate }; - std::vector _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - PCMData = { TPCMData.begin() + Settings.SamplingRate * 1,TPCMData.begin() + Settings.SamplingRate * 2 }; - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - PCMData = { TPCMData.begin() + Settings.SamplingRate * 2,TPCMData.begin() + Settings.SamplingRate * 3 }; - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - PCMData = { TPCMData.begin() + Settings.SamplingRate * 3,TPCMData.begin() + Settings.SamplingRate * 4 }; - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - PCMData = { TPCMData.begin() + Settings.SamplingRate * 4,TPCMData.begin() + Settings.SamplingRate * 5 }; - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - PCMData = { TPCMData.begin() + Settings.SamplingRate * 5,TPCMData.begin() + Settings.SamplingRate * 6 }; - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - PCMData = { TPCMData.begin() + Settings.SamplingRate * 6,TPCMData.begin() + Settings.SamplingRate * 7 }; - auto now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - auto inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - PCMData = TPCMData; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; - now = clock(); - _data = MoeVSModuleManager::GetCurSvcModel()->InferPCMData(PCMData, Settings.SamplingRate, Params); - inferTime = double(clock() - now) / 1000.; - std::cout << "Infer Use Time : " << inferTime << "sec.\n"; -#ifdef DEBUGUSETRYCATCH - } - catch (std::exception& e) - { - std::cout << e.what() << std::endl; - } - catch (Ort::Exception& e) + RtInferenceSpace::Params.Sampler = L"DDim"; + RtInferenceSpace::Params.Step = 100; + RtInferenceSpace::Params.Pndm = 10; + RtInferenceSpace::Params.F0Method = L"RMVPE"; + RtInferenceSpace::Params.CrossFadeLength = 8000; + RtInferenceSpace::Params.Keys = 8; + + RtInferenceSpace::RTRecorder.initRecorder((DWORD)MoeVSModuleManager::SamplingRate); + RtInferenceSpace::RTRecorder.setStreamBufferSize(0.5); + RtInferenceSpace::RTRecorder.Start(); + RtInferenceSpace::RTPlayer.initPlayer((DWORD)MoeVSModuleManager::SamplingRate); + + RtInferenceSpace::RTInference(); + + while (true); + while (true) { - std::cout << e.what() << std::endl; + auto PCM = RtInferenceSpace::RTRecorder.GetStreamData(); + if (!PCM.empty()) + RtInferenceSpace::RTPlayer.Play(PCM); } +} #endif - system("pause"); + +#include "LibDLVoiceCodec/value.h" +class Class0 : libdlvcodec::Module +{ +public: + Class0(Module* _Parent, const std::string& _Name) : Module(_Parent, _Name) {} +}; + +class ClassA : libdlvcodec::Module +{ +public: + ClassA(Module* _Parent, const std::string& _Name) : Module(_Parent, _Name) {} +private: + RegLayer(Class0, attrC0); +}; + +class ClassB : libdlvcodec::Module +{ +public: + ClassB() : Module(nullptr, "ClassB") {} +private: + RegLayer(ClassA, attrCA); +}; + +int main() +{ + ClassB a; + printf("%d", &a); } \ No newline at end of file diff --git a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj index 6ee40d1..14fe639 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj +++ b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj @@ -71,10 +71,10 @@ - S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\World\src\world;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\OnnxRuntimeDmlProvider\build\native\include;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\MJson;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\ffmpeg-4.2.1\include;$(IncludePath) + $(SolutionDir)Lib\World\src\world;$(SolutionDir)Lib\OnnxRuntimeDmlProvider\build\native\include;$(SolutionDir)Lib\MJson;$(SolutionDir)Lib\ffmpeg-4.2.1\include;$(SolutionDir)Lib\openblas\include;$(SolutionDir)Lib\faiss;$(IncludePath) - S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\ffmpeg-4.2.1\include;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\MJson;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\OnnxRuntimeDmlProvider\build\native\include;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\World\src\world;$(IncludePath) + $(SolutionDir)Lib\ffmpeg-4.2.1\include;$(SolutionDir)Lib\MJson;$(SolutionDir)Lib\OnnxRuntimeDmlProvider\build\native\include;$(SolutionDir)Lib\World\src\world;$(SolutionDir)Lib\openblas\include;$(SolutionDir)Lib\faiss;$(IncludePath) @@ -108,7 +108,7 @@ Level3 true - _DEBUG;_CONSOLE;MOEVSDMLPROVIDER;MoeVoiceStudioCommandLineProg;%(PreprocessorDefinitions) + _DEBUG;_CONSOLE;MOEVSDMLPROVIDER;MoeVoiceStudioCommandLineProg;MoeVoiceStudioIndexCluster;%(PreprocessorDefinitions) true stdcpp17 4996 @@ -116,7 +116,7 @@ Console true - S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\ffmpeg-4.2.1\Lib;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\OnnxRuntimeDmlProvider\runtimes\win-x64\native;%(AdditionalLibraryDirectories) + $(SolutionDir)Lib\ffmpeg-4.2.1\Lib;$(SolutionDir)Lib\OnnxRuntimeDmlProvider\runtimes\win-x64\native;$(SolutionDir)Lib\openblas\lib;%(AdditionalLibraryDirectories) avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;onnxruntime.lib;%(AdditionalDependencies) @@ -136,7 +136,7 @@ true true true - S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\ffmpeg-4.2.1\Lib;S:\VSGIT\MoeVoiceStudioSvc - Core - Cmd\Lib\OnnxRuntimeDmlProvider\runtimes\win-x64\native;%(AdditionalLibraryDirectories) + $(SolutionDir)Lib\ffmpeg-4.2.1\Lib;$(SolutionDir)Lib\OnnxRuntimeDmlProvider\runtimes\win-x64\native;$(SolutionDir)Lib\openblas\lib;%(AdditionalLibraryDirectories) avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;onnxruntime.lib;%(AdditionalDependencies) @@ -155,6 +155,8 @@ + + @@ -199,6 +201,8 @@ + + diff --git a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters index 9bc6e6b..9a65b7f 100644 --- a/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters +++ b/MoeVoiceStudioSvc - Core - Cmd/MoeVoiceStudioSvc - Core - Cmd.vcxproj.filters @@ -91,6 +91,12 @@ {7cc7c826-511c-434d-a057-362c82a3da04} + + {0d30c1a5-5cc4-4016-976f-f0c388871d59} + + + {9cdc3e22-5d4a-4a08-a1cd-57dd5c657e2c} + @@ -210,6 +216,12 @@ 源文件\Modules\InferTools\F0Extractor + + 源文件\LibDLVoiceCodec + + + 源文件\LibDLVoiceCodec + @@ -338,5 +350,11 @@ 头文件\Modules\InferTools\F0Extractor + + 头文件\LibDLVoiceCodec + + + 头文件\LibDLVoiceCodec + \ No newline at end of file