BugFix

NaruseMioShirakana · Jun 2, 2024 · 7597c82 · 7597c82
1 parent 604fc92
commit 7597c82
Show file tree

Hide file tree

Showing 11 changed files with 392 additions and 250 deletions.
diff --git a/DotNetApi/LibSvcApi.cs b/DotNetApi/LibSvcApi.cs
@@ -433,6 +433,11 @@ public UnionModel(void* _Obj)
             throw new Exception(GetError(0));
         }
 
+        public void* GetModel()
+        {
+            return Model_;
+        }
+
         public Int16Vector Inference(
             Slice _Slice,
             ref Params _InferParams,
@@ -592,11 +597,21 @@ public Params() { }
 
             public int UseShallowDiffusion = 0;                       //使用浅扩散
             public void* _VocoderModel = null;
+            public void* _ShallowDiffusionModel = null;
+            public int ShallowDiffusionUseSrcAudio = 1;
+            public int VocoderHopSize = 512;
+            public int VocoderMelBins = 128;
+            public int VocoderSamplingRate = 44100;
+            public long ShallowDiffuisonSpeaker = 0;
 
             public void SetVocoder(ref VocoderModel Vocoder)
             {
                 _VocoderModel = Vocoder.GetModel();
             }
+            public void SetShallowDiffusion(ref UnionModel UnionMod)
+            {
+                _ShallowDiffusionModel = UnionMod.GetModel();
+            }
         };
 
         [StructLayout(LayoutKind.Sequential, Pack = 4, CharSet = CharSet.Unicode)]

diff --git a/libsvc/Api/header/NativeApi.h b/libsvc/Api/header/NativeApi.h
@@ -1,5 +1,12 @@
 #pragma once
 #include "../../framework.h"
+#ifdef __GNUC__
+#define LibSvcDeprecated __attribute__((deprecated))
+#else
+#ifdef _MSC_VER
+#define LibSvcDeprecated __declspec(deprecated)
+#endif
+#endif
 #ifdef _WIN32
 #include "windows.h"
 #endif
@@ -72,8 +79,14 @@ extern "C" {
 		LPWSTR Sampler;									//Diffusion采样器			["Pndm" "DDim"]
 		LPWSTR ReflowSampler;							//Reflow采样器				["Eular" "Rk4" "Heun" "Pecece"]
 		LPWSTR F0Method;								//F0提取算法					["Dio" "Harvest" "RMVPE" "FCPE"]
-		INT32 UseShallowDiffusion;						//是否使用浅扩散				[0(false)/1(true)]
+		INT32 UseShallowDiffusionOrEnhancer;			//是否使用浅扩散/声码器增强		[0(false)/1(true)]
 		void* _VocoderModel;							//声码器模型					Diffusion模型必须设定该项目
+		void* _ShallowDiffusionModel;                   //扩散模型					浅扩散必需设置为扩散模型地址
+		INT32 ShallowDiffusionUseSrcAudio;              //浅扩散模型是否使用原始音频		[0(false)/1(true)]
+		INT32 VocoderHopSize;							//声码器HopSize				[    Hop     ]
+		INT32 VocoderMelBins;							//声码器MelBins				[    Bins    ]
+		INT32 VocoderSamplingRate;						//声码器采样率				[     SR     ]
+		INT64 ShallowDiffuisonSpeaker;					//浅扩散中Vits模型输入的角色ID	[   0 ~ NS   ]
 	};
 
 	struct DiffusionSvcPaths
@@ -276,7 +289,15 @@ extern "C" {
 		Int16Vector _Output							//std::vector<int16_t> By "LibSvcAllocateAudio()"
 	);
 
-	LibSvcApi INT32 LibSvcShallowDiffusionInference(
+	LibSvcApi INT32 LibSvcInferPCMData(
+		SvcModel _Model,							//SingingVoiceConversion Model
+		UINT32 _T,
+		CInt16Vector _PCMData,
+		const void* _InferParams,					//Ptr Of LibSvcParams
+		Int16Vector _Output							//std::vector<int16_t> By "LibSvcAllocateAudio()"
+	);
+
+	LibSvcApi LibSvcDeprecated INT32 LibSvcShallowDiffusionInference(
 		SvcModel _Model,							//SingingVoiceConversion Model
 		CInt16Vector _16KAudioHubert,				//SamplingRate Must Be 16000
 		MelType _Mel,								//Mel By "LibSvcAllocateMel()"
@@ -289,7 +310,7 @@ extern "C" {
 		Int16Vector _Output							//std::vector<int16_t> By "LibSvcAllocateAudio()"
 	);
 
-	LibSvcApi INT32 LibSvcVocoderEnhance(
+	LibSvcApi LibSvcDeprecated INT32 LibSvcVocoderEnhance(
 		VocoderModel _Model,						//Vocoder Model
 		MelType _Mel,								//Mel By "LibSvcAllocateMel()"
 		CFloatVector _F0,

diff --git a/libsvc/Api/header/readme.md → libsvc/Api/readme.md b/libsvc/Api/header/readme.md → libsvc/Api/readme.md
@@ -199,7 +199,10 @@ void func(){
 
     //声码器增强
     {
-        //推理出一个基础结果
+        _Params.VocoderHopSize = 512;
+        _Params.VocoderMelBins = 128;
+        _Params.VocoderSamplingRate = 44100;
+        _Params._VocoderModel = _Vocoder;
         LibSvcInferSlice(
             _Model, //模型
             0, //模型类型
@@ -208,32 +211,15 @@ void func(){
             &_Process, //当前进度
             _OutPutAudio //输出
         );
-
-        /*
-			此处自行将_OutPutAudio重采样至声码器的采样率，或是保证满足以下函数的要求
-        */
-
-        //短时傅里叶变换，并将其变换到Mel空间。注意：至少要保证该函数参数中(_SamplingRate / _HopSize)与声码器参数的(_SamplingRate / _HopSize)相等
-        LibSvcStft(
-            _OutPutAudio, //输入音频
-            44100, //声码器采样率
-            512, //STFT HopSize（声码器的HopSize）
-            128, //Mel Bins（必须为声码器的MelBins）
-            _Mel //输出的Mel
-        );
-
-        LibSvcVocoderEnhance(
-            _Vocoder, //声码器模型
-            _Mel, //上一步输出的Mel
-            _F0, //该切片的F0数据（必须为同一切片的数据）
-            128, //Mel Bins（必须为声码器的MelBins）
-            _OutPutAudio //输出
-        );
     }
 
     //浅扩散推理
     {
-        //用Vits推理出一个基础结果
+        _Params.VocoderHopSize = 512;
+        _Params.VocoderMelBins = 128;
+        _Params.VocoderSamplingRate = 44100;
+        _Params._VocoderModel = _Vocoder;
+        _Params._ShallowDiffusionModel = nullptr; //改为你的Diffusion模型
         LibSvcInferSlice(
             _Model, //模型
             0, //模型类型
@@ -242,36 +228,6 @@ void func(){
             &_Process, //当前进度
             _OutPutAudio //输出
         );
-
-        /*
-            此处自行将_OutPutAudio重采样至Diffusion模型的采样率，或是保证满足以下函数的要求
-        */
-
-        //短时傅里叶变换，并将其变换到Mel空间。注意：至少要保证该函数参数中(_SamplingRate / _HopSize)与Diffusion模型的(_SamplingRate / _HopSize)相等
-        LibSvcStft(
-            _OutPutAudio, //输入音频
-            44100, //Diffusion模型采样率
-            512, //STFT HopSize（Diffusion模型的HopSize）
-            128, //Mel Bins（必须为Diffusion模型的MelBins）
-            _Mel //输出的Mel
-        );
-
-        /*
-            此处自行将_OutPutAudio重采样至16000采样率
-        */
-
-        LibSvcShallowDiffusionInference(
-            _Model, //此处的模型必须为Diffusion模型，写教程的时候为了方便我写成了同一个
-            _OutPutAudio, //16K采样率的输入音频
-            _Mel, //上一步得到的Mel
-            _F0, //该切片的F0
-            _Volume, //该切片的音量
-            _Speaker, //该切片的的角色
-            LibSvcGetSrcLength(_SingleSlice), //该切片的原始数据大小
-            &_Params, //推理参数
-            &_Process, //当前进度
-            &_OutPutAudio //输出
-        );
     }
 
     //释放模型，第一个参数为类型

diff --git a/libsvc/Api/src/NativeApi.cpp b/libsvc/Api/src/NativeApi.cpp
@@ -115,8 +115,14 @@ void InitLibSvcParams(LibSvcParams* _Input)
 	_Input->Sampler = nullptr;							//Diffusion采样器
 	_Input->ReflowSampler = nullptr;						//Reflow采样器
 	_Input->F0Method = nullptr;							//F0提取算法
-	_Input->UseShallowDiffusion = false;                  //使用浅扩散
+	_Input->UseShallowDiffusionOrEnhancer = false;                  //使用浅扩散
 	_Input->_VocoderModel = nullptr;
+	_Input->_ShallowDiffusionModel = nullptr;
+	_Input->ShallowDiffusionUseSrcAudio = 1;
+	_Input->VocoderHopSize = 512;
+	_Input->VocoderMelBins = 128;
+	_Input->VocoderSamplingRate = 44100;
+	_Input->ShallowDiffuisonSpeaker = 0;
 }
 
 void InitLibSvcSlicerSettings(LibSvcSlicerSettings* _Input)
@@ -564,8 +570,14 @@ INT32 LibSvcInferSlice(
 		LibSvcNullStrCheck(InpParam.Sampler),
 		LibSvcNullStrCheck(InpParam.ReflowSampler),
 		LibSvcNullStrCheck(InpParam.F0Method),
-		(bool)InpParam.UseShallowDiffusion,
-		InpParam._VocoderModel
+		(bool)InpParam.UseShallowDiffusionOrEnhancer,
+		InpParam._VocoderModel,
+		InpParam._ShallowDiffusionModel,
+		(bool)InpParam.ShallowDiffusionUseSrcAudio,
+		InpParam.VocoderHopSize,
+		InpParam.VocoderMelBins,
+		InpParam.VocoderSamplingRate,
+		InpParam.ShallowDiffuisonSpeaker
 	};
 
 	try
@@ -589,6 +601,98 @@ INT32 LibSvcInferSlice(
 	return 0;
 }
 
+INT32 LibSvcInferPCMData(
+	SvcModel _Model,							//SingingVoiceConversion Model
+	UINT32 _T,
+	CInt16Vector _PCMData,
+	const void* _InferParams,					//Ptr Of LibSvcParams
+	Int16Vector _Output							//std::vector<int16_t> By "LibSvcAllocateAudio()"
+)
+{
+	if (!_Model)
+	{
+		RaiseError(L"_Model Could Not Be Null!");
+		return 1;
+	}
+
+	if (!_PCMData)
+	{
+		RaiseError(L"_PCMData Could Not Be Null!");
+		return 1;
+	}
+
+	if (!_InferParams)
+	{
+		RaiseError(L"_InferParams Could Not Be Null!");
+		return 1;
+	}
+
+	if (!_Output)
+	{
+		RaiseError(L"_Output Could Not Be Null!");
+		return 1;
+	}
+
+	const auto& InpParam = *(const LibSvcParams*)(_InferParams);
+
+	if (!InpParam._VocoderModel && _T == 1)
+	{
+		RaiseError(L"_VocoderModel Could Not Be Null!");
+		return 1;
+	}
+
+	const Params Param
+	{
+		InpParam.NoiseScale,
+		InpParam.Seed,
+		InpParam.SpeakerId,
+		InpParam.SrcSamplingRate,
+		InpParam.SpkCount,
+		InpParam.IndexRate,
+		InpParam.ClusterRate,
+		InpParam.DDSPNoiseScale,
+		InpParam.Keys,
+		InpParam.MeanWindowLength,
+		InpParam.Pndm,
+		InpParam.Step,
+		InpParam.TBegin,
+		InpParam.TEnd,
+		LibSvcNullStrCheck(InpParam.Sampler),
+		LibSvcNullStrCheck(InpParam.ReflowSampler),
+		LibSvcNullStrCheck(InpParam.F0Method),
+		(bool)InpParam.UseShallowDiffusionOrEnhancer,
+		InpParam._VocoderModel,
+		InpParam._ShallowDiffusionModel,
+		(bool)InpParam.ShallowDiffusionUseSrcAudio,
+		InpParam.VocoderHopSize,
+		InpParam.VocoderMelBins,
+		InpParam.VocoderSamplingRate,
+		InpParam.ShallowDiffuisonSpeaker
+	};
+
+	auto& InputData = *(const AudioContainer*)(_PCMData);
+
+	try
+	{
+		if (_T == 0)
+			*(AudioContainer*)(_Output) = ((VitsSvc*)(_Model))->InferPCMData(InputData, (long)InputData.size(), Param);
+		else if (_T == 1)
+			*(AudioContainer*)(_Output) = ((UnionSvc*)(_Model))->InferPCMData(InputData, (long)InputData.size(), Param);
+		else
+		{
+			RaiseError(L"UnSupported Model Type!");
+			return 1;
+		}
+	}
+	catch (std::exception& e)
+	{
+		RaiseError(to_wide_string(e.what()));
+		return 1;
+	}
+
+	return 0;
+}
+
 INT32 LibSvcShallowDiffusionInference(
 	void* _Model,
 	const void* _16KAudioHubert,
@@ -683,8 +787,14 @@ INT32 LibSvcShallowDiffusionInference(
 		LibSvcNullStrCheck(InpParam.Sampler),
 		LibSvcNullStrCheck(InpParam.ReflowSampler),
 		LibSvcNullStrCheck(InpParam.F0Method),
-		(bool)InpParam.UseShallowDiffusion,
-		InpParam._VocoderModel
+		(bool)InpParam.UseShallowDiffusionOrEnhancer,
+		InpParam._VocoderModel,
+		InpParam._ShallowDiffusionModel,
+		(bool)InpParam.ShallowDiffusionUseSrcAudio,
+		InpParam.VocoderHopSize,
+		InpParam.VocoderMelBins,
+		InpParam.VocoderSamplingRate,
+		InpParam.ShallowDiffuisonSpeaker
 	};
 
 	auto _NormalizedAudio = InferTools::InterpResample(

diff --git a/libsvc/Modules/header/Models/MoeVSProject.hpp b/libsvc/Modules/header/Models/MoeVSProject.hpp
@@ -97,40 +97,6 @@ namespace MoeVSProjectSpace
         }
     };
 
-    struct MoeVSAudioSliceRef
-    {
-        const std::vector<int16_t>& Audio;
-        const std::vector<float>& F0;
-        const std::vector<float>& Volume;
-        const std::vector<std::vector<float>>& Speaker;
-        bool IsNotMute;
-        long OrgLen;
-        const std::wstring& Path;
-        size_t Slice = 0;
-        void* Mel = nullptr;
-        MoeVSAudioSliceRef(
-            const std::vector<int16_t>& audio,
-            const std::vector<float>& f0,
-            const std::vector<float>& volume,
-            const std::vector<std::vector<float>>& speaker,
-            bool isnotmute,
-            long orglen,
-            const std::wstring& path,
-            size_t sli,
-            void* mel_tensor_ptr = nullptr
-        ) :
-            Audio(audio),
-            F0(f0),
-            Volume(volume),
-            Speaker(speaker),
-            IsNotMute(isnotmute),
-            OrgLen(orglen),
-            Path(path),
-            Slice(sli),
-            Mel(mel_tensor_ptr)
-        {}
-    };
-
 	struct MoeVSParams
 	{
         //通用
@@ -146,15 +112,21 @@ namespace MoeVSProjectSpace
 		float DDSPNoiseScale = 0.8f;                       //DDSP噪声修正因子      0-10
 		float Keys = 0.f;                                  //升降调               -64-64
 		size_t MeanWindowLength = 2;                       //均值滤波器窗口大小     1-20
-		size_t Pndm = 100;                                 //Diffusion加速倍数    1-200
-		size_t Step = 1000;                                //Diffusion总步数      1-1000
+		size_t Pndm = 1;                                 //Diffusion加速倍数    1-200
+		size_t Step = 100;                                //Diffusion总步数      1-1000
         float TBegin = 0.f;
         float TEnd = 1.f;
 		std::wstring Sampler = L"Pndm";                    //Diffusion采样器
         std::wstring ReflowSampler = L"Eular";             //Reflow采样器
 		std::wstring F0Method = L"Dio";                    //F0提取算法
         bool UseShallowDiffusion = false;                  //使用浅扩散
         void* _VocoderModel = nullptr;
+        void* _ShallowDiffusionModel = nullptr;
+        bool ShallowDiffusionUseSrcAudio = true;
+        int VocoderHopSize = 512;
+        int VocoderMelBins = 128;
+        int VocoderSamplingRate = 44100;
+        int64_t ShallowDiffuisonSpeaker = 0;
 
         //SVCRTInfer
         int64_t RTSampleSize = 44100;