diff --git a/client/demo/build-voice-changer-js.sh b/client/demo/build-voice-changer-js.sh index 2af183a2c..413a0dd12 100644 --- a/client/demo/build-voice-changer-js.sh +++ b/client/demo/build-voice-changer-js.sh @@ -4,7 +4,6 @@ # cp -r ~/git-work/voice-changer-js/lib/package.json node_modules/@dannadori/voice-changer-js/ # cp -r ~/git-work/voice-changer-js/lib/dist node_modules/@dannadori/voice-changer-js/ - cd ~/git-work/voice-changer-js/lib/ ; npm run build:prod; cd - rm -rf node_modules/@dannadori/voice-changer-js mkdir -p node_modules/@dannadori/voice-changer-js/dist diff --git a/client/demo/src/001_provider/001_AppStateProvider.tsx b/client/demo/src/001_provider/001_AppStateProvider.tsx index 16e15917e..ce2585968 100644 --- a/client/demo/src/001_provider/001_AppStateProvider.tsx +++ b/client/demo/src/001_provider/001_AppStateProvider.tsx @@ -1,12 +1,11 @@ import { ClientState } from "@dannadori/voice-changer-client-js"; +import { VoiceChangerJSClient } from "@dannadori/voice-changer-js"; import React, { useContext, useEffect, useRef } from "react"; import { ReactNode } from "react"; import { useVCClient } from "../001_globalHooks/001_useVCClient"; import { useAppRoot } from "./001_AppRootProvider"; import { useMessageBuilder } from "../hooks/useMessageBuilder"; -import { VoiceChangerJSClient } from "./VoiceChangerJSClient"; - type Props = { children: ReactNode; }; @@ -58,22 +57,34 @@ export const AppStateProvider = ({ children }: Props) => { // useEffect(() => { // if (clientState.clientState.initialized) { + // const baseUrl = "https://192.168.0.247:18888"; + // // const modelUrl = `${baseUrl}/models/rvc2v_40k_f0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvc2v_40k_nof0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvc2v_16k_f0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_40k_f0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_40k_nof0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_32k_f0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_32k_nof0_24000.bin`; + + // // const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_32k_f0_24000.bin`; + // const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_32k_nof0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_40k_f0_24000.bin`; + // // const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_40k_nof0_24000.bin`; + // voiceChangerJSClient.current = new VoiceChangerJSClient(); - // voiceChangerJSClient.current.initialize(); + // voiceChangerJSClient.current.initialize( + // { + // baseUrl: baseUrl, + // inputSamplingRate: 48000, + // outputSamplingRate: 48000, + // }, + // modelUrl, + // ); // clientState.clientState.setInternalAudioProcessCallback({ // processAudio: async (data: Uint8Array) => { - // console.log("[CLIENTJS] start --------------------------------------"); // const audioF32 = new Float32Array(data.buffer); // const converted = await voiceChangerJSClient.current!.convert(audioF32); - - // let audio_int16_out = new Int16Array(converted.length); - // for (let i = 0; i < converted.length; i++) { - // audio_int16_out[i] = converted[i] * 32768.0; - // } - // const res = new Uint8Array(audio_int16_out.buffer); - // console.log("AUDIO::::audio_int16_out", audio_int16_out); - - // console.log("[CLIENTJS] end --------------------------------------"); + // const res = new Uint8Array(converted.buffer); // return res; // }, // }); diff --git a/client/demo/src/001_provider/VoiceChangerJSClient.ts b/client/demo/src/001_provider/VoiceChangerJSClient.ts deleted file mode 100644 index a19b9fb32..000000000 --- a/client/demo/src/001_provider/VoiceChangerJSClient.ts +++ /dev/null @@ -1,149 +0,0 @@ -import { create, ConverterType } from "@alexanderolsen/libsamplerate-js"; -import { BlockingQueue } from "./_BlockingQueue"; -import { WorkerManager, generateConfig, VoiceChangerProcessorInitializeParams, VoiceChangerProcessorConvertParams, FunctionType, VoiceChangerProcessorResult } from "@dannadori/voice-changer-js"; - -export class VoiceChangerJSClient { - private wm = new WorkerManager(); - private audioBuffer: Float32Array = new Float32Array(0); - private audioInputLength = 24000; - - private inputSamplingRate = 48000; - private outputSamplingRate = 48000; - private modelInputSamplingRate = 16000; - private modelOutputSamplingRate = 40000; - private sem = new BlockingQueue(); - private crossfadeChunks = 1; - private solaChunks = 0.5; - constructor() { - this.sem.enqueue(0); - } - private lock = async () => { - const num = await this.sem.dequeue(); - return num; - }; - private unlock = (num: number) => { - this.sem.enqueue(num + 1); - }; - - initialize = async () => { - console.log("Voice Changer Initializing,,,"); - const baseUrl = "http://127.0.0.1:18888"; - - this.wm = new WorkerManager(); - const config = generateConfig(); - config.processorURL = `${baseUrl}/process.js`; - config.onnxWasmPaths = `${baseUrl}/`; - await this.wm.init(config); - - const initializeParams: VoiceChangerProcessorInitializeParams = { - type: FunctionType.initialize, - inputLength: 24000, - f0_min: 50, - f0_max: 1100, - embPitchUrl: "http://127.0.0.1:18888/models/emb_pit_24000.bin", - rvcv2InputLength: 148, - // rvcv2Url: "http://127.0.0.1:18888/models/rvc2v_24000.bin", - rvcv2Url: "http://127.0.0.1:18888/models/rvc2vnof0_24000.bin", - transfer: [], - }; - - const res = (await this.wm.execute(initializeParams)) as VoiceChangerProcessorResult; - console.log("Voice Changer Initialized..", res); - }; - - convert = async (audio: Float32Array): Promise => { - console.log("convert start....", audio); - const lockNum = await this.lock(); - //resample - const audio_16k = await this.resample(audio, this.inputSamplingRate, this.modelInputSamplingRate); - //store data and get target data - //// store - const newAudioBuffer = new Float32Array(this.audioBuffer.length + audio_16k.length); - newAudioBuffer.set(this.audioBuffer); - newAudioBuffer.set(audio_16k, this.audioBuffer.length); - this.audioBuffer = newAudioBuffer; - - //// Buffering..... - if (this.audioBuffer.length < this.audioInputLength * 1) { - console.log(`skip covert length:${this.audioBuffer.length}, audio_16k:${audio_16k.length}`); - await this.unlock(lockNum); - return new Float32Array(1); - } else { - console.log(`--------------- convert start... length:${this.audioBuffer.length}, audio_16k:${audio_16k.length}`); - } - - //// get chunks - let chunkIndex = 0; - const audioChunks: Float32Array[] = []; - while (true) { - const chunkOffset = chunkIndex * this.audioInputLength - (this.crossfadeChunks + this.solaChunks) * 320 * chunkIndex; - const chunkEnd = chunkOffset + this.audioInputLength; - if (chunkEnd > this.audioBuffer.length) { - this.audioBuffer = this.audioBuffer.slice(chunkOffset); - break; - } else { - const chunk = this.audioBuffer.slice(chunkOffset, chunkEnd); - audioChunks.push(chunk); - } - chunkIndex++; - } - - if (audioChunks.length == 0) { - await this.unlock(lockNum); - console.log(`skip covert length:${this.audioBuffer.length}, audio_16k:${audio_16k.length}`); - return new Float32Array(1); - } - - //convert (each) - const convetedAudioChunks: Float32Array[] = []; - for (let i = 0; i < audioChunks.length; i++) { - const convertParams: VoiceChangerProcessorConvertParams = { - type: FunctionType.convert, - transfer: [audioChunks[i].buffer], - }; - const res = (await this.wm.execute(convertParams)) as VoiceChangerProcessorResult; - const converted = new Float32Array(res.transfer[0] as ArrayBuffer); - console.log(`converted.length:::${i}:${converted.length}`); - - convetedAudioChunks.push(converted); - } - - //concat - let totalLength = convetedAudioChunks.reduce((prev, cur) => prev + cur.length, 0); - let convetedAudio = new Float32Array(totalLength); - let offset = 0; - for (let chunk of convetedAudioChunks) { - convetedAudio.set(chunk, offset); - offset += chunk.length; - } - console.log(`converted.length:::convetedAudio:${convetedAudio.length}`); - - //resample - // const response = await this.resample(convetedAudio, this.params.modelOutputSamplingRate, this.params.outputSamplingRate); - - const outputDuration = (this.audioInputLength * audioChunks.length - this.crossfadeChunks * 320) / 16000; - const outputSamples = outputDuration * this.outputSamplingRate; - const convertedOutputRatio = outputSamples / convetedAudio.length; - const realOutputSamplingRate = this.modelOutputSamplingRate * convertedOutputRatio; - console.log(`realOutputSamplingRate:${realOutputSamplingRate}, `, this.modelOutputSamplingRate, convertedOutputRatio); - - // const response2 = await this.resample(convetedAudio, this.params.modelOutputSamplingRate, realOutputSamplingRate); - const response2 = await this.resample(convetedAudio, this.modelOutputSamplingRate, this.outputSamplingRate); - - console.log(`converted from :${audioChunks.length * this.audioInputLength} to:${convetedAudio.length} to:${response2.length}`); - console.log(`outputDuration :${outputDuration} outputSamples:${outputSamples}, convertedOutputRatio:${convertedOutputRatio}, realOutputSamplingRate:${realOutputSamplingRate}`); - await this.unlock(lockNum); - return response2; - }; - - // Utility - resample = async (data: Float32Array, srcSampleRate: number, dstSampleRate: number) => { - const converterType = ConverterType.SRC_SINC_BEST_QUALITY; - const nChannels = 1; - const converter = await create(nChannels, srcSampleRate, dstSampleRate, { - converterType: converterType, // default SRC_SINC_FASTEST. see API for more - }); - const res = converter.simple(data); - return res; - }; -} diff --git a/client/demo/webpack.common.js b/client/demo/webpack.common.js index 6d92bcc0d..8a0acabdc 100644 --- a/client/demo/webpack.common.js +++ b/client/demo/webpack.common.js @@ -55,20 +55,44 @@ module.exports = { patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }], }), + new CopyPlugin({ + patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/ort-wasm-simd.wasm", to: "ort-wasm-simd.wasm" }], + }), // new CopyPlugin({ - // patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/ort-wasm-simd.wasm", to: "ort-wasm-simd.wasm" }], + // patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/tfjs-backend-wasm-simd.wasm", to: "tfjs-backend-wasm-simd.wasm" }], // }), // new CopyPlugin({ // patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/process.js", to: "process.js" }], // }), // new CopyPlugin({ - // patterns: [{ from: "public/models/emb_pit_24000.bin", to: "models/emb_pit_24000.bin" }], + // patterns: [{ from: "public/models/rvcv2_emb_pit_24000.bin", to: "models/rvcv2_emb_pit_24000.bin" }], + // }), + // new CopyPlugin({ + // patterns: [{ from: "public/models/rvcv2_amitaro_v2_32k_f0_24000.bin", to: "models/rvcv2_amitaro_v2_32k_f0_24000.bin" }], + // }), + // new CopyPlugin({ + // patterns: [{ from: "public/models/rvcv2_amitaro_v2_32k_nof0_24000.bin", to: "models/rvcv2_amitaro_v2_32k_nof0_24000.bin" }], + // }), + // new CopyPlugin({ + // patterns: [{ from: "public/models/rvcv2_amitaro_v2_40k_f0_24000.bin", to: "models/rvcv2_amitaro_v2_40k_f0_24000.bin" }], + // }), + // new CopyPlugin({ + // patterns: [{ from: "public/models/rvcv2_amitaro_v2_40k_nof0_24000.bin", to: "models/rvcv2_amitaro_v2_40k_nof0_24000.bin" }], + // }), + // new CopyPlugin({ + // patterns: [{ from: "public/models/rvcv1_emb_pit_24000.bin", to: "models/rvcv1_emb_pit_24000.bin" }], + // }), + // new CopyPlugin({ + // patterns: [{ from: "public/models/rvcv1_amitaro_v1_32k_f0_24000.bin", to: "models/rvcv1_amitaro_v1_32k_f0_24000.bin" }], + // }), + // new CopyPlugin({ + // patterns: [{ from: "public/models/rvcv1_amitaro_v1_32k_nof0_24000.bin", to: "models/rvcv1_amitaro_v1_32k_nof0_24000.bin" }], // }), // new CopyPlugin({ - // patterns: [{ from: "public/models/rvc2v_24000.bin", to: "models/rvc2v_24000.bin" }], + // patterns: [{ from: "public/models/rvcv1_amitaro_v1_40k_f0_24000.bin", to: "models/rvcv1_amitaro_v1_40k_f0_24000.bin" }], // }), // new CopyPlugin({ - // patterns: [{ from: "public/models/rvc2vnof0_24000.bin", to: "models/rvc2vnof0_24000.bin" }], + // patterns: [{ from: "public/models/rvcv1_amitaro_v1_40k_nof0_24000.bin", to: "models/rvcv1_amitaro_v1_40k_nof0_24000.bin" }], // }), ], }; diff --git a/server/voice_changer/DDSP_SVC/DDSP_SVC.py b/server/voice_changer/DDSP_SVC/DDSP_SVC.py index 8cc7b8aa2..20fa6a52f 100644 --- a/server/voice_changer/DDSP_SVC/DDSP_SVC.py +++ b/server/voice_changer/DDSP_SVC/DDSP_SVC.py @@ -20,7 +20,7 @@ from .models.diffusion.infer_gt_mel import DiffGtMel -from voice_changer.utils.VoiceChangerModel import AudioInOut +from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.DDSP_SVC.DDSP_SVCSetting import DDSP_SVCSettings from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager @@ -44,15 +44,20 @@ def phase_vocoder(a, b, fade_out, fade_in): deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5) w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase t = torch.arange(n).unsqueeze(-1).to(a) / n - result = a * (fade_out**2) + b * (fade_in**2) + torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n + result = ( + a * (fade_out**2) + + b * (fade_in**2) + + torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n + ) return result -class DDSP_SVC: +class DDSP_SVC(VoiceChangerModel): initialLoad: bool = True def __init__(self, params: VoiceChangerParams, slotInfo: DDSPSVCModelSlot): print("[Voice Changer] [DDSP-SVC] Creating instance ") + self.voiceChangerType = "DDSP-SVC" self.deviceManager = DeviceManager.get_instance() self.gpu_num = torch.cuda.device_count() self.params = params @@ -71,8 +76,18 @@ def __init__(self, params: VoiceChangerParams, slotInfo: DDSPSVCModelSlot): def initialize(self): self.device = self.deviceManager.getDevice(self.settings.gpu) vcparams = VoiceChangerParamsManager.get_instance().params - modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), "model", self.slotInfo.modelFile) - diffPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), "diff", self.slotInfo.diffModelFile) + modelPath = os.path.join( + vcparams.model_dir, + str(self.slotInfo.slotIndex), + "model", + self.slotInfo.modelFile, + ) + diffPath = os.path.join( + vcparams.model_dir, + str(self.slotInfo.slotIndex), + "diff", + self.slotInfo.diffModelFile, + ) self.svc_model = SvcDDSP() self.svc_model.setVCParams(self.params) @@ -112,11 +127,15 @@ def generate_input( # newData = newData.astype(np.float32) if self.audio_buffer is not None: - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 + self.audio_buffer = np.concatenate( + [self.audio_buffer, newData], 0 + ) # 過去のデータに連結 else: self.audio_buffer = newData - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + convertSize = ( + inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + ) # if convertSize % self.hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 # convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size)) @@ -147,7 +166,8 @@ def _pyTorch_inference(self, data): f0_min=50, f0_max=1100, # safe_prefix_pad_length=0, # TBD なにこれ? - safe_prefix_pad_length=self.settings.extraConvertSize / self.svc_model.args.data.sampling_rate, + safe_prefix_pad_length=self.settings.extraConvertSize + / self.svc_model.args.data.sampling_rate, diff_model=self.diff_model, diff_acc=self.settings.diffAcc, # TBD なにこれ? diff_spk_id=self.settings.diffSpkId, @@ -155,7 +175,9 @@ def _pyTorch_inference(self, data): # diff_use_dpm=True if self.settings.useDiffDpm == 1 else False, # TBD なにこれ? method=self.settings.diffMethod, k_step=self.settings.kStep, # TBD なにこれ? - diff_silence=True if self.settings.useDiffSilence == 1 else False, # TBD なにこれ? + diff_silence=True + if self.settings.useDiffSilence == 1 + else False, # TBD なにこれ? ) return _audio.cpu().numpy() * 32768.0 @@ -182,5 +204,4 @@ def __del__(self): pass def get_model_current(self): - return [ - ] + return [] diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index 9bd6d7045..531c874c2 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -6,16 +6,28 @@ from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline -from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager +from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import ( + PitchExtractorManager, +) from voice_changer.ModelSlotManager import ModelSlotManager -from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel +from voice_changer.utils.VoiceChangerModel import ( + AudioInOut, + PitchfInOut, + FeatureInOut, + VoiceChangerModel, +) from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager + # from voice_changer.RVC.onnxExporter.export2onnx import export2onnx from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager -from Exceptions import DeviceCannotSupportHalfPrecisionException, PipelineCreateException, PipelineNotInitializedException +from Exceptions import ( + DeviceCannotSupportHalfPrecisionException, + PipelineCreateException, + PipelineNotInitializedException, +) logger = VoiceChangaerLogger.get_instance().getLogger() @@ -23,6 +35,7 @@ class DiffusionSVC(VoiceChangerModel): def __init__(self, params: VoiceChangerParams, slotInfo: DiffusionSVCModelSlot): logger.info("[Voice Changer] [DiffusionSVC] Creating instance ") + self.voiceChangerType = "Diffusion-SVC" self.deviceManager = DeviceManager.get_instance() EmbedderManager.initialize(params) PitchExtractorManager.initialize(params) @@ -46,9 +59,17 @@ def initialize(self): # pipelineの生成 try: - self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector, self.inputSampleRate, self.outputSampleRate) + self.pipeline = createPipeline( + self.slotInfo, + self.settings.gpu, + self.settings.f0Detector, + self.inputSampleRate, + self.outputSampleRate, + ) except PipelineCreateException as e: # NOQA - logger.error("[Voice Changer] pipeline create failed. check your model is valid.") + logger.error( + "[Voice Changer] pipeline create failed. check your model is valid." + ) return # その他の設定 @@ -76,7 +97,9 @@ def update_settings(self, key: str, val: int | float | str): elif key in self.settings.strData: setattr(self.settings, key, str(val)) if key == "f0Detector" and self.pipeline is not None: - pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) + pitchExtractor = PitchExtractorManager.getPitchExtractor( + self.settings.f0Detector, self.settings.gpu + ) self.pipeline.setPitchExtractor(pitchExtractor) else: return False @@ -100,30 +123,65 @@ def generate_input( crossfadeSize: int, solaSearchFrame: int = 0, ): - newData = newData.astype(np.float32) / 32768.0 # DiffusionSVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) - new_feature_length = int(((newData.shape[0] / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) # 100 は hubertのhosizeから (16000 / 160). + newData = ( + newData.astype(np.float32) / 32768.0 + ) # DiffusionSVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) + new_feature_length = int( + ((newData.shape[0] / self.inputSampleRate) * self.slotInfo.samplingRate) + / 512 + ) # 100 は hubertのhosizeから (16000 / 160). # ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。 if self.audio_buffer is not None: # 過去のデータに連結 self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) - self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0) - self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0) + self.pitchf_buffer = np.concatenate( + [self.pitchf_buffer, np.zeros(new_feature_length)], 0 + ) + self.feature_buffer = np.concatenate( + [ + self.feature_buffer, + np.zeros([new_feature_length, self.slotInfo.embChannels]), + ], + 0, + ) else: self.audio_buffer = newData self.pitchf_buffer = np.zeros(new_feature_length) - self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels]) + self.feature_buffer = np.zeros( + [new_feature_length, self.slotInfo.embChannels] + ) - convertSize = newData.shape[0] + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + convertSize = ( + newData.shape[0] + + crossfadeSize + + solaSearchFrame + + self.settings.extraConvertSize + ) if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (128 - (convertSize % 128)) # バッファがたまっていない場合はzeroで補う - generateFeatureLength = int(((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) + 1 + generateFeatureLength = ( + int( + ((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate) + / 512 + ) + + 1 + ) if self.audio_buffer.shape[0] < convertSize: - self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) - self.pitchf_buffer = np.concatenate([np.zeros(generateFeatureLength), self.pitchf_buffer]) - self.feature_buffer = np.concatenate([np.zeros([generateFeatureLength, self.slotInfo.embChannels]), self.feature_buffer]) + self.audio_buffer = np.concatenate( + [np.zeros([convertSize]), self.audio_buffer] + ) + self.pitchf_buffer = np.concatenate( + [np.zeros(generateFeatureLength), self.pitchf_buffer] + ) + self.feature_buffer = np.concatenate( + [ + np.zeros([generateFeatureLength, self.slotInfo.embChannels]), + self.feature_buffer, + ] + ) convertOffset = -1 * convertSize featureOffset = -1 * generateFeatureLength @@ -139,9 +197,17 @@ def generate_input( vol = float(max(vol, self.prevVol * 0.0)) self.prevVol = vol - return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol) + return ( + self.audio_buffer, + self.pitchf_buffer, + self.feature_buffer, + convertSize, + vol, + ) - def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int): + def inference( + self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int + ): if self.pipeline is None: logger.info("[Voice Changer] Pipeline is not initialized.") raise PipelineNotInitializedException() @@ -169,7 +235,11 @@ def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_ speedUp = self.settings.speedUp embOutputLayer = 12 useFinalProj = False - silenceFrontSec = self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0. # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。 + silenceFrontSec = ( + self.settings.extraConvertSize / self.inputSampleRate + if self.settings.silenceFront + else 0.0 + ) # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。 try: audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( @@ -190,7 +260,9 @@ def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_ result = audio_out.detach().cpu().numpy() return result except DeviceCannotSupportHalfPrecisionException as e: # NOQA - logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....") + logger.warn( + "[Device Manager] Device cannot support half precision. Fallback to float...." + ) self.deviceManager.setForceTensor(True) self.initialize() # raise e diff --git a/server/voice_changer/MMVCv13/MMVCv13.py b/server/voice_changer/MMVCv13/MMVCv13.py index 8ea77365a..8fe2f36b6 100644 --- a/server/voice_changer/MMVCv13/MMVCv13.py +++ b/server/voice_changer/MMVCv13/MMVCv13.py @@ -3,7 +3,7 @@ from data.ModelSlot import MMVCv13ModelSlot from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager -from voice_changer.utils.VoiceChangerModel import AudioInOut +from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel if sys.platform.startswith("darwin"): baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")] @@ -48,9 +48,10 @@ class MMVCv13Settings: strData: list[str] = field(default_factory=lambda: []) -class MMVCv13: +class MMVCv13(VoiceChangerModel): def __init__(self, slotInfo: MMVCv13ModelSlot): print("[Voice Changer] [MMVCv13] Creating instance ") + self.voiceChangerType = "MMVCv13" self.settings = MMVCv13Settings() self.net_g = None self.onnx_session = None @@ -65,8 +66,12 @@ def __init__(self, slotInfo: MMVCv13ModelSlot): def initialize(self): print("[Voice Changer] [MMVCv13] Initializing... ") vcparams = VoiceChangerParamsManager.get_instance().params - configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile) - modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile) + configPath = os.path.join( + vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile + ) + modelPath = os.path.join( + vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile + ) self.hps = get_hparams_from_file(configPath) if self.slotInfo.isONNX: @@ -77,7 +82,13 @@ def initialize(self): provider_options=options, ) else: - self.net_g = SynthesizerTrn(len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model) + self.net_g = SynthesizerTrn( + len(symbols), + self.hps.data.filter_length // 2 + 1, + self.hps.train.segment_size // self.hps.data.hop_length, + n_speakers=self.hps.data.n_speakers, + **self.hps.model + ) self.net_g.eval() load_checkpoint(modelPath, self.net_g, None) @@ -89,7 +100,11 @@ def initialize(self): def getOnnxExecutionProvider(self): availableProviders = onnxruntime.get_available_providers() devNum = torch.cuda.device_count() - if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0: + if ( + self.settings.gpu >= 0 + and "CUDAExecutionProvider" in availableProviders + and devNum > 0 + ): return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}] elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders: return ["DmlExecutionProvider"], [{}] @@ -110,7 +125,11 @@ def update_settings(self, key: str, val: int | float | str): if key == "gpu" and self.slotInfo.isONNX: providers, options = self.getOnnxExecutionProvider() vcparams = VoiceChangerParamsManager.get_instance().params - modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile) + modelPath = os.path.join( + vcparams.model_dir, + str(self.slotInfo.slotIndex), + self.slotInfo.modelFile, + ) self.onnx_session = onnxruntime.InferenceSession( modelPath, providers=providers, @@ -136,7 +155,9 @@ def update_settings(self, key: str, val: int | float | str): def get_info(self): data = asdict(self.settings) - data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else [] + data["onnxExecutionProviders"] = ( + self.onnx_session.get_providers() if self.onnx_session is not None else [] + ) return data def get_processing_sampling_rate(self): @@ -166,7 +187,9 @@ def generate_input( newData = newData.astype(np.float32) / self.hps.data.max_wav_value if self.audio_buffer is not None: - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 + self.audio_buffer = np.concatenate( + [self.audio_buffer, newData], 0 + ) # 過去のデータに連結 else: self.audio_buffer = newData @@ -175,7 +198,9 @@ def generate_input( # if convertSize < 8192: # convertSize = 8192 if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 - convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) + convertSize = convertSize + ( + self.hps.data.hop_length - (convertSize % self.hps.data.hop_length) + ) convertOffset = -1 * convertSize self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 @@ -207,9 +232,7 @@ def _onnx_inference(self, data): "sid_src": sid_src.numpy(), "sid_tgt": sid_tgt1.numpy(), }, - )[ - 0 - ][0, 0] + )[0][0, 0] * self.hps.data.max_wav_value ) return audio1 @@ -225,10 +248,19 @@ def _pyTorch_inference(self, data): dev = torch.device("cuda", index=self.settings.gpu) with torch.no_grad(): - x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(dev) for x in data] + x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [ + x.to(dev) for x in data + ] sid_target = torch.LongTensor([self.settings.dstId]).to(dev) - audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_target)[0, 0].data * self.hps.data.max_wav_value + audio1 = ( + self.net_g.to(dev) + .voice_conversion( + spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_target + )[0, 0] + .data + * self.hps.data.max_wav_value + ) result = audio1.float().cpu().numpy() return result @@ -265,5 +297,5 @@ def get_model_current(self): { "key": "dstId", "val": self.settings.dstId, - } + }, ] diff --git a/server/voice_changer/MMVCv15/MMVCv15.py b/server/voice_changer/MMVCv15/MMVCv15.py index 51849c1d4..6b1663351 100644 --- a/server/voice_changer/MMVCv15/MMVCv15.py +++ b/server/voice_changer/MMVCv15/MMVCv15.py @@ -2,7 +2,7 @@ import os from data.ModelSlot import MMVCv15ModelSlot from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager -from voice_changer.utils.VoiceChangerModel import AudioInOut +from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel if sys.platform.startswith("darwin"): baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")] @@ -56,9 +56,10 @@ class MMVCv15Settings: strData = ["f0Detector"] -class MMVCv15: +class MMVCv15(VoiceChangerModel): def __init__(self, slotInfo: MMVCv15ModelSlot): print("[Voice Changer] [MMVCv15] Creating instance ") + self.voiceChangerType = "MMVCv15" self.settings = MMVCv15Settings() self.net_g = None self.onnx_session: onnxruntime.InferenceSession | None = None @@ -72,8 +73,12 @@ def __init__(self, slotInfo: MMVCv15ModelSlot): def initialize(self): print("[Voice Changer] [MMVCv15] Initializing... ") vcparams = VoiceChangerParamsManager.get_instance().params - configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile) - modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile) + configPath = os.path.join( + vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile + ) + modelPath = os.path.join( + vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile + ) self.hps = get_hparams_from_file(configPath) @@ -110,7 +115,11 @@ def initialize(self): # print("ONNX INPUT SHAPE", i.name, i.shape) if i.name == "sin": self.onxx_input_length = i.shape[2] - self.settings.maxInputLength = self.onxx_input_length - (0.012 * self.hps.data.sampling_rate) - 1024 # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA + self.settings.maxInputLength = ( + self.onxx_input_length + - (0.012 * self.hps.data.sampling_rate) + - 1024 + ) # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA else: self.net_g.eval() load_checkpoint(modelPath, self.net_g, None) @@ -125,7 +134,11 @@ def initialize(self): def getOnnxExecutionProvider(self): availableProviders = onnxruntime.get_available_providers() devNum = torch.cuda.device_count() - if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0: + if ( + self.settings.gpu >= 0 + and "CUDAExecutionProvider" in availableProviders + and devNum > 0 + ): return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}] elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders: return ["DmlExecutionProvider"], [{}] @@ -145,7 +158,11 @@ def update_settings(self, key: str, val: int | float | str): if key == "gpu" and self.slotInfo.isONNX: providers, options = self.getOnnxExecutionProvider() vcparams = VoiceChangerParamsManager.get_instance().params - modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile) + modelPath = os.path.join( + vcparams.model_dir, + str(self.slotInfo.slotIndex), + self.slotInfo.modelFile, + ) self.onnx_session = onnxruntime.InferenceSession( modelPath, providers=providers, @@ -155,7 +172,11 @@ def update_settings(self, key: str, val: int | float | str): for i in inputs_info: if i.name == "sin": self.onxx_input_length = i.shape[2] - self.settings.maxInputLength = self.onxx_input_length - (0.012 * self.hps.data.sampling_rate) - 1024 # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA + self.settings.maxInputLength = ( + self.onxx_input_length + - (0.012 * self.hps.data.sampling_rate) + - 1024 + ) # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA elif key in self.settings.floatData: setattr(self.settings, key, float(val)) elif key in self.settings.strData: @@ -168,7 +189,9 @@ def update_settings(self, key: str, val: int | float | str): def get_info(self): data = asdict(self.settings) - data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else [] + data["onnxExecutionProviders"] = ( + self.onnx_session.get_providers() if self.onnx_session is not None else [] + ) return data def get_processing_sampling_rate(self): @@ -179,7 +202,9 @@ def get_processing_sampling_rate(self): def _get_f0(self, detector: str, newData: AudioInOut): audio_norm_np = newData.astype(np.float64) if detector == "dio": - _f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5) + _f0, _time = pw.dio( + audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5 + ) f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate) else: f0, t = pw.harvest( @@ -189,7 +214,9 @@ def _get_f0(self, detector: str, newData: AudioInOut): f0_floor=71.0, f0_ceil=1000.0, ) - f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length)) + f0 = convert_continuos_f0( + f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length) + ) f0 = torch.from_numpy(f0.astype(np.float32)) return f0 @@ -216,12 +243,16 @@ def generate_input( ): # maxInputLength を更新(ここでやると非効率だが、とりあえず。) if self.slotInfo.isONNX: - self.settings.maxInputLength = self.onxx_input_length - crossfadeSize - solaSearchFrame # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA get_infoで返る値。この関数内の処理では使わない。 + self.settings.maxInputLength = ( + self.onxx_input_length - crossfadeSize - solaSearchFrame + ) # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA get_infoで返る値。この関数内の処理では使わない。 newData = newData.astype(np.float32) / self.hps.data.max_wav_value if self.audio_buffer is not None: - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 + self.audio_buffer = np.concatenate( + [self.audio_buffer, newData], 0 + ) # 過去のデータに連結 else: self.audio_buffer = newData @@ -230,7 +261,9 @@ def generate_input( # if convertSize < 8192: # convertSize = 8192 if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 - convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) + convertSize = convertSize + ( + self.hps.data.hop_length - (convertSize % self.hps.data.hop_length) + ) # ONNX は固定長 if self.slotInfo.isONNX: @@ -266,9 +299,7 @@ def _onnx_inference(self, data): "sid_src": sid_src.numpy(), "sid_tgt": sid_tgt1.numpy(), }, - )[ - 0 - ][0, 0] + )[0][0, 0] * self.hps.data.max_wav_value ) return audio1 @@ -287,7 +318,12 @@ def _pyTorch_inference(self, data): sid_src = sid_src.to(dev) sid_target = torch.LongTensor([self.settings.dstId]).to(dev) - audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value + audio1 = ( + self.net_g.to(dev) + .voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0, 0] + .data + * self.hps.data.max_wav_value + ) result = audio1.float().cpu().numpy() return result @@ -316,7 +352,7 @@ def __del__(self): if file_path.find(remove_path + os.path.sep) >= 0: # print("remove", key, file_path) sys.modules.pop(key) - except: # NOQA + except: # NOQA pass def get_model_current(self): @@ -332,5 +368,5 @@ def get_model_current(self): { "key": "f0Factor", "val": self.settings.f0Factor, - } + }, ] diff --git a/server/voice_changer/RVC/RVCr2.py b/server/voice_changer/RVC/RVCr2.py index 01ad1cbd3..0422f882b 100644 --- a/server/voice_changer/RVC/RVCr2.py +++ b/server/voice_changer/RVC/RVCr2.py @@ -1,6 +1,6 @@ -''' +""" VoiceChangerV2向け -''' +""" from dataclasses import asdict import numpy as np import torch @@ -9,7 +9,12 @@ from voice_changer.RVC.RVCSettings import RVCSettings from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel +from voice_changer.utils.VoiceChangerModel import ( + AudioInOut, + PitchfInOut, + FeatureInOut, + VoiceChangerModel, +) from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.RVC.onnxExporter.export2onnx import export2onnx from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager @@ -17,7 +22,11 @@ from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.pipeline.Pipeline import Pipeline -from Exceptions import DeviceCannotSupportHalfPrecisionException, PipelineCreateException, PipelineNotInitializedException +from Exceptions import ( + DeviceCannotSupportHalfPrecisionException, + PipelineCreateException, + PipelineNotInitializedException, +) import resampy from typing import cast @@ -27,6 +36,8 @@ class RVCr2(VoiceChangerModel): def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot): logger.info("[Voice Changer] [RVCr2] Creating instance ") + self.voiceChangerType = "RVC" + self.deviceManager = DeviceManager.get_instance() EmbedderManager.initialize(params) PitchExtractorManager.initialize(params) @@ -48,9 +59,13 @@ def initialize(self): # pipelineの生成 try: - self.pipeline = createPipeline(self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector) + self.pipeline = createPipeline( + self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector + ) except PipelineCreateException as e: # NOQA - logger.error("[Voice Changer] pipeline create failed. check your model is valid.") + logger.error( + "[Voice Changer] pipeline create failed. check your model is valid." + ) return # その他の設定 @@ -76,7 +91,9 @@ def update_settings(self, key: str, val: int | float | str): elif key in self.settings.strData: setattr(self.settings, key, str(val)) if key == "f0Detector" and self.pipeline is not None: - pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) + pitchExtractor = PitchExtractorManager.getPitchExtractor( + self.settings.f0Detector, self.settings.gpu + ) self.pipeline.setPitchExtractor(pitchExtractor) else: return False @@ -99,7 +116,7 @@ def generate_input( newData: AudioInOut, crossfadeSize: int, solaSearchFrame: int, - extra_frame: int + extra_frame: int, ): # 16k で入ってくる。 inputSize = newData.shape[0] @@ -110,26 +127,47 @@ def generate_input( # 過去のデータに連結 self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) if self.slotInfo.f0: - self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(newFeatureLength)], 0) - self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([newFeatureLength, self.slotInfo.embChannels])], 0) + self.pitchf_buffer = np.concatenate( + [self.pitchf_buffer, np.zeros(newFeatureLength)], 0 + ) + self.feature_buffer = np.concatenate( + [ + self.feature_buffer, + np.zeros([newFeatureLength, self.slotInfo.embChannels]), + ], + 0, + ) else: self.audio_buffer = newData if self.slotInfo.f0: self.pitchf_buffer = np.zeros(newFeatureLength) - self.feature_buffer = np.zeros([newFeatureLength, self.slotInfo.embChannels]) + self.feature_buffer = np.zeros( + [newFeatureLength, self.slotInfo.embChannels] + ) convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (160 - (convertSize % 160)) - outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate) + outSize = int( + ((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate + ) # バッファがたまっていない場合はzeroで補う if self.audio_buffer.shape[0] < convertSize: - self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) + self.audio_buffer = np.concatenate( + [np.zeros([convertSize]), self.audio_buffer] + ) if self.slotInfo.f0: - self.pitchf_buffer = np.concatenate([np.zeros([convertSize // 160]), self.pitchf_buffer]) - self.feature_buffer = np.concatenate([np.zeros([convertSize // 160, self.slotInfo.embChannels]), self.feature_buffer]) + self.pitchf_buffer = np.concatenate( + [np.zeros([convertSize // 160]), self.pitchf_buffer] + ) + self.feature_buffer = np.concatenate( + [ + np.zeros([convertSize // 160, self.slotInfo.embChannels]), + self.feature_buffer, + ] + ) # 不要部分をトリミング convertOffset = -1 * convertSize @@ -147,9 +185,18 @@ def generate_input( vol = max(vol, self.prevVol * 0.0) self.prevVol = vol - return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize) + return ( + self.audio_buffer, + self.pitchf_buffer, + self.feature_buffer, + convertSize, + vol, + outSize, + ) - def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int): + def inference( + self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int + ): if self.pipeline is None: logger.info("[Voice Changer] Pipeline is not initialized.") raise PipelineNotInitializedException() @@ -165,10 +212,14 @@ def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_ ) crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000) sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000) - extra_frame = int((self.settings.extraConvertSize / self.inputSampleRate) * 16000) + extra_frame = int( + (self.settings.extraConvertSize / self.inputSampleRate) * 16000 + ) # 入力データ生成 - data = self.generate_input(receivedData, crossfade_frame, sola_search_frame, extra_frame) + data = self.generate_input( + receivedData, crossfade_frame, sola_search_frame, extra_frame + ) audio = data[0] pitchf = data[1] @@ -181,7 +232,7 @@ def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_ return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) device = self.pipeline.device - + audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32) repeat = 1 if self.settings.rvcQuality else 0 sid = self.settings.dstId @@ -192,7 +243,7 @@ def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_ if_f0 = 1 if self.slotInfo.f0 else 0 embOutputLayer = self.slotInfo.embOutputLayer useFinalProj = self.slotInfo.useFinalProj - + try: audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( sid, @@ -203,12 +254,14 @@ def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_ index_rate, if_f0, # 0, - self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。入力のサンプリングレートで算出 + self.settings.extraConvertSize / self.inputSampleRate + if self.settings.silenceFront + else 0.0, # extaraDataSizeの秒数。入力のサンプリングレートで算出 embOutputLayer, useFinalProj, repeat, protect, - outSize + outSize, ) # result = audio_out.detach().cpu().numpy() * np.sqrt(vol) result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol) @@ -224,7 +277,9 @@ def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_ return result except DeviceCannotSupportHalfPrecisionException as e: # NOQA - logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....") + logger.warn( + "[Device Manager] Device cannot support half precision. Fallback to float...." + ) self.deviceManager.setForceTensor(True) self.initialize() # raise e diff --git a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py index 80cfe1b76..19b0fc1ae 100644 --- a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py +++ b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py @@ -3,7 +3,7 @@ from data.ModelSlot import SoVitsSvc40ModelSlot from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager -from voice_changer.utils.VoiceChangerModel import AudioInOut +from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerParams import VoiceChangerParams if sys.platform.startswith("darwin"): @@ -27,7 +27,13 @@ # from models import SynthesizerTrn # type:ignore from .models.models import SynthesizerTrn -from .models.utils import interpolate_f0, get_hparams_from_file, load_checkpoint, repeat_expand_2d, get_hubert_content +from .models.utils import ( + interpolate_f0, + get_hparams_from_file, + load_checkpoint, + repeat_expand_2d, + get_hubert_content, +) from .models.cluster import get_cluster_model, get_cluster_center_result from fairseq import checkpoint_utils import librosa @@ -64,9 +70,10 @@ class SoVitsSvc40Settings: strData = ["f0Detector"] -class SoVitsSvc40: +class SoVitsSvc40(VoiceChangerModel): def __init__(self, params: VoiceChangerParams, slotInfo: SoVitsSvc40ModelSlot): print("[Voice Changer] [so-vits-svc40] Creating instance ") + self.voiceChangerType = "so-vits-svc-40" self.settings = SoVitsSvc40Settings() self.net_g = None self.onnx_session = None @@ -94,20 +101,31 @@ def __init__(self, params: VoiceChangerParams, slotInfo: SoVitsSvc40ModelSlot): def initialize(self): print("[Voice Changer] [so-vits-svc40] Initializing... ") vcparams = VoiceChangerParamsManager.get_instance().params - configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile) - modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile) + configPath = os.path.join( + vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile + ) + modelPath = os.path.join( + vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile + ) self.hps = get_hparams_from_file(configPath) self.settings.speakers = self.hps.spk # cluster try: if self.slotInfo.clusterFile is not None: - clusterPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.clusterFile) + clusterPath = os.path.join( + vcparams.model_dir, + str(self.slotInfo.slotIndex), + self.slotInfo.clusterFile, + ) self.cluster_model = get_cluster_model(clusterPath) else: self.cluster_model = None except Exception as e: - print("[Voice Changer] [so-vits-svc40] EXCEPTION during loading cluster model ", e) + print( + "[Voice Changer] [so-vits-svc40] EXCEPTION during loading cluster model ", + e, + ) print("[Voice Changer] [so-vits-svc40] fallback to without cluster") self.cluster_model = None @@ -132,7 +150,11 @@ def initialize(self): def getOnnxExecutionProvider(self): availableProviders = onnxruntime.get_available_providers() devNum = torch.cuda.device_count() - if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0: + if ( + self.settings.gpu >= 0 + and "CUDAExecutionProvider" in availableProviders + and devNum > 0 + ): return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}] elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders: return ["DmlExecutionProvider"], [{}] @@ -170,7 +192,9 @@ def update_settings(self, key: str, val: int | float | str): def get_info(self): data = asdict(self.settings) - data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else [] + data["onnxExecutionProviders"] = ( + self.onnx_session.get_providers() if self.onnx_session is not None else [] + ) return data @@ -196,7 +220,9 @@ def get_unit_f0(self, audio_buffer, tran): ) if wav_44k.shape[0] % self.hps.data.hop_length != 0: - print(f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}") + print( + f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}" + ) f0, uv = interpolate_f0(f0) f0 = torch.FloatTensor(f0) @@ -205,7 +231,9 @@ def get_unit_f0(self, audio_buffer, tran): f0 = f0.unsqueeze(0) uv = uv.unsqueeze(0) - wav16k_numpy = librosa.resample(audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000) + wav16k_numpy = librosa.resample( + audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000 + ) wav16k_tensor = torch.from_numpy(wav16k_numpy) if (self.settings.gpu < 0 or self.gpu_num == 0) or self.slotInfo.isONNX: @@ -226,7 +254,9 @@ def get_unit_f0(self, audio_buffer, tran): if self.hps.model.ssl_dim == 768: self.hubert_model = self.hubert_model.to(dev) wav16k_tensor = wav16k_tensor.to(dev) - c = get_hubert_content_layer9(self.hubert_model, wav_16k_tensor=wav16k_tensor) + c = get_hubert_content_layer9( + self.hubert_model, wav_16k_tensor=wav16k_tensor + ) else: self.hubert_model = self.hubert_model.to(dev) wav16k_tensor = wav16k_tensor.to(dev) @@ -237,16 +267,29 @@ def get_unit_f0(self, audio_buffer, tran): c = repeat_expand_2d(c.squeeze(0), f0.shape[1]) - if self.settings.clusterInferRatio != 0 and hasattr(self, "cluster_model") and self.cluster_model is not None: - speaker = [key for key, value in self.settings.speakers.items() if value == self.settings.dstId] + if ( + self.settings.clusterInferRatio != 0 + and hasattr(self, "cluster_model") + and self.cluster_model is not None + ): + speaker = [ + key + for key, value in self.settings.speakers.items() + if value == self.settings.dstId + ] if len(speaker) != 1: pass # print("not only one speaker found.", speaker) else: - cluster_c = get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker[0]).T + cluster_c = get_cluster_center_result( + self.cluster_model, c.cpu().numpy().T, speaker[0] + ).T cluster_c = torch.FloatTensor(cluster_c).to(dev) c = c.to(dev) - c = self.settings.clusterInferRatio * cluster_c + (1 - self.settings.clusterInferRatio) * c + c = ( + self.settings.clusterInferRatio * cluster_c + + (1 - self.settings.clusterInferRatio) * c + ) c = c.unsqueeze(0) return c, f0, uv @@ -261,14 +304,20 @@ def generate_input( newData = newData.astype(np.float32) / self.hps.data.max_wav_value if self.audio_buffer is not None: - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 + self.audio_buffer = np.concatenate( + [self.audio_buffer, newData], 0 + ) # 過去のデータに連結 else: self.audio_buffer = newData - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + convertSize = ( + inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + ) if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 - convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) + convertSize = convertSize + ( + self.hps.data.hop_length - (convertSize % self.hps.data.hop_length) + ) convertOffset = -1 * convertSize self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 @@ -306,7 +355,9 @@ def _onnx_inference(self, data): "f0": f0.astype(np.float32), "uv": uv.astype(np.float32), "g": sid_target.astype(np.int64), - "noise_scale": np.array([self.settings.noiseScale]).astype(np.float32), + "noise_scale": np.array([self.settings.noiseScale]).astype( + np.float32 + ), # "predict_f0": np.array([self.settings.dstId]).astype(np.int64), }, )[0][0, 0] @@ -385,8 +436,7 @@ def __del__(self): pass def get_model_current(self): - return [ - ] + return [] def resize_f0(x, target_len): diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py index c6466b8b5..81e473b72 100644 --- a/server/voice_changer/VoiceChangerV2.py +++ b/server/voice_changer/VoiceChangerV2.py @@ -1,4 +1,4 @@ -''' +""" ■ VoiceChangerV2 - VoiceChangerとの差分 ・リサンプル処理の無駄を省くため、VoiceChangerModelにリサンプル処理を移譲 @@ -7,7 +7,7 @@ - 適用VoiceChangerModel ・DiffusionSVC ・RVC -''' +""" from typing import Any, Union @@ -18,7 +18,8 @@ from dataclasses import dataclass, asdict, field import onnxruntime from mods.log_control import VoiceChangaerLogger -from voice_changer.Beatrice.Beatrice import Beatrice + +# from voice_changer.Beatrice.Beatrice import Beatrice from voice_changer.IORecorder import IORecorder @@ -89,27 +90,38 @@ def __init__(self, params: VoiceChangerParams): self.params = params self.gpu_num = torch.cuda.device_count() self.prev_audio = np.zeros(4096) - self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() + self.mps_enabled: bool = ( + getattr(torch.backends, "mps", None) is not None + and torch.backends.mps.is_available() + ) self.onnx_device = onnxruntime.get_device() self.noCrossFade = False - logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})") + logger.info( + f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})" + ) def setModel(self, model: VoiceChangerModel): self.voiceChanger = model - self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) - if isinstance(model, Beatrice): + self.voiceChanger.setSamplingRate( + self.settings.inputSampleRate, self.settings.outputSampleRate + ) + if model.voiceChangerType == "Beatrice": self.noCrossFade = True else: self.noCrossFade = False def setInputSampleRate(self, sr: int): self.settings.inputSampleRate = sr - self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + self.voiceChanger.setSamplingRate( + self.settings.inputSampleRate, self.settings.outputSampleRate + ) def setOutputSampleRate(self, sr: int): self.settings.outputSampleRate = sr - self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + self.voiceChanger.setSamplingRate( + self.settings.inputSampleRate, self.settings.outputSampleRate + ) def get_info(self): data = asdict(self.settings) @@ -128,7 +140,9 @@ def update_settings(self, key: str, val: Any): if key == "serverAudioStated" and val == 0: self.settings.inputSampleRate = 48000 self.settings.outputSampleRate = 48000 - self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + self.voiceChanger.setSamplingRate( + self.settings.inputSampleRate, self.settings.outputSampleRate + ) if key in self.settings.intData: setattr(self.settings, key, int(val)) @@ -137,7 +151,12 @@ def update_settings(self, key: str, val: Any): if key == "recordIO" and val == 1: if hasattr(self, "ioRecorder"): self.ioRecorder.close() - self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate, self.settings.outputSampleRate) + self.ioRecorder = IORecorder( + STREAM_INPUT_FILE, + STREAM_OUTPUT_FILE, + self.settings.inputSampleRate, + self.settings.outputSampleRate, + ) if key == "recordIO" and val == 0: if hasattr(self, "ioRecorder"): self.ioRecorder.close() @@ -146,7 +165,9 @@ def update_settings(self, key: str, val: Any): if hasattr(self, "ioRecorder"): self.ioRecorder.close() if key == "inputSampleRate" or key == "outputSampleRate": - self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + self.voiceChanger.setSamplingRate( + self.settings.inputSampleRate, self.settings.outputSampleRate + ) elif key in self.settings.floatData: setattr(self.settings, key, float(val)) elif key in self.settings.strData: @@ -159,7 +180,12 @@ def update_settings(self, key: str, val: Any): return self.get_info() def _generate_strength(self, crossfadeSize: int): - if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize: + if ( + self.crossfadeSize != crossfadeSize + or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate + or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate + or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize + ): self.crossfadeSize = crossfadeSize self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate self.currentCrossFadeEndRate = self.settings.crossFadeEndRate @@ -188,7 +214,9 @@ def _generate_strength(self, crossfadeSize: int): ] ) - logger.info(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}") + logger.info( + f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}" + ) # ひとつ前の結果とサイズが変わるため、記録は消去する。 if hasattr(self, "np_prev_audio1") is True: @@ -203,13 +231,19 @@ def get_processing_sampling_rate(self): return self.voiceChanger.get_processing_sampling_rate() # receivedData: tuple of short - def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: + def on_request( + self, receivedData: AudioInOut + ) -> tuple[AudioInOut, list[Union[int, float]]]: try: if self.voiceChanger is None: - raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.") + raise VoiceChangerIsNotSelectedException( + "Voice Changer is not selected." + ) with Timer("main-process") as t: - processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() + processing_sampling_rate = ( + self.voiceChanger.get_processing_sampling_rate() + ) if self.noCrossFade: # Beatrice audio = self.voiceChanger.inference( @@ -223,18 +257,22 @@ def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[i else: sola_search_frame = int(0.012 * processing_sampling_rate) block_frame = receivedData.shape[0] - crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) + crossfade_frame = min( + self.settings.crossFadeOverlapSize, block_frame + ) self._generate_strength(crossfade_frame) - + audio = self.voiceChanger.inference( receivedData, crossfade_frame=crossfade_frame, - sola_search_frame=sola_search_frame + sola_search_frame=sola_search_frame, ) if hasattr(self, "sola_buffer") is True: np.set_printoptions(threshold=10000) - audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame) + audio_offset = -1 * ( + sola_search_frame + crossfade_frame + block_frame + ) audio = audio[audio_offset:] # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI @@ -259,16 +297,25 @@ def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[i result = output_wav else: - logger.info("[Voice Changer] warming up... generating sola buffer.") + logger.info( + "[Voice Changer] warming up... generating sola buffer." + ) result = np.zeros(4096).astype(np.int16) - if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame: - offset = -1 * (sola_search_frame + crossfade_frame - sola_offset) + if ( + hasattr(self, "sola_buffer") is True + and sola_offset < sola_search_frame + ): + offset = -1 * ( + sola_search_frame + crossfade_frame - sola_offset + ) end = -1 * (sola_search_frame - sola_offset) sola_buf_org = audio[offset:end] self.sola_buffer = sola_buf_org * self.np_prev_strength else: - self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength + self.sola_buffer = ( + audio[-crossfade_frame:] * self.np_prev_strength + ) # self.sola_buffer = audio[- crossfade_frame:] mainprocess_time = t.secs @@ -277,7 +324,9 @@ def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[i with Timer("post-process") as t: result = result.astype(np.int16) - print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz") + print_convert_processing( + f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz" + ) if receivedData.shape[0] != result.shape[0]: outputData = pad_array(result, receivedData.shape[0]) @@ -291,7 +340,9 @@ def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[i postprocess_time = t.secs - print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") + print_convert_processing( + f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}" + ) perf = [0, mainprocess_time, postprocess_time] return outputData, perf @@ -300,7 +351,9 @@ def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[i logger.warn(f"[Voice Changer] [Exception], {e}") return np.zeros(1).astype(np.int16), [0, 0, 0] except ONNXInputArgumentException as e: - logger.warn(f"[Voice Changer] [Exception] onnx are waiting valid input., {e}") + logger.warn( + f"[Voice Changer] [Exception] onnx are waiting valid input., {e}" + ) return np.zeros(1).astype(np.int16), [0, 0, 0] except HalfPrecisionChangingException: logger.warn("[Voice Changer] Switching model configuration....") @@ -312,7 +365,9 @@ def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[i logger.warn(f"[Voice Changer] embedder: {e}") return np.zeros(1).astype(np.int16), [0, 0, 0] except VoiceChangerIsNotSelectedException: - logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.") + logger.warn( + "[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc." + ) return np.zeros(1).astype(np.int16), [0, 0, 0] except DeviceCannotSupportHalfPrecisionException: # RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。 diff --git a/server/voice_changer/utils/VoiceChangerModel.py b/server/voice_changer/utils/VoiceChangerModel.py index fbe40d6c4..1a3ad4b28 100644 --- a/server/voice_changer/utils/VoiceChangerModel.py +++ b/server/voice_changer/utils/VoiceChangerModel.py @@ -1,5 +1,6 @@ from typing import Any, Protocol, TypeAlias import numpy as np +from const import VoiceChangerType from voice_changer.utils.LoadModelParams import LoadModelParams @@ -10,6 +11,8 @@ class VoiceChangerModel(Protocol): + voiceChangerType: VoiceChangerType = "RVC" + # loadModel: Callable[..., dict[str, Any]] def loadModel(self, params: LoadModelParams): ... @@ -23,7 +26,13 @@ def get_info(self) -> dict[str, Any]: def inference(self, data: tuple[Any, ...]) -> Any: ... - def generate_input(self, newData: AudioInOut, inputSize: int, crossfadeSize: int, solaSearchFrame: int) -> tuple[Any, ...]: + def generate_input( + self, + newData: AudioInOut, + inputSize: int, + crossfadeSize: int, + solaSearchFrame: int, + ) -> tuple[Any, ...]: ... def update_settings(self, key: str, val: int | float | str) -> bool: