From e3e1b687daabc93259df363a3bfde8992f996381 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Tue, 24 Sep 2024 08:02:38 -0400 Subject: [PATCH 1/7] feat: added unittest for wrong model used --- everyvoice/tests/test_model.py | 144 +++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/everyvoice/tests/test_model.py b/everyvoice/tests/test_model.py index adba9f2c..88e915bf 100644 --- a/everyvoice/tests/test_model.py +++ b/everyvoice/tests/test_model.py @@ -168,6 +168,8 @@ def test_find_non_basic_substructures(self): class TestLoadingModel(BasicTestCase): + """Test loading models""" + def setUp(self) -> None: super().setUp() self.config_dir = self.data_dir / "relative" / "config" @@ -251,3 +253,145 @@ def test_model_is_not_a_vocoder(self): ), ): HiFiGAN.load_from_checkpoint(ckpt_fn) + + def test_wrong_model_type(self): + """ + Detecting wrong model type in checkpoint. + """ + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import ModelCheckpoint + + with tempfile.TemporaryDirectory() as tmpdir_str: + model = FastSpeech2( + FastSpeech2Config.load_config_from_path( + self.config_dir / f"{TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX}.yaml" + ), + stats=Stats( + pitch=StatsInfo( + min=0, max=1, std=2, mean=3, norm_min=4, norm_max=5 + ), + energy=StatsInfo( + min=7, max=8, std=9, mean=10, norm_min=11, norm_max=12 + ), + ), + lang2id={"foo": 0, "bar": 1}, + speaker2id={"baz": 0, "qux": 1}, + ) + trainer = Trainer( + default_root_dir=tmpdir_str, + enable_progress_bar=False, + logger=False, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + callbacks=[ModelCheckpoint(dirpath=tmpdir_str, every_n_train_steps=1)], + ) + trainer.strategy.connect(model) + ckpt_fn = tmpdir_str + "/checkpoint.ckpt" + trainer.save_checkpoint(ckpt_fn) + m = torch.load(ckpt_fn) + self.assertIn("model_info", m.keys()) + m["model_info"]["name"] = "BAD_TYPE" + torch.save(m, ckpt_fn) + m = torch.load(ckpt_fn) + self.assertIn("model_info", m.keys()) + self.assertEqual(m["model_info"]["name"], "BAD_TYPE") + # self.assertEqual(m["model_info"]["version"], "1.0") + with self.assertRaisesRegex( + TypeError, + r"Wrong model type \(BAD_TYPE\), we are expecting a 'FastSpeech2' model", + ): + FastSpeech2.load_from_checkpoint(ckpt_fn) + + def test_missing_model_version(self): + """ + Loading an old model that doesn't have a version. + """ + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import ModelCheckpoint + + with tempfile.TemporaryDirectory() as tmpdir_str: + model = FastSpeech2( + FastSpeech2Config.load_config_from_path( + self.config_dir / f"{TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX}.yaml" + ), + stats=Stats( + pitch=StatsInfo( + min=0, max=1, std=2, mean=3, norm_min=4, norm_max=5 + ), + energy=StatsInfo( + min=7, max=8, std=9, mean=10, norm_min=11, norm_max=12 + ), + ), + lang2id={"foo": 0, "bar": 1}, + speaker2id={"baz": 0, "qux": 1}, + ) + CANARY_VERSION = "BAD_VERSION" + model._VERSION = CANARY_VERSION + trainer = Trainer( + default_root_dir=tmpdir_str, + enable_progress_bar=False, + logger=False, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + callbacks=[ModelCheckpoint(dirpath=tmpdir_str, every_n_train_steps=1)], + ) + trainer.strategy.connect(model) + ckpt_fn = tmpdir_str + "/checkpoint.ckpt" + trainer.save_checkpoint(ckpt_fn) + m = torch.load(ckpt_fn) + self.assertIn("model_info", m.keys()) + self.assertEqual(m["model_info"]["name"], FastSpeech2.__name__) + self.assertEqual(m["model_info"]["version"], CANARY_VERSION) + del m["model_info"]["version"] + torch.save(m, ckpt_fn) + model = FastSpeech2.load_from_checkpoint(ckpt_fn) + self.assertEqual(model._VERSION, "1.0") + + def test_newer_model_version(self): + """ + Detecting an incompatible version number in the checkpoint. + """ + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import ModelCheckpoint + + with tempfile.TemporaryDirectory() as tmpdir_str: + model = FastSpeech2( + FastSpeech2Config.load_config_from_path( + self.config_dir / f"{TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX}.yaml" + ), + stats=Stats( + pitch=StatsInfo( + min=0, max=1, std=2, mean=3, norm_min=4, norm_max=5 + ), + energy=StatsInfo( + min=7, max=8, std=9, mean=10, norm_min=11, norm_max=12 + ), + ), + lang2id={"foo": 0, "bar": 1}, + speaker2id={"baz": 0, "qux": 1}, + ) + NEWER_VERSION = "100.0" + model._VERSION = NEWER_VERSION + trainer = Trainer( + default_root_dir=tmpdir_str, + enable_progress_bar=False, + logger=False, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + callbacks=[ModelCheckpoint(dirpath=tmpdir_str, every_n_train_steps=1)], + ) + trainer.strategy.connect(model) + ckpt_fn = tmpdir_str + "/checkpoint.ckpt" + trainer.save_checkpoint(ckpt_fn) + m = torch.load(ckpt_fn) + self.assertIn("model_info", m.keys()) + self.assertEqual(m["model_info"]["name"], FastSpeech2.__name__) + self.assertEqual(m["model_info"]["version"], NEWER_VERSION) + with self.assertRaisesRegex( + ValueError, + r"Your model was created with a newer version of EveryVoice, please update your software.", + ): + FastSpeech2.load_from_checkpoint(ckpt_fn) From 7f7f1cadca1aa3b69fd7d0860a1663a82b7652e3 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Wed, 23 Oct 2024 11:50:15 -0400 Subject: [PATCH 2/7] feat: added unittest for FastSpeech2Config's version --- everyvoice/tests/test_model.py | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/everyvoice/tests/test_model.py b/everyvoice/tests/test_model.py index 88e915bf..1a408fe7 100644 --- a/everyvoice/tests/test_model.py +++ b/everyvoice/tests/test_model.py @@ -395,3 +395,51 @@ def test_newer_model_version(self): r"Your model was created with a newer version of EveryVoice, please update your software.", ): FastSpeech2.load_from_checkpoint(ckpt_fn) + + +class TestLoadingConfig(BasicTestCase): + """Test loading configurations""" + + def setUp(self) -> None: + super().setUp() + self.config_dir = self.data_dir / "relative" / "config" + self.configs = ( + (FastSpeech2Config, TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX), + (DFAlignerConfig, ALIGNER_CONFIG_FILENAME_PREFIX), + (HiFiGANConfig, SPEC_TO_WAV_CONFIG_FILENAME_PREFIX), + ) + + def test_config_versionless(self): + """ + Validate that we can load a config that doesn't have a `VERSION` as a version 1.0 config. + """ + + for ConfigType, filename in self.configs: + with self.subTest(ConfigType=ConfigType): + arguments = ConfigType.load_config_from_path( + self.config_dir / f"{filename}.yaml" + ).model_dump() + del arguments["VERSION"] + + self.assertNotIn("VERSION", arguments) + c = ConfigType(**arguments) + self.assertEqual(c.VERSION, "1.0") + + def test_config_newer_version(self): + """ + Validate that we are detecting that a config is newer. + """ + + for ConfigType, filename in self.configs: + with self.subTest(ConfigType=ConfigType): + reference = ConfigType.load_config_from_path( + self.config_dir / f"{filename}.yaml" + ) + NEWER_VERSION = "100.0" + reference.VERSION = NEWER_VERSION + + with self.assertRaisesRegex( + ValueError, + r"Your config was created with a newer version of EveryVoice, please update your software.", + ): + ConfigType(**reference.model_dump()) From a752107bd2b2d83797f8676202c36265ae229508 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Mon, 28 Oct 2024 16:40:58 -0400 Subject: [PATCH 3/7] feat: testing multiple model types --- everyvoice/tests/test_model.py | 190 +++++++++++++++++++++------------ 1 file changed, 121 insertions(+), 69 deletions(-) diff --git a/everyvoice/tests/test_model.py b/everyvoice/tests/test_model.py index 1a408fe7..7040de28 100644 --- a/everyvoice/tests/test_model.py +++ b/everyvoice/tests/test_model.py @@ -310,44 +310,70 @@ def test_missing_model_version(self): from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint - with tempfile.TemporaryDirectory() as tmpdir_str: - model = FastSpeech2( - FastSpeech2Config.load_config_from_path( - self.config_dir / f"{TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX}.yaml" + tests = ( + ( + Aligner, + Aligner( + DFAlignerConfig.load_config_from_path( + self.config_dir / f"{ALIGNER_CONFIG_FILENAME_PREFIX}.yaml" + ) ), - stats=Stats( - pitch=StatsInfo( - min=0, max=1, std=2, mean=3, norm_min=4, norm_max=5 + ), + ( + FastSpeech2, + FastSpeech2( + FastSpeech2Config.load_config_from_path( + self.config_dir / f"{TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX}.yaml" ), - energy=StatsInfo( - min=7, max=8, std=9, mean=10, norm_min=11, norm_max=12 + stats=Stats( + pitch=StatsInfo( + min=0, max=1, std=2, mean=3, norm_min=4, norm_max=5 + ), + energy=StatsInfo( + min=7, max=8, std=9, mean=10, norm_min=11, norm_max=12 + ), ), + lang2id={"foo": 0, "bar": 1}, + speaker2id={"baz": 0, "qux": 1}, ), - lang2id={"foo": 0, "bar": 1}, - speaker2id={"baz": 0, "qux": 1}, - ) - CANARY_VERSION = "BAD_VERSION" - model._VERSION = CANARY_VERSION - trainer = Trainer( - default_root_dir=tmpdir_str, - enable_progress_bar=False, - logger=False, - max_epochs=1, - limit_train_batches=1, - limit_val_batches=1, - callbacks=[ModelCheckpoint(dirpath=tmpdir_str, every_n_train_steps=1)], - ) - trainer.strategy.connect(model) - ckpt_fn = tmpdir_str + "/checkpoint.ckpt" - trainer.save_checkpoint(ckpt_fn) - m = torch.load(ckpt_fn) - self.assertIn("model_info", m.keys()) - self.assertEqual(m["model_info"]["name"], FastSpeech2.__name__) - self.assertEqual(m["model_info"]["version"], CANARY_VERSION) - del m["model_info"]["version"] - torch.save(m, ckpt_fn) - model = FastSpeech2.load_from_checkpoint(ckpt_fn) - self.assertEqual(model._VERSION, "1.0") + ), # we should probably also test that the error about the variance adaptor is raised + ( + HiFiGAN, + HiFiGAN( + HiFiGANConfig.load_config_from_path( + self.config_dir / f"{SPEC_TO_WAV_CONFIG_FILENAME_PREFIX}.yaml" + ) + ), + ), + ) + + CANARY_VERSION = "CANARY_VERSION" + with tempfile.TemporaryDirectory() as tmpdir_str: + for ModelType, model in tests: + with self.subTest(ModelType=ModelType): + model._VERSION = CANARY_VERSION + trainer = Trainer( + default_root_dir=tmpdir_str, + enable_progress_bar=False, + logger=False, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + callbacks=[ + ModelCheckpoint(dirpath=tmpdir_str, every_n_train_steps=1) + ], + ) + trainer.strategy.connect(model) + ckpt_fn = tmpdir_str + "/checkpoint.ckpt" + trainer.save_checkpoint(ckpt_fn) + m = torch.load(ckpt_fn) + self.assertIn("model_info", m.keys()) + self.assertEqual(m["model_info"]["name"], ModelType.__name__) + self.assertEqual(m["model_info"]["version"], CANARY_VERSION) + del m["model_info"]["version"] + torch.save(m, ckpt_fn) + model = ModelType.load_from_checkpoint(ckpt_fn) + self.assertEqual(model._VERSION, "1.0") def test_newer_model_version(self): """ @@ -356,45 +382,71 @@ def test_newer_model_version(self): from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint - with tempfile.TemporaryDirectory() as tmpdir_str: - model = FastSpeech2( - FastSpeech2Config.load_config_from_path( - self.config_dir / f"{TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX}.yaml" + tests = ( + ( + Aligner, + Aligner( + DFAlignerConfig.load_config_from_path( + self.config_dir / f"{ALIGNER_CONFIG_FILENAME_PREFIX}.yaml" + ) ), - stats=Stats( - pitch=StatsInfo( - min=0, max=1, std=2, mean=3, norm_min=4, norm_max=5 + ), + ( + FastSpeech2, + FastSpeech2( + FastSpeech2Config.load_config_from_path( + self.config_dir / f"{TEXT_TO_SPEC_CONFIG_FILENAME_PREFIX}.yaml" ), - energy=StatsInfo( - min=7, max=8, std=9, mean=10, norm_min=11, norm_max=12 + stats=Stats( + pitch=StatsInfo( + min=0, max=1, std=2, mean=3, norm_min=4, norm_max=5 + ), + energy=StatsInfo( + min=7, max=8, std=9, mean=10, norm_min=11, norm_max=12 + ), ), + lang2id={"foo": 0, "bar": 1}, + speaker2id={"baz": 0, "qux": 1}, ), - lang2id={"foo": 0, "bar": 1}, - speaker2id={"baz": 0, "qux": 1}, - ) - NEWER_VERSION = "100.0" - model._VERSION = NEWER_VERSION - trainer = Trainer( - default_root_dir=tmpdir_str, - enable_progress_bar=False, - logger=False, - max_epochs=1, - limit_train_batches=1, - limit_val_batches=1, - callbacks=[ModelCheckpoint(dirpath=tmpdir_str, every_n_train_steps=1)], - ) - trainer.strategy.connect(model) - ckpt_fn = tmpdir_str + "/checkpoint.ckpt" - trainer.save_checkpoint(ckpt_fn) - m = torch.load(ckpt_fn) - self.assertIn("model_info", m.keys()) - self.assertEqual(m["model_info"]["name"], FastSpeech2.__name__) - self.assertEqual(m["model_info"]["version"], NEWER_VERSION) - with self.assertRaisesRegex( - ValueError, - r"Your model was created with a newer version of EveryVoice, please update your software.", - ): - FastSpeech2.load_from_checkpoint(ckpt_fn) + ), # we should probably also test that the error about the variance adaptor is raised + ( + HiFiGAN, + HiFiGAN( + HiFiGANConfig.load_config_from_path( + self.config_dir / f"{SPEC_TO_WAV_CONFIG_FILENAME_PREFIX}.yaml" + ) + ), + ), + ) + + NEWER_VERSION = "100.0" + with tempfile.TemporaryDirectory() as tmpdir_str: + for ModelType, model in tests: + with self.subTest(ModelType=ModelType): + model._VERSION = NEWER_VERSION + trainer = Trainer( + default_root_dir=tmpdir_str, + enable_progress_bar=False, + logger=False, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + callbacks=[ + ModelCheckpoint(dirpath=tmpdir_str, every_n_train_steps=1) + ], + ) + trainer.strategy.connect(model) + ckpt_fn = tmpdir_str + "/checkpoint.ckpt" + trainer.save_checkpoint(ckpt_fn) + m = torch.load(ckpt_fn) + self.assertIn("model_info", m.keys()) + self.assertEqual(m["model_info"]["name"], ModelType.__name__) + self.assertEqual(m["model_info"]["version"], NEWER_VERSION) + with self.assertRaisesRegex( + ValueError, + r"Your model was created with a newer version of EveryVoice, please update your software.", + ): + ModelType.load_from_checkpoint(ckpt_fn) class TestLoadingConfig(BasicTestCase): From 78c08d4c4f551fc7694d17cd393171178430bfc3 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Wed, 30 Oct 2024 08:45:34 -0400 Subject: [PATCH 4/7] feat: bumped up the schema version --- .../.schema/everyvoice-aligner-0.2.json | 849 ++++++ .../.schema/everyvoice-shared-data-0.2.json | 233 ++ .../.schema/everyvoice-shared-text-0.2.json | 137 + .../.schema/everyvoice-spec-to-wav-0.2.json | 845 ++++++ .../.schema/everyvoice-text-to-spec-0.2.json | 1115 +++++++ .../.schema/everyvoice-text-to-wav-0.2.json | 2567 +++++++++++++++++ everyvoice/_version.py | 2 +- 7 files changed, 5747 insertions(+), 1 deletion(-) create mode 100644 everyvoice/.schema/everyvoice-aligner-0.2.json create mode 100644 everyvoice/.schema/everyvoice-shared-data-0.2.json create mode 100644 everyvoice/.schema/everyvoice-shared-text-0.2.json create mode 100644 everyvoice/.schema/everyvoice-spec-to-wav-0.2.json create mode 100644 everyvoice/.schema/everyvoice-text-to-spec-0.2.json create mode 100644 everyvoice/.schema/everyvoice-text-to-wav-0.2.json diff --git a/everyvoice/.schema/everyvoice-aligner-0.2.json b/everyvoice/.schema/everyvoice-aligner-0.2.json new file mode 100644 index 00000000..aac7a589 --- /dev/null +++ b/everyvoice/.schema/everyvoice-aligner-0.2.json @@ -0,0 +1,849 @@ +{ + "$defs": { + "AdamOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the Adam Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "adam", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "AdamOptimizer", + "type": "object" + }, + "AdamWOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the AdamW Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "adamw", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "AdamWOptimizer", + "type": "object" + }, + "AudioConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "min_audio_length": { + "default": 0.4, + "description": "The minimum length of an audio sample in seconds. Audio shorter than this will be ignored during preprocessing.", + "title": "Min Audio Length", + "type": "number" + }, + "max_audio_length": { + "default": 11.0, + "description": "The maximum length of an audio sample in seconds. Audio longer than this will be ignored during preprocessing. Increasing the max_audio_length will result in larger memory usage. If you are running out of memory, consider lowering the max_audio_length.", + "title": "Max Audio Length", + "type": "number" + }, + "max_wav_value": { + "default": 32767.0, + "description": "Advanced. The maximum value allowed to be in your wav files. For 16-bit audio, this should be (2**16)/2 - 1.", + "title": "Max Wav Value", + "type": "number" + }, + "input_sampling_rate": { + "default": 22050, + "description": "The sampling rate describes the number of samples per second of audio. The 'input_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the spectrograms predicted by your text-to-spec model will also be calculated from audio at this sampling rate. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Input Sampling Rate", + "type": "integer" + }, + "output_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'output_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the wav files generated by your vocoder or spec-to-wav model will be at this sampling rate. If you change this value, you will also need to change the upsample rates in your vocoder. Your audio will automatically be re-sampled during preprocessing.", + "title": "Output Sampling Rate", + "type": "integer" + }, + "alignment_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'alignment_sampling_rate' describes the sampling rate used when training an alignment model. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Alignment Sampling Rate", + "type": "integer" + }, + "target_bit_depth": { + "default": 16, + "description": "Advanced. This is the bit depth of each sample in your audio files.", + "title": "Target Bit Depth", + "type": "integer" + }, + "n_fft": { + "default": 1024, + "description": "Advanced. This is the number of bins used by the Fast Fourier Transform (FFT).", + "title": "FFT Size", + "type": "integer" + }, + "fft_window_size": { + "default": 1024, + "description": "Advanced. This is the window size used by the Fast Fourier Transform (FFT).", + "title": "FFT Window Size", + "type": "integer" + }, + "fft_hop_size": { + "default": 256, + "description": "Advanced. This is the hop size for calculating the Short-Time Fourier Transform (STFT) which calculates a sequence of spectrograms from a single audio file. Another way of putting it is that the hop size is equal to the amount of non-intersecting samples from the audio in each spectrogram.", + "title": "FFT Hop Size", + "type": "integer" + }, + "f_min": { + "default": 0, + "description": "Advanced. This is the minimum frequency for the lowest frequency bin when calculating the spectrogram.", + "title": "Minimum Frequency", + "type": "integer" + }, + "f_max": { + "default": 8000, + "description": "Advanced. This is the maximum frequency for the highest frequency bin when calculating the spectrogram.", + "title": "Maximum Frequency", + "type": "integer" + }, + "n_mels": { + "default": 80, + "description": "Advanced. This is the number of filters in the Mel-scale spaced filterbank.", + "title": "Number of Mel bins", + "type": "integer" + }, + "spec_type": { + "anyOf": [ + { + "$ref": "#/$defs/AudioSpecTypeEnum" + }, + { + "type": "string" + } + ], + "default": "mel-librosa", + "description": "Advanced. Defines how to calculate the spectrogram. 'mel' uses the TorchAudio implementation for a Mel spectrogram. 'mel-librosa' uses Librosa's implementation. 'linear' calculates a non-Mel linear spectrogram and 'raw' calculates a complex-valued spectrogram. 'linear' and 'raw' are not currently supported by EveryVoice. We recommend using 'mel-librosa'.", + "title": "Spec Type" + }, + "vocoder_segment_size": { + "default": 8192, + "description": "Advanced. The vocoder, or spec-to-wav model is trained by sampling random fixed-size sections of the audio. This value specifies the number of samples in those sections.", + "title": "Vocoder Segment Size", + "type": "integer" + } + }, + "title": "AudioConfig", + "type": "object" + }, + "AudioSpecTypeEnum": { + "enum": [ + "mel", + "mel-librosa", + "linear", + "raw" + ], + "title": "AudioSpecTypeEnum", + "type": "string" + }, + "ContactInformation": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact_name": { + "description": "The name of the contact person or organization responsible for answering questions related to this model.", + "title": "Contact Name", + "type": "string" + }, + "contact_email": { + "description": "The email address of the contact person or organization responsible for answering questions related to this model.", + "format": "email", + "title": "Contact Email", + "type": "string" + } + }, + "required": [ + "contact_name", + "contact_email" + ], + "title": "ContactInformation", + "type": "object" + }, + "DFAlignerExtractionMethod": { + "enum": [ + "beam", + "dijkstra" + ], + "title": "DFAlignerExtractionMethod", + "type": "string" + }, + "DFAlignerModelConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "target_text_representation_level": { + "allOf": [ + { + "$ref": "#/$defs/TargetTrainingTextRepresentationLevel" + } + ], + "default": "characters" + }, + "lstm_dim": { + "default": 512, + "description": "The number of dimensions in the LSTM layers.", + "title": "Lstm Dim", + "type": "integer" + }, + "conv_dim": { + "default": 512, + "description": "The number of dimensions in the convolutional layers.", + "title": "Conv Dim", + "type": "integer" + } + }, + "title": "DFAlignerModelConfig", + "type": "object" + }, + "DFAlignerTrainingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "batch_size": { + "default": 16, + "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.", + "title": "Batch Size", + "type": "integer" + }, + "save_top_k_ckpts": { + "default": 5, + "description": "The number of checkpoints to save.", + "title": "Save Top K Ckpts", + "type": "integer" + }, + "ckpt_steps": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter", + "title": "Ckpt Steps" + }, + "ckpt_epochs": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'", + "title": "Ckpt Epochs" + }, + "val_check_interval": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 500, + "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.", + "title": "Val Check Interval" + }, + "check_val_every_n_epoch": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training", + "title": "Check Val Every N Epoch" + }, + "max_epochs": { + "default": 1000, + "description": "Stop training after this many epochs", + "title": "Max Epochs", + "type": "integer" + }, + "max_steps": { + "default": 100000, + "description": "Stop training after this many steps", + "title": "Max Steps", + "type": "integer" + }, + "finetune_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Automatically resume training from a checkpoint loaded from this path.", + "title": "Finetune Checkpoint" + }, + "training_filelist": { + "default": "path/to/your/preprocessed/training_filelist.psv", + "description": "The path to a filelist containing samples belonging to your training set.", + "format": "path", + "title": "Training Filelist", + "type": "string" + }, + "validation_filelist": { + "default": "path/to/your/preprocessed/validation_filelist.psv", + "description": "The path to a filelist containing samples belonging to your validation set.", + "format": "path", + "title": "Validation Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The function to use to load the filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "logger": { + "allOf": [ + { + "$ref": "#/$defs/LoggerConfig" + } + ], + "description": "The configuration for the logger." + }, + "val_data_workers": { + "default": 0, + "description": "The number of CPU workers to use when loading data during validation.", + "title": "Val Data Workers", + "type": "integer" + }, + "train_data_workers": { + "default": 4, + "description": "The number of CPU workers to use when loading data during training.", + "title": "Train Data Workers", + "type": "integer" + }, + "optimizer": { + "anyOf": [ + { + "$ref": "#/$defs/AdamOptimizer" + }, + { + "$ref": "#/$defs/AdamWOptimizer" + } + ], + "description": "Optimizer configuration settings.", + "title": "Optimizer" + }, + "binned_sampler": { + "default": true, + "description": "Use a binned length sampler", + "title": "Binned Sampler", + "type": "boolean" + }, + "plot_steps": { + "default": 1000, + "description": "The maximum number of steps to plot", + "title": "Plot Steps", + "type": "integer" + }, + "extraction_method": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerExtractionMethod" + } + ], + "default": "dijkstra", + "description": "The alignment extraction algorithm to use. 'beam' will be quicker but possibly less accurate than 'dijkstra'" + } + }, + "title": "DFAlignerTrainingConfig", + "type": "object" + }, + "Dataset": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "label": { + "default": "YourDataSet", + "description": "A label for the source of data", + "title": "Label", + "type": "string" + }, + "permissions_obtained": { + "default": false, + "description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.", + "title": "Permissions Obtained", + "type": "boolean" + }, + "data_dir": { + "default": "/please/create/a/path/to/your/dataset/data", + "description": "The path to the directory with your audio files.", + "format": "path", + "title": "Data Dir", + "type": "string" + }, + "filelist": { + "default": "/please/create/a/path/to/your/dataset/filelist", + "description": "The path to your dataset's filelist.", + "format": "path", + "title": "Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The file-loader function to use to load your dataset's filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "sox_effects": { + "default": [ + [ + "channels", + "1" + ] + ], + "description": "Advanced. A list of SoX effects to apply to your audio prior to preprocessing. Run python -c 'import torchaudio; print(torchaudio.sox_effects.effect_names())' to see a list of supported effects.", + "items": {}, + "title": "Sox Effects", + "type": "array" + } + }, + "title": "Dataset", + "type": "object" + }, + "LoggerConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "The logger configures all the information needed for where to store your experiment's logs and checkpoints.\nThe structure of your logs will then be:\n / / \n will be generated by calling each time the LoggerConfig is constructed.", + "properties": { + "name": { + "default": "BaseExperiment", + "description": "The name of the experiment. The structure of your logs will be / / .", + "title": "Experiment Name", + "type": "string" + }, + "save_dir": { + "default": "logs_and_checkpoints", + "description": "The directory to save your checkpoints and logs to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "sub_dir_callable": { + "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be / / where is a timestamp.", + "title": "Sub Dir Callable", + "type": "string" + }, + "version": { + "default": "base", + "description": "The version of your experiment. The structure of your logs will be / / .", + "title": "Version", + "type": "string" + } + }, + "title": "LoggerConfig", + "type": "object" + }, + "PreprocessingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "dataset": { + "default": "YourDataSet", + "description": "The name of the dataset.", + "title": "Dataset", + "type": "string" + }, + "train_split": { + "default": 0.9, + "description": "The amount of the dataset to use for training. The rest will be used as validation. Hold some of the validation set out for a test set if you are performing experiments.", + "maximum": 1.0, + "minimum": 0.0, + "title": "Train Split", + "type": "number" + }, + "dataset_split_seed": { + "default": 1234, + "description": "The seed to use when splitting the dataset into train and validation sets.", + "title": "Dataset Split Seed", + "type": "integer" + }, + "save_dir": { + "default": "preprocessed/YourDataSet", + "description": "The directory to save preprocessed files to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "audio": { + "allOf": [ + { + "$ref": "#/$defs/AudioConfig" + } + ], + "description": "Configuration settings for audio." + }, + "path_to_audio_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path to an audio configuration file.", + "title": "Path To Audio Config File" + }, + "source_data": { + "description": "A list of datasets.", + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Source Data", + "type": "array" + } + }, + "title": "PreprocessingConfig", + "type": "object" + }, + "Punctuation": { + "properties": { + "exclamations": { + "default": [ + "!", + "\u00a1" + ], + "description": "Exclamation punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Exclamations", + "type": "array" + }, + "question_symbols": { + "default": [ + "?", + "\u00bf" + ], + "description": "Question/interrogative punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Question Symbols", + "type": "array" + }, + "quotemarks": { + "default": [ + "\"", + "'", + "\u201c", + "\u201d", + "\u00ab", + "\u00bb" + ], + "description": "Quotemark punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Quotemarks", + "type": "array" + }, + "big_breaks": { + "default": [ + ".", + ":", + ";" + ], + "description": "Punctuation symbols indicating a 'big break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Big Breaks", + "type": "array" + }, + "small_breaks": { + "default": [ + ",", + "-", + "\u2014" + ], + "description": "Punctuation symbols indicating a 'small break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Small Breaks", + "type": "array" + }, + "ellipsis": { + "default": [ + "\u2026" + ], + "description": "Punctuation symbols indicating an ellipsis used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Ellipsis", + "type": "array" + } + }, + "title": "Punctuation", + "type": "object" + }, + "Symbols": { + "additionalProperties": true, + "properties": { + "silence": { + "default": [ + "" + ], + "description": "The symbol(s) used to indicate silence.", + "items": { + "type": "string" + }, + "title": "Silence", + "type": "array" + }, + "punctuation": { + "allOf": [ + { + "$ref": "#/$defs/Punctuation" + } + ], + "description": "EveryVoice will combine punctuation and normalize it into a set of five permissible types of punctuation to help tractable training." + } + }, + "title": "Symbols", + "type": "object" + }, + "TargetTrainingTextRepresentationLevel": { + "enum": [ + "characters", + "phones", + "phonological_features" + ], + "title": "TargetTrainingTextRepresentationLevel", + "type": "string" + }, + "TextConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "symbols": { + "$ref": "#/$defs/Symbols" + }, + "to_replace": { + "additionalProperties": { + "type": "string" + }, + "default": {}, + "title": "To Replace", + "type": "object" + }, + "cleaners": { + "items": { + "type": "string" + }, + "title": "Cleaners", + "type": "array" + } + }, + "title": "TextConfig", + "type": "object" + } + }, + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "allOf": [ + { + "$ref": "#/$defs/ContactInformation" + } + ], + "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide to understand more about the importance of misuse prevention with TTS." + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerTrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + }, + "text": { + "$ref": "#/$defs/TextConfig" + }, + "path_to_text_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path To Text Config File" + } + }, + "required": [ + "contact" + ], + "title": "DFAlignerConfig", + "type": "object" +} diff --git a/everyvoice/.schema/everyvoice-shared-data-0.2.json b/everyvoice/.schema/everyvoice-shared-data-0.2.json new file mode 100644 index 00000000..52d3240b --- /dev/null +++ b/everyvoice/.schema/everyvoice-shared-data-0.2.json @@ -0,0 +1,233 @@ +{ + "$defs": { + "AudioConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "min_audio_length": { + "default": 0.4, + "description": "The minimum length of an audio sample in seconds. Audio shorter than this will be ignored during preprocessing.", + "title": "Min Audio Length", + "type": "number" + }, + "max_audio_length": { + "default": 11.0, + "description": "The maximum length of an audio sample in seconds. Audio longer than this will be ignored during preprocessing. Increasing the max_audio_length will result in larger memory usage. If you are running out of memory, consider lowering the max_audio_length.", + "title": "Max Audio Length", + "type": "number" + }, + "max_wav_value": { + "default": 32767.0, + "description": "Advanced. The maximum value allowed to be in your wav files. For 16-bit audio, this should be (2**16)/2 - 1.", + "title": "Max Wav Value", + "type": "number" + }, + "input_sampling_rate": { + "default": 22050, + "description": "The sampling rate describes the number of samples per second of audio. The 'input_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the spectrograms predicted by your text-to-spec model will also be calculated from audio at this sampling rate. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Input Sampling Rate", + "type": "integer" + }, + "output_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'output_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the wav files generated by your vocoder or spec-to-wav model will be at this sampling rate. If you change this value, you will also need to change the upsample rates in your vocoder. Your audio will automatically be re-sampled during preprocessing.", + "title": "Output Sampling Rate", + "type": "integer" + }, + "alignment_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'alignment_sampling_rate' describes the sampling rate used when training an alignment model. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Alignment Sampling Rate", + "type": "integer" + }, + "target_bit_depth": { + "default": 16, + "description": "Advanced. This is the bit depth of each sample in your audio files.", + "title": "Target Bit Depth", + "type": "integer" + }, + "n_fft": { + "default": 1024, + "description": "Advanced. This is the number of bins used by the Fast Fourier Transform (FFT).", + "title": "FFT Size", + "type": "integer" + }, + "fft_window_size": { + "default": 1024, + "description": "Advanced. This is the window size used by the Fast Fourier Transform (FFT).", + "title": "FFT Window Size", + "type": "integer" + }, + "fft_hop_size": { + "default": 256, + "description": "Advanced. This is the hop size for calculating the Short-Time Fourier Transform (STFT) which calculates a sequence of spectrograms from a single audio file. Another way of putting it is that the hop size is equal to the amount of non-intersecting samples from the audio in each spectrogram.", + "title": "FFT Hop Size", + "type": "integer" + }, + "f_min": { + "default": 0, + "description": "Advanced. This is the minimum frequency for the lowest frequency bin when calculating the spectrogram.", + "title": "Minimum Frequency", + "type": "integer" + }, + "f_max": { + "default": 8000, + "description": "Advanced. This is the maximum frequency for the highest frequency bin when calculating the spectrogram.", + "title": "Maximum Frequency", + "type": "integer" + }, + "n_mels": { + "default": 80, + "description": "Advanced. This is the number of filters in the Mel-scale spaced filterbank.", + "title": "Number of Mel bins", + "type": "integer" + }, + "spec_type": { + "anyOf": [ + { + "$ref": "#/$defs/AudioSpecTypeEnum" + }, + { + "type": "string" + } + ], + "default": "mel-librosa", + "description": "Advanced. Defines how to calculate the spectrogram. 'mel' uses the TorchAudio implementation for a Mel spectrogram. 'mel-librosa' uses Librosa's implementation. 'linear' calculates a non-Mel linear spectrogram and 'raw' calculates a complex-valued spectrogram. 'linear' and 'raw' are not currently supported by EveryVoice. We recommend using 'mel-librosa'.", + "title": "Spec Type" + }, + "vocoder_segment_size": { + "default": 8192, + "description": "Advanced. The vocoder, or spec-to-wav model is trained by sampling random fixed-size sections of the audio. This value specifies the number of samples in those sections.", + "title": "Vocoder Segment Size", + "type": "integer" + } + }, + "title": "AudioConfig", + "type": "object" + }, + "AudioSpecTypeEnum": { + "enum": [ + "mel", + "mel-librosa", + "linear", + "raw" + ], + "title": "AudioSpecTypeEnum", + "type": "string" + }, + "Dataset": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "label": { + "default": "YourDataSet", + "description": "A label for the source of data", + "title": "Label", + "type": "string" + }, + "permissions_obtained": { + "default": false, + "description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.", + "title": "Permissions Obtained", + "type": "boolean" + }, + "data_dir": { + "default": "/please/create/a/path/to/your/dataset/data", + "description": "The path to the directory with your audio files.", + "format": "path", + "title": "Data Dir", + "type": "string" + }, + "filelist": { + "default": "/please/create/a/path/to/your/dataset/filelist", + "description": "The path to your dataset's filelist.", + "format": "path", + "title": "Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The file-loader function to use to load your dataset's filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "sox_effects": { + "default": [ + [ + "channels", + "1" + ] + ], + "description": "Advanced. A list of SoX effects to apply to your audio prior to preprocessing. Run python -c 'import torchaudio; print(torchaudio.sox_effects.effect_names())' to see a list of supported effects.", + "items": {}, + "title": "Sox Effects", + "type": "array" + } + }, + "title": "Dataset", + "type": "object" + } + }, + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "dataset": { + "default": "YourDataSet", + "description": "The name of the dataset.", + "title": "Dataset", + "type": "string" + }, + "train_split": { + "default": 0.9, + "description": "The amount of the dataset to use for training. The rest will be used as validation. Hold some of the validation set out for a test set if you are performing experiments.", + "maximum": 1.0, + "minimum": 0.0, + "title": "Train Split", + "type": "number" + }, + "dataset_split_seed": { + "default": 1234, + "description": "The seed to use when splitting the dataset into train and validation sets.", + "title": "Dataset Split Seed", + "type": "integer" + }, + "save_dir": { + "default": "preprocessed/YourDataSet", + "description": "The directory to save preprocessed files to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "audio": { + "allOf": [ + { + "$ref": "#/$defs/AudioConfig" + } + ], + "description": "Configuration settings for audio." + }, + "path_to_audio_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path to an audio configuration file.", + "title": "Path To Audio Config File" + }, + "source_data": { + "description": "A list of datasets.", + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Source Data", + "type": "array" + } + }, + "title": "PreprocessingConfig", + "type": "object" +} diff --git a/everyvoice/.schema/everyvoice-shared-text-0.2.json b/everyvoice/.schema/everyvoice-shared-text-0.2.json new file mode 100644 index 00000000..0bf1a6f2 --- /dev/null +++ b/everyvoice/.schema/everyvoice-shared-text-0.2.json @@ -0,0 +1,137 @@ +{ + "$defs": { + "Punctuation": { + "properties": { + "exclamations": { + "default": [ + "!", + "\u00a1" + ], + "description": "Exclamation punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Exclamations", + "type": "array" + }, + "question_symbols": { + "default": [ + "?", + "\u00bf" + ], + "description": "Question/interrogative punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Question Symbols", + "type": "array" + }, + "quotemarks": { + "default": [ + "\"", + "'", + "\u201c", + "\u201d", + "\u00ab", + "\u00bb" + ], + "description": "Quotemark punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Quotemarks", + "type": "array" + }, + "big_breaks": { + "default": [ + ".", + ":", + ";" + ], + "description": "Punctuation symbols indicating a 'big break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Big Breaks", + "type": "array" + }, + "small_breaks": { + "default": [ + ",", + "-", + "\u2014" + ], + "description": "Punctuation symbols indicating a 'small break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Small Breaks", + "type": "array" + }, + "ellipsis": { + "default": [ + "\u2026" + ], + "description": "Punctuation symbols indicating an ellipsis used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Ellipsis", + "type": "array" + } + }, + "title": "Punctuation", + "type": "object" + }, + "Symbols": { + "additionalProperties": true, + "properties": { + "silence": { + "default": [ + "" + ], + "description": "The symbol(s) used to indicate silence.", + "items": { + "type": "string" + }, + "title": "Silence", + "type": "array" + }, + "punctuation": { + "allOf": [ + { + "$ref": "#/$defs/Punctuation" + } + ], + "description": "EveryVoice will combine punctuation and normalize it into a set of five permissible types of punctuation to help tractable training." + } + }, + "title": "Symbols", + "type": "object" + } + }, + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "symbols": { + "$ref": "#/$defs/Symbols" + }, + "to_replace": { + "additionalProperties": { + "type": "string" + }, + "default": {}, + "title": "To Replace", + "type": "object" + }, + "cleaners": { + "items": { + "type": "string" + }, + "title": "Cleaners", + "type": "array" + } + }, + "title": "TextConfig", + "type": "object" +} diff --git a/everyvoice/.schema/everyvoice-spec-to-wav-0.2.json b/everyvoice/.schema/everyvoice-spec-to-wav-0.2.json new file mode 100644 index 00000000..d1a48ec8 --- /dev/null +++ b/everyvoice/.schema/everyvoice-spec-to-wav-0.2.json @@ -0,0 +1,845 @@ +{ + "$defs": { + "AdamOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the Adam Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "adam", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "AdamOptimizer", + "type": "object" + }, + "AdamWOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the AdamW Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "adamw", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "AdamWOptimizer", + "type": "object" + }, + "AudioConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "min_audio_length": { + "default": 0.4, + "description": "The minimum length of an audio sample in seconds. Audio shorter than this will be ignored during preprocessing.", + "title": "Min Audio Length", + "type": "number" + }, + "max_audio_length": { + "default": 11.0, + "description": "The maximum length of an audio sample in seconds. Audio longer than this will be ignored during preprocessing. Increasing the max_audio_length will result in larger memory usage. If you are running out of memory, consider lowering the max_audio_length.", + "title": "Max Audio Length", + "type": "number" + }, + "max_wav_value": { + "default": 32767.0, + "description": "Advanced. The maximum value allowed to be in your wav files. For 16-bit audio, this should be (2**16)/2 - 1.", + "title": "Max Wav Value", + "type": "number" + }, + "input_sampling_rate": { + "default": 22050, + "description": "The sampling rate describes the number of samples per second of audio. The 'input_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the spectrograms predicted by your text-to-spec model will also be calculated from audio at this sampling rate. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Input Sampling Rate", + "type": "integer" + }, + "output_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'output_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the wav files generated by your vocoder or spec-to-wav model will be at this sampling rate. If you change this value, you will also need to change the upsample rates in your vocoder. Your audio will automatically be re-sampled during preprocessing.", + "title": "Output Sampling Rate", + "type": "integer" + }, + "alignment_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'alignment_sampling_rate' describes the sampling rate used when training an alignment model. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Alignment Sampling Rate", + "type": "integer" + }, + "target_bit_depth": { + "default": 16, + "description": "Advanced. This is the bit depth of each sample in your audio files.", + "title": "Target Bit Depth", + "type": "integer" + }, + "n_fft": { + "default": 1024, + "description": "Advanced. This is the number of bins used by the Fast Fourier Transform (FFT).", + "title": "FFT Size", + "type": "integer" + }, + "fft_window_size": { + "default": 1024, + "description": "Advanced. This is the window size used by the Fast Fourier Transform (FFT).", + "title": "FFT Window Size", + "type": "integer" + }, + "fft_hop_size": { + "default": 256, + "description": "Advanced. This is the hop size for calculating the Short-Time Fourier Transform (STFT) which calculates a sequence of spectrograms from a single audio file. Another way of putting it is that the hop size is equal to the amount of non-intersecting samples from the audio in each spectrogram.", + "title": "FFT Hop Size", + "type": "integer" + }, + "f_min": { + "default": 0, + "description": "Advanced. This is the minimum frequency for the lowest frequency bin when calculating the spectrogram.", + "title": "Minimum Frequency", + "type": "integer" + }, + "f_max": { + "default": 8000, + "description": "Advanced. This is the maximum frequency for the highest frequency bin when calculating the spectrogram.", + "title": "Maximum Frequency", + "type": "integer" + }, + "n_mels": { + "default": 80, + "description": "Advanced. This is the number of filters in the Mel-scale spaced filterbank.", + "title": "Number of Mel bins", + "type": "integer" + }, + "spec_type": { + "anyOf": [ + { + "$ref": "#/$defs/AudioSpecTypeEnum" + }, + { + "type": "string" + } + ], + "default": "mel-librosa", + "description": "Advanced. Defines how to calculate the spectrogram. 'mel' uses the TorchAudio implementation for a Mel spectrogram. 'mel-librosa' uses Librosa's implementation. 'linear' calculates a non-Mel linear spectrogram and 'raw' calculates a complex-valued spectrogram. 'linear' and 'raw' are not currently supported by EveryVoice. We recommend using 'mel-librosa'.", + "title": "Spec Type" + }, + "vocoder_segment_size": { + "default": 8192, + "description": "Advanced. The vocoder, or spec-to-wav model is trained by sampling random fixed-size sections of the audio. This value specifies the number of samples in those sections.", + "title": "Vocoder Segment Size", + "type": "integer" + } + }, + "title": "AudioConfig", + "type": "object" + }, + "AudioSpecTypeEnum": { + "enum": [ + "mel", + "mel-librosa", + "linear", + "raw" + ], + "title": "AudioSpecTypeEnum", + "type": "string" + }, + "ContactInformation": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact_name": { + "description": "The name of the contact person or organization responsible for answering questions related to this model.", + "title": "Contact Name", + "type": "string" + }, + "contact_email": { + "description": "The email address of the contact person or organization responsible for answering questions related to this model.", + "format": "email", + "title": "Contact Email", + "type": "string" + } + }, + "required": [ + "contact_name", + "contact_email" + ], + "title": "ContactInformation", + "type": "object" + }, + "Dataset": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "label": { + "default": "YourDataSet", + "description": "A label for the source of data", + "title": "Label", + "type": "string" + }, + "permissions_obtained": { + "default": false, + "description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.", + "title": "Permissions Obtained", + "type": "boolean" + }, + "data_dir": { + "default": "/please/create/a/path/to/your/dataset/data", + "description": "The path to the directory with your audio files.", + "format": "path", + "title": "Data Dir", + "type": "string" + }, + "filelist": { + "default": "/please/create/a/path/to/your/dataset/filelist", + "description": "The path to your dataset's filelist.", + "format": "path", + "title": "Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The file-loader function to use to load your dataset's filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "sox_effects": { + "default": [ + [ + "channels", + "1" + ] + ], + "description": "Advanced. A list of SoX effects to apply to your audio prior to preprocessing. Run python -c 'import torchaudio; print(torchaudio.sox_effects.effect_names())' to see a list of supported effects.", + "items": {}, + "title": "Sox Effects", + "type": "array" + } + }, + "title": "Dataset", + "type": "object" + }, + "HiFiGANModelConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "resblock": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANResblock" + } + ], + "default": "1", + "description": "Which resblock to use. See Kong et. al. 2020: https://arxiv.org/abs/2010.05646" + }, + "upsample_rates": { + "default": [ + 8, + 8, + 2, + 2 + ], + "description": "The stride of each convolutional layer in the upsampling module.", + "items": { + "type": "integer" + }, + "title": "Upsample Rates", + "type": "array" + }, + "upsample_kernel_sizes": { + "default": [ + 16, + 16, + 4, + 4 + ], + "description": "The kernel size of each convolutional layer in the upsampling module.", + "items": { + "type": "integer" + }, + "title": "Upsample Kernel Sizes", + "type": "array" + }, + "upsample_initial_channel": { + "default": 512, + "description": "The number of dimensions to project the Mel inputs to before being passed to the resblock.", + "title": "Upsample Initial Channel", + "type": "integer" + }, + "resblock_kernel_sizes": { + "default": [ + 3, + 7, + 11 + ], + "description": "The kernel size of each convolutional layer in the resblock.", + "items": { + "type": "integer" + }, + "title": "Resblock Kernel Sizes", + "type": "array" + }, + "resblock_dilation_sizes": { + "default": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "description": "The dilations of each convolution in each layer of the resblock.", + "items": { + "items": { + "type": "integer" + }, + "type": "array" + }, + "title": "Resblock Dilation Sizes", + "type": "array" + }, + "activation_function": { + "description": "The activation function to use.", + "title": "Activation Function", + "type": "string" + }, + "istft_layer": { + "default": false, + "description": "Whether to predict phase and magnitude values and use an inverse Short-Time Fourier Transform instead of predicting a waveform directly. See Kaneko et. al. 2022: https://arxiv.org/abs/2203.02395", + "title": "Istft Layer", + "type": "boolean" + }, + "msd_layers": { + "default": 3, + "description": "The number of layers to use in the Multi-Scale Discriminator.", + "title": "Msd Layers", + "type": "integer" + }, + "mpd_layers": { + "default": [ + 2, + 3, + 5, + 7, + 11 + ], + "description": "The size of each layer in the Multi-Period Discriminator.", + "items": { + "type": "integer" + }, + "title": "Mpd Layers", + "type": "array" + } + }, + "title": "HiFiGANModelConfig", + "type": "object" + }, + "HiFiGANResblock": { + "enum": [ + "1", + "2" + ], + "title": "HiFiGANResblock", + "type": "string" + }, + "HiFiGANTrainTypes": { + "enum": [ + "original", + "wgan" + ], + "title": "HiFiGANTrainTypes", + "type": "string" + }, + "HiFiGANTrainingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "batch_size": { + "default": 16, + "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.", + "title": "Batch Size", + "type": "integer" + }, + "save_top_k_ckpts": { + "default": 5, + "description": "The number of checkpoints to save.", + "title": "Save Top K Ckpts", + "type": "integer" + }, + "ckpt_steps": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter", + "title": "Ckpt Steps" + }, + "ckpt_epochs": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'", + "title": "Ckpt Epochs" + }, + "val_check_interval": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 500, + "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.", + "title": "Val Check Interval" + }, + "check_val_every_n_epoch": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training", + "title": "Check Val Every N Epoch" + }, + "max_epochs": { + "default": 1000, + "description": "Stop training after this many epochs", + "title": "Max Epochs", + "type": "integer" + }, + "max_steps": { + "default": 100000, + "description": "Stop training after this many steps", + "title": "Max Steps", + "type": "integer" + }, + "finetune_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Automatically resume training from a checkpoint loaded from this path.", + "title": "Finetune Checkpoint" + }, + "training_filelist": { + "default": "path/to/your/preprocessed/training_filelist.psv", + "description": "The path to a filelist containing samples belonging to your training set.", + "format": "path", + "title": "Training Filelist", + "type": "string" + }, + "validation_filelist": { + "default": "path/to/your/preprocessed/validation_filelist.psv", + "description": "The path to a filelist containing samples belonging to your validation set.", + "format": "path", + "title": "Validation Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The function to use to load the filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "logger": { + "allOf": [ + { + "$ref": "#/$defs/LoggerConfig" + } + ], + "description": "The configuration for the logger." + }, + "val_data_workers": { + "default": 0, + "description": "The number of CPU workers to use when loading data during validation.", + "title": "Val Data Workers", + "type": "integer" + }, + "train_data_workers": { + "default": 4, + "description": "The number of CPU workers to use when loading data during training.", + "title": "Train Data Workers", + "type": "integer" + }, + "generator_warmup_steps": { + "default": 0, + "description": "The number of steps to run through before activating the discriminators.", + "title": "Generator Warmup Steps", + "type": "integer" + }, + "gan_type": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANTrainTypes" + } + ], + "default": "original", + "description": "The type of GAN to use. Can be set to either 'original' for a vanilla GAN, or 'wgan' for a Wasserstein GAN that clips gradients." + }, + "optimizer": { + "anyOf": [ + { + "$ref": "#/$defs/AdamOptimizer" + }, + { + "$ref": "#/$defs/AdamWOptimizer" + }, + { + "$ref": "#/$defs/RMSOptimizer" + } + ], + "description": "Configuration settings for the optimizer.", + "title": "Optimizer" + }, + "wgan_clip_value": { + "default": 0.01, + "description": "The gradient clip value when gan_type='wgan'.", + "title": "Wgan Clip Value", + "type": "number" + }, + "use_weighted_sampler": { + "default": false, + "description": "Whether to use a sampler which oversamples from the minority language or speaker class for balanced training.", + "title": "Use Weighted Sampler", + "type": "boolean" + }, + "finetune": { + "default": false, + "description": "Whether to read spectrograms from 'preprocessed/synthesized_spec' instead of 'preprocessed/spec'. This is used when finetuning a pretrained spec-to-wav (vocoder) model using the outputs of a trained text-to-spec (feature prediction network) model.", + "title": "Finetune", + "type": "boolean" + } + }, + "title": "HiFiGANTrainingConfig", + "type": "object" + }, + "LoggerConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "The logger configures all the information needed for where to store your experiment's logs and checkpoints.\nThe structure of your logs will then be:\n / / \n will be generated by calling each time the LoggerConfig is constructed.", + "properties": { + "name": { + "default": "BaseExperiment", + "description": "The name of the experiment. The structure of your logs will be / / .", + "title": "Experiment Name", + "type": "string" + }, + "save_dir": { + "default": "logs_and_checkpoints", + "description": "The directory to save your checkpoints and logs to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "sub_dir_callable": { + "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be / / where is a timestamp.", + "title": "Sub Dir Callable", + "type": "string" + }, + "version": { + "default": "base", + "description": "The version of your experiment. The structure of your logs will be / / .", + "title": "Version", + "type": "string" + } + }, + "title": "LoggerConfig", + "type": "object" + }, + "PreprocessingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "dataset": { + "default": "YourDataSet", + "description": "The name of the dataset.", + "title": "Dataset", + "type": "string" + }, + "train_split": { + "default": 0.9, + "description": "The amount of the dataset to use for training. The rest will be used as validation. Hold some of the validation set out for a test set if you are performing experiments.", + "maximum": 1.0, + "minimum": 0.0, + "title": "Train Split", + "type": "number" + }, + "dataset_split_seed": { + "default": 1234, + "description": "The seed to use when splitting the dataset into train and validation sets.", + "title": "Dataset Split Seed", + "type": "integer" + }, + "save_dir": { + "default": "preprocessed/YourDataSet", + "description": "The directory to save preprocessed files to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "audio": { + "allOf": [ + { + "$ref": "#/$defs/AudioConfig" + } + ], + "description": "Configuration settings for audio." + }, + "path_to_audio_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path to an audio configuration file.", + "title": "Path To Audio Config File" + }, + "source_data": { + "description": "A list of datasets.", + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Source Data", + "type": "array" + } + }, + "title": "PreprocessingConfig", + "type": "object" + }, + "RMSOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "alpha": { + "default": 0.99, + "description": "Advanced. The value of RMSProp optimizer alpha smoothing constant.", + "title": "Alpha", + "type": "number" + }, + "name": { + "default": "rms", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "RMSOptimizer", + "type": "object" + } + }, + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "allOf": [ + { + "$ref": "#/$defs/ContactInformation" + } + ], + "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide to understand more about the importance of misuse prevention with TTS." + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a model configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANTrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a training configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + } + }, + "required": [ + "contact" + ], + "title": "HiFiGANConfig", + "type": "object" +} diff --git a/everyvoice/.schema/everyvoice-text-to-spec-0.2.json b/everyvoice/.schema/everyvoice-text-to-spec-0.2.json new file mode 100644 index 00000000..bb54c2b5 --- /dev/null +++ b/everyvoice/.schema/everyvoice-text-to-spec-0.2.json @@ -0,0 +1,1115 @@ +{ + "$defs": { + "AudioConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "min_audio_length": { + "default": 0.4, + "description": "The minimum length of an audio sample in seconds. Audio shorter than this will be ignored during preprocessing.", + "title": "Min Audio Length", + "type": "number" + }, + "max_audio_length": { + "default": 11.0, + "description": "The maximum length of an audio sample in seconds. Audio longer than this will be ignored during preprocessing. Increasing the max_audio_length will result in larger memory usage. If you are running out of memory, consider lowering the max_audio_length.", + "title": "Max Audio Length", + "type": "number" + }, + "max_wav_value": { + "default": 32767.0, + "description": "Advanced. The maximum value allowed to be in your wav files. For 16-bit audio, this should be (2**16)/2 - 1.", + "title": "Max Wav Value", + "type": "number" + }, + "input_sampling_rate": { + "default": 22050, + "description": "The sampling rate describes the number of samples per second of audio. The 'input_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the spectrograms predicted by your text-to-spec model will also be calculated from audio at this sampling rate. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Input Sampling Rate", + "type": "integer" + }, + "output_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'output_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the wav files generated by your vocoder or spec-to-wav model will be at this sampling rate. If you change this value, you will also need to change the upsample rates in your vocoder. Your audio will automatically be re-sampled during preprocessing.", + "title": "Output Sampling Rate", + "type": "integer" + }, + "alignment_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'alignment_sampling_rate' describes the sampling rate used when training an alignment model. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Alignment Sampling Rate", + "type": "integer" + }, + "target_bit_depth": { + "default": 16, + "description": "Advanced. This is the bit depth of each sample in your audio files.", + "title": "Target Bit Depth", + "type": "integer" + }, + "n_fft": { + "default": 1024, + "description": "Advanced. This is the number of bins used by the Fast Fourier Transform (FFT).", + "title": "FFT Size", + "type": "integer" + }, + "fft_window_size": { + "default": 1024, + "description": "Advanced. This is the window size used by the Fast Fourier Transform (FFT).", + "title": "FFT Window Size", + "type": "integer" + }, + "fft_hop_size": { + "default": 256, + "description": "Advanced. This is the hop size for calculating the Short-Time Fourier Transform (STFT) which calculates a sequence of spectrograms from a single audio file. Another way of putting it is that the hop size is equal to the amount of non-intersecting samples from the audio in each spectrogram.", + "title": "FFT Hop Size", + "type": "integer" + }, + "f_min": { + "default": 0, + "description": "Advanced. This is the minimum frequency for the lowest frequency bin when calculating the spectrogram.", + "title": "Minimum Frequency", + "type": "integer" + }, + "f_max": { + "default": 8000, + "description": "Advanced. This is the maximum frequency for the highest frequency bin when calculating the spectrogram.", + "title": "Maximum Frequency", + "type": "integer" + }, + "n_mels": { + "default": 80, + "description": "Advanced. This is the number of filters in the Mel-scale spaced filterbank.", + "title": "Number of Mel bins", + "type": "integer" + }, + "spec_type": { + "anyOf": [ + { + "$ref": "#/$defs/AudioSpecTypeEnum" + }, + { + "type": "string" + } + ], + "default": "mel-librosa", + "description": "Advanced. Defines how to calculate the spectrogram. 'mel' uses the TorchAudio implementation for a Mel spectrogram. 'mel-librosa' uses Librosa's implementation. 'linear' calculates a non-Mel linear spectrogram and 'raw' calculates a complex-valued spectrogram. 'linear' and 'raw' are not currently supported by EveryVoice. We recommend using 'mel-librosa'.", + "title": "Spec Type" + }, + "vocoder_segment_size": { + "default": 8192, + "description": "Advanced. The vocoder, or spec-to-wav model is trained by sampling random fixed-size sections of the audio. This value specifies the number of samples in those sections.", + "title": "Vocoder Segment Size", + "type": "integer" + } + }, + "title": "AudioConfig", + "type": "object" + }, + "AudioSpecTypeEnum": { + "enum": [ + "mel", + "mel-librosa", + "linear", + "raw" + ], + "title": "AudioSpecTypeEnum", + "type": "string" + }, + "ConformerConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "layers": { + "default": 4, + "description": "The number of layers in the Conformer.", + "title": "Layers", + "type": "integer" + }, + "heads": { + "default": 2, + "description": "The number of heads in the multi-headed attention modules.", + "title": "Heads", + "type": "integer" + }, + "input_dim": { + "default": 256, + "description": "The number of hidden dimensions in the input. The input_dim value declared in the encoder and decoder modules must match the input_dim value declared in each variance predictor module.", + "title": "Input Dim", + "type": "integer" + }, + "feedforward_dim": { + "default": 1024, + "description": "The number of dimensions in the feedforward layers.", + "title": "Feedforward Dim", + "type": "integer" + }, + "conv_kernel_size": { + "default": 9, + "description": "The size of the kernel in each convoluational layer of the Conformer.", + "title": "Conv Kernel Size", + "type": "integer" + }, + "dropout": { + "default": 0.2, + "description": "The amount of dropout to apply.", + "title": "Dropout", + "type": "number" + } + }, + "title": "ConformerConfig", + "type": "object" + }, + "ContactInformation": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact_name": { + "description": "The name of the contact person or organization responsible for answering questions related to this model.", + "title": "Contact Name", + "type": "string" + }, + "contact_email": { + "description": "The email address of the contact person or organization responsible for answering questions related to this model.", + "format": "email", + "title": "Contact Email", + "type": "string" + } + }, + "required": [ + "contact_name", + "contact_email" + ], + "title": "ContactInformation", + "type": "object" + }, + "Dataset": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "label": { + "default": "YourDataSet", + "description": "A label for the source of data", + "title": "Label", + "type": "string" + }, + "permissions_obtained": { + "default": false, + "description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.", + "title": "Permissions Obtained", + "type": "boolean" + }, + "data_dir": { + "default": "/please/create/a/path/to/your/dataset/data", + "description": "The path to the directory with your audio files.", + "format": "path", + "title": "Data Dir", + "type": "string" + }, + "filelist": { + "default": "/please/create/a/path/to/your/dataset/filelist", + "description": "The path to your dataset's filelist.", + "format": "path", + "title": "Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The file-loader function to use to load your dataset's filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "sox_effects": { + "default": [ + [ + "channels", + "1" + ] + ], + "description": "Advanced. A list of SoX effects to apply to your audio prior to preprocessing. Run python -c 'import torchaudio; print(torchaudio.sox_effects.effect_names())' to see a list of supported effects.", + "items": {}, + "title": "Sox Effects", + "type": "array" + } + }, + "title": "Dataset", + "type": "object" + }, + "FastSpeech2ModelConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "encoder": { + "allOf": [ + { + "$ref": "#/$defs/ConformerConfig" + } + ], + "description": "The configuration of the encoder module." + }, + "decoder": { + "allOf": [ + { + "$ref": "#/$defs/ConformerConfig" + } + ], + "description": "The configuration of the decoder module." + }, + "variance_predictors": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictors" + } + ], + "description": "Configuration for energy, duration, and pitch variance predictors." + }, + "target_text_representation_level": { + "allOf": [ + { + "$ref": "#/$defs/TargetTrainingTextRepresentationLevel" + } + ], + "default": "characters" + }, + "learn_alignment": { + "default": true, + "description": "Whether to jointly learn alignments using monotonic alignment search module (See Badlani et. al. 2021: https://arxiv.org/abs/2108.10447). If set to False, you will have to provide text/audio alignments separately before training a text-to-spec (feature prediction) model.", + "title": "Learn Alignment", + "type": "boolean" + }, + "max_length": { + "default": 1000, + "description": "The maximum length (i.e. number of symbols) for text inputs.", + "title": "Max Length", + "type": "integer" + }, + "mel_loss": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLossEnum" + } + ], + "default": "mse", + "description": "The loss function to use when calculating Mel spectrogram loss." + }, + "use_postnet": { + "default": true, + "description": "Whether to use a postnet module.", + "title": "Use Postnet", + "type": "boolean" + }, + "multilingual": { + "default": false, + "description": "Whether to train a multilingual model. For this to work, your filelist must contain a column/field for 'language' with values for each utterance.", + "title": "Multilingual", + "type": "boolean" + }, + "multispeaker": { + "default": false, + "description": "Whether to train a multispeaker model. For this to work, your filelist must contain a column/field for 'speaker' with values for each utterance.", + "title": "Multispeaker", + "type": "boolean" + } + }, + "title": "FastSpeech2ModelConfig", + "type": "object" + }, + "FastSpeech2TrainingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "batch_size": { + "default": 16, + "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.", + "title": "Batch Size", + "type": "integer" + }, + "save_top_k_ckpts": { + "default": 5, + "description": "The number of checkpoints to save.", + "title": "Save Top K Ckpts", + "type": "integer" + }, + "ckpt_steps": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter", + "title": "Ckpt Steps" + }, + "ckpt_epochs": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'", + "title": "Ckpt Epochs" + }, + "val_check_interval": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 500, + "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.", + "title": "Val Check Interval" + }, + "check_val_every_n_epoch": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training", + "title": "Check Val Every N Epoch" + }, + "max_epochs": { + "default": 1000, + "description": "Stop training after this many epochs", + "title": "Max Epochs", + "type": "integer" + }, + "max_steps": { + "default": 100000, + "description": "Stop training after this many steps", + "title": "Max Steps", + "type": "integer" + }, + "finetune_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Automatically resume training from a checkpoint loaded from this path.", + "title": "Finetune Checkpoint" + }, + "training_filelist": { + "default": "path/to/your/preprocessed/training_filelist.psv", + "description": "The path to a filelist containing samples belonging to your training set.", + "format": "path", + "title": "Training Filelist", + "type": "string" + }, + "validation_filelist": { + "default": "path/to/your/preprocessed/validation_filelist.psv", + "description": "The path to a filelist containing samples belonging to your validation set.", + "format": "path", + "title": "Validation Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The function to use to load the filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "logger": { + "allOf": [ + { + "$ref": "#/$defs/LoggerConfig" + } + ], + "description": "The configuration for the logger." + }, + "val_data_workers": { + "default": 0, + "description": "The number of CPU workers to use when loading data during validation.", + "title": "Val Data Workers", + "type": "integer" + }, + "train_data_workers": { + "default": 4, + "description": "The number of CPU workers to use when loading data during training.", + "title": "Train Data Workers", + "type": "integer" + }, + "use_weighted_sampler": { + "default": false, + "description": "Whether to use a sampler which oversamples from the minority language or speaker class for balanced training.", + "title": "Use Weighted Sampler", + "type": "boolean" + }, + "optimizer": { + "allOf": [ + { + "$ref": "#/$defs/NoamOptimizer" + } + ], + "default": { + "learning_rate": 0.001, + "eps": 1e-08, + "weight_decay": 1e-06, + "betas": [ + 0.9, + 0.999 + ], + "name": "noam", + "warmup_steps": 1000 + }, + "description": "The optimizer to use during training." + }, + "vocoder_path": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Vocoder Path" + }, + "mel_loss_weight": { + "default": 1.0, + "description": "Multiply the spec loss by this weight", + "title": "Mel Loss Weight", + "type": "number" + }, + "postnet_loss_weight": { + "default": 1.0, + "description": "Multiply the postnet loss by this weight", + "title": "Postnet Loss Weight", + "type": "number" + }, + "pitch_loss_weight": { + "default": 0.1, + "description": "Multiply the pitch loss by this weight", + "title": "Pitch Loss Weight", + "type": "number" + }, + "energy_loss_weight": { + "default": 0.1, + "description": "Multiply the energy loss by this weight", + "title": "Energy Loss Weight", + "type": "number" + }, + "duration_loss_weight": { + "default": 0.1, + "description": "Multiply the duration loss by this weight", + "title": "Duration Loss Weight", + "type": "number" + }, + "attn_ctc_loss_weight": { + "default": 0.1, + "description": "Multiply the Attention CTC loss by this weight", + "title": "Attn Ctc Loss Weight", + "type": "number" + }, + "attn_bin_loss_weight": { + "default": 0.1, + "description": "Multiply the Attention Binarization loss by this weight", + "title": "Attn Bin Loss Weight", + "type": "number" + }, + "attn_bin_loss_warmup_epochs": { + "default": 100, + "description": "Scale the Attention Binarization loss by (current_epoch / attn_bin_loss_warmup_epochs) until the number of epochs defined by attn_bin_loss_warmup_epochs is reached.", + "minimum": 1, + "title": "Attn Bin Loss Warmup Epochs", + "type": "integer" + } + }, + "title": "FastSpeech2TrainingConfig", + "type": "object" + }, + "LoggerConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "The logger configures all the information needed for where to store your experiment's logs and checkpoints.\nThe structure of your logs will then be:\n / / \n will be generated by calling each time the LoggerConfig is constructed.", + "properties": { + "name": { + "default": "BaseExperiment", + "description": "The name of the experiment. The structure of your logs will be / / .", + "title": "Experiment Name", + "type": "string" + }, + "save_dir": { + "default": "logs_and_checkpoints", + "description": "The directory to save your checkpoints and logs to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "sub_dir_callable": { + "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be / / where is a timestamp.", + "title": "Sub Dir Callable", + "type": "string" + }, + "version": { + "default": "base", + "description": "The version of your experiment. The structure of your logs will be / / .", + "title": "Version", + "type": "string" + } + }, + "title": "LoggerConfig", + "type": "object" + }, + "NoamOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the Adam Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "noam", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + }, + "warmup_steps": { + "default": 1000, + "description": "The number of steps to increase the learning rate before starting to decrease it.", + "title": "Warmup Steps", + "type": "integer" + } + }, + "title": "NoamOptimizer", + "type": "object" + }, + "PreprocessingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "dataset": { + "default": "YourDataSet", + "description": "The name of the dataset.", + "title": "Dataset", + "type": "string" + }, + "train_split": { + "default": 0.9, + "description": "The amount of the dataset to use for training. The rest will be used as validation. Hold some of the validation set out for a test set if you are performing experiments.", + "maximum": 1.0, + "minimum": 0.0, + "title": "Train Split", + "type": "number" + }, + "dataset_split_seed": { + "default": 1234, + "description": "The seed to use when splitting the dataset into train and validation sets.", + "title": "Dataset Split Seed", + "type": "integer" + }, + "save_dir": { + "default": "preprocessed/YourDataSet", + "description": "The directory to save preprocessed files to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "audio": { + "allOf": [ + { + "$ref": "#/$defs/AudioConfig" + } + ], + "description": "Configuration settings for audio." + }, + "path_to_audio_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path to an audio configuration file.", + "title": "Path To Audio Config File" + }, + "source_data": { + "description": "A list of datasets.", + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Source Data", + "type": "array" + } + }, + "title": "PreprocessingConfig", + "type": "object" + }, + "Punctuation": { + "properties": { + "exclamations": { + "default": [ + "!", + "\u00a1" + ], + "description": "Exclamation punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Exclamations", + "type": "array" + }, + "question_symbols": { + "default": [ + "?", + "\u00bf" + ], + "description": "Question/interrogative punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Question Symbols", + "type": "array" + }, + "quotemarks": { + "default": [ + "\"", + "'", + "\u201c", + "\u201d", + "\u00ab", + "\u00bb" + ], + "description": "Quotemark punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Quotemarks", + "type": "array" + }, + "big_breaks": { + "default": [ + ".", + ":", + ";" + ], + "description": "Punctuation symbols indicating a 'big break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Big Breaks", + "type": "array" + }, + "small_breaks": { + "default": [ + ",", + "-", + "\u2014" + ], + "description": "Punctuation symbols indicating a 'small break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Small Breaks", + "type": "array" + }, + "ellipsis": { + "default": [ + "\u2026" + ], + "description": "Punctuation symbols indicating an ellipsis used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Ellipsis", + "type": "array" + } + }, + "title": "Punctuation", + "type": "object" + }, + "Symbols": { + "additionalProperties": true, + "properties": { + "silence": { + "default": [ + "" + ], + "description": "The symbol(s) used to indicate silence.", + "items": { + "type": "string" + }, + "title": "Silence", + "type": "array" + }, + "punctuation": { + "allOf": [ + { + "$ref": "#/$defs/Punctuation" + } + ], + "description": "EveryVoice will combine punctuation and normalize it into a set of five permissible types of punctuation to help tractable training." + } + }, + "title": "Symbols", + "type": "object" + }, + "TargetTrainingTextRepresentationLevel": { + "enum": [ + "characters", + "phones", + "phonological_features" + ], + "title": "TargetTrainingTextRepresentationLevel", + "type": "string" + }, + "TextConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "symbols": { + "$ref": "#/$defs/Symbols" + }, + "to_replace": { + "additionalProperties": { + "type": "string" + }, + "default": {}, + "title": "To Replace", + "type": "object" + }, + "cleaners": { + "items": { + "type": "string" + }, + "title": "Cleaners", + "type": "array" + } + }, + "title": "TextConfig", + "type": "object" + }, + "VarianceLevelEnum": { + "enum": [ + "phone", + "frame" + ], + "title": "VarianceLevelEnum", + "type": "string" + }, + "VarianceLossEnum": { + "enum": [ + "mse", + "mae" + ], + "title": "VarianceLossEnum", + "type": "string" + }, + "VariancePredictorBase": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "loss": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLossEnum" + } + ], + "default": "mse", + "description": "The loss function to use when calculate variance loss. Either 'mse' or 'mae'." + }, + "n_layers": { + "default": 5, + "description": "The number of layers in the variance predictor module.", + "title": "N Layers", + "type": "integer" + }, + "kernel_size": { + "default": 3, + "description": "The kernel size of each convolutional layer in the variance predictor module.", + "title": "Kernel Size", + "type": "integer" + }, + "dropout": { + "default": 0.5, + "description": "The amount of dropout to apply.", + "title": "Dropout", + "type": "number" + }, + "input_dim": { + "default": 256, + "description": "The number of hidden dimensions in the input. This must match the input_dim value declared in the encoder and decoder modules.", + "title": "Input Dim", + "type": "integer" + }, + "n_bins": { + "default": 256, + "description": "The number of bins to use in the variance predictor module.", + "title": "N Bins", + "type": "integer" + }, + "depthwise": { + "default": true, + "description": "Whether to use depthwise separable convolutions.", + "title": "Depthwise", + "type": "boolean" + } + }, + "title": "VariancePredictorBase", + "type": "object" + }, + "VariancePredictorConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "loss": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLossEnum" + } + ], + "default": "mse", + "description": "The loss function to use when calculate variance loss. Either 'mse' or 'mae'." + }, + "n_layers": { + "default": 5, + "description": "The number of layers in the variance predictor module.", + "title": "N Layers", + "type": "integer" + }, + "kernel_size": { + "default": 3, + "description": "The kernel size of each convolutional layer in the variance predictor module.", + "title": "Kernel Size", + "type": "integer" + }, + "dropout": { + "default": 0.5, + "description": "The amount of dropout to apply.", + "title": "Dropout", + "type": "number" + }, + "input_dim": { + "default": 256, + "description": "The number of hidden dimensions in the input. This must match the input_dim value declared in the encoder and decoder modules.", + "title": "Input Dim", + "type": "integer" + }, + "n_bins": { + "default": 256, + "description": "The number of bins to use in the variance predictor module.", + "title": "N Bins", + "type": "integer" + }, + "depthwise": { + "default": true, + "description": "Whether to use depthwise separable convolutions.", + "title": "Depthwise", + "type": "boolean" + }, + "level": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLevelEnum" + } + ], + "default": "phone", + "description": "The level for the variance predictor to use. 'frame' will make predictions at the frame level. 'phone' will average predictions across all frames in each phone." + } + }, + "title": "VariancePredictorConfig", + "type": "object" + }, + "VariancePredictors": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "energy": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictorConfig" + } + ], + "description": "The variance predictor for energy" + }, + "duration": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictorBase" + } + ], + "description": "The variance predictor for duration" + }, + "pitch": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictorConfig" + } + ], + "description": "The variance predictor for pitch" + } + }, + "title": "VariancePredictors", + "type": "object" + } + }, + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "allOf": [ + { + "$ref": "#/$defs/ContactInformation" + } + ], + "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide to understand more about the importance of misuse prevention with TTS." + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/FastSpeech2ModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a model configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/FastSpeech2TrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a training configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + }, + "text": { + "allOf": [ + { + "$ref": "#/$defs/TextConfig" + } + ], + "description": "The text configuration." + }, + "path_to_text_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a text configuration file.", + "title": "Path To Text Config File" + } + }, + "required": [ + "contact" + ], + "title": "FastSpeech2Config", + "type": "object" +} diff --git a/everyvoice/.schema/everyvoice-text-to-wav-0.2.json b/everyvoice/.schema/everyvoice-text-to-wav-0.2.json new file mode 100644 index 00000000..7561ee97 --- /dev/null +++ b/everyvoice/.schema/everyvoice-text-to-wav-0.2.json @@ -0,0 +1,2567 @@ +{ + "$defs": { + "AdamOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the Adam Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "adam", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "AdamOptimizer", + "type": "object" + }, + "AdamWOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the AdamW Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "adamw", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "AdamWOptimizer", + "type": "object" + }, + "AlignerConfigNoContact": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "anyOf": [ + { + "$ref": "#/$defs/ContactInformation" + }, + { + "type": "null" + } + ], + "default": null + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerTrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + }, + "text": { + "$ref": "#/$defs/TextConfig" + }, + "path_to_text_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path To Text Config File" + } + }, + "title": "AlignerConfigNoContact", + "type": "object" + }, + "AudioConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "min_audio_length": { + "default": 0.4, + "description": "The minimum length of an audio sample in seconds. Audio shorter than this will be ignored during preprocessing.", + "title": "Min Audio Length", + "type": "number" + }, + "max_audio_length": { + "default": 11.0, + "description": "The maximum length of an audio sample in seconds. Audio longer than this will be ignored during preprocessing. Increasing the max_audio_length will result in larger memory usage. If you are running out of memory, consider lowering the max_audio_length.", + "title": "Max Audio Length", + "type": "number" + }, + "max_wav_value": { + "default": 32767.0, + "description": "Advanced. The maximum value allowed to be in your wav files. For 16-bit audio, this should be (2**16)/2 - 1.", + "title": "Max Wav Value", + "type": "number" + }, + "input_sampling_rate": { + "default": 22050, + "description": "The sampling rate describes the number of samples per second of audio. The 'input_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the spectrograms predicted by your text-to-spec model will also be calculated from audio at this sampling rate. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Input Sampling Rate", + "type": "integer" + }, + "output_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'output_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the wav files generated by your vocoder or spec-to-wav model will be at this sampling rate. If you change this value, you will also need to change the upsample rates in your vocoder. Your audio will automatically be re-sampled during preprocessing.", + "title": "Output Sampling Rate", + "type": "integer" + }, + "alignment_sampling_rate": { + "default": 22050, + "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'alignment_sampling_rate' describes the sampling rate used when training an alignment model. If you change this value, your audio will automatically be re-sampled during preprocessing.", + "title": "Alignment Sampling Rate", + "type": "integer" + }, + "target_bit_depth": { + "default": 16, + "description": "Advanced. This is the bit depth of each sample in your audio files.", + "title": "Target Bit Depth", + "type": "integer" + }, + "n_fft": { + "default": 1024, + "description": "Advanced. This is the number of bins used by the Fast Fourier Transform (FFT).", + "title": "FFT Size", + "type": "integer" + }, + "fft_window_size": { + "default": 1024, + "description": "Advanced. This is the window size used by the Fast Fourier Transform (FFT).", + "title": "FFT Window Size", + "type": "integer" + }, + "fft_hop_size": { + "default": 256, + "description": "Advanced. This is the hop size for calculating the Short-Time Fourier Transform (STFT) which calculates a sequence of spectrograms from a single audio file. Another way of putting it is that the hop size is equal to the amount of non-intersecting samples from the audio in each spectrogram.", + "title": "FFT Hop Size", + "type": "integer" + }, + "f_min": { + "default": 0, + "description": "Advanced. This is the minimum frequency for the lowest frequency bin when calculating the spectrogram.", + "title": "Minimum Frequency", + "type": "integer" + }, + "f_max": { + "default": 8000, + "description": "Advanced. This is the maximum frequency for the highest frequency bin when calculating the spectrogram.", + "title": "Maximum Frequency", + "type": "integer" + }, + "n_mels": { + "default": 80, + "description": "Advanced. This is the number of filters in the Mel-scale spaced filterbank.", + "title": "Number of Mel bins", + "type": "integer" + }, + "spec_type": { + "anyOf": [ + { + "$ref": "#/$defs/AudioSpecTypeEnum" + }, + { + "type": "string" + } + ], + "default": "mel-librosa", + "description": "Advanced. Defines how to calculate the spectrogram. 'mel' uses the TorchAudio implementation for a Mel spectrogram. 'mel-librosa' uses Librosa's implementation. 'linear' calculates a non-Mel linear spectrogram and 'raw' calculates a complex-valued spectrogram. 'linear' and 'raw' are not currently supported by EveryVoice. We recommend using 'mel-librosa'.", + "title": "Spec Type" + }, + "vocoder_segment_size": { + "default": 8192, + "description": "Advanced. The vocoder, or spec-to-wav model is trained by sampling random fixed-size sections of the audio. This value specifies the number of samples in those sections.", + "title": "Vocoder Segment Size", + "type": "integer" + } + }, + "title": "AudioConfig", + "type": "object" + }, + "AudioSpecTypeEnum": { + "enum": [ + "mel", + "mel-librosa", + "linear", + "raw" + ], + "title": "AudioSpecTypeEnum", + "type": "string" + }, + "ConformerConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "layers": { + "default": 4, + "description": "The number of layers in the Conformer.", + "title": "Layers", + "type": "integer" + }, + "heads": { + "default": 2, + "description": "The number of heads in the multi-headed attention modules.", + "title": "Heads", + "type": "integer" + }, + "input_dim": { + "default": 256, + "description": "The number of hidden dimensions in the input. The input_dim value declared in the encoder and decoder modules must match the input_dim value declared in each variance predictor module.", + "title": "Input Dim", + "type": "integer" + }, + "feedforward_dim": { + "default": 1024, + "description": "The number of dimensions in the feedforward layers.", + "title": "Feedforward Dim", + "type": "integer" + }, + "conv_kernel_size": { + "default": 9, + "description": "The size of the kernel in each convoluational layer of the Conformer.", + "title": "Conv Kernel Size", + "type": "integer" + }, + "dropout": { + "default": 0.2, + "description": "The amount of dropout to apply.", + "title": "Dropout", + "type": "number" + } + }, + "title": "ConformerConfig", + "type": "object" + }, + "ContactInformation": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact_name": { + "description": "The name of the contact person or organization responsible for answering questions related to this model.", + "title": "Contact Name", + "type": "string" + }, + "contact_email": { + "description": "The email address of the contact person or organization responsible for answering questions related to this model.", + "format": "email", + "title": "Contact Email", + "type": "string" + } + }, + "required": [ + "contact_name", + "contact_email" + ], + "title": "ContactInformation", + "type": "object" + }, + "DFAlignerConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "allOf": [ + { + "$ref": "#/$defs/ContactInformation" + } + ], + "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide to understand more about the importance of misuse prevention with TTS." + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerTrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + }, + "text": { + "$ref": "#/$defs/TextConfig" + }, + "path_to_text_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path To Text Config File" + } + }, + "required": [ + "contact" + ], + "title": "DFAlignerConfig", + "type": "object" + }, + "DFAlignerExtractionMethod": { + "enum": [ + "beam", + "dijkstra" + ], + "title": "DFAlignerExtractionMethod", + "type": "string" + }, + "DFAlignerModelConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "target_text_representation_level": { + "allOf": [ + { + "$ref": "#/$defs/TargetTrainingTextRepresentationLevel" + } + ], + "default": "characters" + }, + "lstm_dim": { + "default": 512, + "description": "The number of dimensions in the LSTM layers.", + "title": "Lstm Dim", + "type": "integer" + }, + "conv_dim": { + "default": 512, + "description": "The number of dimensions in the convolutional layers.", + "title": "Conv Dim", + "type": "integer" + } + }, + "title": "DFAlignerModelConfig", + "type": "object" + }, + "DFAlignerTrainingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "batch_size": { + "default": 16, + "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.", + "title": "Batch Size", + "type": "integer" + }, + "save_top_k_ckpts": { + "default": 5, + "description": "The number of checkpoints to save.", + "title": "Save Top K Ckpts", + "type": "integer" + }, + "ckpt_steps": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter", + "title": "Ckpt Steps" + }, + "ckpt_epochs": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'", + "title": "Ckpt Epochs" + }, + "val_check_interval": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 500, + "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.", + "title": "Val Check Interval" + }, + "check_val_every_n_epoch": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training", + "title": "Check Val Every N Epoch" + }, + "max_epochs": { + "default": 1000, + "description": "Stop training after this many epochs", + "title": "Max Epochs", + "type": "integer" + }, + "max_steps": { + "default": 100000, + "description": "Stop training after this many steps", + "title": "Max Steps", + "type": "integer" + }, + "finetune_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Automatically resume training from a checkpoint loaded from this path.", + "title": "Finetune Checkpoint" + }, + "training_filelist": { + "default": "path/to/your/preprocessed/training_filelist.psv", + "description": "The path to a filelist containing samples belonging to your training set.", + "format": "path", + "title": "Training Filelist", + "type": "string" + }, + "validation_filelist": { + "default": "path/to/your/preprocessed/validation_filelist.psv", + "description": "The path to a filelist containing samples belonging to your validation set.", + "format": "path", + "title": "Validation Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The function to use to load the filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "logger": { + "allOf": [ + { + "$ref": "#/$defs/LoggerConfig" + } + ], + "description": "The configuration for the logger." + }, + "val_data_workers": { + "default": 0, + "description": "The number of CPU workers to use when loading data during validation.", + "title": "Val Data Workers", + "type": "integer" + }, + "train_data_workers": { + "default": 4, + "description": "The number of CPU workers to use when loading data during training.", + "title": "Train Data Workers", + "type": "integer" + }, + "optimizer": { + "anyOf": [ + { + "$ref": "#/$defs/AdamOptimizer" + }, + { + "$ref": "#/$defs/AdamWOptimizer" + } + ], + "description": "Optimizer configuration settings.", + "title": "Optimizer" + }, + "binned_sampler": { + "default": true, + "description": "Use a binned length sampler", + "title": "Binned Sampler", + "type": "boolean" + }, + "plot_steps": { + "default": 1000, + "description": "The maximum number of steps to plot", + "title": "Plot Steps", + "type": "integer" + }, + "extraction_method": { + "allOf": [ + { + "$ref": "#/$defs/DFAlignerExtractionMethod" + } + ], + "default": "dijkstra", + "description": "The alignment extraction algorithm to use. 'beam' will be quicker but possibly less accurate than 'dijkstra'" + } + }, + "title": "DFAlignerTrainingConfig", + "type": "object" + }, + "Dataset": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "label": { + "default": "YourDataSet", + "description": "A label for the source of data", + "title": "Label", + "type": "string" + }, + "permissions_obtained": { + "default": false, + "description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.", + "title": "Permissions Obtained", + "type": "boolean" + }, + "data_dir": { + "default": "/please/create/a/path/to/your/dataset/data", + "description": "The path to the directory with your audio files.", + "format": "path", + "title": "Data Dir", + "type": "string" + }, + "filelist": { + "default": "/please/create/a/path/to/your/dataset/filelist", + "description": "The path to your dataset's filelist.", + "format": "path", + "title": "Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The file-loader function to use to load your dataset's filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "sox_effects": { + "default": [ + [ + "channels", + "1" + ] + ], + "description": "Advanced. A list of SoX effects to apply to your audio prior to preprocessing. Run python -c 'import torchaudio; print(torchaudio.sox_effects.effect_names())' to see a list of supported effects.", + "items": {}, + "title": "Sox Effects", + "type": "array" + } + }, + "title": "Dataset", + "type": "object" + }, + "E2ETrainingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "batch_size": { + "default": 16, + "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.", + "title": "Batch Size", + "type": "integer" + }, + "save_top_k_ckpts": { + "default": 5, + "description": "The number of checkpoints to save.", + "title": "Save Top K Ckpts", + "type": "integer" + }, + "ckpt_steps": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter", + "title": "Ckpt Steps" + }, + "ckpt_epochs": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'", + "title": "Ckpt Epochs" + }, + "val_check_interval": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 500, + "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.", + "title": "Val Check Interval" + }, + "check_val_every_n_epoch": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training", + "title": "Check Val Every N Epoch" + }, + "max_epochs": { + "default": 1000, + "description": "Stop training after this many epochs", + "title": "Max Epochs", + "type": "integer" + }, + "max_steps": { + "default": 100000, + "description": "Stop training after this many steps", + "title": "Max Steps", + "type": "integer" + }, + "finetune_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Automatically resume training from a checkpoint loaded from this path.", + "title": "Finetune Checkpoint" + }, + "training_filelist": { + "default": "path/to/your/preprocessed/training_filelist.psv", + "description": "The path to a filelist containing samples belonging to your training set.", + "format": "path", + "title": "Training Filelist", + "type": "string" + }, + "validation_filelist": { + "default": "path/to/your/preprocessed/validation_filelist.psv", + "description": "The path to a filelist containing samples belonging to your validation set.", + "format": "path", + "title": "Validation Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The function to use to load the filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "logger": { + "allOf": [ + { + "$ref": "#/$defs/LoggerConfig" + } + ], + "description": "The configuration for the logger." + }, + "val_data_workers": { + "default": 0, + "description": "The number of CPU workers to use when loading data during validation.", + "title": "Val Data Workers", + "type": "integer" + }, + "train_data_workers": { + "default": 4, + "description": "The number of CPU workers to use when loading data during training.", + "title": "Train Data Workers", + "type": "integer" + }, + "feature_prediction_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Feature Prediction Checkpoint" + }, + "vocoder_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Vocoder Checkpoint" + } + }, + "title": "E2ETrainingConfig", + "type": "object" + }, + "FastSpeech2Config": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "allOf": [ + { + "$ref": "#/$defs/ContactInformation" + } + ], + "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide to understand more about the importance of misuse prevention with TTS." + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/FastSpeech2ModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a model configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/FastSpeech2TrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a training configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + }, + "text": { + "allOf": [ + { + "$ref": "#/$defs/TextConfig" + } + ], + "description": "The text configuration." + }, + "path_to_text_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a text configuration file.", + "title": "Path To Text Config File" + } + }, + "required": [ + "contact" + ], + "title": "FastSpeech2Config", + "type": "object" + }, + "FastSpeech2ModelConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "encoder": { + "allOf": [ + { + "$ref": "#/$defs/ConformerConfig" + } + ], + "description": "The configuration of the encoder module." + }, + "decoder": { + "allOf": [ + { + "$ref": "#/$defs/ConformerConfig" + } + ], + "description": "The configuration of the decoder module." + }, + "variance_predictors": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictors" + } + ], + "description": "Configuration for energy, duration, and pitch variance predictors." + }, + "target_text_representation_level": { + "allOf": [ + { + "$ref": "#/$defs/TargetTrainingTextRepresentationLevel" + } + ], + "default": "characters" + }, + "learn_alignment": { + "default": true, + "description": "Whether to jointly learn alignments using monotonic alignment search module (See Badlani et. al. 2021: https://arxiv.org/abs/2108.10447). If set to False, you will have to provide text/audio alignments separately before training a text-to-spec (feature prediction) model.", + "title": "Learn Alignment", + "type": "boolean" + }, + "max_length": { + "default": 1000, + "description": "The maximum length (i.e. number of symbols) for text inputs.", + "title": "Max Length", + "type": "integer" + }, + "mel_loss": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLossEnum" + } + ], + "default": "mse", + "description": "The loss function to use when calculating Mel spectrogram loss." + }, + "use_postnet": { + "default": true, + "description": "Whether to use a postnet module.", + "title": "Use Postnet", + "type": "boolean" + }, + "multilingual": { + "default": false, + "description": "Whether to train a multilingual model. For this to work, your filelist must contain a column/field for 'language' with values for each utterance.", + "title": "Multilingual", + "type": "boolean" + }, + "multispeaker": { + "default": false, + "description": "Whether to train a multispeaker model. For this to work, your filelist must contain a column/field for 'speaker' with values for each utterance.", + "title": "Multispeaker", + "type": "boolean" + } + }, + "title": "FastSpeech2ModelConfig", + "type": "object" + }, + "FastSpeech2TrainingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "batch_size": { + "default": 16, + "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.", + "title": "Batch Size", + "type": "integer" + }, + "save_top_k_ckpts": { + "default": 5, + "description": "The number of checkpoints to save.", + "title": "Save Top K Ckpts", + "type": "integer" + }, + "ckpt_steps": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter", + "title": "Ckpt Steps" + }, + "ckpt_epochs": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'", + "title": "Ckpt Epochs" + }, + "val_check_interval": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 500, + "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.", + "title": "Val Check Interval" + }, + "check_val_every_n_epoch": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training", + "title": "Check Val Every N Epoch" + }, + "max_epochs": { + "default": 1000, + "description": "Stop training after this many epochs", + "title": "Max Epochs", + "type": "integer" + }, + "max_steps": { + "default": 100000, + "description": "Stop training after this many steps", + "title": "Max Steps", + "type": "integer" + }, + "finetune_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Automatically resume training from a checkpoint loaded from this path.", + "title": "Finetune Checkpoint" + }, + "training_filelist": { + "default": "path/to/your/preprocessed/training_filelist.psv", + "description": "The path to a filelist containing samples belonging to your training set.", + "format": "path", + "title": "Training Filelist", + "type": "string" + }, + "validation_filelist": { + "default": "path/to/your/preprocessed/validation_filelist.psv", + "description": "The path to a filelist containing samples belonging to your validation set.", + "format": "path", + "title": "Validation Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The function to use to load the filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "logger": { + "allOf": [ + { + "$ref": "#/$defs/LoggerConfig" + } + ], + "description": "The configuration for the logger." + }, + "val_data_workers": { + "default": 0, + "description": "The number of CPU workers to use when loading data during validation.", + "title": "Val Data Workers", + "type": "integer" + }, + "train_data_workers": { + "default": 4, + "description": "The number of CPU workers to use when loading data during training.", + "title": "Train Data Workers", + "type": "integer" + }, + "use_weighted_sampler": { + "default": false, + "description": "Whether to use a sampler which oversamples from the minority language or speaker class for balanced training.", + "title": "Use Weighted Sampler", + "type": "boolean" + }, + "optimizer": { + "allOf": [ + { + "$ref": "#/$defs/NoamOptimizer" + } + ], + "default": { + "learning_rate": 0.001, + "eps": 1e-08, + "weight_decay": 1e-06, + "betas": [ + 0.9, + 0.999 + ], + "name": "noam", + "warmup_steps": 1000 + }, + "description": "The optimizer to use during training." + }, + "vocoder_path": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Vocoder Path" + }, + "mel_loss_weight": { + "default": 1.0, + "description": "Multiply the spec loss by this weight", + "title": "Mel Loss Weight", + "type": "number" + }, + "postnet_loss_weight": { + "default": 1.0, + "description": "Multiply the postnet loss by this weight", + "title": "Postnet Loss Weight", + "type": "number" + }, + "pitch_loss_weight": { + "default": 0.1, + "description": "Multiply the pitch loss by this weight", + "title": "Pitch Loss Weight", + "type": "number" + }, + "energy_loss_weight": { + "default": 0.1, + "description": "Multiply the energy loss by this weight", + "title": "Energy Loss Weight", + "type": "number" + }, + "duration_loss_weight": { + "default": 0.1, + "description": "Multiply the duration loss by this weight", + "title": "Duration Loss Weight", + "type": "number" + }, + "attn_ctc_loss_weight": { + "default": 0.1, + "description": "Multiply the Attention CTC loss by this weight", + "title": "Attn Ctc Loss Weight", + "type": "number" + }, + "attn_bin_loss_weight": { + "default": 0.1, + "description": "Multiply the Attention Binarization loss by this weight", + "title": "Attn Bin Loss Weight", + "type": "number" + }, + "attn_bin_loss_warmup_epochs": { + "default": 100, + "description": "Scale the Attention Binarization loss by (current_epoch / attn_bin_loss_warmup_epochs) until the number of epochs defined by attn_bin_loss_warmup_epochs is reached.", + "minimum": 1, + "title": "Attn Bin Loss Warmup Epochs", + "type": "integer" + } + }, + "title": "FastSpeech2TrainingConfig", + "type": "object" + }, + "FeaturePredictionConfigNoContact": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "anyOf": [ + { + "$ref": "#/$defs/ContactInformation" + }, + { + "type": "null" + } + ], + "default": null + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/FastSpeech2ModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a model configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/FastSpeech2TrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a training configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + }, + "text": { + "allOf": [ + { + "$ref": "#/$defs/TextConfig" + } + ], + "description": "The text configuration." + }, + "path_to_text_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a text configuration file.", + "title": "Path To Text Config File" + } + }, + "title": "FeaturePredictionConfigNoContact", + "type": "object" + }, + "HiFiGANConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "allOf": [ + { + "$ref": "#/$defs/ContactInformation" + } + ], + "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide to understand more about the importance of misuse prevention with TTS." + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a model configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANTrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a training configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + } + }, + "required": [ + "contact" + ], + "title": "HiFiGANConfig", + "type": "object" + }, + "HiFiGANModelConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "resblock": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANResblock" + } + ], + "default": "1", + "description": "Which resblock to use. See Kong et. al. 2020: https://arxiv.org/abs/2010.05646" + }, + "upsample_rates": { + "default": [ + 8, + 8, + 2, + 2 + ], + "description": "The stride of each convolutional layer in the upsampling module.", + "items": { + "type": "integer" + }, + "title": "Upsample Rates", + "type": "array" + }, + "upsample_kernel_sizes": { + "default": [ + 16, + 16, + 4, + 4 + ], + "description": "The kernel size of each convolutional layer in the upsampling module.", + "items": { + "type": "integer" + }, + "title": "Upsample Kernel Sizes", + "type": "array" + }, + "upsample_initial_channel": { + "default": 512, + "description": "The number of dimensions to project the Mel inputs to before being passed to the resblock.", + "title": "Upsample Initial Channel", + "type": "integer" + }, + "resblock_kernel_sizes": { + "default": [ + 3, + 7, + 11 + ], + "description": "The kernel size of each convolutional layer in the resblock.", + "items": { + "type": "integer" + }, + "title": "Resblock Kernel Sizes", + "type": "array" + }, + "resblock_dilation_sizes": { + "default": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "description": "The dilations of each convolution in each layer of the resblock.", + "items": { + "items": { + "type": "integer" + }, + "type": "array" + }, + "title": "Resblock Dilation Sizes", + "type": "array" + }, + "activation_function": { + "description": "The activation function to use.", + "title": "Activation Function", + "type": "string" + }, + "istft_layer": { + "default": false, + "description": "Whether to predict phase and magnitude values and use an inverse Short-Time Fourier Transform instead of predicting a waveform directly. See Kaneko et. al. 2022: https://arxiv.org/abs/2203.02395", + "title": "Istft Layer", + "type": "boolean" + }, + "msd_layers": { + "default": 3, + "description": "The number of layers to use in the Multi-Scale Discriminator.", + "title": "Msd Layers", + "type": "integer" + }, + "mpd_layers": { + "default": [ + 2, + 3, + 5, + 7, + 11 + ], + "description": "The size of each layer in the Multi-Period Discriminator.", + "items": { + "type": "integer" + }, + "title": "Mpd Layers", + "type": "array" + } + }, + "title": "HiFiGANModelConfig", + "type": "object" + }, + "HiFiGANResblock": { + "enum": [ + "1", + "2" + ], + "title": "HiFiGANResblock", + "type": "string" + }, + "HiFiGANTrainTypes": { + "enum": [ + "original", + "wgan" + ], + "title": "HiFiGANTrainTypes", + "type": "string" + }, + "HiFiGANTrainingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "batch_size": { + "default": 16, + "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.", + "title": "Batch Size", + "type": "integer" + }, + "save_top_k_ckpts": { + "default": 5, + "description": "The number of checkpoints to save.", + "title": "Save Top K Ckpts", + "type": "integer" + }, + "ckpt_steps": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter", + "title": "Ckpt Steps" + }, + "ckpt_epochs": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'", + "title": "Ckpt Epochs" + }, + "val_check_interval": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 500, + "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.", + "title": "Val Check Interval" + }, + "check_val_every_n_epoch": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training", + "title": "Check Val Every N Epoch" + }, + "max_epochs": { + "default": 1000, + "description": "Stop training after this many epochs", + "title": "Max Epochs", + "type": "integer" + }, + "max_steps": { + "default": 100000, + "description": "Stop training after this many steps", + "title": "Max Steps", + "type": "integer" + }, + "finetune_checkpoint": { + "anyOf": [ + { + "format": "path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Automatically resume training from a checkpoint loaded from this path.", + "title": "Finetune Checkpoint" + }, + "training_filelist": { + "default": "path/to/your/preprocessed/training_filelist.psv", + "description": "The path to a filelist containing samples belonging to your training set.", + "format": "path", + "title": "Training Filelist", + "type": "string" + }, + "validation_filelist": { + "default": "path/to/your/preprocessed/validation_filelist.psv", + "description": "The path to a filelist containing samples belonging to your validation set.", + "format": "path", + "title": "Validation Filelist", + "type": "string" + }, + "filelist_loader": { + "description": "Advanced. The function to use to load the filelist.", + "title": "Filelist Loader", + "type": "string" + }, + "logger": { + "allOf": [ + { + "$ref": "#/$defs/LoggerConfig" + } + ], + "description": "The configuration for the logger." + }, + "val_data_workers": { + "default": 0, + "description": "The number of CPU workers to use when loading data during validation.", + "title": "Val Data Workers", + "type": "integer" + }, + "train_data_workers": { + "default": 4, + "description": "The number of CPU workers to use when loading data during training.", + "title": "Train Data Workers", + "type": "integer" + }, + "generator_warmup_steps": { + "default": 0, + "description": "The number of steps to run through before activating the discriminators.", + "title": "Generator Warmup Steps", + "type": "integer" + }, + "gan_type": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANTrainTypes" + } + ], + "default": "original", + "description": "The type of GAN to use. Can be set to either 'original' for a vanilla GAN, or 'wgan' for a Wasserstein GAN that clips gradients." + }, + "optimizer": { + "anyOf": [ + { + "$ref": "#/$defs/AdamOptimizer" + }, + { + "$ref": "#/$defs/AdamWOptimizer" + }, + { + "$ref": "#/$defs/RMSOptimizer" + } + ], + "description": "Configuration settings for the optimizer.", + "title": "Optimizer" + }, + "wgan_clip_value": { + "default": 0.01, + "description": "The gradient clip value when gan_type='wgan'.", + "title": "Wgan Clip Value", + "type": "number" + }, + "use_weighted_sampler": { + "default": false, + "description": "Whether to use a sampler which oversamples from the minority language or speaker class for balanced training.", + "title": "Use Weighted Sampler", + "type": "boolean" + }, + "finetune": { + "default": false, + "description": "Whether to read spectrograms from 'preprocessed/synthesized_spec' instead of 'preprocessed/spec'. This is used when finetuning a pretrained spec-to-wav (vocoder) model using the outputs of a trained text-to-spec (feature prediction network) model.", + "title": "Finetune", + "type": "boolean" + } + }, + "title": "HiFiGANTrainingConfig", + "type": "object" + }, + "LoggerConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "The logger configures all the information needed for where to store your experiment's logs and checkpoints.\nThe structure of your logs will then be:\n / / \n will be generated by calling each time the LoggerConfig is constructed.", + "properties": { + "name": { + "default": "BaseExperiment", + "description": "The name of the experiment. The structure of your logs will be / / .", + "title": "Experiment Name", + "type": "string" + }, + "save_dir": { + "default": "logs_and_checkpoints", + "description": "The directory to save your checkpoints and logs to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "sub_dir_callable": { + "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be / / where is a timestamp.", + "title": "Sub Dir Callable", + "type": "string" + }, + "version": { + "default": "base", + "description": "The version of your experiment. The structure of your logs will be / / .", + "title": "Version", + "type": "string" + } + }, + "title": "LoggerConfig", + "type": "object" + }, + "NoamOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "betas": { + "default": [ + 0.9, + 0.98 + ], + "description": "Advanced. The values of the Adam Optimizer beta coefficients.", + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "title": "Betas", + "type": "array" + }, + "name": { + "default": "noam", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + }, + "warmup_steps": { + "default": 1000, + "description": "The number of steps to increase the learning rate before starting to decrease it.", + "title": "Warmup Steps", + "type": "integer" + } + }, + "title": "NoamOptimizer", + "type": "object" + }, + "PreprocessingConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "dataset": { + "default": "YourDataSet", + "description": "The name of the dataset.", + "title": "Dataset", + "type": "string" + }, + "train_split": { + "default": 0.9, + "description": "The amount of the dataset to use for training. The rest will be used as validation. Hold some of the validation set out for a test set if you are performing experiments.", + "maximum": 1.0, + "minimum": 0.0, + "title": "Train Split", + "type": "number" + }, + "dataset_split_seed": { + "default": 1234, + "description": "The seed to use when splitting the dataset into train and validation sets.", + "title": "Dataset Split Seed", + "type": "integer" + }, + "save_dir": { + "default": "preprocessed/YourDataSet", + "description": "The directory to save preprocessed files to.", + "format": "path", + "title": "Save Dir", + "type": "string" + }, + "audio": { + "allOf": [ + { + "$ref": "#/$defs/AudioConfig" + } + ], + "description": "Configuration settings for audio." + }, + "path_to_audio_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path to an audio configuration file.", + "title": "Path To Audio Config File" + }, + "source_data": { + "description": "A list of datasets.", + "items": { + "$ref": "#/$defs/Dataset" + }, + "title": "Source Data", + "type": "array" + } + }, + "title": "PreprocessingConfig", + "type": "object" + }, + "Punctuation": { + "properties": { + "exclamations": { + "default": [ + "!", + "\u00a1" + ], + "description": "Exclamation punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Exclamations", + "type": "array" + }, + "question_symbols": { + "default": [ + "?", + "\u00bf" + ], + "description": "Question/interrogative punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Question Symbols", + "type": "array" + }, + "quotemarks": { + "default": [ + "\"", + "'", + "\u201c", + "\u201d", + "\u00ab", + "\u00bb" + ], + "description": "Quotemark punctuation symbols used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Quotemarks", + "type": "array" + }, + "big_breaks": { + "default": [ + ".", + ":", + ";" + ], + "description": "Punctuation symbols indicating a 'big break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Big Breaks", + "type": "array" + }, + "small_breaks": { + "default": [ + ",", + "-", + "\u2014" + ], + "description": "Punctuation symbols indicating a 'small break' used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Small Breaks", + "type": "array" + }, + "ellipsis": { + "default": [ + "\u2026" + ], + "description": "Punctuation symbols indicating an ellipsis used in your datasets. Replaces these symbols with internally.", + "items": { + "type": "string" + }, + "title": "Ellipsis", + "type": "array" + } + }, + "title": "Punctuation", + "type": "object" + }, + "RMSOptimizer": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "learning_rate": { + "default": 0.0001, + "description": "The initial learning rate to use", + "title": "Learning Rate", + "type": "number" + }, + "eps": { + "default": 1e-08, + "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.", + "title": "Eps", + "type": "number" + }, + "weight_decay": { + "default": 0.01, + "title": "Weight Decay", + "type": "number" + }, + "alpha": { + "default": 0.99, + "description": "Advanced. The value of RMSProp optimizer alpha smoothing constant.", + "title": "Alpha", + "type": "number" + }, + "name": { + "default": "rms", + "description": "The name of the optimizer to use.", + "title": "Name", + "type": "string" + } + }, + "title": "RMSOptimizer", + "type": "object" + }, + "Symbols": { + "additionalProperties": true, + "properties": { + "silence": { + "default": [ + "" + ], + "description": "The symbol(s) used to indicate silence.", + "items": { + "type": "string" + }, + "title": "Silence", + "type": "array" + }, + "punctuation": { + "allOf": [ + { + "$ref": "#/$defs/Punctuation" + } + ], + "description": "EveryVoice will combine punctuation and normalize it into a set of five permissible types of punctuation to help tractable training." + } + }, + "title": "Symbols", + "type": "object" + }, + "TargetTrainingTextRepresentationLevel": { + "enum": [ + "characters", + "phones", + "phonological_features" + ], + "title": "TargetTrainingTextRepresentationLevel", + "type": "string" + }, + "TextConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "symbols": { + "$ref": "#/$defs/Symbols" + }, + "to_replace": { + "additionalProperties": { + "type": "string" + }, + "default": {}, + "title": "To Replace", + "type": "object" + }, + "cleaners": { + "items": { + "type": "string" + }, + "title": "Cleaners", + "type": "array" + } + }, + "title": "TextConfig", + "type": "object" + }, + "VarianceLevelEnum": { + "enum": [ + "phone", + "frame" + ], + "title": "VarianceLevelEnum", + "type": "string" + }, + "VarianceLossEnum": { + "enum": [ + "mse", + "mae" + ], + "title": "VarianceLossEnum", + "type": "string" + }, + "VariancePredictorBase": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "loss": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLossEnum" + } + ], + "default": "mse", + "description": "The loss function to use when calculate variance loss. Either 'mse' or 'mae'." + }, + "n_layers": { + "default": 5, + "description": "The number of layers in the variance predictor module.", + "title": "N Layers", + "type": "integer" + }, + "kernel_size": { + "default": 3, + "description": "The kernel size of each convolutional layer in the variance predictor module.", + "title": "Kernel Size", + "type": "integer" + }, + "dropout": { + "default": 0.5, + "description": "The amount of dropout to apply.", + "title": "Dropout", + "type": "number" + }, + "input_dim": { + "default": 256, + "description": "The number of hidden dimensions in the input. This must match the input_dim value declared in the encoder and decoder modules.", + "title": "Input Dim", + "type": "integer" + }, + "n_bins": { + "default": 256, + "description": "The number of bins to use in the variance predictor module.", + "title": "N Bins", + "type": "integer" + }, + "depthwise": { + "default": true, + "description": "Whether to use depthwise separable convolutions.", + "title": "Depthwise", + "type": "boolean" + } + }, + "title": "VariancePredictorBase", + "type": "object" + }, + "VariancePredictorConfig": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "loss": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLossEnum" + } + ], + "default": "mse", + "description": "The loss function to use when calculate variance loss. Either 'mse' or 'mae'." + }, + "n_layers": { + "default": 5, + "description": "The number of layers in the variance predictor module.", + "title": "N Layers", + "type": "integer" + }, + "kernel_size": { + "default": 3, + "description": "The kernel size of each convolutional layer in the variance predictor module.", + "title": "Kernel Size", + "type": "integer" + }, + "dropout": { + "default": 0.5, + "description": "The amount of dropout to apply.", + "title": "Dropout", + "type": "number" + }, + "input_dim": { + "default": 256, + "description": "The number of hidden dimensions in the input. This must match the input_dim value declared in the encoder and decoder modules.", + "title": "Input Dim", + "type": "integer" + }, + "n_bins": { + "default": 256, + "description": "The number of bins to use in the variance predictor module.", + "title": "N Bins", + "type": "integer" + }, + "depthwise": { + "default": true, + "description": "Whether to use depthwise separable convolutions.", + "title": "Depthwise", + "type": "boolean" + }, + "level": { + "allOf": [ + { + "$ref": "#/$defs/VarianceLevelEnum" + } + ], + "default": "phone", + "description": "The level for the variance predictor to use. 'frame' will make predictions at the frame level. 'phone' will average predictions across all frames in each phone." + } + }, + "title": "VariancePredictorConfig", + "type": "object" + }, + "VariancePredictors": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "energy": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictorConfig" + } + ], + "description": "The variance predictor for energy" + }, + "duration": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictorBase" + } + ], + "description": "The variance predictor for duration" + }, + "pitch": { + "allOf": [ + { + "$ref": "#/$defs/VariancePredictorConfig" + } + ], + "description": "The variance predictor for pitch" + } + }, + "title": "VariancePredictors", + "type": "object" + }, + "VocoderConfigNoContact": { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "anyOf": [ + { + "$ref": "#/$defs/ContactInformation" + }, + { + "type": "null" + } + ], + "default": null + }, + "VERSION": { + "title": "Version", + "type": "string" + }, + "model": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANModelConfig" + } + ], + "description": "The model configuration settings." + }, + "path_to_model_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a model configuration file.", + "title": "Path To Model Config File" + }, + "training": { + "allOf": [ + { + "$ref": "#/$defs/HiFiGANTrainingConfig" + } + ], + "description": "The training configuration hyperparameters." + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a training configuration file.", + "title": "Path To Training Config File" + }, + "preprocessing": { + "allOf": [ + { + "$ref": "#/$defs/PreprocessingConfig" + } + ], + "description": "The preprocessing configuration, including information about audio settings." + }, + "path_to_preprocessing_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The path of a preprocessing configuration file.", + "title": "Path To Preprocessing Config File" + } + }, + "title": "VocoderConfigNoContact", + "type": "object" + } + }, + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "properties": { + "contact": { + "allOf": [ + { + "$ref": "#/$defs/ContactInformation" + } + ], + "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide to understand more about the importance of misuse prevention with TTS." + }, + "aligner": { + "anyOf": [ + { + "$ref": "#/$defs/DFAlignerConfig" + }, + { + "$ref": "#/$defs/AlignerConfigNoContact" + } + ], + "title": "Aligner" + }, + "path_to_aligner_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path To Aligner Config File" + }, + "feature_prediction": { + "anyOf": [ + { + "$ref": "#/$defs/FastSpeech2Config" + }, + { + "$ref": "#/$defs/FeaturePredictionConfigNoContact" + } + ], + "title": "Feature Prediction" + }, + "path_to_feature_prediction_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path To Feature Prediction Config File" + }, + "vocoder": { + "anyOf": [ + { + "$ref": "#/$defs/HiFiGANConfig" + }, + { + "$ref": "#/$defs/VocoderConfigNoContact" + } + ], + "title": "Vocoder" + }, + "path_to_vocoder_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path To Vocoder Config File" + }, + "training": { + "$ref": "#/$defs/E2ETrainingConfig" + }, + "path_to_training_config_file": { + "anyOf": [ + { + "format": "file-path", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path To Training Config File" + } + }, + "required": [ + "contact" + ], + "title": "EveryVoiceConfig", + "type": "object" +} diff --git a/everyvoice/_version.py b/everyvoice/_version.py index 61555898..f8eb8ced 100644 --- a/everyvoice/_version.py +++ b/everyvoice/_version.py @@ -2,4 +2,4 @@ # [PEP 440 – Version Identification and Dependency Specification](https://peps.python.org/pep-0440/) # [Specifying Your Project’s Version](https://setuptools.pypa.io/en/latest/userguide/distribution.html) # [N!]N(.N)*[{a|b|rc}N][.postN][.devN] -VERSION = "0.1.0a3" +VERSION = "0.2.0a0" From 8a53f69348aa3b215201313bbf8bc75d341501f9 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Wed, 30 Oct 2024 08:46:20 -0400 Subject: [PATCH 5/7] feat: using subTest() for unittest with a for loop --- everyvoice/tests/test_cli.py | 58 +++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/everyvoice/tests/test_cli.py b/everyvoice/tests/test_cli.py index 8c1356aa..0ce73167 100644 --- a/everyvoice/tests/test_cli.py +++ b/everyvoice/tests/test_cli.py @@ -189,38 +189,42 @@ def test_update_schema(self): # Validate that schema generation works correctly. _ = self.runner.invoke(app, ["update-schemas", "-o", tmpdir]) for filename, obj in SCHEMAS_TO_OUTPUT.items(): - with open(Path(tmpdir) / filename, encoding="utf8") as f: - schema = json.load(f) - # serialize the model to json and then validate against the schema - # Some objects will require a contact key - try: - obj_instance = obj() - except ValidationError: - obj_instance = obj(contact=dummy_contact) - self.assertIsNone( - jsonschema.validate( - json.loads(obj_instance.model_dump_json()), - schema=schema, + with self.subTest(filename=filename, type=obj): + with open(Path(tmpdir) / filename, encoding="utf8") as f: + schema = json.load(f) + # serialize the model to json and then validate against the schema + # Some objects will require a contact key + try: + obj_instance = obj() + except ValidationError: + obj_instance = obj(contact=dummy_contact) + self.assertIsNone( + jsonschema.validate( + json.loads(obj_instance.model_dump_json()), + schema=schema, + ) ) - ) # Make sure the generated schemas are identical to those saved in the repo, # i.e., that we didn't change the models but forget to update the schemas. for filename in SCHEMAS_TO_OUTPUT: - with open(Path(tmpdir) / filename, encoding="utf8") as f: - new_schema = f.read().replace("\\\\", "/") # force paths to posix - try: - with open(EV_DIR / ".schema" / filename, encoding="utf8") as f: - saved_schema = f.read() - except FileNotFoundError as e: - raise AssertionError( - f'Schema file {filename} is missing, please run "everyvoice update-schemas".' - ) from e - self.assertEqual( - saved_schema, - new_schema, - 'Schemas are out of date, please run "everyvoice update-schemas".', - ) + with self.subTest(filename=filename): + with open(Path(tmpdir) / filename, encoding="utf8") as f: + new_schema = f.read().replace( + "\\\\", "/" + ) # force paths to posix + try: + with open(EV_DIR / ".schema" / filename, encoding="utf8") as f: + saved_schema = f.read() + except FileNotFoundError as e: + raise AssertionError( + f'Schema file {filename} is missing, please run "everyvoice update-schemas".' + ) from e + self.assertEqual( + saved_schema, + new_schema, + 'Schemas are out of date, please run "everyvoice update-schemas".', + ) # Next, but only if everything above passed, we make sure we can't overwrite # existing schemas by accident. From ecc1dfccbb79bd59b0cf451ecf3ef869ac435a4b Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Tue, 5 Nov 2024 15:18:56 -0500 Subject: [PATCH 6/7] feat: added TestVersion --- everyvoice/tests/test_model.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/everyvoice/tests/test_model.py b/everyvoice/tests/test_model.py index 7040de28..77744d70 100644 --- a/everyvoice/tests/test_model.py +++ b/everyvoice/tests/test_model.py @@ -3,6 +3,7 @@ import tempfile from enum import Enum from pathlib import Path +from unittest import TestCase import torch from pytorch_lightning import Trainer @@ -495,3 +496,14 @@ def test_config_newer_version(self): r"Your config was created with a newer version of EveryVoice, please update your software.", ): ConfigType(**reference.model_dump()) + + +class TestVersion(TestCase): + def test_newer_version(self): + """ + Canary test. We use packaging.version.Version to compare model's versions and config's versions. + """ + from packaging.version import Version + + self.assertFalse("10.0" > "9.0") + self.assertTrue(Version("10.0") > Version("9.0")) From 1cc510dc84f5cd73d1b075f3c12554186d26a6e8 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Wed, 23 Oct 2024 15:09:13 -0400 Subject: [PATCH 7/7] chore: submodules update --- everyvoice/model/aligner/DeepForcedAligner | 2 +- everyvoice/model/feature_prediction/FastSpeech2_lightning | 2 +- everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/everyvoice/model/aligner/DeepForcedAligner b/everyvoice/model/aligner/DeepForcedAligner index f2cec61b..132c55e1 160000 --- a/everyvoice/model/aligner/DeepForcedAligner +++ b/everyvoice/model/aligner/DeepForcedAligner @@ -1 +1 @@ -Subproject commit f2cec61bd2028449f416849c2dae0fc959d2960a +Subproject commit 132c55e13bc1166da447466a8732c31e89a4d479 diff --git a/everyvoice/model/feature_prediction/FastSpeech2_lightning b/everyvoice/model/feature_prediction/FastSpeech2_lightning index c17e0389..6c1dfce0 160000 --- a/everyvoice/model/feature_prediction/FastSpeech2_lightning +++ b/everyvoice/model/feature_prediction/FastSpeech2_lightning @@ -1 +1 @@ -Subproject commit c17e038983c08c7b926e3f079f13bccf5749ee02 +Subproject commit 6c1dfce04605045b2ac913475c59b1a1311f3864 diff --git a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning index 7dc68ff2..fbf198b1 160000 --- a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning +++ b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning @@ -1 +1 @@ -Subproject commit 7dc68ff2881bda9aba519ca21dda77153aad72a9 +Subproject commit fbf198b1595166a96c16863868b497a6485e1863