feat: <voice /> and MSTTS viseme extension

pykeio · Oct 16, 2023 · f64b567 · f64b567
1 parent fa91fe9
commit f64b567
Show file tree

Hide file tree

Showing 6 changed files with 243 additions and 9 deletions.
diff --git a/src/audio.rs b/src/audio.rs
@@ -20,7 +20,7 @@ pub enum AudioRepeat {
 
 /// An SSML `<audio />` element. [`Audio`] supports the insertion of recorded audio files and the insertion of other
 /// audio formats in conjunction with synthesized speech output.
-#[derive(Default, Clone)]
+#[derive(Debug, Default, Clone)]
 pub struct Audio {
 	src: String,
 	desc: Option<String>,

diff --git a/src/lib.rs b/src/lib.rs
@@ -25,17 +25,20 @@ use std::{error::Error, fmt::Debug, io::Write};
 
 mod audio;
 mod error;
+pub mod mstts;
 mod speak;
 mod text;
 mod unit;
 mod util;
+mod voice;
 
 pub(crate) use self::error::{error, GenericError};
 pub use self::{
 	audio::{audio, Audio, AudioRepeat},
 	speak::{speak, Speak, SpeakableElement},
 	text::{text, Text},
-	unit::{Decibels, DecibelsError, TimeDesignation, TimeDesignationError}
+	unit::{Decibels, DecibelsError, TimeDesignation, TimeDesignationError},
+	voice::{voice, Voice, VoiceConfig, VoiceGender}
 };
 
 /// Vendor-specific flavor of SSML. Specifying this can be used to enable compatibility checks & add additional
@@ -75,3 +78,16 @@ pub trait Serialize {
 		Ok(std::str::from_utf8(&write)?.to_owned())
 	}
 }
+
+/// A [`SpeakableElement`] that outputs a simple string.
+///
+/// It differs from [`Text`] in that the contents of `Meta` are not escaped, meaning `Meta` can be used to write raw
+/// XML into the document.
+#[derive(Debug, Clone)]
+pub struct Meta(pub String);
+
+impl Serialize for Meta {
+	fn serialize<W: Write>(&self, writer: &mut W, _: Flavor) -> Result<(), Box<dyn Error>> {
+		Ok(writer.write_all(self.0.as_bytes())?)
+	}
+}
diff --git a/src/mstts/mod.rs b/src/mstts/mod.rs
@@ -0,0 +1,53 @@
+use std::fmt::Display;
+
+use crate::{voice::Voice, Meta, SpeakableElement};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum MicrosoftViseme {
+	/// Receive visemes as an ID. (equivalent to `<mstts:viseme type="redlips_front" />`)
+	ById,
+	/// Receive visemes as blend shapes.
+	FacialExpression
+}
+
+impl Display for MicrosoftViseme {
+	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+		f.write_str(match self {
+			MicrosoftViseme::ById => "redlips_front",
+			MicrosoftViseme::FacialExpression => "FacialExpression"
+		})
+	}
+}
+
+pub trait MicrosoftVoiceExt {
+	/// For ACSS, configures a [`Voice`] section to send back viseme animations in the specified format.
+	///
+	/// ```
+	/// # use ssml::{Flavor, mstts::{MicrosoftVoiceExt, MicrosoftViseme}, Serialize};
+	/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+	/// let doc = ssml::Speak::new(
+	/// 	Some("en-US"),
+	/// 	[ssml::Voice::new(
+	/// 		"en-US-JennyNeural",
+	/// 		["Rainbow has seven colors: Red, orange, yellow, green, blue, indigo, and violet."]
+	/// 	)
+	/// 	.with_mstts_viseme(MicrosoftViseme::FacialExpression)]
+	/// );
+	///
+	/// assert_eq!(
+	/// 	doc.serialize_to_string(Flavor::MicrosoftAzureCognitiveSpeechServices)?,
+	/// 	r#"<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US" xmlns:mstts="http://www.w3.org/2001/mstts"><voice name="en-US-JennyNeural"><mstts:viseme type="FacialExpression" />Rainbow has seven colors: Red, orange, yellow, green, blue, indigo, and violet. </voice></speak>"#
+	/// );
+	/// # Ok(())
+	/// # }
+	/// ```
+	fn with_mstts_viseme(self, config: MicrosoftViseme) -> Self;
+}
+
+impl MicrosoftVoiceExt for Voice {
+	fn with_mstts_viseme(mut self, config: MicrosoftViseme) -> Self {
+		self.elements
+			.insert(0, SpeakableElement::Meta(Meta(format!("<mstts:viseme type=\"{config}\" />"))));
+		self
+	}
+}
diff --git a/src/speak.rs b/src/speak.rs
@@ -1,14 +1,14 @@
 use std::{error::Error, io::Write};
 
-use crate::{util, Audio, Flavor, Serialize, Text};
+use crate::{util, Audio, Flavor, Meta, Serialize, Text, Voice};
 
 macro_rules! el {
 	(
 		$(#[$outer:meta])*
 		pub enum $name:ident {
 			$(
 				$(#[$innermeta:meta])*
-				$variant:ident($inner:ident)
+				$variant:ident($inner:ty)
 			),*
 		}
 	) => {
@@ -37,10 +37,12 @@ macro_rules! el {
 }
 
 el! {
-	#[derive(Clone)]
+	#[derive(Debug, Clone)]
 	pub enum SpeakableElement {
 		Text(Text),
-		Audio(Audio)
+		Audio(Audio),
+		Voice(Voice),
+		Meta(Meta)
 		// Break(BreakElement),
 		// Emphasis(EmphasisElement),
 		// Lang(LangElement),
@@ -63,7 +65,7 @@ impl<T: ToString> From<T> for SpeakableElement {
 }
 
 /// The root element of an SSML document.
-#[derive(Default)]
+#[derive(Default, Debug, Clone)]
 pub struct Speak {
 	elements: Vec<SpeakableElement>,
 	marks: (Option<String>, Option<String>),
@@ -134,7 +136,7 @@ impl Serialize for Speak {
 
 		// Include `mstts` namespace for ACSS.
 		if flavor == Flavor::MicrosoftAzureCognitiveSpeechServices {
-			util::write_attr(writer, "xmlns:mstts", "https://www.w3.org/2001/mstts")?;
+			util::write_attr(writer, "xmlns:mstts", "http://www.w3.org/2001/mstts")?;
 		}
 
 		if let Some(start_mark) = &self.marks.0 {

diff --git a/src/text.rs b/src/text.rs
@@ -3,7 +3,7 @@ use std::{error::Error, io::Write};
 use crate::{Flavor, Serialize};
 
 /// A non-marked-up string of text for use as a spoken element.
-#[derive(Clone)]
+#[derive(Default, Debug, Clone)]
 pub struct Text(pub String);
 
 impl<T: ToString> From<T> for Text {

diff --git a/src/voice.rs b/src/voice.rs
@@ -0,0 +1,163 @@
+use std::{error::Error, fmt::Display, io::Write};
+
+use crate::{util, Flavor, Serialize, SpeakableElement};
+
+#[derive(Default, Debug, Clone, PartialEq, Eq)]
+pub enum VoiceGender {
+	#[default]
+	Unspecified,
+	Neutral,
+	Female,
+	Male
+}
+
+impl Display for VoiceGender {
+	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+		f.write_str(match self {
+			VoiceGender::Unspecified => "",
+			VoiceGender::Neutral => "neutral",
+			VoiceGender::Female => "female",
+			VoiceGender::Male => "male"
+		})
+	}
+}
+
+/// Configuration for the [`Voice`] element.
+#[derive(Default, Debug, Clone)]
+pub struct VoiceConfig {
+	pub gender: Option<VoiceGender>,
+	pub age: Option<u8>,
+	pub name: Option<String>,
+	pub variant: Option<String>
+}
+
+impl VoiceConfig {
+	/// Creates a new [`VoiceConfig`] with the specified voice name and no other attributes.
+	///
+	/// ```
+	/// let doc = ssml::VoiceConfig::named("en-US-JennyNeural");
+	/// ```
+	pub fn named(name: impl ToString) -> Self {
+		Self {
+			name: Some(name.to_string()),
+			..VoiceConfig::default()
+		}
+	}
+}
+
+impl<S: ToString> From<S> for VoiceConfig {
+	fn from(value: S) -> Self {
+		VoiceConfig::named(value)
+	}
+}
+
+impl Serialize for VoiceConfig {
+	fn serialize<W: Write>(&self, writer: &mut W, _: Flavor) -> Result<(), Box<dyn Error>> {
+		if let Some(gender) = &self.gender {
+			util::write_attr(writer, "gender", gender.to_string())?;
+		}
+		if let Some(age) = &self.age {
+			util::write_attr(writer, "age", age.to_string())?;
+		}
+		if let Some(name) = &self.name {
+			util::write_attr(writer, "name", name)?;
+		}
+		if let Some(variant) = &self.variant {
+			util::write_attr(writer, "variant", variant)?;
+		}
+		Ok(())
+	}
+}
+
+/// The [`Voice`] element allows you to specify a voice or use multiple different voices in one document.
+#[derive(Default, Debug, Clone)]
+pub struct Voice {
+	pub(crate) elements: Vec<SpeakableElement>,
+	config: VoiceConfig
+}
+
+impl Voice {
+	/// Creates a new `voice` element to change the voice of a section spoken elements.
+	///
+	/// ```
+	/// # use ssml::{Flavor, Serialize};
+	/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+	/// let doc = ssml::Speak::new(None, [ssml::Voice::new("en-US-Neural2-F", ["Hello, world!"])]);
+	///
+	/// assert_eq!(
+	/// 	doc.serialize_to_string(Flavor::GoogleCloudTextToSpeech)?,
+	/// 	r#"<speak><voice name="en-US-Neural2-F">Hello, world! </voice></speak>"#
+	/// );
+	/// # Ok(())
+	/// # }
+	/// ```
+	pub fn new<S: Into<SpeakableElement>, I: IntoIterator<Item = S>>(config: impl Into<VoiceConfig>, elements: I) -> Self {
+		Self {
+			elements: elements.into_iter().map(|f| f.into()).collect(),
+			config: config.into()
+		}
+	}
+
+	/// Extend this `voice` section with additional spoken elements.
+	///
+	/// ```
+	/// # use ssml::{Flavor, Serialize};
+	/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+	/// let mut voice = ssml::voice("en-US-Neural2-F", ["Hello, world!"]);
+	/// voice = voice.with_elements(["This is an SSML document."]);
+	/// let doc = ssml::Speak::new(None, [voice]);
+	///
+	/// assert_eq!(
+	/// 	doc.serialize_to_string(Flavor::GoogleCloudTextToSpeech)?,
+	/// 	r#"<speak><voice name="en-US-Neural2-F">Hello, world! This is an SSML document. </voice></speak>"#
+	/// );
+	/// # Ok(())
+	/// # }
+	/// ```
+	pub fn with_elements<S: Into<SpeakableElement>, I: IntoIterator<Item = S>>(mut self, elements: I) -> Self {
+		self.elements.extend(elements.into_iter().map(|f| f.into()));
+		self
+	}
+
+	/// Modifies the voice configuration of this `voice` section.
+	///
+	/// ```
+	/// let mut voice = ssml::Voice::default();
+	/// voice = voice.with_voice(ssml::VoiceConfig { age: Some(42), ..Default::default() });
+	/// ```
+	pub fn with_voice(mut self, config: impl Into<VoiceConfig>) -> Self {
+		self.config = config.into();
+		self
+	}
+}
+
+impl Serialize for Voice {
+	fn serialize<W: Write>(&self, writer: &mut W, flavor: Flavor) -> Result<(), Box<dyn Error>> {
+		writer.write_all(b"<voice")?;
+		self.config.serialize(writer, flavor)?;
+		writer.write_all(b">")?;
+		for el in &self.elements {
+			el.serialize(writer, flavor)?;
+		}
+		writer.write_all(b"</voice>")?;
+		Ok(())
+	}
+}
+
+/// Creates a new `voice` element to change the voice of a section spoken elements.
+///
+/// ```
+/// # use ssml::{Flavor, Serialize};
+/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+/// let doc = ssml::speak(None, [ssml::voice("en-US-Neural2-F", ["Hello, world!"])]);
+///
+/// assert_eq!(
+/// 	doc.serialize_to_string(Flavor::GoogleCloudTextToSpeech)?,
+/// 	r#"<speak><voice name="en-US-Neural2-F">Hello, world! </voice></speak>"#
+/// );
+/// # Ok(())
+/// # }
+/// ```
+pub fn voice<S: Into<SpeakableElement>, I: IntoIterator<Item = S>>(config: impl Into<VoiceConfig>, elements: I) -> Voice {
+	Voice::new(config, elements)
+}