Improved speech handling + clean up code

Chiroyce1 · Sep 19, 2024 · 753d3c0 · 753d3c0
1 parent 418ffdb
commit 753d3c0
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 132 deletions.
diff --git a/index.html b/index.html
@@ -23,7 +23,7 @@
 
 <body>
   <div class="stuff">
-    <h1>Gemini Vision</h1>
+    <h1><span><img src="./images/gemini.png" alt=""></span>Gemini Vision</h1>
     <div class="flex">
       <div class="right" id="settings">
         <input type="password" id="api" placeholder="Enter API key (get it from ai.google.dev)">
@@ -32,16 +32,14 @@ <h1>Gemini Vision</h1>
         <select id="voiceSelect">
           <option value="default">Default voice</option>
         </select><br>
+        <button id="speak">Speak</button>
+
       </div>
       <div class="left">
         <video id="webcam" autoplay></video>
         <select id="cameraSelect"></select>
-        <button>Take picture 📸</button>
+        <button id="click">Take picture 📸</button>
         <div class="speech-checkbox">
-          <div>
-            <input type="checkbox" id="speech">
-            <span id="speechtext">Speak output</span>
-          </div>
           <div>
             <input type="checkbox" id="hide">
             <span>Hide settings</span>

diff --git a/prompts.js b/prompts.js
@@ -1,4 +1,6 @@
-export default [
+import { HarmBlockThreshold, HarmCategory } from "@google/generative-ai";
+
+export const prompts = [
 	{
 		description: "Default prompt",
 		prompt: `What do you see in this picture? Describe in detail, along with reasoning.`,
@@ -46,3 +48,22 @@ Show an overall confidence score out of 100% ONLY at THE END of the paragraph.
 `,
 	},
 ];
+
+export const safetySettings = [
+	{
+		category: HarmCategory.HARM_CATEGORY_HARASSMENT,
+		threshold: HarmBlockThreshold.BLOCK_NONE,
+	},
+	{
+		category: HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+		threshold: HarmBlockThreshold.BLOCK_NONE,
+	},
+	{
+		category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+		threshold: HarmBlockThreshold.BLOCK_NONE,
+	},
+	{
+		category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+		threshold: HarmBlockThreshold.BLOCK_NONE,
+	},
+];
diff --git a/prompts.json b/prompts.json
diff --git a/script.js b/script.js
@@ -1,63 +1,39 @@
 import { GoogleGenerativeAI } from "@google/generative-ai";
-import prompts from "./prompts.js";
-
-import { HarmBlockThreshold, HarmCategory } from "@google/generative-ai";
-
-const safetySettings = [
-	{
-		category: HarmCategory.HARM_CATEGORY_HARASSMENT,
-		threshold: HarmBlockThreshold.BLOCK_NONE,
-	},
-	{
-		category: HarmCategory.HARM_CATEGORY_HATE_SPEECH,
-		threshold: HarmBlockThreshold.BLOCK_NONE,
-	},
-	{
-		category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
-		threshold: HarmBlockThreshold.BLOCK_NONE,
-	},
-	{
-		category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
-		threshold: HarmBlockThreshold.BLOCK_NONE,
-	},
-];
+import { prompts, safetySettings } from "./prompts.js";
+
+/* ======================================================================================== */
+// DOM and state management setup
+/* ======================================================================================== */
 
 const responseElement = document.getElementById("response");
 const cameraSelect = document.getElementById("cameraSelect");
 const promptSelect = document.getElementById("promptSelect");
+const speakBtn = document.getElementById("speak");
 const voiceSelect = document.getElementById("voiceSelect");
-const voiceCheckbox = document.getElementById("speech");
 const promptInput = document.getElementById("prompt");
 const video = document.getElementById("webcam");
 const canvas = document.getElementById("canvas");
 const context = canvas.getContext("2d");
-let active = false;
-let output = "";
-let voice = null;
 
+let active = false; // is the model currently generating a response
+let output = ""; // last output
+let speaking = false; // is the speechsynthesis currently speaking
+
+speakBtn.style.display = "none";
 promptInput.value = `What do you see in this picture? Describe in detail, along with reasoning.`;
 
 const show = (text) => (responseElement.innerText = text);
+promptSelect.addEventListener("change", (e) => {
+	document.querySelector("#prompt").value = promptSelect.value;
+});
 
-async function fileToGenerativePart(file) {
-	const base64EncodedDataPromise = new Promise((resolve) => {
-		const reader = new FileReader();
-		reader.onloadend = () => resolve(reader.result.split(",")[1]);
-		reader.readAsDataURL(file);
-	});
-	return {
-		inlineData: { data: await base64EncodedDataPromise, mimeType: file.type },
-	};
-}
-
-function speak(txt) {
-	const utterance = new SpeechSynthesisUtterance(txt);
-	if (voice) {
-		utterance.voice = speechSynthesis.getVoices()[voice + 1];
-	}
-	document.querySelector("#speechtext").innerText = "Stop Speaking";
-	speechSynthesis.speak(utterance);
-}
+document.querySelector("#click").addEventListener("click", captureImage);
+prompts.forEach((prompt) => {
+	const option = document.createElement("option");
+	option.text = prompt.description;
+	option.value = prompt["prompt"];
+	promptSelect.add(option);
+});
 
 document.querySelector("#hide").checked = false;
 document.querySelector("#hide").addEventListener("click", () => {
@@ -69,54 +45,39 @@ document.querySelector("#hide").addEventListener("click", () => {
 	}
 });
 
-voiceCheckbox.addEventListener("click", () => {
-	if (!voiceCheckbox.checked) {
+speakBtn.addEventListener("click", () => {
+	if (speaking) {
 		speechSynthesis.cancel();
-		document.querySelector("#speechtext").innerText = "Speak output";
-	} else {
-		if (output.trim() !== "") {
-			speak(output);
-		}
+		speakBtn.innerText = "Speak";
+		speaking = false;
+	} else if (output.trim() !== "") {
+		speak(output);
 	}
 });
 
-navigator.mediaDevices
-	.enumerateDevices()
-	.then((devices) => {
-		devices.forEach((device) => {
-			if (device.kind === "videoinput") {
-				const option = document.createElement("option");
-				option.value = device.deviceId;
-				option.text =
-					device.label || `Camera ${cameraSelect.options.length + 1}`;
-				cameraSelect.add(option);
-			}
-		});
-	})
-	.catch((error) => {
-		show(`Error enumerating devices: ${error}`);
-		console.error(`Error enumerating devices: ${error}`);
-	});
-
-cameraSelect.addEventListener("change", setCamera);
+/* ======================================================================================== */
+// Generative AI invocation and response handling with speech synthesis
+/* ======================================================================================== */
 
-function setCamera() {
-	const selectedCameraId = cameraSelect.value;
-	// disable all other media streams
-	if (video.srcObject) {
-		video.srcObject.getTracks().forEach((track) => track.stop());
+function speak(txt) {
+	speechSynthesis.cancel();
+	speaking = true;
+	const voice = speechSynthesis.getVoices()[voiceSelect.selectedIndex - 1];
+	const utterance = new SpeechSynthesisUtterance(txt);
+	if (voiceSelect.selectedIndex !== 0) {
+		utterance.voice = voice;
+		console.log(voice);
+		localStorage.setItem("voice", voice.name);
+	} else {
+		console.log("Using default voice");
 	}
-	navigator.mediaDevices
-		.getUserMedia({
-			video: { deviceId: selectedCameraId },
-		})
-		.then((stream) => {
-			video.srcObject = stream;
-		})
-		.catch((error) => {
-			console.error(`Error accessing webcam: ${error}`);
-			show(`Error accessing webcam: ${error}`);
-		});
+	speakBtn.innerText = "Stop Speaking";
+	speakBtn.style.display = "";
+	speechSynthesis.speak(utterance);
+	utterance.addEventListener("end", () => {
+		speakBtn.innerText = "Speak";
+		speaking = false;
+	});
 }
 
 async function captureImage() {
@@ -147,17 +108,18 @@ async function captureImage() {
 	show("Loading... ");
 	let res;
 	active = true;
+	speakBtn.style.display = "none";
 	try {
+		let start = Date.now();
 		res = await model.generateContentStream([promptInput.value, image]);
 		let text = "";
 		for await (const chunk of res.stream) {
 			text += chunk.text();
 			show(text);
 		}
 		output = text;
-		if (document.querySelector("#speech").checked) {
-			speak(text);
-		}
+		show(`${output} [${((Date.now() - start) / 1000).toFixed(1)}s]`);
+		speak(text);
 	} catch (e) {
 		console.error(e);
 		show(`Oops something went wrong.\nError: ${e.toString()}`);
@@ -168,6 +130,76 @@ async function captureImage() {
 	active = false;
 }
 
+/* ======================================================================================== */
+// Setup speech voices and camera
+/* ======================================================================================== */
+
+setTimeout(() => {
+	// for some reason this function doesnt work properly on page load
+	// so we have to wait a bit
+	console.log(`${speechSynthesis.getVoices().length} voices found`);
+	speechSynthesis.getVoices().forEach((voice) => {
+		const option = document.createElement("option");
+		option.value = voice.name;
+		option.text = voice.name;
+		voiceSelect.add(option);
+	});
+	const voice = localStorage.getItem("voice");
+	if (voice) {
+		const voiceIndex = Array.from(voiceSelect.options).findIndex(
+			(option) => option.value === voice
+		);
+		if (voiceIndex !== -1) {
+			voiceSelect.selectedIndex = voiceIndex;
+		}
+	}
+}, 1000);
+
+navigator.mediaDevices
+	.enumerateDevices()
+	.then((devices) => {
+		devices.forEach((device) => {
+			if (device.kind === "videoinput") {
+				const option = document.createElement("option");
+				option.value = device.deviceId;
+				option.text =
+					device.label || `Camera ${cameraSelect.options.length + 1}`;
+				cameraSelect.add(option);
+			}
+		});
+	})
+	.catch((error) => {
+		show(`Error enumerating devices: ${error}`);
+		console.error(`Error enumerating devices: ${error}`);
+	});
+
+cameraSelect.addEventListener("change", setCamera);
+
+function setCamera() {
+	const selectedCameraId = cameraSelect.value;
+	// disable all other media streams
+	if (video.srcObject) {
+		video.srcObject.getTracks().forEach((track) => track.stop());
+	}
+	navigator.mediaDevices
+		.getUserMedia({
+			video: { deviceId: selectedCameraId },
+		})
+		.then((stream) => {
+			video.srcObject = stream;
+		})
+		.catch((error) => {
+			console.error(`Error accessing webcam: ${error}`);
+			show(`Error accessing webcam: ${error}`);
+		});
+}
+
+setCamera();
+
+/* ======================================================================================== */
+// UTILITY FUNCTIONS for the Google Generative AI JS SDK
+/* ======================================================================================== */
+
 function dataURItoBlob(dataURI) {
 	// Thanks to ChatGPT for this
 	const byteString = atob(dataURI.split(",")[1]);
@@ -182,28 +214,17 @@ function dataURItoBlob(dataURI) {
 	return new Blob([arrayBuffer], { type: mimeString });
 }
 
-prompts.forEach((prompt) => {
-	const option = document.createElement("option");
-	option.text = prompt.description;
-	option.value = prompt["prompt"];
-	promptSelect.add(option);
-});
-
-speechSynthesis.getVoices().forEach((voice) => {
-	const option = document.createElement("option");
-	option.value = voice.name;
-	option.text = voice.name;
-	voiceSelect.add(option);
-});
-
-setCamera();
-
-promptSelect.addEventListener("change", (e) => {
-	document.querySelector("#prompt").value = promptSelect.value;
-});
-
-voiceSelect.addEventListener("change", (e) => {
-	voice = voiceSelect.selectedIndex;
-});
+async function fileToGenerativePart(file) {
+	const base64EncodedDataPromise = new Promise((resolve) => {
+		const reader = new FileReader();
+		reader.onloadend = () => resolve(reader.result.split(",")[1]);
+		reader.readAsDataURL(file);
+	});
+	return {
+		inlineData: { data: await base64EncodedDataPromise, mimeType: file.type },
+	};
+}
 
-document.querySelector("button").addEventListener("click", captureImage);
+/* ======================================================================================== */
+// that was a lot of code eh?
+/* ======================================================================================== */
diff --git a/style.css b/style.css
@@ -106,6 +106,8 @@ video {
 
 #response {
 	margin-bottom: 3em;
+	font-size: 1.5em;
+	font-weight: 300;
 }
 
 button:hover,
@@ -178,3 +180,8 @@ input[type="password"] {
 .speech-checkbox > input {
 	cursor: pointer;
 }
+
+img {
+	height: 50px;
+	transform: translateY(8px) translateX(-6px);
+}