Skip to content

Commit

Permalink
Improved speech handling + clean up code
Browse files Browse the repository at this point in the history
  • Loading branch information
Chiroyce1 committed Sep 19, 2024
1 parent 418ffdb commit 753d3c0
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 132 deletions.
10 changes: 4 additions & 6 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

<body>
<div class="stuff">
<h1>Gemini Vision</h1>
<h1><span><img src="./images/gemini.png" alt=""></span>Gemini Vision</h1>
<div class="flex">
<div class="right" id="settings">
<input type="password" id="api" placeholder="Enter API key (get it from ai.google.dev)">
Expand All @@ -32,16 +32,14 @@ <h1>Gemini Vision</h1>
<select id="voiceSelect">
<option value="default">Default voice</option>
</select><br>
<button id="speak">Speak</button>

</div>
<div class="left">
<video id="webcam" autoplay></video>
<select id="cameraSelect"></select>
<button>Take picture 📸</button>
<button id="click">Take picture 📸</button>
<div class="speech-checkbox">
<div>
<input type="checkbox" id="speech">
<span id="speechtext">Speak output</span>
</div>
<div>
<input type="checkbox" id="hide">
<span>Hide settings</span>
Expand Down
23 changes: 22 additions & 1 deletion prompts.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
export default [
import { HarmBlockThreshold, HarmCategory } from "@google/generative-ai";

export const prompts = [
{
description: "Default prompt",
prompt: `What do you see in this picture? Describe in detail, along with reasoning.`,
Expand Down Expand Up @@ -46,3 +48,22 @@ Show an overall confidence score out of 100% ONLY at THE END of the paragraph.
`,
},
];

export const safetySettings = [
{
category: HarmCategory.HARM_CATEGORY_HARASSMENT,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
{
category: HarmCategory.HARM_CATEGORY_HATE_SPEECH,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
{
category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
{
category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
];
11 changes: 0 additions & 11 deletions prompts.json

This file was deleted.

249 changes: 135 additions & 114 deletions script.js
Original file line number Diff line number Diff line change
@@ -1,63 +1,39 @@
import { GoogleGenerativeAI } from "@google/generative-ai";
import prompts from "./prompts.js";

import { HarmBlockThreshold, HarmCategory } from "@google/generative-ai";

const safetySettings = [
{
category: HarmCategory.HARM_CATEGORY_HARASSMENT,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
{
category: HarmCategory.HARM_CATEGORY_HATE_SPEECH,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
{
category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
{
category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold: HarmBlockThreshold.BLOCK_NONE,
},
];
import { prompts, safetySettings } from "./prompts.js";

/* ======================================================================================== */
// DOM and state management setup
/* ======================================================================================== */

const responseElement = document.getElementById("response");
const cameraSelect = document.getElementById("cameraSelect");
const promptSelect = document.getElementById("promptSelect");
const speakBtn = document.getElementById("speak");
const voiceSelect = document.getElementById("voiceSelect");
const voiceCheckbox = document.getElementById("speech");
const promptInput = document.getElementById("prompt");
const video = document.getElementById("webcam");
const canvas = document.getElementById("canvas");
const context = canvas.getContext("2d");
let active = false;
let output = "";
let voice = null;

let active = false; // is the model currently generating a response
let output = ""; // last output
let speaking = false; // is the speechsynthesis currently speaking

speakBtn.style.display = "none";
promptInput.value = `What do you see in this picture? Describe in detail, along with reasoning.`;

const show = (text) => (responseElement.innerText = text);
promptSelect.addEventListener("change", (e) => {
document.querySelector("#prompt").value = promptSelect.value;
});

async function fileToGenerativePart(file) {
const base64EncodedDataPromise = new Promise((resolve) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result.split(",")[1]);
reader.readAsDataURL(file);
});
return {
inlineData: { data: await base64EncodedDataPromise, mimeType: file.type },
};
}

function speak(txt) {
const utterance = new SpeechSynthesisUtterance(txt);
if (voice) {
utterance.voice = speechSynthesis.getVoices()[voice + 1];
}
document.querySelector("#speechtext").innerText = "Stop Speaking";
speechSynthesis.speak(utterance);
}
document.querySelector("#click").addEventListener("click", captureImage);
prompts.forEach((prompt) => {
const option = document.createElement("option");
option.text = prompt.description;
option.value = prompt["prompt"];
promptSelect.add(option);
});

document.querySelector("#hide").checked = false;
document.querySelector("#hide").addEventListener("click", () => {
Expand All @@ -69,54 +45,39 @@ document.querySelector("#hide").addEventListener("click", () => {
}
});

voiceCheckbox.addEventListener("click", () => {
if (!voiceCheckbox.checked) {
speakBtn.addEventListener("click", () => {
if (speaking) {
speechSynthesis.cancel();
document.querySelector("#speechtext").innerText = "Speak output";
} else {
if (output.trim() !== "") {
speak(output);
}
speakBtn.innerText = "Speak";
speaking = false;
} else if (output.trim() !== "") {
speak(output);
}
});

navigator.mediaDevices
.enumerateDevices()
.then((devices) => {
devices.forEach((device) => {
if (device.kind === "videoinput") {
const option = document.createElement("option");
option.value = device.deviceId;
option.text =
device.label || `Camera ${cameraSelect.options.length + 1}`;
cameraSelect.add(option);
}
});
})
.catch((error) => {
show(`Error enumerating devices: ${error}`);
console.error(`Error enumerating devices: ${error}`);
});

cameraSelect.addEventListener("change", setCamera);
/* ======================================================================================== */
// Generative AI invocation and response handling with speech synthesis
/* ======================================================================================== */

function setCamera() {
const selectedCameraId = cameraSelect.value;
// disable all other media streams
if (video.srcObject) {
video.srcObject.getTracks().forEach((track) => track.stop());
function speak(txt) {
speechSynthesis.cancel();
speaking = true;
const voice = speechSynthesis.getVoices()[voiceSelect.selectedIndex - 1];
const utterance = new SpeechSynthesisUtterance(txt);
if (voiceSelect.selectedIndex !== 0) {
utterance.voice = voice;
console.log(voice);
localStorage.setItem("voice", voice.name);
} else {
console.log("Using default voice");
}
navigator.mediaDevices
.getUserMedia({
video: { deviceId: selectedCameraId },
})
.then((stream) => {
video.srcObject = stream;
})
.catch((error) => {
console.error(`Error accessing webcam: ${error}`);
show(`Error accessing webcam: ${error}`);
});
speakBtn.innerText = "Stop Speaking";
speakBtn.style.display = "";
speechSynthesis.speak(utterance);
utterance.addEventListener("end", () => {
speakBtn.innerText = "Speak";
speaking = false;
});
}

async function captureImage() {
Expand Down Expand Up @@ -147,17 +108,18 @@ async function captureImage() {
show("Loading... ");
let res;
active = true;
speakBtn.style.display = "none";
try {
let start = Date.now();
res = await model.generateContentStream([promptInput.value, image]);
let text = "";
for await (const chunk of res.stream) {
text += chunk.text();
show(text);
}
output = text;
if (document.querySelector("#speech").checked) {
speak(text);
}
show(`${output} [${((Date.now() - start) / 1000).toFixed(1)}s]`);
speak(text);
} catch (e) {
console.error(e);
show(`Oops something went wrong.\nError: ${e.toString()}`);
Expand All @@ -168,6 +130,76 @@ async function captureImage() {
active = false;
}

/* ======================================================================================== */
// Setup speech voices and camera
/* ======================================================================================== */

setTimeout(() => {
// for some reason this function doesnt work properly on page load
// so we have to wait a bit
console.log(`${speechSynthesis.getVoices().length} voices found`);
speechSynthesis.getVoices().forEach((voice) => {
const option = document.createElement("option");
option.value = voice.name;
option.text = voice.name;
voiceSelect.add(option);
});
const voice = localStorage.getItem("voice");
if (voice) {
const voiceIndex = Array.from(voiceSelect.options).findIndex(
(option) => option.value === voice
);
if (voiceIndex !== -1) {
voiceSelect.selectedIndex = voiceIndex;
}
}
}, 1000);

navigator.mediaDevices
.enumerateDevices()
.then((devices) => {
devices.forEach((device) => {
if (device.kind === "videoinput") {
const option = document.createElement("option");
option.value = device.deviceId;
option.text =
device.label || `Camera ${cameraSelect.options.length + 1}`;
cameraSelect.add(option);
}
});
})
.catch((error) => {
show(`Error enumerating devices: ${error}`);
console.error(`Error enumerating devices: ${error}`);
});

cameraSelect.addEventListener("change", setCamera);

function setCamera() {
const selectedCameraId = cameraSelect.value;
// disable all other media streams
if (video.srcObject) {
video.srcObject.getTracks().forEach((track) => track.stop());
}
navigator.mediaDevices
.getUserMedia({
video: { deviceId: selectedCameraId },
})
.then((stream) => {
video.srcObject = stream;
})
.catch((error) => {
console.error(`Error accessing webcam: ${error}`);
show(`Error accessing webcam: ${error}`);
});
}

setCamera();

/* ======================================================================================== */
// UTILITY FUNCTIONS for the Google Generative AI JS SDK
/* ======================================================================================== */

function dataURItoBlob(dataURI) {
// Thanks to ChatGPT for this
const byteString = atob(dataURI.split(",")[1]);
Expand All @@ -182,28 +214,17 @@ function dataURItoBlob(dataURI) {
return new Blob([arrayBuffer], { type: mimeString });
}

prompts.forEach((prompt) => {
const option = document.createElement("option");
option.text = prompt.description;
option.value = prompt["prompt"];
promptSelect.add(option);
});

speechSynthesis.getVoices().forEach((voice) => {
const option = document.createElement("option");
option.value = voice.name;
option.text = voice.name;
voiceSelect.add(option);
});

setCamera();

promptSelect.addEventListener("change", (e) => {
document.querySelector("#prompt").value = promptSelect.value;
});

voiceSelect.addEventListener("change", (e) => {
voice = voiceSelect.selectedIndex;
});
async function fileToGenerativePart(file) {
const base64EncodedDataPromise = new Promise((resolve) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result.split(",")[1]);
reader.readAsDataURL(file);
});
return {
inlineData: { data: await base64EncodedDataPromise, mimeType: file.type },
};
}

document.querySelector("button").addEventListener("click", captureImage);
/* ======================================================================================== */
// that was a lot of code eh?
/* ======================================================================================== */
7 changes: 7 additions & 0 deletions style.css
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ video {

#response {
margin-bottom: 3em;
font-size: 1.5em;
font-weight: 300;
}

button:hover,
Expand Down Expand Up @@ -178,3 +180,8 @@ input[type="password"] {
.speech-checkbox > input {
cursor: pointer;
}

img {
height: 50px;
transform: translateY(8px) translateX(-6px);
}

0 comments on commit 753d3c0

Please sign in to comment.