| 1 | <!DOCTYPE html> |
| 2 | <html lang="en"> |
| 3 | <head> |
| 4 | <meta charset="UTF-8"> |
| 5 | <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| 6 | <title>Voice Agent Quickstart — AssemblyAI</title> |
| 7 | <style> |
| 8 | :root { |
| 9 | --brand: #364DEA; --brand-dark: #2B3EC4; --brand-bg: #EEF1FE; |
| 10 | --green: #12B886; --red: #FA5252; |
| 11 | --s50: #F8FAFC; --s100: #F1F5F9; --s200: #E2E8F0; |
| 12 | --s300: #CBD5E1; --s400: #94A3B8; --s500: #64748B; |
| 13 | --s600: #475569; --s700: #334155; --s800: #1E293B; --s900: #0F172A; |
| 14 | } |
| 15 | *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } |
| 16 | html, body { height: 100%; } |
| 17 | body { |
| 18 | font-family: system-ui, -apple-system, sans-serif; |
| 19 | color: var(--s900); display: flex; flex-direction: column; |
| 20 | background: |
| 21 | radial-gradient(1200px 600px at 80% -10%, #DCE3FE 0%, transparent 60%), |
| 22 | radial-gradient(900px 500px at -10% 110%, #E6FCF5 0%, transparent 55%), |
| 23 | var(--s50); |
| 24 | } |
| 25 | |
| 26 | header { |
| 27 | background: rgba(255,255,255,.85); backdrop-filter: blur(12px); |
| 28 | border-bottom: 1px solid var(--s200); |
| 29 | padding: 0 1.5rem; height: 3.5rem; |
| 30 | display: flex; align-items: center; gap: .75rem; |
| 31 | flex-shrink: 0; |
| 32 | } |
| 33 | .logo img { height: 22px; display: block; } |
| 34 | .page-title { font-size: .875rem; color: var(--s500); padding-left: .75rem; border-left: 1px solid var(--s200); } |
| 35 | .header-spacer { flex: 1; } |
| 36 | .status { |
| 37 | display: flex; align-items: center; gap: .5rem; |
| 38 | font-size: .8125rem; color: var(--s500); |
| 39 | padding: .375rem .75rem; border-radius: 999px; |
| 40 | background: var(--s100); border: 1px solid var(--s200); |
| 41 | } |
| 42 | .dot { width: 8px; height: 8px; border-radius: 50%; background: currentColor; flex-shrink: 0; } |
| 43 | .status.ok { color: var(--green); background: #E6FCF5; border-color: #C3FAE8; } |
| 44 | .status.ok .dot { animation: pulse 2s ease-in-out infinite; } |
| 45 | .status.err { color: var(--red); background: #FFF5F5; border-color: #FFE3E3; } |
| 46 | @keyframes pulse { 0%,100% { opacity: 1; } 50% { opacity: .3; } } |
| 47 | |
| 48 | .layout { flex: 1; display: grid; grid-template-columns: 360px 1fr; min-height: 0; } |
| 49 | @media (max-width: 800px) { .layout { grid-template-columns: 1fr; } } |
| 50 | |
| 51 | aside { |
| 52 | border-right: 1px solid var(--s200); |
| 53 | background: rgba(255,255,255,.6); backdrop-filter: blur(8px); |
| 54 | padding: 1.5rem; overflow-y: auto; |
| 55 | display: flex; flex-direction: column; gap: 1rem; |
| 56 | } |
| 57 | aside h2 { |
| 58 | font-size: .6875rem; font-weight: 600; color: var(--s500); |
| 59 | text-transform: uppercase; letter-spacing: .08em; margin-bottom: .5rem; |
| 60 | } |
| 61 | .field { display: flex; flex-direction: column; gap: .375rem; } |
| 62 | label { font-size: .75rem; font-weight: 500; color: var(--s600); } |
| 63 | input, select, textarea { |
| 64 | width: 100%; padding: .5rem .625rem; border: 1px solid var(--s200); border-radius: 8px; |
| 65 | font: inherit; font-size: .875rem; color: var(--s900); background: #fff; |
| 66 | transition: border-color .15s, box-shadow .15s; |
| 67 | } |
| 68 | input:focus, select:focus, textarea:focus { outline: none; border-color: var(--brand); box-shadow: 0 0 0 3px rgba(54,77,234,.12); } |
| 69 | textarea { resize: vertical; min-height: 96px; line-height: 1.5; } |
| 70 | |
| 71 | .btn { |
| 72 | width: 100%; padding: .75rem 1rem; border: none; border-radius: 10px; |
| 73 | font-size: .9375rem; font-weight: 600; cursor: pointer; color: #fff; background: var(--brand); |
| 74 | transition: all .15s; display: flex; align-items: center; justify-content: center; gap: .5rem; |
| 75 | box-shadow: 0 1px 2px rgba(54,77,234,.3), 0 4px 12px rgba(54,77,234,.15); |
| 76 | } |
| 77 | .btn:hover { background: var(--brand-dark); transform: translateY(-1px); } |
| 78 | .btn:disabled { opacity: .5; cursor: default; transform: none; } |
| 79 | .btn.on { background: var(--red); box-shadow: 0 1px 2px rgba(250,82,82,.3), 0 4px 12px rgba(250,82,82,.15); } |
| 80 | .btn.on:hover { background: #e03131; } |
| 81 | .btn svg { width: 18px; height: 18px; } |
| 82 | |
| 83 | main { |
| 84 | display: flex; flex-direction: column; min-height: 0; |
| 85 | padding: 1.5rem 2rem 2rem; |
| 86 | } |
| 87 | |
| 88 | .transcript { |
| 89 | flex: 1; min-height: 0; display: flex; flex-direction: column; |
| 90 | background: #fff; border: 1px solid var(--s200); border-radius: 16px; |
| 91 | overflow: hidden; |
| 92 | box-shadow: 0 1px 2px rgba(15,23,42,.04), 0 4px 16px rgba(15,23,42,.04); |
| 93 | } |
| 94 | .transcript-hd { |
| 95 | padding: .75rem 1.25rem; background: var(--s50); border-bottom: 1px solid var(--s200); |
| 96 | font-size: .6875rem; font-weight: 600; color: var(--s500); |
| 97 | text-transform: uppercase; letter-spacing: .08em; |
| 98 | display: flex; justify-content: space-between; align-items: center; |
| 99 | } |
| 100 | .speakers { display: flex; gap: .375rem; } |
| 101 | .speaker { |
| 102 | display: flex; align-items: center; gap: .375rem; |
| 103 | padding: .25rem .625rem; border-radius: 999px; |
| 104 | background: var(--s100); color: var(--s400); |
| 105 | font-size: .6875rem; font-weight: 600; |
| 106 | text-transform: uppercase; letter-spacing: .05em; |
| 107 | transition: background .2s, color .2s; |
| 108 | } |
| 109 | .speaker .dot { width: 6px; height: 6px; } |
| 110 | .speaker.user.active { background: var(--brand-bg); color: var(--brand); } |
| 111 | .speaker.agent.active { background: #E6FCF5; color: var(--green); } |
| 112 | .speaker.active .dot { animation: pulse 1s ease-in-out infinite; } |
| 113 | #msgs { flex: 1; overflow-y: auto; padding: 1rem 1.25rem; display: flex; flex-direction: column; gap: .5rem; } |
| 114 | .empty { |
| 115 | flex: 1; display: flex; align-items: center; justify-content: center; |
| 116 | color: var(--s400); font-size: .875rem; |
| 117 | } |
| 118 | .msg { |
| 119 | padding: .75rem 1rem; border-radius: 12px; |
| 120 | font-size: .9375rem; line-height: 1.5; |
| 121 | max-width: 85%; animation: slideIn .25s ease; |
| 122 | } |
| 123 | @keyframes slideIn { from { opacity: 0; transform: translateY(4px); } to { opacity: 1; transform: none; } } |
| 124 | .msg .who { |
| 125 | font-size: .6875rem; font-weight: 600; text-transform: uppercase; |
| 126 | letter-spacing: .05em; color: var(--s500); margin-bottom: .25rem; |
| 127 | } |
| 128 | .msg.u { background: var(--brand-bg); align-self: flex-end; } |
| 129 | .msg.u .who { color: var(--brand); } |
| 130 | .msg.a { background: #E6FCF5; align-self: flex-start; } |
| 131 | .msg.a .who { color: var(--green); } |
| 132 | </style> |
| 133 | </head> |
| 134 | <body> |
| 135 | <header> |
| 136 | <a class="logo" href="https://www.assemblyai.com"> |
| 137 | <img src="https://cdn.prod.website-files.com/67a08d9d7d19f8fb63692894/67b5bd3d9e8ee1a6b2410b9e_AssemblyAI%20Logo.svg" alt="AssemblyAI"> |
| 138 | </a> |
| 139 | <span class="page-title">Voice Agent Quickstart</span> |
| 140 | <div class="header-spacer"></div> |
| 141 | <div class="status" id="status"><span class="dot"></span><span id="status-text">Ready</span></div> |
| 142 | </header> |
| 143 | |
| 144 | <div class="layout"> |
| 145 | <aside> |
| 146 | <div> |
| 147 | <h2>Configuration</h2> |
| 148 | <div class="field"> |
| 149 | <label for="key">API key</label> |
| 150 | <input id="key" type="password" placeholder="Your AssemblyAI API key"> |
| 151 | </div> |
| 152 | </div> |
| 153 | |
| 154 | <div class="field"> |
| 155 | <label for="mic">Microphone</label> |
| 156 | <select id="mic"><option value="">Default microphone</option></select> |
| 157 | </div> |
| 158 | |
| 159 | <div class="field"> |
| 160 | <label for="voice">Voice</label> |
| 161 | <select id="voice"> |
| 162 | <optgroup label="English"> |
| 163 | <option value="ivy" selected>🇺🇸 ivy</option> |
| 164 | <option value="james">🇺🇸 james</option> |
| 165 | <option value="tyler">🇺🇸 tyler</option> |
| 166 | <option value="winter">🇺🇸 winter</option> |
| 167 | <option value="sam">🇺🇸 sam</option> |
| 168 | <option value="mia">🇺🇸 mia</option> |
| 169 | <option value="bella">🇺🇸 bella</option> |
| 170 | <option value="david">🇺🇸 david</option> |
| 171 | <option value="jack">🇺🇸 jack</option> |
| 172 | <option value="kyle">🇺🇸 kyle</option> |
| 173 | <option value="helen">🇺🇸 helen</option> |
| 174 | <option value="martha">🇺🇸 martha</option> |
| 175 | <option value="river">🇺🇸 river</option> |
| 176 | <option value="emma">🇺🇸 emma</option> |
| 177 | <option value="victor">🇺🇸 victor</option> |
| 178 | <option value="eleanor">🇺🇸 eleanor</option> |
| 179 | <option value="sophie">🇬🇧 sophie</option> |
| 180 | <option value="oliver">🇬🇧 oliver</option> |
| 181 | </optgroup> |
| 182 | <optgroup label="Multilingual"> |
| 183 | <option value="arjun">🇮🇳 arjun — Hindi/Hinglish</option> |
| 184 | <option value="ethan">🇨🇳 ethan — Mandarin</option> |
| 185 | <option value="dmitri">🇷🇺 dmitri — Russian</option> |
| 186 | <option value="lukas">🇩🇪 lukas — German</option> |
| 187 | <option value="lena">🇩🇪 lena — German</option> |
| 188 | <option value="pierre">🇫🇷 pierre — French</option> |
| 189 | <option value="mina">🇰🇷 mina — Korean</option> |
| 190 | <option value="ren">🇯🇵 ren — Japanese</option> |
| 191 | <option value="mei">🇨🇳 mei — Mandarin</option> |
| 192 | <option value="joon">🇰🇷 joon — Korean</option> |
| 193 | <option value="giulia">🇮🇹 giulia — Italian</option> |
| 194 | <option value="luca">🇮🇹 luca — Italian</option> |
| 195 | <option value="lucia">🇪🇸 lucia — Spanish</option> |
| 196 | <option value="hana">🇯🇵 hana — Japanese</option> |
| 197 | <option value="mateo">🇪🇸 mateo — Spanish</option> |
| 198 | <option value="diego">🇨🇴 diego — Spanish (LatAm)</option> |
| 199 | </optgroup> |
| 200 | </select> |
| 201 | </div> |
| 202 | |
| 203 | <div class="field"> |
| 204 | <label for="prompt">System prompt</label> |
| 205 | <textarea id="prompt">You are a friendly voice assistant having a casual conversation. Keep replies short and natural — usually one or two sentences. Speak the way a person would in real conversation: relaxed, low-key, no exclamation marks, no over-enthusiastic phrases.</textarea> |
| 206 | </div> |
| 207 | |
| 208 | <div class="field"> |
| 209 | <label for="greeting">Greeting</label> |
| 210 | <input id="greeting" value="Hey, what's on your mind?"> |
| 211 | </div> |
| 212 | |
| 213 | <button class="btn" id="btn"> |
| 214 | <svg id="btn-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round"> |
| 215 | <rect x="9" y="2" width="6" height="10" rx="3"/> |
| 216 | <path d="M19 10v1a7 7 0 01-14 0v-1"/><path d="M12 18v4"/><path d="M8 22h8"/> |
| 217 | </svg> |
| 218 | <span id="btn-label">Connect</span> |
| 219 | </button> |
| 220 | </aside> |
| 221 | |
| 222 | <main> |
| 223 | <div class="transcript" id="log"> |
| 224 | <div class="transcript-hd"> |
| 225 | <span>Transcript</span> |
| 226 | <div class="speakers"> |
| 227 | <div class="speaker user" id="spk-user"><span class="dot"></span>You</div> |
| 228 | <div class="speaker agent" id="spk-agent"><span class="dot"></span>Agent</div> |
| 229 | </div> |
| 230 | </div> |
| 231 | <div id="msgs"> |
| 232 | <div class="empty" id="empty-msg">Add your API key on the left and click Connect to start the conversation</div> |
| 233 | </div> |
| 234 | </div> |
| 235 | </main> |
| 236 | </div> |
| 237 | |
| 238 | <script> |
| 239 | const $ = (id) => document.getElementById(id); |
| 240 | const RATE = 24_000; |
| 241 | |
| 242 | // Inline AudioWorklet — captures mic as PCM16 and posts to main thread |
| 243 | const workletUrl = URL.createObjectURL(new Blob([` |
| 244 | class P extends AudioWorkletProcessor { |
| 245 | process(inputs) { |
| 246 | const ch = inputs[0]?.[0]; |
| 247 | if (ch) { |
| 248 | const buf = new Int16Array(ch.length); |
| 249 | for (let i = 0; i < ch.length; i++) |
| 250 | buf[i] = Math.max(-32768, Math.min(32767, ch[i] * 32767)); |
| 251 | this.port.postMessage(buf.buffer, [buf.buffer]); |
| 252 | } |
| 253 | return true; |
| 254 | } |
| 255 | } |
| 256 | registerProcessor("pcm", P); |
| 257 | `], { type: 'application/javascript' })); |
| 258 | |
| 259 | // --- Microphone enumeration --- |
| 260 | async function populateMics() { |
| 261 | if (!navigator.mediaDevices?.enumerateDevices) return; |
| 262 | try { |
| 263 | const devices = await navigator.mediaDevices.enumerateDevices(); |
| 264 | const inputs = devices.filter(d => d.kind === 'audioinput'); |
| 265 | const sel = $('mic'); |
| 266 | const current = sel.value; |
| 267 | while (sel.firstChild) sel.removeChild(sel.firstChild); |
| 268 | const def = document.createElement('option'); |
| 269 | def.value = ''; |
| 270 | def.textContent = 'Default microphone'; |
| 271 | sel.appendChild(def); |
| 272 | inputs.forEach((d, i) => { |
| 273 | const opt = document.createElement('option'); |
| 274 | opt.value = d.deviceId; |
| 275 | opt.textContent = d.label || `Microphone ${i + 1}`; |
| 276 | sel.appendChild(opt); |
| 277 | }); |
| 278 | if (current && inputs.some(d => d.deviceId === current)) sel.value = current; |
| 279 | } catch (e) { console.warn('enumerateDevices failed', e); } |
| 280 | } |
| 281 | populateMics(); |
| 282 | navigator.mediaDevices?.addEventListener?.('devicechange', populateMics); |
| 283 | |
| 284 | // --- Voice Agent --- |
| 285 | let ws, ctx, mic; |
| 286 | |
| 287 | $('btn').onclick = () => (ws?.readyState <= 1) ? stop() : start(); |
| 288 | |
| 289 | async function start() { |
| 290 | const key = $('key').value.trim(); |
| 291 | if (!key) return setStatus('Enter your API key', 'err'); |
| 292 | $('btn').disabled = true; |
| 293 | setStatus('Connecting…'); |
| 294 | |
| 295 | try { |
| 296 | ctx = new AudioContext({ sampleRate: RATE }); |
| 297 | await ctx.resume(); |
| 298 | await ctx.audioWorklet.addModule(workletUrl); |
| 299 | const deviceId = $('mic').value; |
| 300 | mic = await navigator.mediaDevices.getUserMedia({ |
| 301 | audio: { |
| 302 | echoCancellation: true, |
| 303 | noiseSuppression: true, |
| 304 | ...(deviceId ? { deviceId: { exact: deviceId } } : {}), |
| 305 | }, |
| 306 | }); |
| 307 | populateMics(); |
| 308 | const source = ctx.createMediaStreamSource(mic); |
| 309 | const worklet = new AudioWorkletNode(ctx, 'pcm'); |
| 310 | |
| 311 | const url = new URL('wss://agents.assemblyai.com/v1/ws'); |
| 312 | url.searchParams.set('token', key); |
| 313 | ws = new WebSocket(url); |
| 314 | let ready = false, playT = 0; |
| 315 | |
| 316 | worklet.port.onmessage = ({ data }) => { |
| 317 | if (!ready || ws.readyState !== 1) return; |
| 318 | const b = new Uint8Array(data); |
| 319 | let s = ''; for (let i = 0; i < b.length; i++) s += String.fromCharCode(b[i]); |
| 320 | ws.send(JSON.stringify({ type: 'input.audio', audio: btoa(s) })); |
| 321 | }; |
| 322 | source.connect(worklet).connect(ctx.destination); |
| 323 | |
| 324 | ws.onopen = () => ws.send(JSON.stringify({ |
| 325 | type: 'session.update', |
| 326 | session: { |
| 327 | system_prompt: $('prompt').value, |
| 328 | greeting: $('greeting').value, |
| 329 | output: { voice: $('voice').value }, |
| 330 | }, |
| 331 | })); |
| 332 | |
| 333 | ws.onmessage = ({ data }) => { |
| 334 | const m = JSON.parse(data); |
| 335 | switch (m.type) { |
| 336 | case 'input.speech.started': |
| 337 | setSpeaker('user', true); break; |
| 338 | case 'input.speech.stopped': |
| 339 | setSpeaker('user', false); break; |
| 340 | case 'reply.started': |
| 341 | setSpeaker('agent', true); break; |
| 342 | case 'session.ready': |
| 343 | ready = true; |
| 344 | setStatus('Connected', 'ok'); |
| 345 | $('btn').disabled = false; |
| 346 | $('btn-label').textContent = 'Disconnect'; |
| 347 | $('btn').classList.add('on'); |
| 348 | clearEmpty(); |
| 349 | break; |
| 350 | |
| 351 | case 'reply.audio': { |
| 352 | const raw = atob(m.data); |
| 353 | const pcm = new Int16Array(raw.length / 2); |
| 354 | for (let i = 0; i < pcm.length; i++) |
| 355 | pcm[i] = raw.charCodeAt(i * 2) | (raw.charCodeAt(i * 2 + 1) << 8); |
| 356 | const f32 = new Float32Array(pcm.length); |
| 357 | for (let i = 0; i < pcm.length; i++) f32[i] = pcm[i] / 32768; |
| 358 | const buf = ctx.createBuffer(1, f32.length, RATE); |
| 359 | buf.getChannelData(0).set(f32); |
| 360 | const src = ctx.createBufferSource(); |
| 361 | src.buffer = buf; src.connect(ctx.destination); |
| 362 | playT = Math.max(playT, ctx.currentTime); |
| 363 | src.start(playT); playT += buf.duration; |
| 364 | break; |
| 365 | } |
| 366 | |
| 367 | case 'reply.done': |
| 368 | setSpeaker('agent', false); |
| 369 | if (m.status === 'interrupted') playT = ctx.currentTime; |
| 370 | break; |
| 371 | |
| 372 | case 'transcript.user': |
| 373 | addMsg('You', m.text, 'u'); break; |
| 374 | |
| 375 | case 'transcript.agent': |
| 376 | addMsg('Agent', m.text, 'a'); break; |
| 377 | |
| 378 | case 'session.error': |
| 379 | setStatus('Error: ' + m.message, 'err'); break; |
| 380 | } |
| 381 | }; |
| 382 | |
| 383 | ws.onclose = () => { setStatus('Disconnected'); resetUI(); }; |
| 384 | ws.onerror = () => { setStatus('Connection failed', 'err'); resetUI(); }; |
| 385 | } catch (e) { |
| 386 | setStatus(e.message, 'err'); resetUI(); |
| 387 | } |
| 388 | } |
| 389 | |
| 390 | function stop() { |
| 391 | ws?.close(); mic?.getTracks().forEach(t => t.stop()); ctx?.close(); |
| 392 | ws = ctx = mic = null; resetUI(); setStatus('Disconnected'); |
| 393 | } |
| 394 | |
| 395 | function resetUI() { |
| 396 | $('btn').disabled = false; |
| 397 | $('btn-label').textContent = 'Connect'; |
| 398 | $('btn').classList.remove('on'); |
| 399 | setSpeaker('user', false); |
| 400 | setSpeaker('agent', false); |
| 401 | } |
| 402 | |
| 403 | function setStatus(msg, cls) { |
| 404 | $('status-text').textContent = msg; |
| 405 | $('status').className = 'status' + (cls ? ' ' + cls : ''); |
| 406 | } |
| 407 | |
| 408 | function setSpeaker(who, active) { |
| 409 | $('spk-' + who).classList.toggle('active', active); |
| 410 | } |
| 411 | |
| 412 | function clearEmpty() { |
| 413 | const e = $('msgs').querySelector('.empty'); |
| 414 | if (e) e.remove(); |
| 415 | } |
| 416 | |
| 417 | function addMsg(who, text, cls) { |
| 418 | clearEmpty(); |
| 419 | const d = document.createElement('div'); |
| 420 | d.className = 'msg ' + cls; |
| 421 | const whoEl = document.createElement('div'); |
| 422 | whoEl.className = 'who'; |
| 423 | whoEl.textContent = who; |
| 424 | const textEl = document.createElement('div'); |
| 425 | textEl.textContent = text; |
| 426 | d.appendChild(whoEl); |
| 427 | d.appendChild(textEl); |
| 428 | $('msgs').appendChild(d); |
| 429 | $('msgs').scrollTop = $('msgs').scrollHeight; |
| 430 | } |
| 431 | </script> |
| 432 | </body> |
| 433 | </html> |