Speech Input

A compact ZGI speech-to-text input component with real-time transcription.

"use client"

import { useRef, useState } from "react"
import { toast } from "sonner"

import { getScribeToken } from "@/registry/zgi-ui/blocks/realtime-transcriber-01/actions/get-scribe-token"
import { Input } from "@/components/ui/input"
import {
  SpeechInput,
  SpeechInputCancelButton,
  SpeechInputPreview,
  SpeechInputRecordButton,
} from "@/components/ui/speech-input"
import { Textarea } from "@/components/ui/textarea"

async function getToken() {
  const result = await getScribeToken()
  if (result.error) {
    throw new Error(result.error)
  }
  return result.token!
}

function TextareaWithSpeechInputRight() {
  const [value, setValue] = useState("")
  const valueAtStartRef = useRef("")

  return (
    <div className="relative">
      <Textarea
        value={value}
        onChange={(event) => {
          setValue(event.target.value)
        }}
        placeholder="Jot down some thoughts..."
        className="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
      />
      <div className="absolute right-3 bottom-3 flex items-center gap-2">
        <SpeechInput
          size="sm"
          getToken={getToken}
          onStart={() => {
            valueAtStartRef.current = value
          }}
          onChange={({ transcript }) => {
            setValue(valueAtStartRef.current + transcript)
          }}
          onStop={({ transcript }) => {
            setValue(valueAtStartRef.current + transcript)
          }}
          onCancel={() => {
            setValue(valueAtStartRef.current)
          }}
          onError={(error) => {
            toast.error(String(error))
          }}
        >
          <SpeechInputCancelButton />
          <SpeechInputPreview placeholder="Listening..." />
          <SpeechInputRecordButton />
        </SpeechInput>
      </div>
    </div>
  )
}

function TextareaWithSpeechInputLeft() {
  const [value, setValue] = useState("")
  const valueAtStartRef = useRef("")

  return (
    <div className="relative">
      <Textarea
        value={value}
        onChange={(event) => {
          setValue(event.target.value)
        }}
        placeholder="Jot down some thoughts..."
        className="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
      />
      <div className="absolute bottom-3 left-3 flex items-center gap-2">
        <SpeechInput
          size="sm"
          getToken={getToken}
          onStart={() => {
            valueAtStartRef.current = value
          }}
          onChange={({ transcript }) => {
            setValue(valueAtStartRef.current + transcript)
          }}
          onStop={({ transcript }) => {
            setValue(valueAtStartRef.current + transcript)
          }}
          onCancel={() => {
            setValue(valueAtStartRef.current)
          }}
          onError={(error) => {
            toast.error(String(error))
          }}
        >
          <SpeechInputRecordButton />
          <SpeechInputPreview placeholder="Listening..." />
          <SpeechInputCancelButton />
        </SpeechInput>
      </div>
    </div>
  )
}

function InputWithSpeechInput() {
  const [value, setValue] = useState("")
  const valueAtStartRef = useRef("")

  return (
    <div className="flex items-center gap-2.5">
      <Input
        value={value}
        onChange={(event) => {
          setValue(event.target.value)
        }}
        placeholder="Give this idea a title..."
        className="min-w-0 flex-1 px-3.5 text-base transition-[flex-basis] duration-200 md:text-sm"
      />
      <SpeechInput
        getToken={getToken}
        className="shrink-0"
        onStart={() => {
          valueAtStartRef.current = value
        }}
        onChange={({ transcript }) => {
          setValue(valueAtStartRef.current + transcript)
        }}
        onStop={({ transcript }) => {
          setValue(valueAtStartRef.current + transcript)
        }}
        onCancel={() => {
          setValue(valueAtStartRef.current)
        }}
        onError={(error) => {
          toast.error(String(error))
        }}
      >
        <SpeechInputCancelButton />
        <SpeechInputRecordButton />
      </SpeechInput>
    </div>
  )
}

export function SpeechInputDemo() {
  return (
    <div className="absolute inset-0 space-y-4 overflow-auto rounded-2xl p-10">
      <TextareaWithSpeechInputRight />
      <TextareaWithSpeechInputLeft />
      <InputWithSpeechInput />
    </div>
  )
}

Installation


pnpm dlx shadcn@latest add https://ui.zgi.ai/r/speech-input.json

Usage

import {
  SpeechInput,
  SpeechInputCancelButton,
  SpeechInputPreview,
  SpeechInputRecordButton,
} from "@/components/ui/speech-input"

Basic Usage

async function getToken() {
  const response = await fetch("/api/get-scribe-token", { method: "POST" })
  const json = await response.json()
  return json.token
}
 
export default function Example() {
  return (
    <SpeechInput
      getToken={getToken}
      onChange={(data) => console.log(data.transcript)}
      onStop={(data) => console.log("Final:", data.transcript)}
    >
      <SpeechInputRecordButton />
      <SpeechInputPreview placeholder="Start speaking..." />
      <SpeechInputCancelButton />
    </SpeechInput>
  )
}

With Form Input

import { useState } from "react"
 
export default function Example() {
  const [value, setValue] = useState("")
 
  return (
    <div className="flex items-center gap-2">
      <input
        value={value}
        onChange={(e) => setValue(e.target.value)}
        className="flex-1 rounded border px-3 py-2"
      />
      <SpeechInput
        getToken={getToken}
        onStop={(data) => setValue((prev) => prev + " " + data.transcript)}
      >
        <SpeechInputRecordButton />
        <SpeechInputPreview />
        <SpeechInputCancelButton />
      </SpeechInput>
    </div>
  )
}

Reversed Layout

The component automatically adjusts its layout based on child order:

<SpeechInput getToken={getToken}>
  <SpeechInputCancelButton />
  <SpeechInputPreview />
  <SpeechInputRecordButton />
</SpeechInput>

Minimal (Record Button Only)

<SpeechInput
  getToken={getToken}
  onStop={(data) => console.log(data.transcript)}
>
  <SpeechInputRecordButton />
</SpeechInput>

Custom Placeholder

<SpeechInput getToken={getToken}>
  <SpeechInputRecordButton />
  <SpeechInputPreview placeholder="Say something..." />
  <SpeechInputCancelButton />
</SpeechInput>

Using the Hook

Access the speech input context in child components:

import { useSpeechInput } from "@/components/ui/speech-input"
 
function TranscriptDisplay() {
  const { transcript, isConnected, isConnecting } = useSpeechInput()
 
  return (
    <div>
      <p>
        Status:{" "}
        {isConnecting ? "Connecting" : isConnected ? "Recording" : "Idle"}
      </p>
      <p>Transcript: {transcript}</p>
    </div>
  )
}

Server Action for Token

Create a server action to securely fetch the Scribe token:

app/actions/get-scribe-token.ts

"use server"
 
export async function getScribeToken() {
  const response = await fetch(
    "https://api.elevenlabs.io/v1/speech-to-text/get-realtime-token",
    {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        "xi-api-key": process.env.ELEVENLABS_API_KEY!,
      },
      body: JSON.stringify({
        model_id: "scribe_v2_realtime",
        ttl_secs: 300,
      }),
    }
  )
  const data = await response.json()
  return data.token
}

API Reference

SpeechInput

The root component that manages speech-to-text state and provides context to child components.

Props

Prop	Type	Default	Description
children	`ReactNode`	-	Child components (record button, preview, cancel)
getToken	`() => Promise<string>`	-	Function to fetch a realtime transcription token
onChange	`(data: SpeechInputData) => void`	-	Called when transcript changes
onStart	`(data: SpeechInputData) => void`	-	Called when recording starts
onStop	`(data: SpeechInputData) => void`	-	Called when recording stops
onCancel	`(data: SpeechInputData) => void`	-	Called when recording is cancelled
onError	`(error: Error \| Event) => void`	-	Called on connection errors
onAuthError	`(data: { error: string }) => void`	-	Called on authentication errors
onQuotaExceededError	`(data: { error: string }) => void`	-	Called when quota is exceeded
modelId	`string`	`"scribe_v2_realtime"`	Transcription model ID
baseUri	`string`	-	Custom WebSocket base URI
commitStrategy	`CommitStrategy`	`"vad"`	How transcripts are committed (`"manual"` or `"vad"`)
vadSilenceThresholdSecs	`number`	-	VAD silence threshold (0.3-3.0)
vadThreshold	`number`	-	VAD threshold (0.1-0.9)
minSpeechDurationMs	`number`	-	Minimum speech duration (50-2000ms)
minSilenceDurationMs	`number`	-	Minimum silence duration (50-2000ms)
languageCode	`string`	-	ISO-639-1/3 language code
microphone	`MicrophoneOptions`	See below	Microphone configuration
audioFormat	`AudioFormat`	-	Audio format for manual streaming
sampleRate	`number`	-	Sample rate for manual streaming
className	`string`	-	Optional CSS classes

Default Microphone Options

{
  echoCancellation: true,
  noiseSuppression: true
}

SpeechInputRecordButton

Toggle button that switches between microphone icon (idle), connecting indicator, and stop icon (recording).

Props

Prop	Type	Description
className	`string`	Optional CSS classes
disabled	`boolean`	Disable the button
onClick	`(e: MouseEvent) => void`	Additional click handler
...props	`ComponentPropsWithoutRef<"button">`	All button props

SpeechInputPreview

Displays the current transcript with smooth text animations.

Props

Prop	Type	Default	Description
placeholder	`string`	`"Listening..."`	Text shown when empty
className	`string`	-	Optional CSS classes
...props	`ComponentPropsWithoutRef<"div">`	-	All div props

SpeechInputCancelButton

Button to cancel the current recording and clear the transcript.

Props

Prop	Type	Description
className	`string`	Optional CSS classes
onClick	`(e: MouseEvent) => void`	Additional click handler
...props	`ComponentPropsWithoutRef<"button">`	All button props

useSpeechInput

Hook to access speech input context from child components.

Returns

Property	Type	Description
isConnected	`boolean`	Whether currently connected/recording
isConnecting	`boolean`	Whether connection is in progress
transcript	`string`	Full transcript (committed + partial)
partialTranscript	`string`	Current partial transcript
committedTranscripts	`string[]`	Array of committed transcripts
error	`string \| null`	Current error message
start	`() => Promise<void>`	Start recording
stop	`() => void`	Stop recording
cancel	`() => void`	Cancel and clear transcript

SpeechInputData

Data object passed to callbacks.

interface SpeechInputData {
  partialTranscript: string
  committedTranscripts: string[]
  transcript: string // Combined full transcript
}

CommitStrategy

enum CommitStrategy {
  MANUAL = "manual",
  VAD = "vad",
}

AudioFormat

enum AudioFormat {
  PCM_8000 = "pcm_8000",
  PCM_16000 = "pcm_16000",
  PCM_22050 = "pcm_22050",
  PCM_24000 = "pcm_24000",
  PCM_44100 = "pcm_44100",
  PCM_48000 = "pcm_48000",
  ULAW_8000 = "ulaw_8000",
}

Features

Real-time Transcription: Live speech-to-text using a realtime transcription provider
Compound Components: Flexible composition with record button, preview, and cancel
Animated Transitions: Smooth expand/collapse animations using Framer Motion
Voice Activity Detection: Automatic transcript commits based on speech pauses
Visual Feedback: Distinct states for idle, connecting, and recording
Accessibility: Proper ARIA labels and keyboard interaction

Notes

Requires a provider API key for generating realtime transcription tokens
Token generation should happen server-side to protect your API key
The component automatically handles microphone permissions
Uses WebSocket for real-time communication with the transcription provider API
VAD (Voice Activity Detection) mode automatically commits transcripts during pauses
The preview component uses a gradient mask for text overflow
Layout automatically adjusts based on whether the record button is first or last

Shimmering Text Token Trend Chart

Deploy and Scale Agents with ZGI

ZGI delivers the infrastructure and developer experience you need to ship reliable audio & agent applications at scale.