> For clean Markdown of any page, append `.md` to the page URL.
> For a complete documentation index, see https://docs.sarvam.ai/llms.txt.
> For full documentation content in one file, see https://docs.sarvam.ai/llms-full.txt.
> For AI client integration (Claude Code, Cursor, etc.), connect to the MCP server at https://docs.sarvam.ai/_mcp/server.

# WebSocket

GET /speech-to-text-translate/ws

WebSocket channel for real-time speech to text streaming with English translation.

**Note:** This API Reference page is provided for informational purposes only. 
The Try It playground may not provide the best experience for streaming audio. 
For optimal streaming performance, please use the SDK or implement your own WebSocket client.


Reference: https://docs.sarvam.ai/api-reference-docs/speech-to-text-translate/translate/ws

## AsyncAPI Specification

```yaml
asyncapi: 2.6.0
info:
  title: speechToTextTranslateStreaming
  version: subpackage_speechToTextTranslateStreaming.speechToTextTranslateStreaming
  description: >
    WebSocket channel for real-time speech to text streaming with English
    translation.


    **Note:** This API Reference page is provided for informational purposes
    only. 

    The Try It playground may not provide the best experience for streaming
    audio. 

    For optimal streaming performance, please use the SDK or implement your own
    WebSocket client.
channels:
  /speech-to-text-translate/ws:
    description: >
      WebSocket channel for real-time speech to text streaming with English
      translation.


      **Note:** This API Reference page is provided for informational purposes
      only. 

      The Try It playground may not provide the best experience for streaming
      audio. 

      For optimal streaming performance, please use the SDK or implement your
      own WebSocket client.
    bindings:
      ws:
        query:
          type: object
          properties:
            model:
              $ref: '#/components/schemas/speechToTextTranslateStreaming_model'
              default: saaras:v3
            mode:
              $ref: '#/components/schemas/speechToTextTranslateStreaming_mode'
              default: translate
            sample_rate:
              $ref: '#/components/schemas/speechToTextTranslateStreaming_sample_rate'
              default: 16000
            high_vad_sensitivity:
              $ref: >-
                #/components/schemas/speechToTextTranslateStreaming_high_vad_sensitivity
            positive_speech_threshold:
              type: string
              default: 0.7
            negative_speech_threshold:
              type: string
              default: 0.45
            min_speech_frames:
              type: string
              default: 2
            first_turn_min_speech_frames:
              type: string
              default: 8
            negative_frames_count:
              type: string
              default: 18
            negative_frames_window:
              type: string
              default: 24
            start_speech_volume_threshold:
              type: string
            interrupt_min_speech_frames:
              type: string
              default: 2
            pre_speech_pad_frames:
              type: string
              default: 9
            num_initial_ignored_frames:
              type: string
              default: 0
            vad_signals:
              $ref: '#/components/schemas/speechToTextTranslateStreaming_vad_signals'
            flush_signal:
              $ref: '#/components/schemas/speechToTextTranslateStreaming_flush_signal'
            input_audio_codec:
              $ref: >-
                #/components/schemas/speechToTextTranslateStreaming_input_audio_codec
        headers:
          type: object
          properties:
            Api-Subscription-Key:
              type: string
    publish:
      operationId: speech-to-text-translate-streaming-publish
      summary: Translation
      description: >-
        Receive real-time transcription and translation results from the
        WebSocket
      message:
        name: Translation
        title: Translation
        description: >-
          Receive real-time transcription and translation results from the
          WebSocket
        payload:
          $ref: >-
            #/components/schemas/speechToTextTranslateStreaming_speechToTextTranslateStreamingResponse
    subscribe:
      operationId: speech-to-text-translate-streaming-subscribe
      summary: Client messages
      message:
        oneOf:
          - $ref: >-
              #/components/messages/subpackage_speechToTextTranslateStreaming.speechToTextTranslateStreaming-client-0-Audio
              Translation Message
          - $ref: >-
              #/components/messages/subpackage_speechToTextTranslateStreaming.speechToTextTranslateStreaming-client-1-Translation
              Config Message
          - $ref: >-
              #/components/messages/subpackage_speechToTextTranslateStreaming.speechToTextTranslateStreaming-client-2-Speech
              Translate Flush Signal
servers:
  Production:
    url: wss://api.sarvam.ai/
    protocol: wss
    x-default: true
components:
  messages:
    subpackage_speechToTextTranslateStreaming.speechToTextTranslateStreaming-client-0-Audio Translation Message:
      name: Audio Translation Message
      title: Audio Translation Message
      description: Send audio data for real-time speech to text streaming with translation
      payload:
        $ref: '#/components/schemas/speechToTextTranslateStreaming_audioMessage'
    subpackage_speechToTextTranslateStreaming.speechToTextTranslateStreaming-client-1-Translation Config Message:
      name: Translation Config Message
      title: Translation Config Message
      description: Send configuration for speech to text streaming with translation
      payload:
        $ref: '#/components/schemas/speechToTextTranslateStreaming_configMessage'
    subpackage_speechToTextTranslateStreaming.speechToTextTranslateStreaming-client-2-Speech Translate Flush Signal:
      name: Speech Translate Flush Signal
      title: Speech Translate Flush Signal
      description: >-
        Send signal to flush audio buffer and finalize transcription and
        translation
      payload:
        $ref: '#/components/schemas/speechToTextTranslateStreaming_flushSignal'
  schemas:
    speechToTextTranslateStreaming_model:
      type: string
      enum:
        - saaras:v3
        - saaras:v2.5
      default: saaras:v3
      description: >
        Model to be used for speech to text translation.


        - **saaras:v3** (default, recommended): State-of-the-art translation
        model that translates audio from any spoken Indic language to English
        with flexible output formats via the `mode` parameter.


        - **saaras:v2.5** (legacy): Translation model that translates audio from
        any spoken Indic language to English. Kept for backward compatibility.
          - Example: Hindi audio → English text output
      title: speechToTextTranslateStreaming_model
    speechToTextTranslateStreaming_mode:
      type: string
      enum:
        - translate
        - transcribe
        - verbatim
        - translit
        - codemix
      default: translate
      description: >
        Mode of operation. **Only applicable when using saaras:v3 model.**


        - **translate** (default): Translates speech from any supported Indic
        language to English.
          - Example: Hindi audio → English text output

        - **transcribe**: Standard transcription in the original language.


        - **verbatim**: Exact word-for-word transcription without normalization.


        - **translit**: Romanization - Transliterates speech to Latin/Roman
        script only.


        - **codemix**: Code-mixed text with English words in English and Indic
        words in native script.
      title: speechToTextTranslateStreaming_mode
    speechToTextTranslateStreaming_sample_rate:
      type: string
      enum:
        - '16000'
        - '8000'
      description: >-
        Audio sample rate for the WebSocket connection. When specified as a
        connection parameter, only 16kHz and 8kHz are supported. 8kHz is only
        available via this connection parameter. If not specified, defaults to
        16kHz.
      title: speechToTextTranslateStreaming_sample_rate
    speechToTextTranslateStreaming_high_vad_sensitivity:
      type: string
      enum:
        - 'true'
        - 'false'
      description: Enable high VAD (Voice Activity Detection) sensitivity
      title: speechToTextTranslateStreaming_high_vad_sensitivity
    speechToTextTranslateStreaming_vad_signals:
      type: string
      enum:
        - 'true'
        - 'false'
      description: Enable VAD signals in response
      title: speechToTextTranslateStreaming_vad_signals
    speechToTextTranslateStreaming_flush_signal:
      type: string
      enum:
        - 'true'
        - 'false'
      description: >-
        Signal to flush the audio buffer and finalize transcription and
        translation
      title: speechToTextTranslateStreaming_flush_signal
    speechToTextTranslateStreaming_input_audio_codec:
      type: string
      enum:
        - wav
        - pcm_s16le
        - pcm_l16
        - pcm_raw
      description: >
        Audio codec/format of the input stream. Use this when sending raw PCM
        audio.

        Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.
      title: speechToTextTranslateStreaming_input_audio_codec
    ResponseType:
      type: string
      enum:
        - data
        - error
        - events
      description: Type of WebSocket response
      title: ResponseType
    TranscriptionMetrics:
      type: object
      properties:
        audio_duration:
          type: number
          format: double
          description: Duration of processed audio in seconds
        processing_latency:
          type: number
          format: double
          description: Processing latency in seconds
      required:
        - audio_duration
        - processing_latency
      title: TranscriptionMetrics
    SpeechToTextTranslateTranscriptionData:
      type: object
      properties:
        request_id:
          type: string
          description: Unique identifier for the request
        transcript:
          type: string
          description: English translation of the provided speech
        language_code:
          type:
            - string
            - 'null'
          description: >-
            BCP-47 code of detected source language (null when language
            detection is in progress)
        language_probability:
          type:
            - number
            - 'null'
          format: double
          description: >
            Float value (0.0 to 1.0) indicating the probability of the detected
            language being correct. Higher values indicate higher confidence.


            **When it returns a value:**

            - When `language_code` is not provided in the request

            - When `language_code` is set to `unknown`


            **When it returns null:**

            - When a specific `language_code` is provided (language detection is
            skipped)


            The parameter is always present in the response.
        metrics:
          $ref: '#/components/schemas/TranscriptionMetrics'
      required:
        - request_id
        - transcript
        - metrics
      title: SpeechToTextTranslateTranscriptionData
    ErrorData:
      type: object
      properties:
        error:
          type: string
          description: Error message
        code:
          type: string
          description: Error code
      required:
        - error
        - code
      title: ErrorData
    EventsDataSignalType:
      type: string
      enum:
        - START_SPEECH
        - END_SPEECH
      description: VAD signal type
      title: EventsDataSignalType
    EventsData:
      type: object
      properties:
        event_type:
          type: string
          description: Type of event
        timestamp:
          type: string
          format: date-time
          description: Event timestamp
        signal_type:
          $ref: '#/components/schemas/EventsDataSignalType'
          description: VAD signal type
        occured_at:
          type: number
          format: double
          description: Epoch timestamp when the event occurred
      description: >
        VAD events are sent when vad_signals=true. Fields may vary by event
        type.
      title: EventsData
    SpeechToTextTranslateResponseData:
      oneOf:
        - $ref: '#/components/schemas/SpeechToTextTranslateTranscriptionData'
        - $ref: '#/components/schemas/ErrorData'
        - $ref: '#/components/schemas/EventsData'
      title: SpeechToTextTranslateResponseData
    speechToTextTranslateStreaming_speechToTextTranslateStreamingResponse:
      type: object
      properties:
        type:
          $ref: '#/components/schemas/ResponseType'
        data:
          $ref: '#/components/schemas/SpeechToTextTranslateResponseData'
      required:
        - type
        - data
      title: speechToTextTranslateStreaming_speechToTextTranslateStreamingResponse
    AudioDataSampleRate:
      type: string
      enum:
        - '16000'
        - '22050'
        - '24000'
      description: >
        Audio sample rate in Hz for individual audio messages. 


        **Backward Compatibility**: This property is maintained for legacy
        support.

        **Recommended**: Use the connection-level sample_rate parameter instead.

        **Note**: 8kHz is only supported via connection parameter, not in
        AudioData messages.


        Supported values: 16kHz (preferred), 22.05kHz, 24kHz
      title: AudioDataSampleRate
    AudioDataEncoding:
      type: string
      enum:
        - audio/wav
      default: audio/wav
      description: Audio encoding format
      title: AudioDataEncoding
    AudioData:
      type: object
      properties:
        data:
          type: string
          format: base64
          description: Base64 encoded audio data
        sample_rate:
          $ref: '#/components/schemas/AudioDataSampleRate'
          description: >
            Audio sample rate in Hz for individual audio messages. 


            **Backward Compatibility**: This property is maintained for legacy
            support.

            **Recommended**: Use the connection-level sample_rate parameter
            instead.

            **Note**: 8kHz is only supported via connection parameter, not in
            AudioData messages.


            Supported values: 16kHz (preferred), 22.05kHz, 24kHz
        encoding:
          $ref: '#/components/schemas/AudioDataEncoding'
          description: Audio encoding format
      required:
        - data
        - sample_rate
        - encoding
      title: AudioData
    speechToTextTranslateStreaming_audioMessage:
      type: object
      properties:
        audio:
          $ref: '#/components/schemas/AudioData'
      required:
        - audio
      title: speechToTextTranslateStreaming_audioMessage
    ChannelsSpeechToTextTranslateStreamingMessagesConfigMessageType:
      type: string
      enum:
        - config
      default: config
      description: Message type identifier for configuration
      title: ChannelsSpeechToTextTranslateStreamingMessagesConfigMessageType
    speechToTextTranslateStreaming_configMessage:
      type: object
      properties:
        type:
          $ref: >-
            #/components/schemas/ChannelsSpeechToTextTranslateStreamingMessagesConfigMessageType
          description: Message type identifier for configuration
        prompt:
          type: string
          default: ''
          description: Prompt for ASR model to improve transcription accuracy.
      required:
        - type
      title: speechToTextTranslateStreaming_configMessage
    ChannelsSpeechToTextTranslateStreamingMessagesFlushSignalType:
      type: string
      enum:
        - flush
      default: flush
      description: Type identifier for flush signal
      title: ChannelsSpeechToTextTranslateStreamingMessagesFlushSignalType
    speechToTextTranslateStreaming_flushSignal:
      type: object
      properties:
        type:
          $ref: >-
            #/components/schemas/ChannelsSpeechToTextTranslateStreamingMessagesFlushSignalType
          description: Type identifier for flush signal
      required:
        - type
      description: >-
        Signal to flush the audio buffer and force finalize partial
        transcriptions/translations
      title: speechToTextTranslateStreaming_flushSignal

```