Speech-to-Text Streaming API

The Streaming API is performed over websocket protocol.

Before using the streaming API, obtain an API key from Prosa Console.

Example Codes

Here are some example codes to help you get started quickly.

PythonNode.js

import asyncio
import json

import websockets


async def main():
    # Setup
    url = "wss://s-api.prosa.ai/v2/speech/stt"
    filename = "audio_file.mp3"
    api_key = "..."

    # Authenticate via HTTP Header
    headers = {
        "x-api-key": api_key
    }
    async with websockets.connect(url, extra_headers=headers) as ws:

        # Configure the session
        await ws.send(json.dumps(
            {
                "label": "Streaming Test",
                "model": "stt-general-online",
                "include_partial": False
            }
        ))

        # Concurrently send audio data and receive results
        await asyncio.gather(
            send_audio(filename, ws),
            receive_message(ws)
        )


async def send_audio(filename: str, ws, chunk_size: int = 16000):
    with open(filename, "rb") as f:  # Read file to send
        while data := f.read(chunk_size):
            await ws.send(data)
        await ws.send(b"")  # Signifies the end of audio stream


async def receive_message(ws):
    while True:
        data = json.loads(await ws.recv())  # Receive message

        # Identify message type
        message_type = data["type"]

        if message_type == "result":
            transcript = data["transcript"]
            # Process final transcript
        elif message_type == "partial":
            transcript = data["transcript"]
            # Process partial transcript
        print(data)

if __name__ == '__main__':
    asyncio.run(main())

const WebSocket = require('ws');
const fs = require('fs');

(async () => {
  // Setup
  const url = 'wss://s-api.prosa.ai/v2/speech/stt';
  const apiKey = '...';
  const filename = 'audio_file.mp3';

  let client = new WebSocket(url);

  // Wait for the client to connect using async/await
  await new Promise(resolve => client.once('open', resolve));

  // Authenticate via websocket message
  client.send(JSON.stringify({
    'token': apiKey
  }));

  // Configure the session
  client.send(JSON.stringify({
    'model': 'stt-general-online',
    'label': 'This is example streaming session'
  }));

  // Send audio data via websocket
  let stream = fs.createReadStream(filename);

  stream.on('readable', async () => {
    // Read file to send
    let chunk;
    while ((chunk = stream.read(16000))) {
      client.send(chunk, {binary: true})
    }
  });

  stream.on('close', () => {
    // Signifies the end of audio stream
    client.send(null, {binary: true})
  });

  // Receive results via websocket
  client.on('message', msg => {
    let data = JSON.parse(msg);

    const message_type = data["type"]

    if (message_type === "result") {
      const transcript = data["transcript"]
      // Process final transcript
    } else if (message_type === "partial") {
      const transcript = data["transcript"]
      // Process partial transcript
    }
  });
})();

Authentication

The client starts a session by opening a websocket connection and providing the authentication token either via x-api-key header or as the first websocket message.

HTTP Header Authentication

For some websocket client implementation, it is possible to include extra HTTP header on the websocket request.

Python

Using websockets library, it is possible to include extra headers via extra_headers keyword argument.

import asyncio

import websockets


async def main():
    # Setup
    url = "wss://s-api.prosa.ai/v2/speech/stt"
    api_key = "..."

    # Authenticate via HTTP Header
    headers = {
        "x-api-key": api_key
    }
    async with websockets.connect(url, extra_headers=headers) as ws:
        pass


if __name__ == '__main__':
    asyncio.run(main())

Websocket Message Authentication

If it is not feasible for the API Key to be included via HTTP Header, the API Key is expected to be sent as the first websocket message by the client.

PythonNode.js

import asyncio
import json

import websockets


async def main():
    # Setup
    url = "wss://s-api.prosa.ai/v2/speech/stt"
    api_key = "..."

    async with websockets.connect(url) as ws:
        # Authenticate via websocket message
        await ws.send(json.dumps(
            {
                "token": api_key
            }
        ))


if __name__ == '__main__':
    asyncio.run(main())

const WebSocket = require('ws');

(async () => {
  // Setup
  const url = 'wss://s-api.prosa.ai/v2/speech/stt';
  const apiKey = '...';

  let client = new WebSocket(url);

  // Wait for the client to connect using async/await
  await new Promise(resolve => client.once('open', resolve));

  // Authenticate via websocket message
  client.send(JSON.stringify({
    'token': apiKey
  }));
})();

Info

See ApiKey for more information on the message.

Configuring Session

Before sending the audio data, the server expects a message containing the configuration for this session.

PythonNode.js

import asyncio
import json

import websockets


async def main():
    # Setup
    url = "wss://s-api.prosa.ai/v2/speech/stt"
    api_key = "..."

    # Authenticate via HTTP Header
    headers = {
        "x-api-key": api_key
    }
    async with websockets.connect(url, extra_headers=headers) as ws:

        # Configure the session
        await ws.send(json.dumps(
            {
                "model": "stt-general-online",
                "label": "This is example streaming session"
            }
        ))


if __name__ == '__main__':
    asyncio.run(main())

const WebSocket = require('ws');

(async () => {
  // Setup
  const url = 'wss://s-api.prosa.ai/v2/speech/stt';
  const apiKey = '...';

  let client = new WebSocket(url);

  // Wait for the client to connect using async/await
  await new Promise(resolve => client.once('open', resolve));

  // Authenticate via websocket message
  client.send(JSON.stringify({
    'token': apiKey
  }));

  // Configure the session
  client.send(JSON.stringify({
    'model': 'stt-general-online',
    'label': 'This is example streaming session'
  }));
})();

Info

See Configuration for more information on the message.

Sending Audio Data

The audio data are sent as bytes. The audio header is expected to be sent as the initial chunk. For reference, the snippet below demonstrate how to send audio data read from a file.

Attention

The server expects an empty bytes to be sent as the last message. This is to signify the end of the audio stream.

PythonNode.js

import asyncio
import json

import websockets


async def main():
    # Setup
    url = "wss://s-api.prosa.ai/v2/speech/stt"
    filename = "audio_file.mp3"
    api_key = "..."

    # Authenticate via HTTP Header
    headers = {
        "x-api-key": api_key
    }
    async with websockets.connect(url, extra_headers=headers) as ws:

        # Configure the session
        await ws.send(json.dumps(
            {
                "model": "stt-general-online",
                "label": "This is example streaming session"
            }
        ))

        # Send audio data via websocket
        await send_audio(filename, ws)


async def send_audio(filename: str, ws, chunk_size: int = 16000):
    with open(filename, "rb") as f:  # Read file to send
        while data := f.read(chunk_size):
            await ws.send(data)
        await ws.send(b"")  # Signifies the end of audio stream


if __name__ == '__main__':
    asyncio.run(main())

const WebSocket = require('ws');
const fs = require('fs');

(async () => {
  // Setup
  const url = 'wss://s-api.prosa.ai/v2/speech/stt';
  const apiKey = '...';
  const filename = 'audio_file.mp3';

  let client = new WebSocket(url);

  // Wait for the client to connect using async/await
  await new Promise(resolve => client.once('open', resolve));

  // Authenticate via websocket message
  client.send(JSON.stringify({
    'token': apiKey
  }));

  // Configure the session
  client.send(JSON.stringify({
    'model': 'stt-general-online',
    'label': 'This is example streaming session'
  }));

  // Send audio data via websocket
  let stream = fs.createReadStream(filename);

  stream.on('readable', async () => {
    // Read file to send
    let chunk;
    while ((chunk = stream.read(16000))) {
      client.send(chunk, {binary: true})
    }
  });

  stream.on('close', () => {
    // Signifies the end of audio stream
    client.send(null, {binary: true})
  });
})();

Info

Halting the data transfer allows for pausing of the transcription process.

Receiving Messages

There are several messages sent by server to the client. The client is expected to send and receive messages concurrently.

Info

See Subscribe Operation for more information on the messages that are sent by the server

PythonNode.js

import asyncio
import json

import websockets


async def main():
    # Setup
    url = "wss://s-api.prosa.ai/v2/speech/stt"
    filename = "audio_file.mp3"
    api_key = "..."

    # Authenticate via HTTP Header
    headers = {
        "x-api-key": api_key
    }
    async with websockets.connect(url, extra_headers=headers) as ws:

        # Configure the session
        await ws.send(json.dumps(
            {
                "label": "Streaming Test",
                "model": "stt-general-online",
                "include_partial": False
            }
        ))

        # Concurrently send audio data and receive results
        await asyncio.gather(
            send_audio(filename, ws),
            receive_message(ws)
        )


async def send_audio(filename: str, ws, chunk_size: int = 16000):
    with open(filename, "rb") as f:  # Read file to send
        while data := f.read(chunk_size):
            await ws.send(data)
        await ws.send(b"")  # Signifies the end of audio stream


async def receive_message(ws):
    while True:
        data = json.loads(await ws.recv())  # Receive message

        # Identify message type
        message_type = data["type"]

        if message_type == "result":
            transcript = data["transcript"]
            # Process final transcript
        elif message_type == "partial":
            transcript = data["transcript"]
            # Process partial transcript
        print(data)

if __name__ == '__main__':
    asyncio.run(main())

const WebSocket = require('ws');
const fs = require('fs');

(async () => {
  // Setup
  const url = 'wss://s-api.prosa.ai/v2/speech/stt';
  const apiKey = '...';
  const filename = 'audio_file.mp3';

  let client = new WebSocket(url);

  // Wait for the client to connect using async/await
  await new Promise(resolve => client.once('open', resolve));

  // Authenticate via websocket message
  client.send(JSON.stringify({
    'token': apiKey
  }));

  // Configure the session
  client.send(JSON.stringify({
    'model': 'stt-general-online',
    'label': 'This is example streaming session'
  }));

  // Send audio data via websocket
  let stream = fs.createReadStream(filename);

  stream.on('readable', async () => {
    // Read file to send
    let chunk;
    while ((chunk = stream.read(16000))) {
      client.send(chunk, {binary: true})
    }
  });

  stream.on('close', () => {
    // Signifies the end of audio stream
    client.send(null, {binary: true})
  });

  // Receive results via websocket
  client.on('message', msg => {
    let data = JSON.parse(msg);

    const message_type = data["type"]

    if (message_type === "result") {
      const transcript = data["transcript"]
      // Process final transcript
    } else if (message_type === "partial") {
      const transcript = data["transcript"]
      // Process partial transcript
    }
  });
})();

Receiving Results

The attribute type in every message identifies the type of the message.

There are 2 types of transcription result:

partial: the result of an ongoing speech segment
result: the result of a completed speech segment

PythonNode.js

async def receive_message(ws):
    while True:
        data = json.loads(await ws.recv())  # Receive message

        # Identify message type
        message_type = data["type"]

        if message_type == "result":
            transcript = data["transcript"]
            # Process final transcript
        elif message_type == "partial":
            transcript = data["transcript"]
            # Process partial transcript

// Receive results via websocket
client.on('message', msg => {
  let data = JSON.parse(msg);

  const message_type = data["type"]

  if (message_type === "result") {
    const transcript = data["transcript"]
    // Process final transcript
  } else if (message_type === "partial") {
    const transcript = data["transcript"]
    // Process partial transcript
  }
});

Info

See PartialTranscript and FinalTranscript for more information on the messages.