Speech-to-Text Rest API

Before using the Rest API, obtain an API key from Prosa Console.

Synchronous Request

Synchronous Speech-to-Text API request consist of a speech recognition configuration as well as audio data. The audio in each synchronous request is limited to 60 seconds. The wait parameter is set to true to indicate that it is a synchronous request.

Example request

1	`POST https://api.prosa.ai/v2/speech/stt`

{
    "config": {
      "model": "stt-general",
      "wait": true
    },
    "request": {
        "audio": "<base64-encoded audio data>"
    }
}

Example result

{
    "data": [
        {
            "transcript": "hasil akhir dari pekerjaan ini cukup memuaskan",
            "final": true,
            "time_start": 0.0,
            "time_end": 3.6
        }
    ]
}

Info

See Submit an ASR Request for more information on the request.

Here are some example codes to help you get started quickly.

PythonNode.js

import base64
import time
from typing import Optional

import requests

url = "https://api.prosa.ai/v2/speech/stt"
api_key = "..."


def main():
    filename = "audio_file.mp3"

    result = stt(filename)
    print(result)


def stt(filename: str) -> dict:
    job = submit_stt_request(filename)

    if job["status"] == "complete":
        return job["result"]

    # Job was not completed within the timeframe


def submit_stt_request(filename: str) -> dict:
    with open(filename, "rb") as f:
        b64audio_data = base64.b64encode(f.read()).decode("utf-8")

    payload = {
        "config": {
            "model": "stt-general",
            "wait": True  # Blocks the request until the execution is finished
        },
        "request": {
            "data": b64audio_data
        }
    }

    response = requests.post(url, json=payload, headers={
        "x-api-key": api_key
    })

    return response.json()


if __name__ == '__main__':
    main()

const https = require('https');
const fs = require('fs');

// Setup
const url = 'https://api.prosa.ai/v2/speech/stt';
const apiKey = '...';

(async () => {
  const filename = 'audio_file.wav';
  let res = await stt(filename);

  console.log(res)

})();

async function stt(filename) {
  let job = await submitSttRequest(filename);

  if (job["status"] === "complete") {
    return job["result"]["data"];
  }
  // Job was not completed within the timeframe
}

async function submitSttRequest(filename) {
  const audioData = fs.readFileSync(filename)
  const b64audioData = audioData.toString('base64');

  const payload = {
    "config": {
      "model": "stt-general",
      "wait": true  // Blocks the request until the execution is finished
    },
    "request": {
      "data": b64audioData
    }
  }

  return await request(url, "POST", {
    json: payload,
    headers: {
      "x-api-key": apiKey
    }
  });
}

function request(url, method, {headers = null, json = null}) {
  // Simple promise wrapper for built-in https module
  return new Promise((resolve, reject) => {
    let req = https.request(url, {
      method: method,
      headers: {
        "Accept": "application/json",
        "Content-Type": "application/json; charset=UTF-8",
        ...headers
      }
    }, (res) => {
      if (res.statusCode === 200) {
          let data = ""
          res.on('data', (chunk) => {
            data += chunk;
          });
          res.on('end', () => {
            const response = JSON.parse(data);
            resolve(response);
          });
      } else {
        reject(res.statusCode);
      }
    })

    req.on('error', reject);

    if (json != null) {
      req.write(JSON.stringify(json));
    }
    req.end();
  })
}

Note

The Node.js example contains a simple promise wrapper for built-in https module.

Warning

If the job could not be completed within a specified timeframe, it is treated as an Asynchronous Request instead. See Retrieving Result on how to retrieve the result of asynchronous requests.

Important

Requests are limited to 10 MB for each request. If you need to transcribe larger audio, consider using external storage. See Alternative Audio Source

Configure Request

Configure the model to use. In this example, the model being used is stt-general.

PythonNode.js

def submit_stt_request(filename: str) -> dict:
    payload = {
        "config": {
            "model": "stt-general",
            "wait": True  # Blocks the request until the execution is finished
        }
    }

async function submitSttRequest(filename) {
  const payload = {
    "config": {
      "model": "stt-general",
      "wait": true  // Blocks the request until the execution is finished
    }
  }

}

Sending audio data

Read audio data from any source. In this example, the audio is read from the filesystem. The audio is then encoded as base64 string as part of the request payload.

PythonNode.js

def submit_stt_request(filename: str) -> dict:
    with open(filename, "rb") as f:
        b64audio_data = base64.b64encode(f.read()).decode("utf-8")

    payload = {
        "config": {
            "model": "stt-general",
            "wait": True  # Blocks the request until the execution is finished
        },
        "request": {
            "data": b64audio_data
        }
    }

const url = 'https://api.prosa.ai/v2/speech/stt';

async function submitSttRequest(filename) {
  const audioData = fs.readFileSync(filename)
  const b64audioData = audioData.toString('base64');

  const payload = {
    "config": {
      "model": "stt-general",
      "wait": true  // Blocks the request until the execution is finished
    },
    "request": {
      "data": b64audioData
    }
  }
}

Sending the request

Authenticate the request by including API Key in the HTTP request header.

PythonNode.js

url = "https://api.prosa.ai/v2/speech/stt"

def submit_stt_request(filename: str) -> dict:
    with open(filename, "rb") as f:
        b64audio_data = base64.b64encode(f.read()).decode("utf-8")

    payload = {
        "config": {
            "model": "stt-general",
            "wait": True  # Blocks the request until the execution is finished
        },
        "request": {
            "data": b64audio_data
        }
    }

    response = requests.post(url, json=payload, headers={
        "x-api-key": api_key
    })

    return response.json()

const url = 'https://api.prosa.ai/v2/speech/stt';

async function submitSttRequest(filename) {
  const audioData = fs.readFileSync(filename)
  const b64audioData = audioData.toString('base64');

  const payload = {
    "config": {
      "model": "stt-general",
      "wait": true  // Blocks the request until the execution is finished
    },
    "request": {
      "data": b64audioData
    }
  }

  return await request(url, "POST", {
    json: payload,
    headers: {
      "x-api-key": apiKey
    }
  });
}

The result can be retrieved under the key result.

Info

The payload of the response is an AsrResponse object. See Submit an ASR Request for more information on the request.

Receiving response

For synchronous requests, the transcribed text is returned directly under the object result->data as base64-encoded data.

If the job could not be completed within a specified timeframe, it is treated as an Asynchronous Request instead. In that case, you need to poll and retrieve the result using job_id. See Retrieving Result on how to retrieve the result of asynchronous requests.

PythonNode.js

def stt(filename: str) -> dict:
    job = submit_stt_request(filename)

    if job["status"] == "complete":
        return job["result"]["data"]

    # Job was not completed within the timeframe
    job_id = job["job_id"]  # Retrieve with job_id instead

async function stt(filename) {
  let job = await submitSttRequest(filename);

  if (job["status"] === "complete") {
    return job["result"]["data"];
  }
  // Job was not completed within the timeframe
  let jobId = job["job_id"]  // Retrieve with job_id instead
}

Info

See AsrResponse for more information regading the response.

Asynchronous Request

Asynchronous Speech-to-Text API request is fairly similar to synchronous Speech-to-Text API request. However, instead of immediately returning the result, the request will initiate a Long Running Operation and return a response without result. Each asynchronous requests can process up to 4 hours of audio data.

Here are some example codes to help you get started quickly.

PythonNode.js

import base64
import time
from typing import Optional

import requests

url = "https://api.prosa.ai/v2/speech/stt"
api_key = "..."


def main():
    filename = "audio_file.mp3"

    job = submit_stt_request(filename)

    job_id = job["job_id"]

    poll_interval = 5.0

    while True:
        result = query_stt_result(job_id)
        if result is not None:
            print(result)
            break

        time.sleep(poll_interval)


def submit_stt_request(filename: str) -> dict:
    with open(filename, "rb") as f:
        b64audio_data = base64.b64encode(f.read()).decode("utf-8")

    payload = {
        "config": {
            "model": "stt-general",
            "wait": False  # Do not wait for the request to complete
        },
        "request": {
            "data": b64audio_data
        }
    }

    response = requests.post(url, json=payload, headers={
        "x-api-key": api_key
    })

    return response.json()


def query_stt_result(job_id: str) -> Optional[dict]:
    response = requests.get(url + "/" + job_id, headers={
        "x-api-key": api_key
    })

    if response.status_code == 200:
        job = response.json()

        status = job["status"]

        if status == "complete":
            result = job["result"]["data"]

            return result

    return None


if __name__ == '__main__':
    main()

const https = require('https');
const fs = require('fs');

// Setup
const url = 'https://api.prosa.ai/v2/speech/stt';
const apiKey = '...';

(async () => {
  const filename = 'audio_file.wav';

  let job = await submitSttRequest(filename);

  const jobId = job["job_id"];

  const pollInterval = 5.0 * 1000;

  while (true) {
    let result = await querySttResult(jobId);

    if (result != null) {
      console.log(result);
      break;
    }

    await new Promise((resolve) => {
      setTimeout(resolve, pollInterval);
    });
  }

})();

async function submitSttRequest(filename) {
  const audioData = fs.readFileSync(filename)
  const b64audioData = audioData.toString('base64');

  const payload = {
    "config": {
      "model": "stt-general",
      "wait": false  // Do not wait for the request to complete
    },
    "request": {
      "data": b64audioData
    }
  }

  return await request(url, "POST", {
    json: payload,
    headers: {
      "x-api-key": apiKey
    }
  });
}

async function querySttResult(jobId) {
  let res = await request(url + "/" + jobId, "GET", {
    headers: {
      "x-api-key": apiKey
    }
  });
  if (res["status"] === "complete") {
    return res["result"]["data"]
  }

  return null;
}

function request(url, method, {headers = null, json = null}) {
  return new Promise((resolve, reject) => {
    let req = https.request(url, {
      method: method,
      headers: {
        "Accept": "application/json",
        "Content-Type": "application/json; charset=UTF-8",
        ...headers
      }
    }, (res) => {
      if (res.statusCode === 200) {
          let data = ""
          res.on('data', (chunk) => {
            data += chunk;
          });
          res.on('end', () => {
            const response = JSON.parse(data);
            resolve(response);
          });
      } else {
        reject(res.statusCode);
      }
    })

    req.on('error', reject);

    if (json != null) {
      req.write(JSON.stringify(json));
    }
    req.end();
  })
}

Note

The Node.js example contains a simple promise wrapper for built-in https module.

Info

See Submit an ASR Request for more information on the request.

Important

Requests are limited to 10 MB for each request. If you need to transcribe larger audio, consider using external storage. See Alternative Audio Source

Submitting request

The request is fairly similar to synchronous request except the wait parameter is set to false to indicate that this is an asynchronous request.

PythonNode.js

url = "https://api.prosa.ai/v2/speech/stt"

def submit_stt_request(filename: str) -> dict:
    with open(filename, "rb") as f:
        b64audio_data = base64.b64encode(f.read()).decode("utf-8")

    payload = {
        "config": {
            "model": "stt-general",
            "wait": False  # Do not wait for the request to complete
        },
        "request": {
            "data": b64audio_data
        }
    }

    response = requests.post(url, json=payload, headers={
        "x-api-key": api_key
    })


    return response.json()

Note

Note that the value returned is an AsrResponse object which job_id property will be used to retrieve the result.

const url = 'https://api.prosa.ai/v2/speech/stt';

async function submitSttRequest(filename) {
  const audioData = fs.readFileSync(filename)
  const b64audioData = audioData.toString('base64');

  const payload = {
    "config": {
      "model": "stt-general",
      "wait": false  // Do not wait for the request to complete
    },
    "request": {
      "data": b64audioData
    }
  }

  return await request(url, "POST", {
    json: payload,
    headers: {
      "x-api-key": apiKey
    }
  });
}

Retrieving result

Using the job_id from AsrResponse object we previously received when submitting requests, we can retrieve the status and the result of the request. The status describes the progress of the STT request. We check to see if the status is compelete before returning the result.

PythonNode.js

url = "https://api.prosa.ai/v2/speech/stt"

def main():
    filename = "audio_file.mp3"

    job = submit_stt_request(filename)

    job_id = job["job_id"]

    result = query_stt_result(job_id)

def query_stt_result(job_id: str) -> Optional[dict]:
    response = requests.get(url + "/" + job_id, headers={
        "x-api-key": api_key
    })

    if response.status_code == 200:
        job = response.json()

        status = job["status"]

        if status == "complete":
            result = job["result"]["data"]

            return result

    return None

const url = 'https://api.prosa.ai/v2/speech/stt';

(async () => {
  const filename = 'audio_file.wav';

  let job = await submitSttRequest(filename);

  const jobId = job["job_id"];

  let result = await querySttResult(jobId);
})();

async function querySttResult(jobId) {
  let res = await request(url + "/" + jobId, "GET", {
    headers: {
      "x-api-key": apiKey
    }
  });
  if (res["status"] === "complete") {
    return res["result"]["data"]
  }

  return null;
}

Info

See Retrieve an ASR Job for more information on the request.