
Encoding |
Ogg (Vorbis) |
WebM (Opus) |
MPEG-4 (AAC) |
PCM-16khz |
PCM-8khz |
---|---|---|---|---|---|
Sample Rate | 8000* or 16000 Hz | 16000 Hz | 16000 Hz | 16000 Hz | 8000 Hz |
Sample Size | n/a | n/a | n/a | 16 bit | 16 bit |
Quality Setting | 0.5 | default | default | N/A | N/A |
Channels | 1 (mono) | 1 (mono) | 1 (mono) | 1 (mono) | 1 (mono) |
Byte Order | N/A | N/A | N/A | little-endian | little-endian |
Signed / Unsigned | signed | signed | signed | signed | signed |
HTTP Format Alias | ogg | webm | mp4 | pcm-16khz | pcm-8khz |
HTTP Content-Type | audio/ogg | audio/webm | audio/mp4 | audio/x-wav | audio/x-wav |
Maximum Upload File Size |
21,000,000 bytes | 21,000,000 bytes | 21,000,000 bytes | 21,000,000 bytes | 21,000,000 bytes |
Once you have an account, you must change your password before the account can be used for API calls.
//-----------------------> REAL-TIME STREAMING FROM MICROPHONE <---------------------------+
// +
// This C# program will receive input from a microphone, send to the dictation server, +
// receive the dictation, then display the dictation in the console windwo. +
// +
//-----------------------------------------------------------------------------------------+
using NAudio.Wave;
using Newtonsoft.Json;
using Newtonsoft.Json.Serialization;
using System;
using System.ComponentModel;
using System.Security.Authentication;
using System.Threading;
using WebSocketSharp;
namespace CSharpWebSocketDictationSample
{
//------------------------------------------------------------------------------------------
// This class implements the nVoq web socket API
// And, as a C# developer, you are the lucky winner of
// code that reads audio from the microphone as well!
//------------------------------------------------------------------------------------------
class CSharpWebSocketDictationSample
{
/// Program configuration
const string Username = "yourUsername";
const string Password = "yourPassword";
//to use an api key...
//const string apikey = "eyJ0eXA ... iOiJKV1QiLCJh";
const string ServiceUrl = "wss://test.nvoq.com/wsapi/v2/dictation/topics/general_medicine";
const int TimeoutMillis = 10000; // Use a timeout of 10 seconds for most events
/// Program state variables
// Will be a WebSocketSharp client socket
readonly WebSocket _webSocket;
// NAudio will be used to read audio from the OS default microphone
readonly WaveInEvent _waveSource;
// In a GUI we wouldn't need waitable event objects, rather we'd update the UI/state directly from async callbacks.
// But since this is a command-line tester, we'll set up a unique waitable event for every major step of the program.
readonly AutoResetEvent _signalConnected = new AutoResetEvent(false);
readonly AutoResetEvent _signalStartDictationResponseReceived = new AutoResetEvent(false);
readonly AutoResetEvent _signalRecordingStopped = new AutoResetEvent(false);
readonly AutoResetEvent _signalTextDoneReceived = new AutoResetEvent(false);
// Collect some additional status information about which callbacks did what afer an event fires.
volatile bool _timedOut = false; // Reused for all waits
volatile string _connectionResult;
volatile string _startDictationResult;
volatile int _countAudioBytesSent;
volatile int _lastLogAudioBytesSent;
//.....More code to follow
}
}
//create a new instance of our sample program object and
//call the class Main method...
static void Main(string[] args)
{
CSharpWebSocketDictationSample program = new CSharpWebSocketDictationSample();
program.Main();
}
// Constructor sets up a wav source (microphone audio)
// and then opens connection to web socket
public CSharpWebSocketDictationSample()
{
//NAudio simple access to microphone audio...
_waveSource = new WaveInEvent();
// Record and transmit audio in quarter second intervals. The interval isn't super important.
// A smaller buffer gets you slightly more responsive text updates, and a larger buffer
// uses slightly less CPU and bandwidth (framing overhead).
_waveSource.BufferMilliseconds = 250;
_waveSource.WaveFormat = new WaveFormat(16000, 1);
_waveSource.DataAvailable += _waveSource_DataAvailable;
_waveSource.RecordingStopped += _waveSource_RecordingStopped;
// Construct client WebSocket and register callback functions.
_webSocket = new WebSocket(ServiceUrl);
_webSocket.OnClose += _webSocket_OnClose;
_webSocket.OnError += _webSocket_OnError;
_webSocket.OnMessage += _webSocket_OnMessage;
_webSocket.OnOpen += _webSocket_OnOpen;
//_webSocket.Log.Level = LogLevel.Trace; // If you need more logging about what the WebSocket library is doing
//_webSocket.SslConfiguration.EnabledSslProtocols = System.Security.Authentication.SslProtocols.Tls12 | System.Security.Authentication.SslProtocols.Tls12;
// ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12;
// we do not support Tls1.0 ("Tls") in the future.
_webSocket.SslConfiguration.EnabledSslProtocols = (SslProtocols)(768 | 3072); // Tls11, Tls12
}
//Class main method. This controls the flow of execution for the sample program.
private void Main()
{
LogMessage("URL: " + ServiceUrl);
LogMessage("Username: " + Username);
Prompt("Press enter to connect WebSocket.");
LogMessage("Attempting to connect...");
_webSocket.ConnectAsync();
AwaitSignal(_signalConnected);
if (_connectionResult != "connected")
ProgramExit("WebSocket failed to connect.");
LogMessage("WebSocket connected.");
Prompt("Press enter to send STARTDICTATION message.");
SendStartDictationMessage();
LogMessage("Waiting for server to respond to STARTDICTATION.");
AwaitSignal(_signalStartDictationResponseReceived);
if (_timedOut)
ProgramExit("Timed out waiting for STARTDICTATION response from server.");
if (_startDictationResult != "accepted")
ProgramExit("STARTDICTATION not accepted: " + _startDictationResult);
LogMessage("Server accepted STARTDICTATION.");
Prompt("Press enter to start recording and transmitting audio.");
StartAudioRecording();
LogMessage("Recording started. You should see transcription results start to arrive from the server via TEXT messages.");
Prompt("Press enter to stop recording audio and send AUDIODONE to server.");
_waveSource.StopRecording();
AwaitSignal(_signalRecordingStopped);
if (_timedOut)
ProgramExit("Timed out waiting for audio recording to stop.");
LogMessage("Recording finished. Sending AUDIODONE.");
SendAudioDoneToServer();
LogMessage("Waiting for DONE message from server.");
AwaitSignal(_signalTextDoneReceived);
if (_timedOut)
ProgramExit("Timed out waiting for DONE message from server.");
ProgramExit("Final transcription received.");
}
//method creates a message to start the dictation and sends
//it to the server over the open WebSocket
private void SendStartDictationMessage()
{
WebSocketDictationMessage msg = new WebSocketDictationMessage();
msg.Method = "STARTDICTATION";
msg.Params = new WebSocketDictationMessage.JobParams();
msg.Params.Id = Username;
msg.Params.Authorization = Password;
// msg.Params.Apikey = apikey;
msg.Params.AudioFormat = new WebSocketDictationMessage.AudioFormat();
msg.Params.AudioFormat.Encoding = "pcm-16khz";
msg.Params.AudioFormat.SampleRate = 16000;
// {"STABLETEXT"} = request stable text
// {"HYPOTHESISTEXT"} = request hypothesis text
// {"STABLETEXT", "HYPOTHESISTEXT"} = request both stable text and hypothesis text
msg.Params.ReturnSubscriptions = new string[] { "STABLETEXT" };
SendJSONMessageToServer(msg);
}
//Utility method for sending JSON encoded messages over the open WebSocket
private void SendJSONMessageToServer(WebSocketDictationMessage msg)
{
JsonSerializerSettings settings = new JsonSerializerSettings
{
DefaultValueHandling = DefaultValueHandling.Ignore,
ContractResolver = new DefaultContractResolver()
{
NamingStrategy = new CamelCaseNamingStrategy()
}
};
string jsonStr = JsonConvert.SerializeObject(msg, Formatting.None, settings);
SendTextMessageToServer(jsonStr);
}
//Start the flow of audio from the sound card/microphone
private void StartAudioRecording()
{
_waveSource.StartRecording();
}
//When audio becomes available, send it over the socket to the server
private void _waveSource_DataAvailable(object sender, WaveInEventArgs e)
{
byte[] bytes = new byte[e.BytesRecorded];
Array.Copy(e.Buffer, 0, bytes, 0, e.BytesRecorded);
if (_webSocket.IsAlive)
{
_webSocket.Send(bytes);
_countAudioBytesSent += e.BytesRecorded;
// Only log every N seconds of audio so as not to overwhelm the screen with log messages
int loggingIntervalInSeconds = 2;
int audioBytesPerSecond = _waveSource.WaveFormat.AverageBytesPerSecond;
int loggingIntervalInBytes = audioBytesPerSecond * loggingIntervalInSeconds;
int bytesTransmittedSinceLastLog = _countAudioBytesSent - _lastLogAudioBytesSent;
if (bytesTransmittedSinceLastLog >= loggingIntervalInBytes)
{
_lastLogAudioBytesSent = _countAudioBytesSent;
LogMessage("Number of audio bytes transmitted so far: " + _countAudioBytesSent);
}
}
}
//when the audio is done, call this (in the Main() method above...)
private void SendAudioDoneToServer()
{
WebSocketDictationMessage msg = new WebSocketDictationMessage();
msg.Method = "AUDIODONE";
SendJSONMessageToServer(msg);
}
//websocket api calls this method when new data is available
private void _webSocket_OnMessage(object sender, MessageEventArgs args)
{
LogMessage("\n<- Server-to-Client Message: " + args.Data + "\n");
if (args.IsBinary)
{
ProgramExit("Server unexpectedly sent us binary data.");
}
WebSocketDictationMessage msg = JsonConvert.DeserializeObject(args.Data);
string method = msg.Method;
if ("TEXT" == method)
{
if (msg.Data.TextDone)
_signalTextDoneReceived.Set();
LogMessage("*** Press enter to stop recording");
}
else if ("STARTDICTATION" == method)
{
if (msg.Error != null)
{
_startDictationResult = "error." + msg.Error.Reason;
}
else
{
_startDictationResult = "accepted";
}
_signalStartDictationResponseReceived.Set();
}
else if (msg.Error != null)
{
ProgramExit("Received error from server. Reason: " + msg.Error.Reason + ", Message: " + msg.Error.Message);
}
}
/*
* ----------------------> REAL-TIME STREAMING FROM MICROPHONE <----------------------
*
* This C# program will receive input from a microphone, send to the dictation server,
* receive the dictation, then display the dictation in the console window.
*
* Copyright (c) 2000-2021 nVoq Incorporated
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*-----------------------------------------------------------------------------------
*/
using NAudio.Wave;
using Newtonsoft.Json;
using Newtonsoft.Json.Serialization;
using System;
using System.ComponentModel;
using System.Security.Authentication;
using System.Threading;
using WebSocketSharp;
namespace CSharpWebSocketDictationSample
{
class CSharpWebSocketDictationSample
{
/// <summary> Program configuration </summary>
const string Username = "yourUsername";
const string Password = "yourPassword";
// If using API key use the following instead of Password = "password" and update in SendStartDictationMessage below as well;
// const string apikey = "eyJ0eXA ... iOiJKV1QiLCJh";
const string ServiceUrl = "wss://test.nvoq.com/wsapi/v2/dictation/topics/general_medicine";
const int TimeoutMillis = 10000; // Use a timeout of 10 seconds for most events
/// <summary> Program state variables </summary>
// Will be a WebSocketSharp client socket
readonly WebSocket _webSocket;
// NAudio will be used to read audio from the OS default microphone
readonly WaveInEvent _waveSource;
// In a GUI we wouldn't need waitable event objects, rather we'd update the UI/state directly from async callbacks.
// But since this is a command-line tester, we'll set up a unique waitable event for every major step of the program.
readonly AutoResetEvent _signalConnected = new AutoResetEvent(false);
readonly AutoResetEvent _signalStartDictationResponseReceived = new AutoResetEvent(false);
readonly AutoResetEvent _signalRecordingStopped = new AutoResetEvent(false);
readonly AutoResetEvent _signalTextDoneReceived = new AutoResetEvent(false);
// Collect some additional status information about which callbacks did what afer an event fires.
volatile bool _timedOut = false; // Reused for all waits
volatile string _connectionResult;
volatile string _startDictationResult;
volatile int _countAudioBytesSent;
volatile int _lastLogAudioBytesSent;
static void Main(string[] args)
{
CSharpWebSocketDictationSample program = new CSharpWebSocketDictationSample();
program.Main();
}
public CSharpWebSocketDictationSample()
{
_waveSource = new WaveInEvent();
// Record and transmit audio in quarter second intervals. The interval isn't super important.
// A smaller buffer gets you slightly more responsive text updates, and a larger buffer
// uses slightly less CPU and bandwidth (framing overhead).
_waveSource.BufferMilliseconds = 250;
_waveSource.WaveFormat = new WaveFormat(16000, 1);
_waveSource.DataAvailable += _waveSource_DataAvailable;
_waveSource.RecordingStopped += _waveSource_RecordingStopped;
// Construct client WebSocket and register callback functions.
_webSocket = new WebSocket(ServiceUrl);
_webSocket.OnClose += _webSocket_OnClose;
_webSocket.OnError += _webSocket_OnError;
_webSocket.OnMessage += _webSocket_OnMessage;
_webSocket.OnOpen += _webSocket_OnOpen;
//_webSocket.Log.Level = LogLevel.Trace; // If you need more logging about what the WebSocket library is doing
//_webSocket.SslConfiguration.EnabledSslProtocols = System.Security.Authentication.SslProtocols.Tls12 | System.Security.Authentication.SslProtocols.Tls12;
// ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12;
// I think we may not support Tls1.0 ("Tls") in the future.
_webSocket.SslConfiguration.EnabledSslProtocols = (SslProtocols)(768 | 3072); // Tls11, Tls12
}
private void Main()
{
LogMessage("URL: " + ServiceUrl);
LogMessage("Username: " + Username);
Prompt("Press enter to connect WebSocket.");
LogMessage("Attempting to connect...");
_webSocket.ConnectAsync();
AwaitSignal(_signalConnected);
if (_connectionResult != "connected")
ProgramExit("WebSocket failed to connect.");
LogMessage("WebSocket connected.");
Prompt("Press enter to send STARTDICTATION message.");
SendStartDictationMessage();
LogMessage("Waiting for server to respond to STARTDICTATION.");
AwaitSignal(_signalStartDictationResponseReceived);
if (_timedOut)
ProgramExit("Timed out waiting for STARTDICTATION response from server.");
if (_startDictationResult != "accepted")
ProgramExit("STARTDICTATION not accepted: " + _startDictationResult);
LogMessage("Server accepted STARTDICTATION.");
Prompt("Press enter to start recording and transmitting audio.");
StartAudioRecording();
LogMessage("Recording started. You should see transcription results start to arrive from the server via TEXT messages.");
Prompt("Press enter to stop recording audio and send AUDIODONE to server.");
_waveSource.StopRecording();
AwaitSignal(_signalRecordingStopped);
if (_timedOut)
ProgramExit("Timed out waiting for audio recording to stop.");
LogMessage("Recording finished. Sending AUDIODONE.");
SendAudioDoneToServer();
LogMessage("Waiting for DONE message from server.");
AwaitSignal(_signalTextDoneReceived);
if (_timedOut)
ProgramExit("Timed out waiting for DONE message from server.");
ProgramExit("Final transcription received.");
}
private void SendAudioDoneToServer()
{
WebSocketDictationMessage msg = new WebSocketDictationMessage();
msg.Method = "AUDIODONE";
SendJSONMessageToServer(msg);
}
private void StartAudioRecording()
{
_waveSource.StartRecording();
}
private void SendStartDictationMessage()
{
WebSocketDictationMessage msg = new WebSocketDictationMessage();
msg.Method = "STARTDICTATION";
msg.Params = new WebSocketDictationMessage.JobParams();
msg.Params.Id = Username;
msg.Params.Authorization = Password;
// to use the API key, replace Authorization line above with Apikey
// msg.Params.Apikey = Apikey;
//
msg.Params.AudioFormat = new WebSocketDictationMessage.AudioFormat();
// "pcm-16khz" when using WAVE format
// "ogg" when using Ogg format
// "webm" when using WebM format
// "mp4" when using MPEG-4 format
msg.Params.AudioFormat.Encoding = "pcm-16khz";
msg.Params.AudioFormat.SampleRate = 16000;
// {"STABLETEXT"} = request stable text
// {"HYPOTHESISTEXT"} = request hypothesis text
// {"STABLETEXT", "HYPOTHESISTEXT"} = request both stable text and hypothesis text
msg.Params.ReturnSubscriptions = new string[] { "STABLETEXT" };
SendJSONMessageToServer(msg);
}
private void AwaitSignal(AutoResetEvent signal)
{
_timedOut = !signal.WaitOne(TimeoutMillis);
}
private void LogMessage(string message)
{
Console.WriteLine(message);
}
private void ProgramExit(string message)
{
LogMessage("Exiting program because: " + message);
LogMessage("Program State:");
LogMessage(" _timeout=" + _timedOut);
LogMessage(" _connectionResult=" + _connectionResult);
LogMessage(" _startDictationResult=" + _startDictationResult);
Prompt("Press enter to exit program.");
Environment.Exit(0);
}
private void SendJSONMessageToServer(WebSocketDictationMessage msg)
{
JsonSerializerSettings settings = new JsonSerializerSettings
{
DefaultValueHandling = DefaultValueHandling.Ignore,
ContractResolver = new DefaultContractResolver()
{
NamingStrategy = new CamelCaseNamingStrategy()
}
};
string jsonStr = JsonConvert.SerializeObject(msg, Formatting.None, settings);
SendTextMessageToServer(jsonStr);
}
private void Prompt(string promptMessage)
{
Console.WriteLine("\n*** " + promptMessage);
Console.ReadLine();
}
private void _waveSource_DataAvailable(object sender, WaveInEventArgs e)
{
byte[] bytes = new byte[e.BytesRecorded];
Array.Copy(e.Buffer, 0, bytes, 0, e.BytesRecorded);
if (_webSocket.IsAlive)
{
_webSocket.Send(bytes);
_countAudioBytesSent += e.BytesRecorded;
// Only log every N seconds of audio so as not to overwhelm the screen with log messages
int loggingIntervalInSeconds = 2;
int audioBytesPerSecond = _waveSource.WaveFormat.AverageBytesPerSecond;
int loggingIntervalInBytes = audioBytesPerSecond * loggingIntervalInSeconds;
int bytesTransmittedSinceLastLog = _countAudioBytesSent - _lastLogAudioBytesSent;
if (bytesTransmittedSinceLastLog >= loggingIntervalInBytes)
{
_lastLogAudioBytesSent = _countAudioBytesSent;
LogMessage("Number of audio bytes transmitted so far: " + _countAudioBytesSent);
}
}
}
private void _waveSource_RecordingStopped(object sender, StoppedEventArgs e)
{
LogMessage("_waveSource_RecordingStopped");
_signalRecordingStopped.Set();
}
private void _webSocket_OnOpen(object sender, EventArgs e)
{
LogMessage("_webSocket_OnOpen");
_signalConnected.Set();
_connectionResult = "connected";
}
private void _webSocket_OnMessage(object sender, MessageEventArgs args)
{
LogMessage("\n<- Server-to-Client Message: " + args.Data + "\n");
if (args.IsBinary)
{
ProgramExit("Server unexpectedly sent us binary data.");
}
WebSocketDictationMessage msg = JsonConvert.DeserializeObject<WebSocketDictationMessage>(args.Data);
string method = msg.Method;
if ("TEXT" == method)
{
if (msg.Data.TextDone)
_signalTextDoneReceived.Set();
LogMessage("*** Press enter to stop recording");
}
else if ("STARTDICTATION" == method)
{
if (msg.Error != null)
{
_startDictationResult = "error." + msg.Error.Reason;
}
else
{
_startDictationResult = "accepted";
}
_signalStartDictationResponseReceived.Set();
}
else if (msg.Error != null)
{
ProgramExit("Received error from server. Reason: " + msg.Error.Reason + ", Message: " + msg.Error.Message);
}
}
private void _webSocket_OnError(object sender, WebSocketSharp.ErrorEventArgs e)
{
ProgramExit("_webSocket_OnError: \n" + e.Message + "\n" + e.Exception);
}
private void _webSocket_OnClose(object sender, CloseEventArgs e)
{
LogMessage("_webSocket_OnClose");
if (!e.WasClean)
{
_connectionResult = "closed/" + e.Code + "/" + e.Reason;
_signalConnected.Set();
}
}
private void SendTextMessageToServer(string textMessage)
{
LogMessage("\n-> Client-to-Server Message: " + textMessage + "\n");
if (_webSocket.IsAlive)
{
_webSocket.Send(textMessage);
}
else
{
LogMessage("FAILED to send text message because WebSocket is not connected.");
}
}
public class WebSocketDictationMessage
{
public string ApiVersion { get; set; } = "1.0";
public string Method { get; set; }
public string Id { get; set; }
public JobParams Params { get; set; }
public JobData Data { get; set; }
public MessageError Error { get; set; }
public class AudioFormat
{
public string Encoding { get; set; }
public int SampleRate { get; set; }
}
public class MessageError
{
public string Reason { get; set; }
public string Message { get; set; }
}
public class SNSContext
{
public string DictationContextText { get; set; }
public int SelectionOffset { get; set; }
public int SelectionEndIndex { get; set; }
}
public class JobParams
{
public string Id { get; set; }
public string Authorization { get; set; }
//
// replace authorization line above with apikey line
// to use api key.
// public string apikey { get; set; }
//
public string ExternalId { get; set; }
public string Metadata { get; set; }
public int MaxNBest { get; set; }
public AudioFormat AudioFormat { get; set; }
public string TimeStamp { get; set; }
public SNSContext SnsContext { get; set; }
[DefaultValue(true)]
public bool BuiltinSubstitutions { get; set; } = true;
public bool HandsFree { get; set; }
public string[] ReturnSubscriptions { get; set; }
}
public class JobData
{
public string Kind { get; set; }
public string Id { get; set; }
public string Updated { get; set; }
public bool TextDone { get; set; }
public string Text { get; set; }
public string SubstitutedText { get; set; }
}
}
}
}