Transcribe streaming audio from a microphone in C# | AssemblyAI

Overview

By the end of this tutorial, you’ll be able to transcribe audio from your microphone in C#.

Supported languages

Streaming Speech-to-Text is only available for English.

Before you begin

To complete this tutorial, you need:

.NET 9 (earlier versions will work too with minor adjustments)
An AssemblyAI account with credit card set up.

Here’s the full sample code for what you’ll build in this tutorial:

1 using System;
2 using System.Diagnostics;
3 using System.Net.WebSockets;
4 using System.Runtime.InteropServices;
5 using System.Text;
6 using System.Text.Json;
7 using System.Threading;
8 using System.Threading.Tasks;
9 
10 class Program
11 {
12     private const string API_KEY = "<YOUR_API_KEY>";
13     private const int SAMPLE_RATE = 16000;
14     private static Process soxProcess;
15     private static ClientWebSocket ws;
16     private static CancellationTokenSource cts;
17 
18     static async Task Main(string[] args)
19     {
20         cts = new CancellationTokenSource();
21         Console.CancelKeyPress += OnCancelKeyPress;
22 
23         try
24         {
25             await ConnectAndTranscribe();
26         }
27         catch (Exception ex)
28         {
29             Console.WriteLine($"Error: {ex.Message}");
30         }
31         finally
32         {
33             Cleanup();
34         }
35     }
36 
37     static void OnCancelKeyPress(object sender, ConsoleCancelEventArgs e)
38     {
39         e.Cancel = true;
40         Console.WriteLine("\nStopping transcription...");
41         cts.Cancel();
42     }
43 
44     static async Task ConnectAndTranscribe()
45     {
46         ws = new ClientWebSocket();
47         ws.Options.SetRequestHeader("Authorization", API_KEY);
48 
49         string url = $"wss://api.assemblyai.com/v2/realtime/ws?sample_rate={SAMPLE_RATE}";
50         await ws.ConnectAsync(new Uri(url), cts.Token);
51 
52         var receiveTask = ReceiveMessagesAsync();
53 
54         await CaptureAndSendAudioAsync();
55 
56         await receiveTask;
57     }
58 
59     static async Task CaptureAndSendAudioAsync()
60     {
61         var soxArguments = string.Join(' ', new[] {
62             "-d",
63             "--no-show-progress",
64             $"--rate {SAMPLE_RATE}",
65             "--channels 1",
66             "--encoding signed-integer",
67             "--bits 16",
68             "--type wav",
69             "-"
70         });
71 
72         soxProcess = new Process
73         {
74             StartInfo = new ProcessStartInfo
75             {
76                 FileName = "sox",
77                 Arguments = soxArguments,
78                 RedirectStandardOutput = true,
79                 UseShellExecute = false,
80                 CreateNoWindow = true
81             }
82         };
83 
84         try
85         {
86             soxProcess.Start();
87             var soxOutputStream = soxProcess.StandardOutput.BaseStream;
88             var buffer = new byte[4096];
89             int bytesRead;
90 
91             while ((bytesRead = await soxOutputStream.ReadAsync(buffer, 0, buffer.Length, cts.Token)) > 0)
92             {
93                 if (cts.Token.IsCancellationRequested) break;
94 
95                 if (ws.State == WebSocketState.Open)
96                 {
97                     await ws.SendAsync(
98                         new ArraySegment<byte>(buffer, 0, bytesRead),
99                         WebSocketMessageType.Binary,
100                         true,
101                         cts.Token
102                     );
103                 }
104             }
105         }
106         catch (OperationCanceledException)
107         {
108         }
109         catch (Exception ex)
110         {
111             Console.WriteLine($"Error capturing audio: {ex.Message}");
112         }
113     }
114 
115     static async Task ReceiveMessagesAsync()
116     {
117         var buffer = new byte[4096];
118 
119         try
120         {
121             while (ws.State == WebSocketState.Open && !cts.Token.IsCancellationRequested)
122             {
123                 var result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
124 
125                 if (result.MessageType == WebSocketMessageType.Close)
126                 {
127                     await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", cts.Token);
128                     break;
129                 }
130 
131                 string message = Encoding.UTF8.GetString(buffer, 0, result.Count);
132                 ProcessMessage(message);
133             }
134         }
135         catch (OperationCanceledException)
136         {
137         }
138         catch (Exception ex)
139         {
140             Console.WriteLine($"Error receiving messages: {ex.Message}");
141         }
142     }
143 
144     static void ProcessMessage(string message)
145     {
146         try
147         {
148             using JsonDocument doc = JsonDocument.Parse(message);
149             JsonElement root = doc.RootElement;
150 
151             if (root.TryGetProperty("message_type", out JsonElement messageTypeElement))
152             {
153                 string messageType = messageTypeElement.GetString();
154 
155                 if (messageType == "SessionBegins" && root.TryGetProperty("session_id", out JsonElement sessionIdElement))
156                 {
157                     string sessionId = sessionIdElement.GetString();
158                     Console.WriteLine($"Session ID: {sessionId}");
159                     return;
160                 }
161 
162                 if (!root.TryGetProperty("text", out JsonElement textElement))
163                     return;
164 
165                 string text = textElement.GetString();
166                 if (string.IsNullOrWhiteSpace(text))
167                     return;
168 
169                 if (messageType == "PartialTranscript")
170                 {
171                     Console.Write($"\rPartial: {text}");
172                 }
173                 else if (messageType == "FinalTranscript")
174                 {
175                     Console.WriteLine($"\nFinal: {text}");
176                 }
177                 else if (messageType == "Error")
178                 {
179                     string error = root.TryGetProperty("error", out JsonElement errorElement)
180                         ? errorElement.GetString()
181                         : "Unknown error";
182                     Console.WriteLine($"Error from AssemblyAI: {error}");
183                 }
184             }
185         }
186         catch (JsonException ex)
187         {
188             Console.WriteLine($"Error parsing message: {ex.Message}");
189         }
190     }
191 
192     static void Cleanup()
193     {
194         try
195         {
196             if (soxProcess != null && !soxProcess.HasExited)
197             {
198                 soxProcess.Kill();
199                 Console.WriteLine("Recording stopped.");
200             }
201         }
202         catch
203         {
204         }
205 
206         try
207         {
208             if (ws != null && ws.State == WebSocketState.Open)
209             {
210                 var closeTask = ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
211                 closeTask.Wait(TimeSpan.FromSeconds(2));
212                 Console.WriteLine("WebSocket connection closed.");
213             }
214         }
215         catch
216         {
217         }
218     }
219 }

Step 1: Setting up your project

Create a new C# console application.

$ dotnet new console

Install SoX, a cross-platform audio library, to record audio from your microphone.

macOS

Windows

Linux

$ brew install sox

Step 2: Define the program structure and assign your API key

Add the following using directives:

1 using System;
2 using System.Diagnostics;
3 using System.Net.WebSockets;
4 using System.Runtime.InteropServices;
5 using System.Text;
6 using System.Text.Json;
7 using System.Threading;
8 using System.Threading.Tasks;

Browse to API Keys in your dashboard, and then copy your API key.

Define the program structure and assign your API key:

1 class Program
2 {
3     private const string API_KEY = "<YOUR_API_KEY>";
4     private const int SAMPLE_RATE = 16000;
5     private static Process soxProcess;
6     private static ClientWebSocket ws;
7     private static CancellationTokenSource cts;
8 }

Setup the main method to handle the program logic:

1 static async Task Main(string[] args)
2     {
3         cts = new CancellationTokenSource();
4         Console.CancelKeyPress += OnCancelKeyPress;
5 
6         try
7         {
8             await ConnectAndTranscribe();
9         }
10         catch (Exception ex)
11         {
12             Console.WriteLine($"Error: {ex.Message}");
13         }
14         finally
15         {
16             Cleanup();
17         }
18     }

Step 3: Record audio from the microphone

In this step, you’ll setup microphone recording using SoX.

Create a CaptureAndSendAudioAsync method to handle the microphone recording and audio streaming:

1 static async Task CaptureAndSendAudioAsync()
2     {
3         var soxArguments = string.Join(' ', new[] {
4             "-d",
5             "--no-show-progress",
6             $"--rate {SAMPLE_RATE}",
7             "--channels 1",
8             "--encoding signed-integer",
9             "--bits 16",
10             "--type wav",
11             "-"
12         });
13 
14         soxProcess = new Process
15         {
16             StartInfo = new ProcessStartInfo
17             {
18                 FileName = "sox",
19                 Arguments = soxArguments,
20                 RedirectStandardOutput = true,
21                 UseShellExecute = false,
22                 CreateNoWindow = true
23             }
24         };
25 
26         try
27         {
28             soxProcess.Start();
29             var soxOutputStream = soxProcess.StandardOutput.BaseStream;
30             var buffer = new byte[4096];
31             int bytesRead;
32 
33             while ((bytesRead = await soxOutputStream.ReadAsync(buffer, 0, buffer.Length, cts.Token)) > 0)
34             {
35                 if (cts.Token.IsCancellationRequested) break;
36 
37                 if (ws.State == WebSocketState.Open)
38                 {
39                     await ws.SendAsync(
40                         new ArraySegment<byte>(buffer, 0, bytesRead),
41                         WebSocketMessageType.Binary,
42                         true,
43                         cts.Token
44                     );
45                 }
46             }
47         }
48         catch (OperationCanceledException)
49         {
50         }
51         catch (Exception ex)
52         {
53             Console.WriteLine($"Error capturing audio: {ex.Message}");
54         }
55     }

Audio data format

The SoX arguments configure the format of the audio output. The arguments configure the format to a single channel with 16-bit signed integer PCM encoding and 16 kHz sample rate.

If you want to stream data from elsewhere, make sure that your audio data is in the following format:

Single channel
16-bit signed integer PCM or mu-law encoding

By default, the Streaming STT service expects PCM16-encoded audio. If you want to use mu-law encoding, see Specifying the encoding.

Step 4: Setup the WebSocket connection to the Streaming service

Streaming Speech-to-Text uses WebSockets to stream audio to AssemblyAI. This requires first establishing a connection to the API.

Create a ConnectAndTranscribe method to handle the connection and transcription:

1 static async Task ConnectAndTranscribe()
2     {
3         ws = new ClientWebSocket();
4         ws.Options.SetRequestHeader("Authorization", API_KEY);
5 
6         string url = $"wss://api.assemblyai.com/v2/realtime/ws?sample_rate={SAMPLE_RATE}";
7         await ws.ConnectAsync(new Uri(url), cts.Token);
8 
9         var receiveTask = ReceiveMessagesAsync();
10 
11         await CaptureAndSendAudioAsync();
12 
13         await receiveTask;
14     }

Sample rate

The sampleRate is the number of audio samples per second, measured in hertz (Hz). Higher sample rates result in higher quality audio, which may lead to better transcripts, but also more data being sent over the network.

We recommend the following sample rates:

Minimum quality: 8_000 (8 kHz)
Medium quality: 16_000 (16 kHz)
Maximum quality: 48_000 (48 kHz)

If you don’t set a sample rate on the real-time transcriber, it defaults to 16 kHz.

Create a ReceiveMessagesAsync method to handle the message receiving:

1 static async Task ReceiveMessagesAsync()
2     {
3         var buffer = new byte[4096];
4 
5         try
6         {
7             while (ws.State == WebSocketState.Open && !cts.Token.IsCancellationRequested)
8             {
9                 var result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
10 
11                 if (result.MessageType == WebSocketMessageType.Close)
12                 {
13                     await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", cts.Token);
14                     break;
15                 }
16 
17                 string message = Encoding.UTF8.GetString(buffer, 0, result.Count);
18                 ProcessMessage(message);
19             }
20         }
21         catch (OperationCanceledException)
22         {
23         }
24         catch (Exception ex)
25         {
26             Console.WriteLine($"Error receiving messages: {ex.Message}");
27         }
28     }

Create a ProcessMessage method to handle the message processing:

The real-time transcriber returns two types of transcripts: partial and final.

Partial transcripts are returned as the audio is being streamed to AssemblyAI.
Final transcripts are returned when the service detects a pause in speech.

1 static void ProcessMessage(string message)
2     {
3         try
4         {
5             using JsonDocument doc = JsonDocument.Parse(message);
6             JsonElement root = doc.RootElement;
7 
8             if (root.TryGetProperty("message_type", out JsonElement messageTypeElement))
9             {
10                 string messageType = messageTypeElement.GetString();
11 
12                 if (messageType == "SessionBegins" && root.TryGetProperty("session_id", out JsonElement sessionIdElement))
13                 {
14                     string sessionId = sessionIdElement.GetString();
15                     Console.WriteLine($"Session ID: {sessionId}");
16                     return;
17                 }
18 
19                 if (!root.TryGetProperty("text", out JsonElement textElement))
20                     return;
21 
22                 string text = textElement.GetString();
23                 if (string.IsNullOrWhiteSpace(text))
24                     return;
25 
26                 if (messageType == "PartialTranscript")
27                 {
28                     Console.Write($"\rPartial: {text}");
29                 }
30                 else if (messageType == "FinalTranscript")
31                 {
32                     Console.WriteLine($"\nFinal: {text}");
33                 }
34                 else if (messageType == "Error")
35                 {
36                     string error = root.TryGetProperty("error", out JsonElement errorElement)
37                         ? errorElement.GetString()
38                         : "Unknown error";
39                     Console.WriteLine($"Error from AssemblyAI: {error}");
40                 }
41             }
42         }
43         catch (JsonException ex)
44         {
45             Console.WriteLine($"Error parsing message: {ex.Message}");
46         }
47     }

End of utterance controls

You can configure the silence threshold for automatic utterance detection and programmatically force the end of an utterance to immediately get a Final transcript.

Step 5: Disconnect the streaming service

In this step, you’ll setup the disconnect logic.

Handle key press events to disconnect the streaming service:

1 static void OnCancelKeyPress(object sender, ConsoleCancelEventArgs e)
2     {
3         e.Cancel = true;
4         Console.WriteLine("\nStopping transcription...");
5         cts.Cancel();
6     }

Setup the cleanup logic:

1 static void Cleanup()
2     {
3         try
4         {
5             if (soxProcess != null && !soxProcess.HasExited)
6             {
7                 soxProcess.Kill();
8                 Console.WriteLine("Recording stopped.");
9             }
10         }
11         catch
12         {
13         }
14 
15         try
16         {
17             if (ws != null && ws.State == WebSocketState.Open)
18             {
19                 var closeTask = ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
20                 closeTask.Wait(TimeSpan.FromSeconds(2));
21                 Console.WriteLine("WebSocket connection closed.");
22             }
23         }
24         catch
25         {
26         }
27     }

To run the program, first run dotnet build and then dotnet run.

Next steps

To learn more about Streaming Speech-to-Text, see the following resources:

Need some help?

If you get stuck, or have any other questions, we’d love to help you out. Contact our support team at support@assemblyai.com or create a support ticket.