Transcribe streaming audio from a microphone in C#

Learn how to transcribe streaming audio in C#.

Overview

By the end of this tutorial, you’ll be able to transcribe audio from your microphone in C#.

Supported languages

Streaming Speech-to-Text is only available for English.

Before you begin

To complete this tutorial, you need:

Here’s the full sample code for what you’ll build in this tutorial:

1using System;
2using System.Diagnostics;
3using System.Net.WebSockets;
4using System.Runtime.InteropServices;
5using System.Text;
6using System.Text.Json;
7using System.Threading;
8using System.Threading.Tasks;
9
10class Program
11{
12 private const string API_KEY = "<YOUR_API_KEY>";
13 private const int SAMPLE_RATE = 16000;
14 private static Process soxProcess;
15 private static ClientWebSocket ws;
16 private static CancellationTokenSource cts;
17
18 static async Task Main(string[] args)
19 {
20 cts = new CancellationTokenSource();
21 Console.CancelKeyPress += OnCancelKeyPress;
22
23 try
24 {
25 await ConnectAndTranscribe();
26 }
27 catch (Exception ex)
28 {
29 Console.WriteLine($"Error: {ex.Message}");
30 }
31 finally
32 {
33 Cleanup();
34 }
35 }
36
37 static void OnCancelKeyPress(object sender, ConsoleCancelEventArgs e)
38 {
39 e.Cancel = true;
40 Console.WriteLine("\nStopping transcription...");
41 cts.Cancel();
42 }
43
44 static async Task ConnectAndTranscribe()
45 {
46 ws = new ClientWebSocket();
47 ws.Options.SetRequestHeader("Authorization", API_KEY);
48
49 string url = $"wss://api.assemblyai.com/v2/realtime/ws?sample_rate={SAMPLE_RATE}";
50 await ws.ConnectAsync(new Uri(url), cts.Token);
51
52 var receiveTask = ReceiveMessagesAsync();
53
54 await CaptureAndSendAudioAsync();
55
56 await receiveTask;
57 }
58
59 static async Task CaptureAndSendAudioAsync()
60 {
61 var soxArguments = string.Join(' ', new[] {
62 "-d",
63 "--no-show-progress",
64 $"--rate {SAMPLE_RATE}",
65 "--channels 1",
66 "--encoding signed-integer",
67 "--bits 16",
68 "--type wav",
69 "-"
70 });
71
72 soxProcess = new Process
73 {
74 StartInfo = new ProcessStartInfo
75 {
76 FileName = "sox",
77 Arguments = soxArguments,
78 RedirectStandardOutput = true,
79 UseShellExecute = false,
80 CreateNoWindow = true
81 }
82 };
83
84 try
85 {
86 soxProcess.Start();
87 var soxOutputStream = soxProcess.StandardOutput.BaseStream;
88 var buffer = new byte[4096];
89 int bytesRead;
90
91 while ((bytesRead = await soxOutputStream.ReadAsync(buffer, 0, buffer.Length, cts.Token)) > 0)
92 {
93 if (cts.Token.IsCancellationRequested) break;
94
95 if (ws.State == WebSocketState.Open)
96 {
97 await ws.SendAsync(
98 new ArraySegment<byte>(buffer, 0, bytesRead),
99 WebSocketMessageType.Binary,
100 true,
101 cts.Token
102 );
103 }
104 }
105 }
106 catch (OperationCanceledException)
107 {
108 }
109 catch (Exception ex)
110 {
111 Console.WriteLine($"Error capturing audio: {ex.Message}");
112 }
113 }
114
115 static async Task ReceiveMessagesAsync()
116 {
117 var buffer = new byte[4096];
118
119 try
120 {
121 while (ws.State == WebSocketState.Open && !cts.Token.IsCancellationRequested)
122 {
123 var result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
124
125 if (result.MessageType == WebSocketMessageType.Close)
126 {
127 await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", cts.Token);
128 break;
129 }
130
131 string message = Encoding.UTF8.GetString(buffer, 0, result.Count);
132 ProcessMessage(message);
133 }
134 }
135 catch (OperationCanceledException)
136 {
137 }
138 catch (Exception ex)
139 {
140 Console.WriteLine($"Error receiving messages: {ex.Message}");
141 }
142 }
143
144 static void ProcessMessage(string message)
145 {
146 try
147 {
148 using JsonDocument doc = JsonDocument.Parse(message);
149 JsonElement root = doc.RootElement;
150
151 if (root.TryGetProperty("message_type", out JsonElement messageTypeElement))
152 {
153 string messageType = messageTypeElement.GetString();
154
155 if (messageType == "SessionBegins" && root.TryGetProperty("session_id", out JsonElement sessionIdElement))
156 {
157 string sessionId = sessionIdElement.GetString();
158 Console.WriteLine($"Session ID: {sessionId}");
159 return;
160 }
161
162 if (!root.TryGetProperty("text", out JsonElement textElement))
163 return;
164
165 string text = textElement.GetString();
166 if (string.IsNullOrWhiteSpace(text))
167 return;
168
169 if (messageType == "PartialTranscript")
170 {
171 Console.Write($"\rPartial: {text}");
172 }
173 else if (messageType == "FinalTranscript")
174 {
175 Console.WriteLine($"\nFinal: {text}");
176 }
177 else if (messageType == "Error")
178 {
179 string error = root.TryGetProperty("error", out JsonElement errorElement)
180 ? errorElement.GetString()
181 : "Unknown error";
182 Console.WriteLine($"Error from AssemblyAI: {error}");
183 }
184 }
185 }
186 catch (JsonException ex)
187 {
188 Console.WriteLine($"Error parsing message: {ex.Message}");
189 }
190 }
191
192 static void Cleanup()
193 {
194 try
195 {
196 if (soxProcess != null && !soxProcess.HasExited)
197 {
198 soxProcess.Kill();
199 Console.WriteLine("Recording stopped.");
200 }
201 }
202 catch
203 {
204 }
205
206 try
207 {
208 if (ws != null && ws.State == WebSocketState.Open)
209 {
210 var closeTask = ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
211 closeTask.Wait(TimeSpan.FromSeconds(2));
212 Console.WriteLine("WebSocket connection closed.");
213 }
214 }
215 catch
216 {
217 }
218 }
219}

Step 1: Setting up your project

1

Create a new C# console application.

$dotnet new console
2

Install SoX, a cross-platform audio library, to record audio from your microphone.

$brew install sox

Step 2: Define the program structure and assign your API key

1

Add the following using directives:

1using System;
2using System.Diagnostics;
3using System.Net.WebSockets;
4using System.Runtime.InteropServices;
5using System.Text;
6using System.Text.Json;
7using System.Threading;
8using System.Threading.Tasks;
2

Browse to Account, and then click the text under Your API key to copy it.

3

Define the program structure and assign your API key:

1class Program
2{
3 private const string API_KEY = "<YOUR_API_KEY>";
4 private const int SAMPLE_RATE = 16000;
5 private static Process soxProcess;
6 private static ClientWebSocket ws;
7 private static CancellationTokenSource cts;
8}
4

Setup the main method to handle the program logic:

1static async Task Main(string[] args)
2 {
3 cts = new CancellationTokenSource();
4 Console.CancelKeyPress += OnCancelKeyPress;
5
6 try
7 {
8 await ConnectAndTranscribe();
9 }
10 catch (Exception ex)
11 {
12 Console.WriteLine($"Error: {ex.Message}");
13 }
14 finally
15 {
16 Cleanup();
17 }
18 }

Step 3: Record audio from the microphone

In this step, you’ll setup microphone recording using SoX.

1

Create a CaptureAndSendAudioAsync method to handle the microphone recording and audio streaming:

1static async Task CaptureAndSendAudioAsync()
2 {
3 var soxArguments = string.Join(' ', new[] {
4 "-d",
5 "--no-show-progress",
6 $"--rate {SAMPLE_RATE}",
7 "--channels 1",
8 "--encoding signed-integer",
9 "--bits 16",
10 "--type wav",
11 "-"
12 });
13
14 soxProcess = new Process
15 {
16 StartInfo = new ProcessStartInfo
17 {
18 FileName = "sox",
19 Arguments = soxArguments,
20 RedirectStandardOutput = true,
21 UseShellExecute = false,
22 CreateNoWindow = true
23 }
24 };
25
26 try
27 {
28 soxProcess.Start();
29 var soxOutputStream = soxProcess.StandardOutput.BaseStream;
30 var buffer = new byte[4096];
31 int bytesRead;
32
33 while ((bytesRead = await soxOutputStream.ReadAsync(buffer, 0, buffer.Length, cts.Token)) > 0)
34 {
35 if (cts.Token.IsCancellationRequested) break;
36
37 if (ws.State == WebSocketState.Open)
38 {
39 await ws.SendAsync(
40 new ArraySegment<byte>(buffer, 0, bytesRead),
41 WebSocketMessageType.Binary,
42 true,
43 cts.Token
44 );
45 }
46 }
47 }
48 catch (OperationCanceledException)
49 {
50 }
51 catch (Exception ex)
52 {
53 Console.WriteLine($"Error capturing audio: {ex.Message}");
54 }
55 }
Audio data format

The SoX arguments configure the format of the audio output. The arguments configure the format to a single channel with 16-bit signed integer PCM encoding and 16 kHz sample rate.

If you want to stream data from elsewhere, make sure that your audio data is in the following format:

  • Single channel
  • 16-bit signed integer PCM or mu-law encoding

By default, the Streaming STT service expects PCM16-encoded audio. If you want to use mu-law encoding, see Specifying the encoding.

Step 4: Setup the websocket connection to the Streaming service

Streaming Speech-to-Text uses WebSockets to stream audio to AssemblyAI. This requires first establishing a connection to the API.

1

Create a ConnectAndTranscribe method to handle the connection and transcription:

1static async Task ConnectAndTranscribe()
2 {
3 ws = new ClientWebSocket();
4 ws.Options.SetRequestHeader("Authorization", API_KEY);
5
6 string url = $"wss://api.assemblyai.com/v2/realtime/ws?sample_rate={SAMPLE_RATE}";
7 await ws.ConnectAsync(new Uri(url), cts.Token);
8
9 var receiveTask = ReceiveMessagesAsync();
10
11 await CaptureAndSendAudioAsync();
12
13 await receiveTask;
14 }
Sample rate

The sampleRate is the number of audio samples per second, measured in hertz (Hz). Higher sample rates result in higher quality audio, which may lead to better transcripts, but also more data being sent over the network.

We recommend the following sample rates:

  • Minimum quality: 8_000 (8 kHz)
  • Medium quality: 16_000 (16 kHz)
  • Maximum quality: 48_000 (48 kHz)

If you don’t set a sample rate on the real-time transcriber, it defaults to 16 kHz.

2

Create a ReceiveMessagesAsync method to handle the message receiving:

1static async Task ReceiveMessagesAsync()
2 {
3 var buffer = new byte[4096];
4
5 try
6 {
7 while (ws.State == WebSocketState.Open && !cts.Token.IsCancellationRequested)
8 {
9 var result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
10
11 if (result.MessageType == WebSocketMessageType.Close)
12 {
13 await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", cts.Token);
14 break;
15 }
16
17 string message = Encoding.UTF8.GetString(buffer, 0, result.Count);
18 ProcessMessage(message);
19 }
20 }
21 catch (OperationCanceledException)
22 {
23 }
24 catch (Exception ex)
25 {
26 Console.WriteLine($"Error receiving messages: {ex.Message}");
27 }
28 }
3

Create a ProcessMessage method to handle the message processing:

The real-time transcriber returns two types of transcripts: partial and final.

  • Partial transcripts are returned as the audio is being streamed to AssemblyAI.
  • Final transcripts are returned when the service detects a pause in speech.
1static void ProcessMessage(string message)
2 {
3 try
4 {
5 using JsonDocument doc = JsonDocument.Parse(message);
6 JsonElement root = doc.RootElement;
7
8 if (root.TryGetProperty("message_type", out JsonElement messageTypeElement))
9 {
10 string messageType = messageTypeElement.GetString();
11
12 if (messageType == "SessionBegins" && root.TryGetProperty("session_id", out JsonElement sessionIdElement))
13 {
14 string sessionId = sessionIdElement.GetString();
15 Console.WriteLine($"Session ID: {sessionId}");
16 return;
17 }
18
19 if (!root.TryGetProperty("text", out JsonElement textElement))
20 return;
21
22 string text = textElement.GetString();
23 if (string.IsNullOrWhiteSpace(text))
24 return;
25
26 if (messageType == "PartialTranscript")
27 {
28 Console.Write($"\rPartial: {text}");
29 }
30 else if (messageType == "FinalTranscript")
31 {
32 Console.WriteLine($"\nFinal: {text}");
33 }
34 else if (messageType == "Error")
35 {
36 string error = root.TryGetProperty("error", out JsonElement errorElement)
37 ? errorElement.GetString()
38 : "Unknown error";
39 Console.WriteLine($"Error from AssemblyAI: {error}");
40 }
41 }
42 }
43 catch (JsonException ex)
44 {
45 Console.WriteLine($"Error parsing message: {ex.Message}");
46 }
47 }
End of utterance controls

You can configure the silence threshold for automatic utterance detection and programmatically force the end of an utterance to immediately get a Final transcript.

Step 5: Disconnect the streaming service

In this step, you’ll setup the disconnect logic.

1

Handle key press events to disconnect the streaming service:

1static void OnCancelKeyPress(object sender, ConsoleCancelEventArgs e)
2 {
3 e.Cancel = true;
4 Console.WriteLine("\nStopping transcription...");
5 cts.Cancel();
6 }
2

Setup the cleanup logic:

1static void Cleanup()
2 {
3 try
4 {
5 if (soxProcess != null && !soxProcess.HasExited)
6 {
7 soxProcess.Kill();
8 Console.WriteLine("Recording stopped.");
9 }
10 }
11 catch
12 {
13 }
14
15 try
16 {
17 if (ws != null && ws.State == WebSocketState.Open)
18 {
19 var closeTask = ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "Closing", CancellationToken.None);
20 closeTask.Wait(TimeSpan.FromSeconds(2));
21 Console.WriteLine("WebSocket connection closed.");
22 }
23 }
24 catch
25 {
26 }
27 }

To run the program, first run dotnet build and then dotnet run.

Next steps

To learn more about Streaming Speech-to-Text, see the following resources:

Need some help?

If you get stuck, or have any other questions, we’d love to help you out. Contact our support team at support@assemblyai.com or create a support ticket.