Apply Noise Reduction to Audio for Streaming Speech-to-Text | AssemblyAI

This guide demonstrates how to implement a noise reduction system for real-time audio transcription using AssemblyAI’s Streaming STT and the noisereduce library. You’ll learn how to create a custom audio stream that preprocesses incoming audio to remove background noise before it reaches the transcription service.

This solution is particularly valuable for:

Voice assistants operating in noisy environments
Customer service applications processing calls
Meeting transcription tools
Voice-enabled applications requiring high accuracy

The implementation uses Python and combines proven audio processing techniques with AssemblyAI’s powerful transcription capabilities. While our example focuses on microphone input, the principles can be applied to any real-time audio stream.

Quickstart

1 import logging
2 import numpy as np
3 import noisereduce as nr
4 import assemblyai as aai
5 from typing import Type
6 from assemblyai.streaming.v3 import (
7     BeginEvent,
8     StreamingClient,
9     StreamingClientOptions,
10     StreamingError,
11     StreamingEvents,
12     StreamingParameters,
13     StreamingSessionParameters,
14     TerminationEvent,
15     TurnEvent,
16 )
17 
18 logging.basicConfig(level=logging.INFO)
19 
20 api_key = "<YOUR_API_KEY>"
21 
22 # --- Noise-reduced microphone stream ---
23 def noise_reduced_mic_stream(sample_rate=16000):
24     mic = aai.extras.MicrophoneStream(sample_rate=sample_rate)
25     buffer = np.array([], dtype=np.int16)
26     buffer_size = int(sample_rate * 0.5)  # 0.5 seconds
27 
28     for raw_audio in mic:
29         audio_data = np.frombuffer(raw_audio, dtype=np.int16)
30         buffer = np.append(buffer, audio_data)
31 
32         if len(buffer) >= buffer_size:
33             float_audio = buffer.astype(np.float32) / 32768.0
34             denoised = nr.reduce_noise(
35                 y=float_audio,
36                 sr=sample_rate,
37                 prop_decrease=0.75,
38                 n_fft=1024,
39             )
40             int_audio = (denoised * 32768.0).astype(np.int16)
41             buffer = buffer[-1024:]  # keep some overlap
42             yield int_audio.tobytes()
43 
44 
45 # --- Event Handlers ---
46 def on_begin(self: Type[StreamingClient], event: BeginEvent):
47     print(f" Session started: {event.id}")
48 
49 def on_turn(self: Type[StreamingClient], event: TurnEvent):
50     print(f"{event.transcript} ({event.end_of_turn})")
51 
52     if event.end_of_turn and not event.turn_is_formatted:
53         self.set_params(StreamingSessionParameters(format_turns=True))
54 
55 def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
56     print(f" Session terminated after {event.audio_duration_seconds} seconds")
57 
58 def on_error(self: Type[StreamingClient], error: StreamingError):
59     print(f" Error occurred: {error}")
60 
61 # --- Main Function ---
62 def main():
63     client = StreamingClient(
64         StreamingClientOptions(
65             api_key=api_key,
66             api_host="streaming.assemblyai.com",
67         )
68     )
69 
70     client.on(StreamingEvents.Begin, on_begin)
71     client.on(StreamingEvents.Turn, on_turn)
72     client.on(StreamingEvents.Termination, on_terminated)
73     client.on(StreamingEvents.Error, on_error)
74 
75     client.connect(
76         StreamingParameters(
77             sample_rate=16000,
78             format_turns=True,
79         )
80     )
81 
82     try:
83         denoised_stream = noise_reduced_mic_stream(sample_rate=16000)
84         client.stream(denoised_stream)
85     finally:
86         client.disconnect(terminate=True)
87 
88 if __name__ == "__main__":
89     main()

Step-by-step guide

First, install the following packages: assemblyai, noisereduce, numpy

$ pip install assemblyai noisereduce numpy

1 import logging
2 import numpy as np
3 import noisereduce as nr
4 import assemblyai as aai
5 from typing import Type
6 from assemblyai.streaming.v3 import (
7     BeginEvent,
8     StreamingClient,
9     StreamingClientOptions,
10     StreamingError,
11     StreamingEvents,
12     StreamingParameters,
13     StreamingSessionParameters,
14     TerminationEvent,
15     TurnEvent,
16 )

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for a free account and get your API key from your dashboard. Please note that Streaming Speech-to-text is available for upgraded accounts only. If you’re on the free plan, you’ll need to upgrade your account by adding a credit card.

1 api_key = "<YOUR_API_KEY>"

Make sure not to share this token with anyone - it is a private key associated uniquely to your account.

Create a generator function that includes noise reduction.

1 def noise_reduced_mic_stream(sample_rate=16000):
2     mic = aai.extras.MicrophoneStream(sample_rate=sample_rate)
3     buffer = np.array([], dtype=np.int16)
4     buffer_size = int(sample_rate * 0.5)  # 0.5 seconds
5     for raw_audio in mic:
6         audio_data = np.frombuffer(raw_audio, dtype=np.int16)
7         buffer = np.append(buffer, audio_data)
8         if len(buffer) >= buffer_size:
9             float_audio = buffer.astype(np.float32) / 32768.0
10             denoised = nr.reduce_noise(
11                 y=float_audio,
12                 sr=sample_rate,
13                 prop_decrease=0.75,
14                 n_fft=1024,
15             )
16             int_audio = (denoised * 32768.0).astype(np.int16)
17             buffer = buffer[-1024:]  # keep some overlap
18             yield int_audio.tobytes()

Create functions to handle different events during transcription.

1 def on_begin(self: Type[StreamingClient], event: BeginEvent):
2     print(f" Session started: {event.id}")
3 
4 def on_turn(self: Type[StreamingClient], event: TurnEvent):
5     print(f"{event.transcript} ({event.end_of_turn})")
6 
7     if event.end_of_turn and not event.turn_is_formatted:
8         self.set_params(StreamingSessionParameters(format_turns=True))
9 
10 def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
11     print(f" Session terminated after {event.audio_duration_seconds} seconds")
12     
13 def on_error(self: Type[StreamingClient], error: StreamingError):
14     print(f" Error occurred: {error}")

Now we create our transcriber and noise_reduced_mic_stream.

1 def main():
2     client = StreamingClient(
3         StreamingClientOptions(
4             api_key=api_key,
5             api_host="streaming.assemblyai.com",
6         )
7     )
8 
9     client.on(StreamingEvents.Begin, on_begin)
10     client.on(StreamingEvents.Turn, on_turn)
11     client.on(StreamingEvents.Termination, on_terminated)
12     client.on(StreamingEvents.Error, on_error)
13 
14     client.connect(
15         StreamingParameters(
16             sample_rate=16000,
17             format_turns=True,
18         )
19     )
20 
21     try:
22         denoised_stream = noise_reduced_mic_stream(sample_rate=16000)
23         client.stream(denoised_stream)
24     finally:
25         client.disconnect(terminate=True)
26 
27 if __name__ == "__main__":
28     main()

You can press Ctrl+C to stop the transcription.