Apply Noise Reduction to Audio for Streaming Speech-to-Text

This guide demonstrates how to implement a noise reduction system for real-time audio transcription using AssemblyAI’s Streaming STT and the noisereduce library. You’ll learn how to create a custom audio stream that preprocesses incoming audio to remove background noise before it reaches the transcription service.

This solution is particularly valuable for:

  • Voice assistants operating in noisy environments
  • Customer service applications processing calls
  • Meeting transcription tools
  • Voice-enabled applications requiring high accuracy

The implementation uses Python and combines proven audio processing techniques with AssemblyAI’s powerful transcription capabilities. While our example focuses on microphone input, the principles can be applied to any real-time audio stream.

Quickstart

1import logging
2import numpy as np
3import noisereduce as nr
4import assemblyai as aai
5from typing import Type
6from assemblyai.streaming.v3 import (
7 BeginEvent,
8 StreamingClient,
9 StreamingClientOptions,
10 StreamingError,
11 StreamingEvents,
12 StreamingParameters,
13 StreamingSessionParameters,
14 TerminationEvent,
15 TurnEvent,
16)
17
18logging.basicConfig(level=logging.INFO)
19
20api_key = "<YOUR_API_KEY>"
21
22# --- Noise-reduced microphone stream ---
23class NoiseReducedMicrophoneStream:
24 def __init__(self, sample_rate):
25 self.sample_rate = sample_rate
26 self.buffer = np.array([])
27 self.buffer_size = int(sample_rate * 0.5) # 0.5 second buffer
28 self.mic = aai.extras.MicrophoneStream(sample_rate=sample_rate)
29
30 def __iter__(self):
31 return self
32
33 def __next__(self):
34 raw_audio = next(self.mic)
35 audio_data = np.frombuffer(raw_audio, dtype=np.int16)
36 self.buffer = np.append(self.buffer, audio_data)
37
38 if len(self.buffer) >= self.buffer_size:
39 float_audio = self.buffer.astype(np.float32) / 32768.0
40 denoised = nr.reduce_noise(
41 y=float_audio,
42 sr=self.sample_rate,
43 prop_decrease=0.75,
44 n_fft=1024,
45 )
46 int_audio = (denoised * 32768.0).astype(np.int16)
47 self.buffer = self.buffer[-1024:] # keep some overlap
48 return int_audio.tobytes()
49
50 return b''
51
52# --- Event Handlers ---
53def on_begin(self: Type[StreamingClient], event: BeginEvent):
54 print(f" Session started: {event.id}")
55
56def on_turn(self: Type[StreamingClient], event: TurnEvent):
57 print(f"{event.transcript} ({event.end_of_turn})")
58
59 if event.end_of_turn and not event.turn_is_formatted:
60 self.set_params(StreamingSessionParameters(format_turns=True))
61
62def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
63 print(f" Session terminated after {event.audio_duration_seconds} seconds")
64
65def on_error(self: Type[StreamingClient], error: StreamingError):
66 print(f" Error occurred: {error}")
67
68# --- Main Function ---
69def main():
70 client = StreamingClient(
71 StreamingClientOptions(
72 api_key=api_key,
73 api_host="streaming.assemblyai.com",
74 )
75 )
76
77 client.on(StreamingEvents.Begin, on_begin)
78 client.on(StreamingEvents.Turn, on_turn)
79 client.on(StreamingEvents.Termination, on_terminated)
80 client.on(StreamingEvents.Error, on_error)
81
82 client.connect(
83 StreamingParameters(
84 sample_rate=16000,
85 format_turns=True,
86 )
87 )
88
89 try:
90 denoised_stream = NoiseReducedMicrophoneStream(sample_rate=16000)
91 client.stream(denoised_stream)
92 finally:
93 client.disconnect(terminate=True)
94
95if __name__ == "__main__":
96 main()

Step-by-step guide

First, install the following packages: assemblyai, noisereduce, numpy

$pip install assemblyai noisereduce numpy
1import logging
2import numpy as np
3import noisereduce as nr
4import assemblyai as aai
5from typing import Type
6from assemblyai.streaming.v3 import (
7 BeginEvent,
8 StreamingClient,
9 StreamingClientOptions,
10 StreamingError,
11 StreamingEvents,
12 StreamingParameters,
13 StreamingSessionParameters,
14 TerminationEvent,
15 TurnEvent,
16)

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for a free account and get your API key from your dashboard. Please note that Streaming Speech-to-text is available for upgraded accounts only. If you’re on the free plan, you’ll need to upgrade your account by adding a credit card.

1api_key = "<YOUR_API_KEY>"

Make sure not to share this token with anyone - it is a private key associated uniquely to your account.

Create functions to handle different events during transcription.

1def on_begin(self: Type[StreamingClient], event: BeginEvent):
2 print(f" Session started: {event.id}")
3
4def on_turn(self: Type[StreamingClient], event: TurnEvent):
5 print(f"{event.transcript} ({event.end_of_turn})")
6
7 if event.end_of_turn and not event.turn_is_formatted:
8 self.set_params(StreamingSessionParameters(format_turns=True))
9
10def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
11 print(f" Session terminated after {event.audio_duration_seconds} seconds")
12
13def on_error(self: Type[StreamingClient], error: StreamingError):
14 print(f" Error occurred: {error}")

Create a custom stream class that includes noise reduction.

1class NoiseReducedMicrophoneStream:
2 def __init__(self, sample_rate):
3 self.microphone_stream = aai.extras.MicrophoneStream(sample_rate=sample_rate)
4 self.sample_rate = sample_rate
5 self.buffer = np.array([])
6 self.buffer_size = int(sample_rate * 0.5) # 0.5 seconds buffer
7
8 def __iter__(self):
9 return self
10
11 def __next__(self):
12 # Get audio chunk from microphone
13 audio_chunk = next(self.microphone_stream)
14
15 # Convert bytes to numpy array
16 audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
17
18 # Add to buffer
19 self.buffer = np.append(self.buffer, audio_data)
20
21 # Process when buffer is full
22 if len(self.buffer) >= self.buffer_size:
23 # Convert to float32 for noise reduction
24 float_buffer = self.buffer.astype(np.float32) / 32768.0
25
26 # Apply noise reduction
27 # You can tweak these parameters to change the aggressiveness of the noise reduction
28 reduced_noise = nr.reduce_noise(
29 y=float_buffer,
30 sr=self.sample_rate,
31 prop_decrease=0.75,
32 n_fft=1024
33 )
34
35 # Convert back to int16
36 processed_chunk = (reduced_noise * 32768.0).astype(np.int16)
37
38 # Clear buffer but keep a small overlap
39 overlap = 1024
40 self.buffer = self.buffer[-overlap:] if len(self.buffer) > overlap else np.array([])
41
42 # Convert back to bytes
43 return processed_chunk.tobytes()
44
45 # If buffer not full, return empty bytes
46 return b''

Now we create our transcriber and NoiseReducedMicrophoneStream.

1def main():
2 client = StreamingClient(
3 StreamingClientOptions(
4 api_key=api_key,
5 api_host="streaming.assemblyai.com",
6 )
7 )
8
9 client.on(StreamingEvents.Begin, on_begin)
10 client.on(StreamingEvents.Turn, on_turn)
11 client.on(StreamingEvents.Termination, on_terminated)
12 client.on(StreamingEvents.Error, on_error)
13
14 client.connect(
15 StreamingParameters(
16 sample_rate=16000,
17 format_turns=True,
18 )
19 )
20
21 try:
22 denoised_stream = NoiseReducedMicrophoneStream(sample_rate=16000)
23 client.stream(denoised_stream)
24 finally:
25 client.disconnect(terminate=True)
26
27if __name__ == "__main__":
28 main()

You can press Ctrl+C to stop the transcription.