Setup A Speaker Identification System using Pinecone & Nvidia TitaNet | AssemblyAI

This guide will demonstrate how to build an advanced speaker recognition and diarization system that you can use to identify speakers across multiple audio files. It will use:

AssemblyAI for transcription and initial diarization.
Nvidia’s TitaNet model for speaker embedding generation.
Pinecone for efficient similarity search of speaker embeddings.

Quickstart

1 from pinecone import Pinecone, ServerlessSpec
2 import assemblyai as aai
3 import requests
4 import os
5 from pydub import AudioSegment
6 import mimetypes
7 import wave
8 from nemo.collections.asr.models import EncDecSpeakerLabelModel
9 import torch
10 import numpy as np
11 import uuid
12 from sklearn.metrics.pairwise import cosine_similarity
13 
14 # Obtain from your Pinecone dashboard.
15 pc = Pinecone(api_key="PINECONE_KEY_HERE")
16 
17 # Obtain from your AssemblyAI dashboard.
18 aai.settings.api_key = "AAI_KEY_HERE"
19 
20 def transcribe(file_url):
21     config = aai.TranscriptionConfig(speaker_labels=True)  # Speaker labels must be enabled for this Cookbook.
22 
23     transcriber = aai.Transcriber(config=config)
24 
25     transcript = transcriber.transcribe(file_url)
26 
27     return transcript.json_response
28 
29 def download_and_convert_to_wav(url, output_dir="./content/converted_audio"):
30     # Create the output directory if it doesn't exist.
31     os.makedirs(output_dir, exist_ok=True)
32 
33     # Extract filename from URL.
34     filename = url.split("/")[-1].split("?")[0]
35     base_filename, file_extension = os.path.splitext(filename)
36 
37     # Download the file.
38     response = requests.get(url)
39     if response.status_code == 200:
40         # Determine the file type.
41         content_type = response.headers.get("content-type")
42         if content_type:
43             guessed_extension = mimetypes.guess_extension(content_type)
44             if guessed_extension:
45                 file_extension = guessed_extension
46 
47         # Save the downloaded file.
48         downloaded_file = os.path.join(output_dir, filename)
49         with open(downloaded_file, "wb") as f:
50             f.write(response.content)
51 
52         # Generate the WAV file name.
53         wav_filename = f"{base_filename}.wav"
54         wav_file = os.path.join(output_dir, wav_filename)
55 
56         # Load the audio file.
57         audio = AudioSegment.from_file(downloaded_file)
58 
59         # Convert to mono if it's stereo.
60         if audio.channels > 1:
61             print("Setting channels to 1.")
62             audio = audio.set_channels(1)
63 
64         # Export as WAV.
65         audio.export(wav_file, format="wav")
66         print(f"File converted and saved as: {wav_file}")
67 
68         # Remove the original downloaded file if it's different from the WAV file.
69         if downloaded_file != wav_file:
70             os.remove(downloaded_file)
71 
72         # Ensure the WAV file is single channel.
73         with wave.open(wav_file, "rb") as wf:
74             n_channels = wf.getnchannels()
75             if n_channels > 1:
76                 print(f"Converting {n_channels} channels to mono...")
77                 # Read the frames.
78                 frames = wf.readframes(wf.getnframes())
79                 # Get other parameters.
80                 params = wf.getparams()
81                 # Close the file.
82                 wf.close()
83                 # Convert to mono.
84                 mono_frames = b"".join([frames[i::n_channels] for i in range(n_channels)])
85                 # Write the mono WAV file.
86                 with wave.open(wav_file, "wb") as wf:
87                     wf.setparams(
88                         (1, params.sampwidth, params.framerate, params.nframes, params.comptype, params.compname)
89                     )
90                     wf.writeframes(mono_frames)
91                 print("Conversion to mono complete.")
92 
93         return wav_file
94     else:
95         print(f"Failed to download the file. Status code: {response.status_code}")
96         return None
97 
98 def add_speaker_embedding_to_pinecone(speaker_name, speaker_embedding, unique_id=None):
99     # Ensure the embedding is a 1D numpy array.
100     if isinstance(speaker_embedding, torch.Tensor):
101         embedding_np = speaker_embedding.squeeze().cpu().numpy()
102     elif isinstance(speaker_embedding, np.ndarray):
103         embedding_np = speaker_embedding.squeeze()
104     else:
105         raise ValueError("Unsupported embedding type. Expected torch.Tensor or numpy.ndarray")
106 
107     # Ensure the embedding is the correct shape
108     if embedding_np.shape != (192,):
109         raise ValueError(f"Expected embedding of shape (192,), but got {embedding_np.shape}")
110 
111     # Convert to list for Pinecone
112     embedding_list = embedding_np.tolist()
113 
114     # Generate a unique ID if not provided
115     if unique_id is None:
116         unique_id = f"speaker_{speaker_name}_{uuid.uuid4().hex[:8]}"
117 
118     # Create the metadata dictionary
119     metadata = {"speaker_name": speaker_name}
120 
121     # Upsert the vector to Pinecone
122     upsert_response = index.upsert(vectors=[(unique_id, embedding_list, metadata)])
123 
124     print(f"Upserted embedding for speaker {speaker_name} with ID {unique_id}")
125     return unique_id
126 
127 def find_closest_speaker(utterance_embedding, local_embeddings=None, local_only=False, threshold=0.5):
128     def cosine_sim(a, b):
129         return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]
130 
131     best_match = {"speaker_name": "No match found", "score": 0}
132 
133     # Local embeddings processing.
134     if local_embeddings is not None:
135         for speaker_name, embedding in local_embeddings.items():
136             score = cosine_sim(utterance_embedding, embedding)
137             if score > best_match["score"]:
138                 print("Identified speaker " + speaker_name + " confidence " + str(score))
139                 best_match = {"speaker_name": speaker_name, "score": score}
140 
141     # Pinecone query (if not local_only and local_embeddings is empty or not provided)
142     if not local_only and (local_embeddings is None or len(local_embeddings) == 0):
143         results = index.query(vector=utterance_embedding.tolist(), top_k=1, include_metadata=True)
144         if results["matches"]:
145             pinecone_match = results["matches"][0]
146             pinecone_score = pinecone_match["score"]
147             if pinecone_score > best_match["score"]:
148                 best_match = {"speaker_name": pinecone_match["metadata"]["speaker_name"], "score": pinecone_score}
149 
150     # Check if the best match meets the threshold.
151     if best_match["score"] < threshold:
152         return "No match found", 0
153 
154     return best_match["speaker_name"], best_match["score"]
155 
156 def identify_speakers_from_utterances(transcript, wav_file, min_utterance_length=5000, match_all_utterances=False):
157     utterances = transcript["utterances"]
158     speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
159 
160     known_speakers = {}
161     unknown_speakers = {}
162     unknown_count = 0
163 
164     unknown_folder = "unknown_speaker_utterances"
165     os.makedirs(unknown_folder, exist_ok=True)
166 
167     audio_file_name = os.path.basename(wav_file)
168     full_audio = AudioSegment.from_wav(wav_file)
169 
170     def get_suitable_utterance(speaker, min_length):
171         suitable_utterances = [
172             u for u in utterances if u["speaker"] == speaker and (u["end"] - u["start"]) >= min_length
173         ]
174         if suitable_utterances:
175             return max(suitable_utterances, key=lambda u: u["end"] - u["start"])
176         return max((u for u in utterances if u["speaker"] == speaker), key=lambda u: u["end"] - u["start"])
177 
178     # First pass: Identify speakers.
179     for speaker in set(u["speaker"] for u in utterances):
180         if speaker not in known_speakers and speaker not in unknown_speakers:
181             suitable_utterance = get_suitable_utterance(speaker, min_utterance_length)
182 
183             start_ms = suitable_utterance["start"]
184             end_ms = suitable_utterance["end"]
185             utterance_audio = full_audio[start_ms:end_ms]
186 
187             temp_wav = "temp_utterance.wav"
188             utterance_audio.export(temp_wav, format="wav")
189             embedding = speaker_model.get_embedding(temp_wav)
190             os.remove(temp_wav)
191 
192             speaker_name, score = find_closest_speaker(embedding)
193             print(f"Speaker: {speaker}, Closest match: {speaker_name}, Score: {score}")
194 
195             if score > 0.5:  # Adjust threshold as needed.
196                 known_speakers[speaker] = speaker_name
197                 print(f"Identified as known speaker: {speaker}")
198             else:
199                 unknown_count += 1
200                 unknown_name = f"Unknown Speaker {chr(64 + unknown_count)}"
201                 unknown_wav = f"{unknown_folder}/unknown_speaker_{unknown_count}_from_{audio_file_name}"
202                 utterance_audio.export(unknown_wav, format="wav")
203                 unknown_speakers[speaker] = {
204                     "name": unknown_name,
205                     "wav_file": unknown_wav,
206                     "duration": end_ms - start_ms,
207                 }
208                 print(f"New unknown speaker detected: {unknown_name} (Duration: {(end_ms - start_ms)/1000:.2f}s)")
209 
210     # Second pass: Replace speaker names.
211     for utterance in utterances:
212         if utterance["speaker"] in known_speakers:
213             utterance["speaker"] in known_speakers[utterance["speaker"]]
214         elif utterance["speaker"] in unknown_speakers:
215             utterance["speaker"] = unknown_speakers[utterance["speaker"]]["name"]
216 
217     # Third pass: Match all utterances if requested.
218     if match_all_utterances:
219         print("Matching all utterances individually...")
220         for utterance in utterances:
221             start_ms = utterance["start"]
222             end_ms = utterance["end"]
223             utterance_audio = full_audio[start_ms:end_ms]
224 
225             temp_wav = "temp_utterance.wav"
226             utterance_audio.export(temp_wav, format="wav")
227             embedding = speaker_model.get_embedding(temp_wav)
228             os.remove(temp_wav)
229 
230             new_speaker_name, score = find_closest_speaker(embedding)
231 
232             if score > 0.5 and new_speaker_name != utterance["speaker"]:
233                 print(f"Speaker change detected: '{utterance['speaker']}' -> '{new_speaker_name}' (Score: {score})")
234                 print(f"Utterance: {utterance['text'][:50]}...")
235                 utterance["speaker"] = new_speaker_name
236 
237     return utterances, unknown_speakers
238 
239 pc.create_index(
240     name="speaker-embeddings",
241     dimension=192,  # Replace with model-specific dimensions - 192 is for TitaNet-Large.
242     metric="cosine",  # Replace with your model metric.
243     spec=ServerlessSpec(
244         cloud="aws",
245         region="us-east-1",
246     ),
247 )
248 
249 # Connect to our new index.
250 index = pc.Index("speaker-embeddings")
251 
252 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
253 
254 elon_fingerprint = download_and_convert_to_wav(
255     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
256 )
257 altman_fingerprint = download_and_convert_to_wav(
258     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/sam_altman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L3NhbV9hbHRtYW5fZmluZ2VycHJpbnQubXAzIiwiaWF0IjoxNzAwNjY5NjI0LCJleHAiOjE3MzIyMDU2MjR9._1yuMGzBhFcHr7xv76160Hb_SC-mH_Wv3_qX-S7XsTU&t=2023-11-22T16%3A13%3A44.103Z"
259 )
260 lex_fingerprint = download_and_convert_to_wav(
261     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/lex_fridman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2xleF9mcmlkbWFuX2ZpbmdlcnByaW50Lm1wMyIsImlhdCI6MTcwMDY2OTc4OSwiZXhwIjoxNzMyMjA1Nzg5fQ.PUWTOLIHl4dcrWjh2ZJ_2TBaxMXpcU-x6OcvUDe6ZXQ&t=2023-11-22T16%3A16%3A29.697Z"
262 )
263 
264 known_speakers = {"Elon Musk": elon_fingerprint, "Sam Altman": altman_fingerprint, "Lex Fridman": lex_fingerprint}
265 
266 # Upload the known speakers.
267 for speaker, audio_file in known_speakers.items():
268     print("***")
269     print(speaker)
270     print(audio_file)
271     embedding = speaker_model.get_embedding(audio_file)
272     add_speaker_embedding_to_pinecone(speaker, embedding)
273 
274 audio_file = download_and_convert_to_wav(
275     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
276 )
277 utterance_embedding = speaker_model.get_embedding(audio_file)
278 
279 results = index.query(vector=utterance_embedding.tolist(), top_k=3, include_metadata=True)
280 print(results)
281 
282 # Example: Conversation Between Sam Altman and Elon Musk
283 transcript_obj = transcribe("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
284 wav_file = download_and_convert_to_wav("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
285 identified_utterances, unknown_speakers = identify_speakers_from_utterances(transcript_obj, wav_file)
286 
287 for utterance in identified_utterances:
288     print(f"{utterance['speaker']}: {utterance['text']}")

Initial Setup

First, you’ll need to sign up for an AssemblyAI account and obtain your API key from your account dashboard. Then, sign up for a Pinecone account and obtain your API key from “API Keys” on the sidebar of your dashboard. Also note that any files you use for this Cookbook should be in WAV format. While not a requirement for AssemblyAI, TitaNet requires WAV format.

Installing Dependencies

Now we’ll need to install the necessary libraries and frameworks for this project. Please note that this process can take several minutes to complete.

$ pip install -U Cython torch nemo_toolkit ffmpeg pydub pinecone-client assemblyai hydra-core pytorch_lightning huggingface_hub==0.23.5 librosa transformers pandas inflect webdataset sentencepiece youtokentome pyannote-audio editdistance jiwer lhotse datasets

Pinecone Setup

In this section, we’ll import Pinecone, create a new index for our speaker embeddings, and connect to the index. Please enter your Pinecone API key in the placeholder below.

1 from pinecone import Pinecone, ServerlessSpec
2 
3 # Obtain from your Pinecone dashboard.
4 pc = Pinecone(api_key="PINECONE_KEY_HERE")
5 
6 pc.create_index(
7     name="speaker-embeddings",
8     dimension=192,  # Replace with model-specific dimensions - 192 is for TitaNet-Large.
9     metric="cosine",  # Replace with your model metric.
10     spec=ServerlessSpec(
11         cloud="aws",
12         region="us-east-1",
13     ),
14 )
15 
16 # Connect to our new index.
17 index = pc.Index("speaker-embeddings")

AssemblyAI Setup

Now we’ll set up AssemblyAI for transcription and diarization. We’ll import the necessary modules and create a function to transcribe our audio files with speaker labels enabled. Please enter your AssemblyAI API key in the cell below.

1 import assemblyai as aai
2 
3 aai.settings.api_key = "AAI_KEY_HERE"
4 
5 
6 def transcribe(file_url):
7     config = aai.TranscriptionConfig(speaker_labels=True)  # Speaker labels must be enabled for this Cookbook.
8 
9     transcriber = aai.Transcriber(config=config)
10 
11     transcript = transcriber.transcribe(file_url)
12 
13     return transcript.json_response

We’ll also need to create a download_and_convert_to_wav helper function. This function allows us to take file URLs, download them, then convert them to WAV format. If the URLs are already in WAV format, then they’re just downloaded. The files must be in WAV format to work properly with the TitaNet.

1 import requests
2 import os
3 from pydub import AudioSegment
4 import mimetypes
5 import wave
6 
7 
8 def download_and_convert_to_wav(url, output_dir="./content/converted_audio"):
9     # Create the output directory if it doesn't exist.
10     os.makedirs(output_dir, exist_ok=True)
11 
12     # Extract filename from URL.
13     filename = url.split("/")[-1].split("?")[0]
14     base_filename, file_extension = os.path.splitext(filename)
15 
16     # Download the file.
17     response = requests.get(url)
18     if response.status_code == 200:
19         # Determine the file type.
20         content_type = response.headers.get("content-type")
21         if content_type:
22             guessed_extension = mimetypes.guess_extension(content_type)
23             if guessed_extension:
24                 file_extension = guessed_extension
25 
26         # Save the downloaded file.
27         downloaded_file = os.path.join(output_dir, filename)
28         with open(downloaded_file, "wb") as f:
29             f.write(response.content)
30 
31         # Generate the WAV file name.
32         wav_filename = f"{base_filename}.wav"
33         wav_file = os.path.join(output_dir, wav_filename)
34 
35         # Load the audio file.
36         audio = AudioSegment.from_file(downloaded_file)
37 
38         # Convert to mono if it's stereo.
39         if audio.channels > 1:
40             print("Setting channels to 1.")
41             audio = audio.set_channels(1)
42 
43         # Export as WAV.
44         audio.export(wav_file, format="wav")
45         print(f"File converted and saved as: {wav_file}")
46 
47         # Remove the original downloaded file if it's different from the WAV file.
48         if downloaded_file != wav_file:
49             os.remove(downloaded_file)
50 
51         # Ensure the WAV file is single channel.
52         with wave.open(wav_file, "rb") as wf:
53             n_channels = wf.getnchannels()
54             if n_channels > 1:
55                 print(f"Converting {n_channels} channels to mono...")
56                 # Read the frames.
57                 frames = wf.readframes(wf.getnframes())
58                 # Get other parameters.
59                 params = wf.getparams()
60                 # Close the file.
61                 wf.close()
62                 # Convert to mono.
63                 mono_frames = b"".join([frames[i::n_channels] for i in range(n_channels)])
64                 # Write the mono WAV file.
65                 with wave.open(wav_file, "wb") as wf:
66                     wf.setparams(
67                         (1, params.sampwidth, params.framerate, params.nframes, params.comptype, params.compname)
68                     )
69                     wf.writeframes(mono_frames)
70                 print("Conversion to mono complete.")
71 
72         return wav_file
73     else:
74         print(f"Failed to download the file. Status code: {response.status_code}")
75         return None

NVIDIA’s TitaNet Model Setup

Next we’ll import torch and nemo, then connect to and load NVIDIA’s TitaNet model. This model allows us to generate speaker embeddings to create speaker fingerprints. It also enables the conversion of utterances into embeddings for comparison with our fingerprints.

1 from nemo.collections.asr.models import EncDecSpeakerLabelModel
2 
3 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")

We’ll now define an add_speaker_embedding_to_pinecone function to add our speaker embeddings to the Pinecone database.

1 import torch
2 import numpy as np
3 import uuid
4 
5 
6 def add_speaker_embedding_to_pinecone(speaker_name, speaker_embedding, unique_id=None):
7     # Ensure the embedding is a 1D numpy array.
8     if isinstance(speaker_embedding, torch.Tensor):
9         embedding_np = speaker_embedding.squeeze().cpu().numpy()
10     elif isinstance(speaker_embedding, np.ndarray):
11         embedding_np = speaker_embedding.squeeze()
12     else:
13         raise ValueError("Unsupported embedding type. Expected torch.Tensor or numpy.ndarray")
14 
15     # Ensure the embedding is the correct shape
16     if embedding_np.shape != (192,):
17         raise ValueError(f"Expected embedding of shape (192,), but got {embedding_np.shape}")
18 
19     # Convert to list for Pinecone
20     embedding_list = embedding_np.tolist()
21 
22     # Generate a unique ID if not provided
23     if unique_id is None:
24         unique_id = f"speaker_{speaker_name}_{uuid.uuid4().hex[:8]}"
25 
26     # Create the metadata dictionary
27     metadata = {"speaker_name": speaker_name}
28 
29     # Upsert the vector to Pinecone
30     upsert_response = index.upsert(vectors=[(unique_id, embedding_list, metadata)])
31 
32     print(f"Upserted embedding for speaker {speaker_name} with ID {unique_id}")
33     return unique_id

Add Thumbprints to our Pinecone Database

Below we’ll use chunks of the speakers’ conversations to generate speaker embeddings and add them to our vector database. Later on, we’ll show how to take an audio file with speakers not in the vector database and obtain the data required to generate new speaker fingerprints to be uploaded to the Pinecone database.

1 elon_fingerprint = download_and_convert_to_wav(
2     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
3 )
4 altman_fingerprint = download_and_convert_to_wav(
5     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/sam_altman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L3NhbV9hbHRtYW5fZmluZ2VycHJpbnQubXAzIiwiaWF0IjoxNzAwNjY5NjI0LCJleHAiOjE3MzIyMDU2MjR9._1yuMGzBhFcHr7xv76160Hb_SC-mH_Wv3_qX-S7XsTU&t=2023-11-22T16%3A13%3A44.103Z"
6 )
7 lex_fingerprint = download_and_convert_to_wav(
8     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/lex_fridman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2xleF9mcmlkbWFuX2ZpbmdlcnByaW50Lm1wMyIsImlhdCI6MTcwMDY2OTc4OSwiZXhwIjoxNzMyMjA1Nzg5fQ.PUWTOLIHl4dcrWjh2ZJ_2TBaxMXpcU-x6OcvUDe6ZXQ&t=2023-11-22T16%3A16%3A29.697Z"
9 )
10 
11 known_speakers = {"Elon Musk": elon_fingerprint, "Sam Altman": altman_fingerprint, "Lex Fridman": lex_fingerprint}
12 
13 # Upload the known speakers.
14 for speaker, audio_file in known_speakers.items():
15     print("***")
16     print(speaker)
17     print(audio_file)
18     embedding = speaker_model.get_embedding(audio_file)
19     add_speaker_embedding_to_pinecone(speaker, embedding)

Now we can query our Pinecone database to ensure that our embeddings were uploaded successfully.

1 audio_file = download_and_convert_to_wav(
2     "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
3 )
4 utterance_embedding = speaker_model.get_embedding(audio_file)
5 
6 results = index.query(vector=utterance_embedding.tolist(), top_k=3, include_metadata=True)
7 
8 print(results)

Creating Functions to Find the Closest Speaker and Identify Speakers of Utterances

Speaker Identification Function

The find_closest_speaker function is a crucial component of our speaker identification system. It compares a given utterance embedding to known speaker embeddings and identifies the closest match.

1 from sklearn.metrics.pairwise import cosine_similarity
2 
3 
4 def find_closest_speaker(utterance_embedding, local_embeddings=None, local_only=False, threshold=0.5):
5     def cosine_sim(a, b):
6         return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]
7 
8     best_match = {"speaker_name": "No match found", "score": 0}
9 
10     # Local embeddings processing.
11     if local_embeddings is not None:
12         for speaker_name, embedding in local_embeddings.items():
13             score = cosine_sim(utterance_embedding, embedding)
14             if score > best_match["score"]:
15                 print("Identified speaker " + speaker_name + " confidence " + str(score))
16                 best_match = {"speaker_name": speaker_name, "score": score}
17 
18     # Pinecone query (if not local_only and local_embeddings is empty or not provided)
19     if not local_only and (local_embeddings is None or len(local_embeddings) == 0):
20         results = index.query(vector=utterance_embedding.tolist(), top_k=1, include_metadata=True)
21         if results["matches"]:
22             pinecone_match = results["matches"][0]
23             pinecone_score = pinecone_match["score"]
24             if pinecone_score > best_match["score"]:
25                 best_match = {"speaker_name": pinecone_match["metadata"]["speaker_name"], "score": pinecone_score}
26 
27     # Check if the best match meets the threshold.
28     if best_match["score"] < threshold:
29         return "No match found", 0
30 
31     return best_match["speaker_name"], best_match["score"]

Speaker Identification from Utterances

The identify_speakers_from_utterances function is the core of our speaker identification system. It processes a transcript with utterances and identifies speakers, handling both known and unknown voices.

1 def identify_speakers_from_utterances(transcript, wav_file, min_utterance_length=5000, match_all_utterances=False):
2     utterances = transcript["utterances"]
3     speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
4 
5     known_speakers = {}
6     unknown_speakers = {}
7     unknown_count = 0
8 
9     unknown_folder = "unknown_speaker_utterances"
10     os.makedirs(unknown_folder, exist_ok=True)
11 
12     audio_file_name = os.path.basename(wav_file)
13     full_audio = AudioSegment.from_wav(wav_file)
14 
15     def get_suitable_utterance(speaker, min_length):
16         suitable_utterances = [
17             u for u in utterances if u["speaker"] == speaker and (u["end"] - u["start"]) >= min_length
18         ]
19         if suitable_utterances:
20             return max(suitable_utterances, key=lambda u: u["end"] - u["start"])
21         return max((u for u in utterances if u["speaker"] == speaker), key=lambda u: u["end"] - u["start"])
22 
23     # First pass: Identify speakers.
24     for speaker in set(u["speaker"] for u in utterances):
25         if speaker not in known_speakers and speaker not in unknown_speakers:
26             suitable_utterance = get_suitable_utterance(speaker, min_utterance_length)
27 
28             start_ms = suitable_utterance["start"]
29             end_ms = suitable_utterance["end"]
30             utterance_audio = full_audio[start_ms:end_ms]
31 
32             temp_wav = "temp_utterance.wav"
33             utterance_audio.export(temp_wav, format="wav")
34             embedding = speaker_model.get_embedding(temp_wav)
35             os.remove(temp_wav)
36 
37             speaker_name, score = find_closest_speaker(embedding)
38             print(f"Speaker: {speaker}, Closest match: {speaker_name}, Score: {score}")
39 
40             if score > 0.5:  # Adjust threshold as needed.
41                 known_speakers[speaker] = speaker_name
42                 print(f"Identified as known speaker: {speaker}")
43             else:
44                 unknown_count += 1
45                 unknown_name = f"Unknown Speaker {chr(64 + unknown_count)}"
46                 unknown_wav = f"{unknown_folder}/unknown_speaker_{unknown_count}_from_{audio_file_name}"
47                 utterance_audio.export(unknown_wav, format="wav")
48                 unknown_speakers[speaker] = {
49                     "name": unknown_name,
50                     "wav_file": unknown_wav,
51                     "duration": end_ms - start_ms,
52                 }
53                 print(f"New unknown speaker detected: {unknown_name} (Duration: {(end_ms - start_ms)/1000:.2f}s)")
54 
55     # Second pass: Replace speaker names.
56     for utterance in utterances:
57         if utterance["speaker"] in known_speakers:
58             utterance["speaker"] in known_speakers[utterance["speaker"]]
59         elif utterance["speaker"] in unknown_speakers:
60             utterance["speaker"] = unknown_speakers[utterance["speaker"]]["name"]
61 
62     # Third pass: Match all utterances if requested.
63     if match_all_utterances:
64         print("Matching all utterances individually...")
65         for utterance in utterances:
66             start_ms = utterance["start"]
67             end_ms = utterance["end"]
68             utterance_audio = full_audio[start_ms:end_ms]
69 
70             temp_wav = "temp_utterance.wav"
71             utterance_audio.export(temp_wav, format="wav")
72             embedding = speaker_model.get_embedding(temp_wav)
73             os.remove(temp_wav)
74 
75             new_speaker_name, score = find_closest_speaker(embedding)
76 
77             if score > 0.5 and new_speaker_name != utterance["speaker"]:
78                 print(f"Speaker change detected: '{utterance['speaker']}' -> '{new_speaker_name}' (Score: {score})")
79                 print(f"Utterance: {utterance['text'][:50]}...")
80                 utterance["speaker"] = new_speaker_name
81 
82     return utterances, unknown_speakers

Examples: Speaker Identification and Diarization

To demonstrate the capabilities of our speaker identification and diarization system, we’ll cover several examples. We’ll start with a straightforward case and progressively move to more complex scenarios.

Example 1: Conversation Between Sam Altman and Elon Musk

Our first example is a simple conversation between two well-known figures: Elon Musk and Sam Altman. This example will showcase how our system performs with clear, distinct voices in a controlled setting.

1 transcript_obj = transcribe("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
2 wav_file = download_and_convert_to_wav("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
3 identified_utterances, unknown_speakers = identify_speakers_from_utterances(transcript_obj, wav_file)
4 
5 for utterance in identified_utterances:
6     print(f"{utterance['speaker']}: {utterance['text']}")