Setup A Speaker Identification System using Pinecone & Nvidia TitaNet
This guide will demonstrate how to build an advanced speaker recognition and diarization system that you can use to identify speakers across multiple audio files. It will use:
- AssemblyAI for transcription and initial diarization.
- Nvidia’s TitaNet model for speaker embedding generation.
- Pinecone for efficient similarity search of speaker embeddings.
Quickstart
1 from pinecone import Pinecone, ServerlessSpec 2 import assemblyai as aai 3 import requests 4 import os 5 from pydub import AudioSegment 6 import mimetypes 7 import wave 8 from nemo.collections.asr.models import EncDecSpeakerLabelModel 9 import torch 10 import numpy as np 11 import uuid 12 from sklearn.metrics.pairwise import cosine_similarity 13 14 # Obtain from your Pinecone dashboard. 15 pc = Pinecone(api_key="PINECONE_KEY_HERE") 16 17 # Obtain from your AssemblyAI dashboard. 18 aai.settings.api_key = "AAI_KEY_HERE" 19 20 def transcribe(file_url): 21 config = aai.TranscriptionConfig(speaker_labels=True) # Speaker labels must be enabled for this Cookbook. 22 23 transcriber = aai.Transcriber(config=config) 24 25 transcript = transcriber.transcribe(file_url) 26 27 return transcript.json_response 28 29 def download_and_convert_to_wav(url, output_dir="./content/converted_audio"): 30 # Create the output directory if it doesn't exist. 31 os.makedirs(output_dir, exist_ok=True) 32 33 # Extract filename from URL. 34 filename = url.split("/")[-1].split("?")[0] 35 base_filename, file_extension = os.path.splitext(filename) 36 37 # Download the file. 38 response = requests.get(url) 39 if response.status_code == 200: 40 # Determine the file type. 41 content_type = response.headers.get("content-type") 42 if content_type: 43 guessed_extension = mimetypes.guess_extension(content_type) 44 if guessed_extension: 45 file_extension = guessed_extension 46 47 # Save the downloaded file. 48 downloaded_file = os.path.join(output_dir, filename) 49 with open(downloaded_file, "wb") as f: 50 f.write(response.content) 51 52 # Generate the WAV file name. 53 wav_filename = f"{base_filename}.wav" 54 wav_file = os.path.join(output_dir, wav_filename) 55 56 # Load the audio file. 57 audio = AudioSegment.from_file(downloaded_file) 58 59 # Convert to mono if it's stereo. 60 if audio.channels > 1: 61 print("Setting channels to 1.") 62 audio = audio.set_channels(1) 63 64 # Export as WAV. 65 audio.export(wav_file, format="wav") 66 print(f"File converted and saved as: {wav_file}") 67 68 # Remove the original downloaded file if it's different from the WAV file. 69 if downloaded_file != wav_file: 70 os.remove(downloaded_file) 71 72 # Ensure the WAV file is single channel. 73 with wave.open(wav_file, "rb") as wf: 74 n_channels = wf.getnchannels() 75 if n_channels > 1: 76 print(f"Converting {n_channels} channels to mono...") 77 # Read the frames. 78 frames = wf.readframes(wf.getnframes()) 79 # Get other parameters. 80 params = wf.getparams() 81 # Close the file. 82 wf.close() 83 # Convert to mono. 84 mono_frames = b"".join([frames[i::n_channels] for i in range(n_channels)]) 85 # Write the mono WAV file. 86 with wave.open(wav_file, "wb") as wf: 87 wf.setparams( 88 (1, params.sampwidth, params.framerate, params.nframes, params.comptype, params.compname) 89 ) 90 wf.writeframes(mono_frames) 91 print("Conversion to mono complete.") 92 93 return wav_file 94 else: 95 print(f"Failed to download the file. Status code: {response.status_code}") 96 return None 97 98 def add_speaker_embedding_to_pinecone(speaker_name, speaker_embedding, unique_id=None): 99 # Ensure the embedding is a 1D numpy array. 100 if isinstance(speaker_embedding, torch.Tensor): 101 embedding_np = speaker_embedding.squeeze().cpu().numpy() 102 elif isinstance(speaker_embedding, np.ndarray): 103 embedding_np = speaker_embedding.squeeze() 104 else: 105 raise ValueError("Unsupported embedding type. Expected torch.Tensor or numpy.ndarray") 106 107 # Ensure the embedding is the correct shape 108 if embedding_np.shape != (192,): 109 raise ValueError(f"Expected embedding of shape (192,), but got {embedding_np.shape}") 110 111 # Convert to list for Pinecone 112 embedding_list = embedding_np.tolist() 113 114 # Generate a unique ID if not provided 115 if unique_id is None: 116 unique_id = f"speaker_{speaker_name}_{uuid.uuid4().hex[:8]}" 117 118 # Create the metadata dictionary 119 metadata = {"speaker_name": speaker_name} 120 121 # Upsert the vector to Pinecone 122 upsert_response = index.upsert(vectors=[(unique_id, embedding_list, metadata)]) 123 124 print(f"Upserted embedding for speaker {speaker_name} with ID {unique_id}") 125 return unique_id 126 127 def find_closest_speaker(utterance_embedding, local_embeddings=None, local_only=False, threshold=0.5): 128 def cosine_sim(a, b): 129 return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0] 130 131 best_match = {"speaker_name": "No match found", "score": 0} 132 133 # Local embeddings processing. 134 if local_embeddings is not None: 135 for speaker_name, embedding in local_embeddings.items(): 136 score = cosine_sim(utterance_embedding, embedding) 137 if score > best_match["score"]: 138 print("Identified speaker " + speaker_name + " confidence " + str(score)) 139 best_match = {"speaker_name": speaker_name, "score": score} 140 141 # Pinecone query (if not local_only and local_embeddings is empty or not provided) 142 if not local_only and (local_embeddings is None or len(local_embeddings) == 0): 143 results = index.query(vector=utterance_embedding.tolist(), top_k=1, include_metadata=True) 144 if results["matches"]: 145 pinecone_match = results["matches"][0] 146 pinecone_score = pinecone_match["score"] 147 if pinecone_score > best_match["score"]: 148 best_match = {"speaker_name": pinecone_match["metadata"]["speaker_name"], "score": pinecone_score} 149 150 # Check if the best match meets the threshold. 151 if best_match["score"] < threshold: 152 return "No match found", 0 153 154 return best_match["speaker_name"], best_match["score"] 155 156 def identify_speakers_from_utterances(transcript, wav_file, min_utterance_length=5000, match_all_utterances=False): 157 utterances = transcript["utterances"] 158 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large") 159 160 known_speakers = {} 161 unknown_speakers = {} 162 unknown_count = 0 163 164 unknown_folder = "unknown_speaker_utterances" 165 os.makedirs(unknown_folder, exist_ok=True) 166 167 audio_file_name = os.path.basename(wav_file) 168 full_audio = AudioSegment.from_wav(wav_file) 169 170 def get_suitable_utterance(speaker, min_length): 171 suitable_utterances = [ 172 u for u in utterances if u["speaker"] == speaker and (u["end"] - u["start"]) >= min_length 173 ] 174 if suitable_utterances: 175 return max(suitable_utterances, key=lambda u: u["end"] - u["start"]) 176 return max((u for u in utterances if u["speaker"] == speaker), key=lambda u: u["end"] - u["start"]) 177 178 # First pass: Identify speakers. 179 for speaker in set(u["speaker"] for u in utterances): 180 if speaker not in known_speakers and speaker not in unknown_speakers: 181 suitable_utterance = get_suitable_utterance(speaker, min_utterance_length) 182 183 start_ms = suitable_utterance["start"] 184 end_ms = suitable_utterance["end"] 185 utterance_audio = full_audio[start_ms:end_ms] 186 187 temp_wav = "temp_utterance.wav" 188 utterance_audio.export(temp_wav, format="wav") 189 embedding = speaker_model.get_embedding(temp_wav) 190 os.remove(temp_wav) 191 192 speaker_name, score = find_closest_speaker(embedding) 193 print(f"Speaker: {speaker}, Closest match: {speaker_name}, Score: {score}") 194 195 if score > 0.5: # Adjust threshold as needed. 196 known_speakers[speaker] = speaker_name 197 print(f"Identified as known speaker: {speaker}") 198 else: 199 unknown_count += 1 200 unknown_name = f"Unknown Speaker {chr(64 + unknown_count)}" 201 unknown_wav = f"{unknown_folder}/unknown_speaker_{unknown_count}_from_{audio_file_name}" 202 utterance_audio.export(unknown_wav, format="wav") 203 unknown_speakers[speaker] = { 204 "name": unknown_name, 205 "wav_file": unknown_wav, 206 "duration": end_ms - start_ms, 207 } 208 print(f"New unknown speaker detected: {unknown_name} (Duration: {(end_ms - start_ms)/1000:.2f}s)") 209 210 # Second pass: Replace speaker names. 211 for utterance in utterances: 212 if utterance["speaker"] in known_speakers: 213 utterance["speaker"] in known_speakers[utterance["speaker"]] 214 elif utterance["speaker"] in unknown_speakers: 215 utterance["speaker"] = unknown_speakers[utterance["speaker"]]["name"] 216 217 # Third pass: Match all utterances if requested. 218 if match_all_utterances: 219 print("Matching all utterances individually...") 220 for utterance in utterances: 221 start_ms = utterance["start"] 222 end_ms = utterance["end"] 223 utterance_audio = full_audio[start_ms:end_ms] 224 225 temp_wav = "temp_utterance.wav" 226 utterance_audio.export(temp_wav, format="wav") 227 embedding = speaker_model.get_embedding(temp_wav) 228 os.remove(temp_wav) 229 230 new_speaker_name, score = find_closest_speaker(embedding) 231 232 if score > 0.5 and new_speaker_name != utterance["speaker"]: 233 print(f"Speaker change detected: '{utterance['speaker']}' -> '{new_speaker_name}' (Score: {score})") 234 print(f"Utterance: {utterance['text'][:50]}...") 235 utterance["speaker"] = new_speaker_name 236 237 return utterances, unknown_speakers 238 239 pc.create_index( 240 name="speaker-embeddings", 241 dimension=192, # Replace with model-specific dimensions - 192 is for TitaNet-Large. 242 metric="cosine", # Replace with your model metric. 243 spec=ServerlessSpec( 244 cloud="aws", 245 region="us-east-1", 246 ), 247 ) 248 249 # Connect to our new index. 250 index = pc.Index("speaker-embeddings") 251 252 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large") 253 254 elon_fingerprint = download_and_convert_to_wav( 255 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z" 256 ) 257 altman_fingerprint = download_and_convert_to_wav( 258 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/sam_altman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L3NhbV9hbHRtYW5fZmluZ2VycHJpbnQubXAzIiwiaWF0IjoxNzAwNjY5NjI0LCJleHAiOjE3MzIyMDU2MjR9._1yuMGzBhFcHr7xv76160Hb_SC-mH_Wv3_qX-S7XsTU&t=2023-11-22T16%3A13%3A44.103Z" 259 ) 260 lex_fingerprint = download_and_convert_to_wav( 261 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/lex_fridman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2xleF9mcmlkbWFuX2ZpbmdlcnByaW50Lm1wMyIsImlhdCI6MTcwMDY2OTc4OSwiZXhwIjoxNzMyMjA1Nzg5fQ.PUWTOLIHl4dcrWjh2ZJ_2TBaxMXpcU-x6OcvUDe6ZXQ&t=2023-11-22T16%3A16%3A29.697Z" 262 ) 263 264 known_speakers = {"Elon Musk": elon_fingerprint, "Sam Altman": altman_fingerprint, "Lex Fridman": lex_fingerprint} 265 266 # Upload the known speakers. 267 for speaker, audio_file in known_speakers.items(): 268 print("***") 269 print(speaker) 270 print(audio_file) 271 embedding = speaker_model.get_embedding(audio_file) 272 add_speaker_embedding_to_pinecone(speaker, embedding) 273 274 audio_file = download_and_convert_to_wav( 275 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z" 276 ) 277 utterance_embedding = speaker_model.get_embedding(audio_file) 278 279 results = index.query(vector=utterance_embedding.tolist(), top_k=3, include_metadata=True) 280 print(results) 281 282 # Example: Conversation Between Sam Altman and Elon Musk 283 transcript_obj = transcribe("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z") 284 wav_file = download_and_convert_to_wav("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z") 285 identified_utterances, unknown_speakers = identify_speakers_from_utterances(transcript_obj, wav_file) 286 287 for utterance in identified_utterances: 288 print(f"{utterance['speaker']}: {utterance['text']}")
Initial Setup
First, you’ll need to sign up for an AssemblyAI account and obtain your API key from your account dashboard. Then, sign up for a Pinecone account and obtain your API key from “API Keys” on the sidebar of your dashboard. Also note that any files you use for this Cookbook should be in WAV format. While not a requirement for AssemblyAI, TitaNet requires WAV format.
Installing Dependencies
Now we’ll need to install the necessary libraries and frameworks for this project. Please note that this process can take several minutes to complete.
$ pip install -U Cython torch nemo_toolkit ffmpeg pydub pinecone-client assemblyai hydra-core pytorch_lightning huggingface_hub==0.23.5 librosa transformers pandas inflect webdataset sentencepiece youtokentome pyannote-audio editdistance jiwer lhotse datasets
Pinecone Setup
In this section, we’ll import Pinecone, create a new index for our speaker embeddings, and connect to the index. Please enter your Pinecone API key in the placeholder below.
1 from pinecone import Pinecone, ServerlessSpec 2 3 # Obtain from your Pinecone dashboard. 4 pc = Pinecone(api_key="PINECONE_KEY_HERE") 5 6 pc.create_index( 7 name="speaker-embeddings", 8 dimension=192, # Replace with model-specific dimensions - 192 is for TitaNet-Large. 9 metric="cosine", # Replace with your model metric. 10 spec=ServerlessSpec( 11 cloud="aws", 12 region="us-east-1", 13 ), 14 ) 15 16 # Connect to our new index. 17 index = pc.Index("speaker-embeddings")
AssemblyAI Setup
Now we’ll set up AssemblyAI for transcription and diarization. We’ll import the necessary modules and create a function to transcribe our audio files with speaker labels enabled. Please enter your AssemblyAI API key in the cell below.
1 import assemblyai as aai 2 3 aai.settings.api_key = "AAI_KEY_HERE" 4 5 6 def transcribe(file_url): 7 config = aai.TranscriptionConfig(speaker_labels=True) # Speaker labels must be enabled for this Cookbook. 8 9 transcriber = aai.Transcriber(config=config) 10 11 transcript = transcriber.transcribe(file_url) 12 13 return transcript.json_response
We’ll also need to create a download_and_convert_to_wav
helper function. This function allows us to take file URLs, download them, then convert them to WAV format. If the URLs are already in WAV format, then they’re just downloaded. The files must be in WAV format to work properly with the TitaNet.
1 import requests 2 import os 3 from pydub import AudioSegment 4 import mimetypes 5 import wave 6 7 8 def download_and_convert_to_wav(url, output_dir="./content/converted_audio"): 9 # Create the output directory if it doesn't exist. 10 os.makedirs(output_dir, exist_ok=True) 11 12 # Extract filename from URL. 13 filename = url.split("/")[-1].split("?")[0] 14 base_filename, file_extension = os.path.splitext(filename) 15 16 # Download the file. 17 response = requests.get(url) 18 if response.status_code == 200: 19 # Determine the file type. 20 content_type = response.headers.get("content-type") 21 if content_type: 22 guessed_extension = mimetypes.guess_extension(content_type) 23 if guessed_extension: 24 file_extension = guessed_extension 25 26 # Save the downloaded file. 27 downloaded_file = os.path.join(output_dir, filename) 28 with open(downloaded_file, "wb") as f: 29 f.write(response.content) 30 31 # Generate the WAV file name. 32 wav_filename = f"{base_filename}.wav" 33 wav_file = os.path.join(output_dir, wav_filename) 34 35 # Load the audio file. 36 audio = AudioSegment.from_file(downloaded_file) 37 38 # Convert to mono if it's stereo. 39 if audio.channels > 1: 40 print("Setting channels to 1.") 41 audio = audio.set_channels(1) 42 43 # Export as WAV. 44 audio.export(wav_file, format="wav") 45 print(f"File converted and saved as: {wav_file}") 46 47 # Remove the original downloaded file if it's different from the WAV file. 48 if downloaded_file != wav_file: 49 os.remove(downloaded_file) 50 51 # Ensure the WAV file is single channel. 52 with wave.open(wav_file, "rb") as wf: 53 n_channels = wf.getnchannels() 54 if n_channels > 1: 55 print(f"Converting {n_channels} channels to mono...") 56 # Read the frames. 57 frames = wf.readframes(wf.getnframes()) 58 # Get other parameters. 59 params = wf.getparams() 60 # Close the file. 61 wf.close() 62 # Convert to mono. 63 mono_frames = b"".join([frames[i::n_channels] for i in range(n_channels)]) 64 # Write the mono WAV file. 65 with wave.open(wav_file, "wb") as wf: 66 wf.setparams( 67 (1, params.sampwidth, params.framerate, params.nframes, params.comptype, params.compname) 68 ) 69 wf.writeframes(mono_frames) 70 print("Conversion to mono complete.") 71 72 return wav_file 73 else: 74 print(f"Failed to download the file. Status code: {response.status_code}") 75 return None
NVIDIA’s TitaNet Model Setup
Next we’ll import torch
and nemo
, then connect to and load NVIDIA’s TitaNet model. This model allows us to generate speaker embeddings to create speaker fingerprints. It also enables the conversion of utterances into embeddings for comparison with our fingerprints.
1 from nemo.collections.asr.models import EncDecSpeakerLabelModel 2 3 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
We’ll now define an add_speaker_embedding_to_pinecone
function to add our speaker embeddings to the Pinecone database.
1 import torch 2 import numpy as np 3 import uuid 4 5 6 def add_speaker_embedding_to_pinecone(speaker_name, speaker_embedding, unique_id=None): 7 # Ensure the embedding is a 1D numpy array. 8 if isinstance(speaker_embedding, torch.Tensor): 9 embedding_np = speaker_embedding.squeeze().cpu().numpy() 10 elif isinstance(speaker_embedding, np.ndarray): 11 embedding_np = speaker_embedding.squeeze() 12 else: 13 raise ValueError("Unsupported embedding type. Expected torch.Tensor or numpy.ndarray") 14 15 # Ensure the embedding is the correct shape 16 if embedding_np.shape != (192,): 17 raise ValueError(f"Expected embedding of shape (192,), but got {embedding_np.shape}") 18 19 # Convert to list for Pinecone 20 embedding_list = embedding_np.tolist() 21 22 # Generate a unique ID if not provided 23 if unique_id is None: 24 unique_id = f"speaker_{speaker_name}_{uuid.uuid4().hex[:8]}" 25 26 # Create the metadata dictionary 27 metadata = {"speaker_name": speaker_name} 28 29 # Upsert the vector to Pinecone 30 upsert_response = index.upsert(vectors=[(unique_id, embedding_list, metadata)]) 31 32 print(f"Upserted embedding for speaker {speaker_name} with ID {unique_id}") 33 return unique_id
Add Thumbprints to our Pinecone Database
Below we’ll use chunks of the speakers’ conversations to generate speaker embeddings and add them to our vector database. Later on, we’ll show how to take an audio file with speakers not in the vector database and obtain the data required to generate new speaker fingerprints to be uploaded to the Pinecone database.
1 elon_fingerprint = download_and_convert_to_wav( 2 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z" 3 ) 4 altman_fingerprint = download_and_convert_to_wav( 5 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/sam_altman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L3NhbV9hbHRtYW5fZmluZ2VycHJpbnQubXAzIiwiaWF0IjoxNzAwNjY5NjI0LCJleHAiOjE3MzIyMDU2MjR9._1yuMGzBhFcHr7xv76160Hb_SC-mH_Wv3_qX-S7XsTU&t=2023-11-22T16%3A13%3A44.103Z" 6 ) 7 lex_fingerprint = download_and_convert_to_wav( 8 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/lex_fridman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2xleF9mcmlkbWFuX2ZpbmdlcnByaW50Lm1wMyIsImlhdCI6MTcwMDY2OTc4OSwiZXhwIjoxNzMyMjA1Nzg5fQ.PUWTOLIHl4dcrWjh2ZJ_2TBaxMXpcU-x6OcvUDe6ZXQ&t=2023-11-22T16%3A16%3A29.697Z" 9 ) 10 11 known_speakers = {"Elon Musk": elon_fingerprint, "Sam Altman": altman_fingerprint, "Lex Fridman": lex_fingerprint} 12 13 # Upload the known speakers. 14 for speaker, audio_file in known_speakers.items(): 15 print("***") 16 print(speaker) 17 print(audio_file) 18 embedding = speaker_model.get_embedding(audio_file) 19 add_speaker_embedding_to_pinecone(speaker, embedding)
Now we can query our Pinecone database to ensure that our embeddings were uploaded successfully.
1 audio_file = download_and_convert_to_wav( 2 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z" 3 ) 4 utterance_embedding = speaker_model.get_embedding(audio_file) 5 6 results = index.query(vector=utterance_embedding.tolist(), top_k=3, include_metadata=True) 7 8 print(results)
Creating Functions to Find the Closest Speaker and Identify Speakers of Utterances
Speaker Identification Function
The find_closest_speaker
function is a crucial component of our speaker identification system. It compares a given utterance embedding to known speaker embeddings and identifies the closest match.
1 from sklearn.metrics.pairwise import cosine_similarity 2 3 4 def find_closest_speaker(utterance_embedding, local_embeddings=None, local_only=False, threshold=0.5): 5 def cosine_sim(a, b): 6 return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0] 7 8 best_match = {"speaker_name": "No match found", "score": 0} 9 10 # Local embeddings processing. 11 if local_embeddings is not None: 12 for speaker_name, embedding in local_embeddings.items(): 13 score = cosine_sim(utterance_embedding, embedding) 14 if score > best_match["score"]: 15 print("Identified speaker " + speaker_name + " confidence " + str(score)) 16 best_match = {"speaker_name": speaker_name, "score": score} 17 18 # Pinecone query (if not local_only and local_embeddings is empty or not provided) 19 if not local_only and (local_embeddings is None or len(local_embeddings) == 0): 20 results = index.query(vector=utterance_embedding.tolist(), top_k=1, include_metadata=True) 21 if results["matches"]: 22 pinecone_match = results["matches"][0] 23 pinecone_score = pinecone_match["score"] 24 if pinecone_score > best_match["score"]: 25 best_match = {"speaker_name": pinecone_match["metadata"]["speaker_name"], "score": pinecone_score} 26 27 # Check if the best match meets the threshold. 28 if best_match["score"] < threshold: 29 return "No match found", 0 30 31 return best_match["speaker_name"], best_match["score"]
Speaker Identification from Utterances
The identify_speakers_from_utterances
function is the core of our speaker identification system. It processes a transcript with utterances and identifies speakers, handling both known and unknown voices.
1 def identify_speakers_from_utterances(transcript, wav_file, min_utterance_length=5000, match_all_utterances=False): 2 utterances = transcript["utterances"] 3 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large") 4 5 known_speakers = {} 6 unknown_speakers = {} 7 unknown_count = 0 8 9 unknown_folder = "unknown_speaker_utterances" 10 os.makedirs(unknown_folder, exist_ok=True) 11 12 audio_file_name = os.path.basename(wav_file) 13 full_audio = AudioSegment.from_wav(wav_file) 14 15 def get_suitable_utterance(speaker, min_length): 16 suitable_utterances = [ 17 u for u in utterances if u["speaker"] == speaker and (u["end"] - u["start"]) >= min_length 18 ] 19 if suitable_utterances: 20 return max(suitable_utterances, key=lambda u: u["end"] - u["start"]) 21 return max((u for u in utterances if u["speaker"] == speaker), key=lambda u: u["end"] - u["start"]) 22 23 # First pass: Identify speakers. 24 for speaker in set(u["speaker"] for u in utterances): 25 if speaker not in known_speakers and speaker not in unknown_speakers: 26 suitable_utterance = get_suitable_utterance(speaker, min_utterance_length) 27 28 start_ms = suitable_utterance["start"] 29 end_ms = suitable_utterance["end"] 30 utterance_audio = full_audio[start_ms:end_ms] 31 32 temp_wav = "temp_utterance.wav" 33 utterance_audio.export(temp_wav, format="wav") 34 embedding = speaker_model.get_embedding(temp_wav) 35 os.remove(temp_wav) 36 37 speaker_name, score = find_closest_speaker(embedding) 38 print(f"Speaker: {speaker}, Closest match: {speaker_name}, Score: {score}") 39 40 if score > 0.5: # Adjust threshold as needed. 41 known_speakers[speaker] = speaker_name 42 print(f"Identified as known speaker: {speaker}") 43 else: 44 unknown_count += 1 45 unknown_name = f"Unknown Speaker {chr(64 + unknown_count)}" 46 unknown_wav = f"{unknown_folder}/unknown_speaker_{unknown_count}_from_{audio_file_name}" 47 utterance_audio.export(unknown_wav, format="wav") 48 unknown_speakers[speaker] = { 49 "name": unknown_name, 50 "wav_file": unknown_wav, 51 "duration": end_ms - start_ms, 52 } 53 print(f"New unknown speaker detected: {unknown_name} (Duration: {(end_ms - start_ms)/1000:.2f}s)") 54 55 # Second pass: Replace speaker names. 56 for utterance in utterances: 57 if utterance["speaker"] in known_speakers: 58 utterance["speaker"] in known_speakers[utterance["speaker"]] 59 elif utterance["speaker"] in unknown_speakers: 60 utterance["speaker"] = unknown_speakers[utterance["speaker"]]["name"] 61 62 # Third pass: Match all utterances if requested. 63 if match_all_utterances: 64 print("Matching all utterances individually...") 65 for utterance in utterances: 66 start_ms = utterance["start"] 67 end_ms = utterance["end"] 68 utterance_audio = full_audio[start_ms:end_ms] 69 70 temp_wav = "temp_utterance.wav" 71 utterance_audio.export(temp_wav, format="wav") 72 embedding = speaker_model.get_embedding(temp_wav) 73 os.remove(temp_wav) 74 75 new_speaker_name, score = find_closest_speaker(embedding) 76 77 if score > 0.5 and new_speaker_name != utterance["speaker"]: 78 print(f"Speaker change detected: '{utterance['speaker']}' -> '{new_speaker_name}' (Score: {score})") 79 print(f"Utterance: {utterance['text'][:50]}...") 80 utterance["speaker"] = new_speaker_name 81 82 return utterances, unknown_speakers
Examples: Speaker Identification and Diarization
To demonstrate the capabilities of our speaker identification and diarization system, we’ll cover several examples. We’ll start with a straightforward case and progressively move to more complex scenarios.
Example 1: Conversation Between Sam Altman and Elon Musk
Our first example is a simple conversation between two well-known figures: Elon Musk and Sam Altman. This example will showcase how our system performs with clear, distinct voices in a controlled setting.
1 transcript_obj = transcribe("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z") 2 wav_file = download_and_convert_to_wav("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z") 3 identified_utterances, unknown_speakers = identify_speakers_from_utterances(transcript_obj, wav_file) 4 5 for utterance in identified_utterances: 6 print(f"{utterance['speaker']}: {utterance['text']}")