Setup A Speaker Identification System using Pinecone & Nvidia TitaNet

This guide will demonstrate how to build an advanced speaker recognition and diarization system that you can use to identify speakers across multiple audio files. It will use:

  • AssemblyAI for transcription and initial diarization.
  • Nvidia’s TitaNet model for speaker embedding generation.
  • Pinecone for efficient similarity search of speaker embeddings.

Quickstart

1from pinecone import Pinecone, ServerlessSpec
2import assemblyai as aai
3import requests
4import os
5from pydub import AudioSegment
6import mimetypes
7import wave
8from nemo.collections.asr.models import EncDecSpeakerLabelModel
9import torch
10import numpy as np
11import uuid
12from sklearn.metrics.pairwise import cosine_similarity
13
14# Obtain from your Pinecone dashboard.
15pc = Pinecone(api_key="PINECONE_KEY_HERE")
16
17# Obtain from your AssemblyAI dashboard.
18aai.settings.api_key = "AAI_KEY_HERE"
19
20def transcribe(file_url):
21 config = aai.TranscriptionConfig(speaker_labels=True) # Speaker labels must be enabled for this Cookbook.
22
23 transcriber = aai.Transcriber(config=config)
24
25 transcript = transcriber.transcribe(file_url)
26
27 return transcript.json_response
28
29def download_and_convert_to_wav(url, output_dir="./content/converted_audio"):
30 # Create the output directory if it doesn't exist.
31 os.makedirs(output_dir, exist_ok=True)
32
33 # Extract filename from URL.
34 filename = url.split("/")[-1].split("?")[0]
35 base_filename, file_extension = os.path.splitext(filename)
36
37 # Download the file.
38 response = requests.get(url)
39 if response.status_code == 200:
40 # Determine the file type.
41 content_type = response.headers.get("content-type")
42 if content_type:
43 guessed_extension = mimetypes.guess_extension(content_type)
44 if guessed_extension:
45 file_extension = guessed_extension
46
47 # Save the downloaded file.
48 downloaded_file = os.path.join(output_dir, filename)
49 with open(downloaded_file, "wb") as f:
50 f.write(response.content)
51
52 # Generate the WAV file name.
53 wav_filename = f"{base_filename}.wav"
54 wav_file = os.path.join(output_dir, wav_filename)
55
56 # Load the audio file.
57 audio = AudioSegment.from_file(downloaded_file)
58
59 # Convert to mono if it's stereo.
60 if audio.channels > 1:
61 print("Setting channels to 1.")
62 audio = audio.set_channels(1)
63
64 # Export as WAV.
65 audio.export(wav_file, format="wav")
66 print(f"File converted and saved as: {wav_file}")
67
68 # Remove the original downloaded file if it's different from the WAV file.
69 if downloaded_file != wav_file:
70 os.remove(downloaded_file)
71
72 # Ensure the WAV file is single channel.
73 with wave.open(wav_file, "rb") as wf:
74 n_channels = wf.getnchannels()
75 if n_channels > 1:
76 print(f"Converting {n_channels} channels to mono...")
77 # Read the frames.
78 frames = wf.readframes(wf.getnframes())
79 # Get other parameters.
80 params = wf.getparams()
81 # Close the file.
82 wf.close()
83 # Convert to mono.
84 mono_frames = b"".join([frames[i::n_channels] for i in range(n_channels)])
85 # Write the mono WAV file.
86 with wave.open(wav_file, "wb") as wf:
87 wf.setparams(
88 (1, params.sampwidth, params.framerate, params.nframes, params.comptype, params.compname)
89 )
90 wf.writeframes(mono_frames)
91 print("Conversion to mono complete.")
92
93 return wav_file
94 else:
95 print(f"Failed to download the file. Status code: {response.status_code}")
96 return None
97
98def add_speaker_embedding_to_pinecone(speaker_name, speaker_embedding, unique_id=None):
99 # Ensure the embedding is a 1D numpy array.
100 if isinstance(speaker_embedding, torch.Tensor):
101 embedding_np = speaker_embedding.squeeze().cpu().numpy()
102 elif isinstance(speaker_embedding, np.ndarray):
103 embedding_np = speaker_embedding.squeeze()
104 else:
105 raise ValueError("Unsupported embedding type. Expected torch.Tensor or numpy.ndarray")
106
107 # Ensure the embedding is the correct shape
108 if embedding_np.shape != (192,):
109 raise ValueError(f"Expected embedding of shape (192,), but got {embedding_np.shape}")
110
111 # Convert to list for Pinecone
112 embedding_list = embedding_np.tolist()
113
114 # Generate a unique ID if not provided
115 if unique_id is None:
116 unique_id = f"speaker_{speaker_name}_{uuid.uuid4().hex[:8]}"
117
118 # Create the metadata dictionary
119 metadata = {"speaker_name": speaker_name}
120
121 # Upsert the vector to Pinecone
122 upsert_response = index.upsert(vectors=[(unique_id, embedding_list, metadata)])
123
124 print(f"Upserted embedding for speaker {speaker_name} with ID {unique_id}")
125 return unique_id
126
127def find_closest_speaker(utterance_embedding, local_embeddings=None, local_only=False, threshold=0.5):
128 def cosine_sim(a, b):
129 return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]
130
131 best_match = {"speaker_name": "No match found", "score": 0}
132
133 # Local embeddings processing.
134 if local_embeddings is not None:
135 for speaker_name, embedding in local_embeddings.items():
136 score = cosine_sim(utterance_embedding, embedding)
137 if score > best_match["score"]:
138 print("Identified speaker " + speaker_name + " confidence " + str(score))
139 best_match = {"speaker_name": speaker_name, "score": score}
140
141 # Pinecone query (if not local_only and local_embeddings is empty or not provided)
142 if not local_only and (local_embeddings is None or len(local_embeddings) == 0):
143 results = index.query(vector=utterance_embedding.tolist(), top_k=1, include_metadata=True)
144 if results["matches"]:
145 pinecone_match = results["matches"][0]
146 pinecone_score = pinecone_match["score"]
147 if pinecone_score > best_match["score"]:
148 best_match = {"speaker_name": pinecone_match["metadata"]["speaker_name"], "score": pinecone_score}
149
150 # Check if the best match meets the threshold.
151 if best_match["score"] < threshold:
152 return "No match found", 0
153
154 return best_match["speaker_name"], best_match["score"]
155
156def identify_speakers_from_utterances(transcript, wav_file, min_utterance_length=5000, match_all_utterances=False):
157 utterances = transcript["utterances"]
158 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
159
160 known_speakers = {}
161 unknown_speakers = {}
162 unknown_count = 0
163
164 unknown_folder = "unknown_speaker_utterances"
165 os.makedirs(unknown_folder, exist_ok=True)
166
167 audio_file_name = os.path.basename(wav_file)
168 full_audio = AudioSegment.from_wav(wav_file)
169
170 def get_suitable_utterance(speaker, min_length):
171 suitable_utterances = [
172 u for u in utterances if u["speaker"] == speaker and (u["end"] - u["start"]) >= min_length
173 ]
174 if suitable_utterances:
175 return max(suitable_utterances, key=lambda u: u["end"] - u["start"])
176 return max((u for u in utterances if u["speaker"] == speaker), key=lambda u: u["end"] - u["start"])
177
178 # First pass: Identify speakers.
179 for speaker in set(u["speaker"] for u in utterances):
180 if speaker not in known_speakers and speaker not in unknown_speakers:
181 suitable_utterance = get_suitable_utterance(speaker, min_utterance_length)
182
183 start_ms = suitable_utterance["start"]
184 end_ms = suitable_utterance["end"]
185 utterance_audio = full_audio[start_ms:end_ms]
186
187 temp_wav = "temp_utterance.wav"
188 utterance_audio.export(temp_wav, format="wav")
189 embedding = speaker_model.get_embedding(temp_wav)
190 os.remove(temp_wav)
191
192 speaker_name, score = find_closest_speaker(embedding)
193 print(f"Speaker: {speaker}, Closest match: {speaker_name}, Score: {score}")
194
195 if score > 0.5: # Adjust threshold as needed.
196 known_speakers[speaker] = speaker_name
197 print(f"Identified as known speaker: {speaker}")
198 else:
199 unknown_count += 1
200 unknown_name = f"Unknown Speaker {chr(64 + unknown_count)}"
201 unknown_wav = f"{unknown_folder}/unknown_speaker_{unknown_count}_from_{audio_file_name}"
202 utterance_audio.export(unknown_wav, format="wav")
203 unknown_speakers[speaker] = {
204 "name": unknown_name,
205 "wav_file": unknown_wav,
206 "duration": end_ms - start_ms,
207 }
208 print(f"New unknown speaker detected: {unknown_name} (Duration: {(end_ms - start_ms)/1000:.2f}s)")
209
210 # Second pass: Replace speaker names.
211 for utterance in utterances:
212 if utterance["speaker"] in known_speakers:
213 utterance["speaker"] in known_speakers[utterance["speaker"]]
214 elif utterance["speaker"] in unknown_speakers:
215 utterance["speaker"] = unknown_speakers[utterance["speaker"]]["name"]
216
217 # Third pass: Match all utterances if requested.
218 if match_all_utterances:
219 print("Matching all utterances individually...")
220 for utterance in utterances:
221 start_ms = utterance["start"]
222 end_ms = utterance["end"]
223 utterance_audio = full_audio[start_ms:end_ms]
224
225 temp_wav = "temp_utterance.wav"
226 utterance_audio.export(temp_wav, format="wav")
227 embedding = speaker_model.get_embedding(temp_wav)
228 os.remove(temp_wav)
229
230 new_speaker_name, score = find_closest_speaker(embedding)
231
232 if score > 0.5 and new_speaker_name != utterance["speaker"]:
233 print(f"Speaker change detected: '{utterance['speaker']}' -> '{new_speaker_name}' (Score: {score})")
234 print(f"Utterance: {utterance['text'][:50]}...")
235 utterance["speaker"] = new_speaker_name
236
237 return utterances, unknown_speakers
238
239pc.create_index(
240 name="speaker-embeddings",
241 dimension=192, # Replace with model-specific dimensions - 192 is for TitaNet-Large.
242 metric="cosine", # Replace with your model metric.
243 spec=ServerlessSpec(
244 cloud="aws",
245 region="us-east-1",
246 ),
247)
248
249# Connect to our new index.
250index = pc.Index("speaker-embeddings")
251
252speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
253
254elon_fingerprint = download_and_convert_to_wav(
255 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
256)
257altman_fingerprint = download_and_convert_to_wav(
258 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/sam_altman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L3NhbV9hbHRtYW5fZmluZ2VycHJpbnQubXAzIiwiaWF0IjoxNzAwNjY5NjI0LCJleHAiOjE3MzIyMDU2MjR9._1yuMGzBhFcHr7xv76160Hb_SC-mH_Wv3_qX-S7XsTU&t=2023-11-22T16%3A13%3A44.103Z"
259)
260lex_fingerprint = download_and_convert_to_wav(
261 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/lex_fridman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2xleF9mcmlkbWFuX2ZpbmdlcnByaW50Lm1wMyIsImlhdCI6MTcwMDY2OTc4OSwiZXhwIjoxNzMyMjA1Nzg5fQ.PUWTOLIHl4dcrWjh2ZJ_2TBaxMXpcU-x6OcvUDe6ZXQ&t=2023-11-22T16%3A16%3A29.697Z"
262)
263
264known_speakers = {"Elon Musk": elon_fingerprint, "Sam Altman": altman_fingerprint, "Lex Fridman": lex_fingerprint}
265
266# Upload the known speakers.
267for speaker, audio_file in known_speakers.items():
268 print("***")
269 print(speaker)
270 print(audio_file)
271 embedding = speaker_model.get_embedding(audio_file)
272 add_speaker_embedding_to_pinecone(speaker, embedding)
273
274audio_file = download_and_convert_to_wav(
275 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
276)
277utterance_embedding = speaker_model.get_embedding(audio_file)
278
279results = index.query(vector=utterance_embedding.tolist(), top_k=3, include_metadata=True)
280print(results)
281
282# Example: Conversation Between Sam Altman and Elon Musk
283transcript_obj = transcribe("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
284wav_file = download_and_convert_to_wav("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
285identified_utterances, unknown_speakers = identify_speakers_from_utterances(transcript_obj, wav_file)
286
287for utterance in identified_utterances:
288 print(f"{utterance['speaker']}: {utterance['text']}")

Initial Setup

First, you’ll need to sign up for an AssemblyAI account and obtain your API key from your account dashboard. Then, sign up for a Pinecone account and obtain your API key from “API Keys” on the sidebar of your dashboard. Also note that any files you use for this Cookbook should be in WAV format. While not a requirement for AssemblyAI, TitaNet requires WAV format.

Installing Dependencies

Now we’ll need to install the necessary libraries and frameworks for this project. Please note that this process can take several minutes to complete.

$pip install -U Cython torch nemo_toolkit ffmpeg pydub pinecone-client assemblyai hydra-core pytorch_lightning huggingface_hub==0.23.5 librosa transformers pandas inflect webdataset sentencepiece youtokentome pyannote-audio editdistance jiwer lhotse datasets

Pinecone Setup

In this section, we’ll import Pinecone, create a new index for our speaker embeddings, and connect to the index. Please enter your Pinecone API key in the placeholder below.

1from pinecone import Pinecone, ServerlessSpec
2
3# Obtain from your Pinecone dashboard.
4pc = Pinecone(api_key="PINECONE_KEY_HERE")
5
6pc.create_index(
7 name="speaker-embeddings",
8 dimension=192, # Replace with model-specific dimensions - 192 is for TitaNet-Large.
9 metric="cosine", # Replace with your model metric.
10 spec=ServerlessSpec(
11 cloud="aws",
12 region="us-east-1",
13 ),
14)
15
16# Connect to our new index.
17index = pc.Index("speaker-embeddings")

AssemblyAI Setup

Now we’ll set up AssemblyAI for transcription and diarization. We’ll import the necessary modules and create a function to transcribe our audio files with speaker labels enabled. Please enter your AssemblyAI API key in the cell below.

1import assemblyai as aai
2
3aai.settings.api_key = "AAI_KEY_HERE"
4
5
6def transcribe(file_url):
7 config = aai.TranscriptionConfig(speaker_labels=True) # Speaker labels must be enabled for this Cookbook.
8
9 transcriber = aai.Transcriber(config=config)
10
11 transcript = transcriber.transcribe(file_url)
12
13 return transcript.json_response

We’ll also need to create a download_and_convert_to_wav helper function. This function allows us to take file URLs, download them, then convert them to WAV format. If the URLs are already in WAV format, then they’re just downloaded. The files must be in WAV format to work properly with the TitaNet.

1import requests
2import os
3from pydub import AudioSegment
4import mimetypes
5import wave
6
7
8def download_and_convert_to_wav(url, output_dir="./content/converted_audio"):
9 # Create the output directory if it doesn't exist.
10 os.makedirs(output_dir, exist_ok=True)
11
12 # Extract filename from URL.
13 filename = url.split("/")[-1].split("?")[0]
14 base_filename, file_extension = os.path.splitext(filename)
15
16 # Download the file.
17 response = requests.get(url)
18 if response.status_code == 200:
19 # Determine the file type.
20 content_type = response.headers.get("content-type")
21 if content_type:
22 guessed_extension = mimetypes.guess_extension(content_type)
23 if guessed_extension:
24 file_extension = guessed_extension
25
26 # Save the downloaded file.
27 downloaded_file = os.path.join(output_dir, filename)
28 with open(downloaded_file, "wb") as f:
29 f.write(response.content)
30
31 # Generate the WAV file name.
32 wav_filename = f"{base_filename}.wav"
33 wav_file = os.path.join(output_dir, wav_filename)
34
35 # Load the audio file.
36 audio = AudioSegment.from_file(downloaded_file)
37
38 # Convert to mono if it's stereo.
39 if audio.channels > 1:
40 print("Setting channels to 1.")
41 audio = audio.set_channels(1)
42
43 # Export as WAV.
44 audio.export(wav_file, format="wav")
45 print(f"File converted and saved as: {wav_file}")
46
47 # Remove the original downloaded file if it's different from the WAV file.
48 if downloaded_file != wav_file:
49 os.remove(downloaded_file)
50
51 # Ensure the WAV file is single channel.
52 with wave.open(wav_file, "rb") as wf:
53 n_channels = wf.getnchannels()
54 if n_channels > 1:
55 print(f"Converting {n_channels} channels to mono...")
56 # Read the frames.
57 frames = wf.readframes(wf.getnframes())
58 # Get other parameters.
59 params = wf.getparams()
60 # Close the file.
61 wf.close()
62 # Convert to mono.
63 mono_frames = b"".join([frames[i::n_channels] for i in range(n_channels)])
64 # Write the mono WAV file.
65 with wave.open(wav_file, "wb") as wf:
66 wf.setparams(
67 (1, params.sampwidth, params.framerate, params.nframes, params.comptype, params.compname)
68 )
69 wf.writeframes(mono_frames)
70 print("Conversion to mono complete.")
71
72 return wav_file
73 else:
74 print(f"Failed to download the file. Status code: {response.status_code}")
75 return None

NVIDIA’s TitaNet Model Setup

Next we’ll import torch and nemo, then connect to and load NVIDIA’s TitaNet model. This model allows us to generate speaker embeddings to create speaker fingerprints. It also enables the conversion of utterances into embeddings for comparison with our fingerprints.

1from nemo.collections.asr.models import EncDecSpeakerLabelModel
2
3speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")

We’ll now define an add_speaker_embedding_to_pinecone function to add our speaker embeddings to the Pinecone database.

1import torch
2import numpy as np
3import uuid
4
5
6def add_speaker_embedding_to_pinecone(speaker_name, speaker_embedding, unique_id=None):
7 # Ensure the embedding is a 1D numpy array.
8 if isinstance(speaker_embedding, torch.Tensor):
9 embedding_np = speaker_embedding.squeeze().cpu().numpy()
10 elif isinstance(speaker_embedding, np.ndarray):
11 embedding_np = speaker_embedding.squeeze()
12 else:
13 raise ValueError("Unsupported embedding type. Expected torch.Tensor or numpy.ndarray")
14
15 # Ensure the embedding is the correct shape
16 if embedding_np.shape != (192,):
17 raise ValueError(f"Expected embedding of shape (192,), but got {embedding_np.shape}")
18
19 # Convert to list for Pinecone
20 embedding_list = embedding_np.tolist()
21
22 # Generate a unique ID if not provided
23 if unique_id is None:
24 unique_id = f"speaker_{speaker_name}_{uuid.uuid4().hex[:8]}"
25
26 # Create the metadata dictionary
27 metadata = {"speaker_name": speaker_name}
28
29 # Upsert the vector to Pinecone
30 upsert_response = index.upsert(vectors=[(unique_id, embedding_list, metadata)])
31
32 print(f"Upserted embedding for speaker {speaker_name} with ID {unique_id}")
33 return unique_id

Add Thumbprints to our Pinecone Database

Below we’ll use chunks of the speakers’ conversations to generate speaker embeddings and add them to our vector database. Later on, we’ll show how to take an audio file with speakers not in the vector database and obtain the data required to generate new speaker fingerprints to be uploaded to the Pinecone database.

1elon_fingerprint = download_and_convert_to_wav(
2 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
3)
4altman_fingerprint = download_and_convert_to_wav(
5 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/sam_altman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L3NhbV9hbHRtYW5fZmluZ2VycHJpbnQubXAzIiwiaWF0IjoxNzAwNjY5NjI0LCJleHAiOjE3MzIyMDU2MjR9._1yuMGzBhFcHr7xv76160Hb_SC-mH_Wv3_qX-S7XsTU&t=2023-11-22T16%3A13%3A44.103Z"
6)
7lex_fingerprint = download_and_convert_to_wav(
8 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/lex_fridman_fingerprint.mp3?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2xleF9mcmlkbWFuX2ZpbmdlcnByaW50Lm1wMyIsImlhdCI6MTcwMDY2OTc4OSwiZXhwIjoxNzMyMjA1Nzg5fQ.PUWTOLIHl4dcrWjh2ZJ_2TBaxMXpcU-x6OcvUDe6ZXQ&t=2023-11-22T16%3A16%3A29.697Z"
9)
10
11known_speakers = {"Elon Musk": elon_fingerprint, "Sam Altman": altman_fingerprint, "Lex Fridman": lex_fingerprint}
12
13# Upload the known speakers.
14for speaker, audio_file in known_speakers.items():
15 print("***")
16 print(speaker)
17 print(audio_file)
18 embedding = speaker_model.get_embedding(audio_file)
19 add_speaker_embedding_to_pinecone(speaker, embedding)

Now we can query our Pinecone database to ensure that our embeddings were uploaded successfully.

1audio_file = download_and_convert_to_wav(
2 "https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/musk_fingerprinting.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L211c2tfZmluZ2VycHJpbnRpbmcud2F2IiwiaWF0IjoxNzAwNDM4OTAwLCJleHAiOjE3MzE5NzQ5MDB9.O3QOJSBqFNb1sg4nurwSFA13xIPHyKuon3UfHcFYit0&t=2023-11-20T00%3A08%3A20.071Z"
3)
4utterance_embedding = speaker_model.get_embedding(audio_file)
5
6results = index.query(vector=utterance_embedding.tolist(), top_k=3, include_metadata=True)
7
8print(results)

Creating Functions to Find the Closest Speaker and Identify Speakers of Utterances

Speaker Identification Function

The find_closest_speaker function is a crucial component of our speaker identification system. It compares a given utterance embedding to known speaker embeddings and identifies the closest match.

1from sklearn.metrics.pairwise import cosine_similarity
2
3
4def find_closest_speaker(utterance_embedding, local_embeddings=None, local_only=False, threshold=0.5):
5 def cosine_sim(a, b):
6 return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0]
7
8 best_match = {"speaker_name": "No match found", "score": 0}
9
10 # Local embeddings processing.
11 if local_embeddings is not None:
12 for speaker_name, embedding in local_embeddings.items():
13 score = cosine_sim(utterance_embedding, embedding)
14 if score > best_match["score"]:
15 print("Identified speaker " + speaker_name + " confidence " + str(score))
16 best_match = {"speaker_name": speaker_name, "score": score}
17
18 # Pinecone query (if not local_only and local_embeddings is empty or not provided)
19 if not local_only and (local_embeddings is None or len(local_embeddings) == 0):
20 results = index.query(vector=utterance_embedding.tolist(), top_k=1, include_metadata=True)
21 if results["matches"]:
22 pinecone_match = results["matches"][0]
23 pinecone_score = pinecone_match["score"]
24 if pinecone_score > best_match["score"]:
25 best_match = {"speaker_name": pinecone_match["metadata"]["speaker_name"], "score": pinecone_score}
26
27 # Check if the best match meets the threshold.
28 if best_match["score"] < threshold:
29 return "No match found", 0
30
31 return best_match["speaker_name"], best_match["score"]

Speaker Identification from Utterances

The identify_speakers_from_utterances function is the core of our speaker identification system. It processes a transcript with utterances and identifies speakers, handling both known and unknown voices.

1def identify_speakers_from_utterances(transcript, wav_file, min_utterance_length=5000, match_all_utterances=False):
2 utterances = transcript["utterances"]
3 speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
4
5 known_speakers = {}
6 unknown_speakers = {}
7 unknown_count = 0
8
9 unknown_folder = "unknown_speaker_utterances"
10 os.makedirs(unknown_folder, exist_ok=True)
11
12 audio_file_name = os.path.basename(wav_file)
13 full_audio = AudioSegment.from_wav(wav_file)
14
15 def get_suitable_utterance(speaker, min_length):
16 suitable_utterances = [
17 u for u in utterances if u["speaker"] == speaker and (u["end"] - u["start"]) >= min_length
18 ]
19 if suitable_utterances:
20 return max(suitable_utterances, key=lambda u: u["end"] - u["start"])
21 return max((u for u in utterances if u["speaker"] == speaker), key=lambda u: u["end"] - u["start"])
22
23 # First pass: Identify speakers.
24 for speaker in set(u["speaker"] for u in utterances):
25 if speaker not in known_speakers and speaker not in unknown_speakers:
26 suitable_utterance = get_suitable_utterance(speaker, min_utterance_length)
27
28 start_ms = suitable_utterance["start"]
29 end_ms = suitable_utterance["end"]
30 utterance_audio = full_audio[start_ms:end_ms]
31
32 temp_wav = "temp_utterance.wav"
33 utterance_audio.export(temp_wav, format="wav")
34 embedding = speaker_model.get_embedding(temp_wav)
35 os.remove(temp_wav)
36
37 speaker_name, score = find_closest_speaker(embedding)
38 print(f"Speaker: {speaker}, Closest match: {speaker_name}, Score: {score}")
39
40 if score > 0.5: # Adjust threshold as needed.
41 known_speakers[speaker] = speaker_name
42 print(f"Identified as known speaker: {speaker}")
43 else:
44 unknown_count += 1
45 unknown_name = f"Unknown Speaker {chr(64 + unknown_count)}"
46 unknown_wav = f"{unknown_folder}/unknown_speaker_{unknown_count}_from_{audio_file_name}"
47 utterance_audio.export(unknown_wav, format="wav")
48 unknown_speakers[speaker] = {
49 "name": unknown_name,
50 "wav_file": unknown_wav,
51 "duration": end_ms - start_ms,
52 }
53 print(f"New unknown speaker detected: {unknown_name} (Duration: {(end_ms - start_ms)/1000:.2f}s)")
54
55 # Second pass: Replace speaker names.
56 for utterance in utterances:
57 if utterance["speaker"] in known_speakers:
58 utterance["speaker"] in known_speakers[utterance["speaker"]]
59 elif utterance["speaker"] in unknown_speakers:
60 utterance["speaker"] = unknown_speakers[utterance["speaker"]]["name"]
61
62 # Third pass: Match all utterances if requested.
63 if match_all_utterances:
64 print("Matching all utterances individually...")
65 for utterance in utterances:
66 start_ms = utterance["start"]
67 end_ms = utterance["end"]
68 utterance_audio = full_audio[start_ms:end_ms]
69
70 temp_wav = "temp_utterance.wav"
71 utterance_audio.export(temp_wav, format="wav")
72 embedding = speaker_model.get_embedding(temp_wav)
73 os.remove(temp_wav)
74
75 new_speaker_name, score = find_closest_speaker(embedding)
76
77 if score > 0.5 and new_speaker_name != utterance["speaker"]:
78 print(f"Speaker change detected: '{utterance['speaker']}' -> '{new_speaker_name}' (Score: {score})")
79 print(f"Utterance: {utterance['text'][:50]}...")
80 utterance["speaker"] = new_speaker_name
81
82 return utterances, unknown_speakers

Examples: Speaker Identification and Diarization

To demonstrate the capabilities of our speaker identification and diarization system, we’ll cover several examples. We’ll start with a straightforward case and progressively move to more complex scenarios.

Example 1: Conversation Between Sam Altman and Elon Musk

Our first example is a simple conversation between two well-known figures: Elon Musk and Sam Altman. This example will showcase how our system performs with clear, distinct voices in a controlled setting.

1transcript_obj = transcribe("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
2wav_file = download_and_convert_to_wav("https://api.assemblyai-solutions.com/storage/v1/object/sign/sam_training_bucket/elon_altman_interview_clipped.wav?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1cmwiOiJzYW1fdHJhaW5pbmdfYnVja2V0L2Vsb25fYWx0bWFuX2ludGVydmlld19jbGlwcGVkLndhdiIsImlhdCI6MTcwMDY5MDU3OSwiZXhwIjoxNzMyMjI2NTc5fQ.4qZHvVRGhNGttfcpcfXDcJkJe_tbkc_2Bvs4i51SNSE&t=2023-11-22T22%3A02%3A59.429Z")
3identified_utterances, unknown_speakers = identify_speakers_from_utterances(transcript_obj, wav_file)
4
5for utterance in identified_utterances:
6 print(f"{utterance['speaker']}: {utterance['text']}")