Extract Quotes with Timestamps Using LLM Gateway + Semantic Search

This guide will demonstrate how to use AssemblyAI’s LLM Gateway framework to process an audio file and find the best quotes included in it through Semantic Search.

Quickstart

1import datetime
2import numpy as np
3import requests
4import time
5from sklearn.neighbors import NearestNeighbors
6from sentence_transformers import SentenceTransformer
7
8# Configuration
9api_key = "<YOUR_API_KEY>"
10base_url = "https://api.assemblyai.com"
11headers = {"authorization": api_key}
12
13def upload_file(file_path):
14 """Upload a local audio file to AssemblyAI"""
15 with open(file_path, "rb") as f:
16 response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
17 if response.status_code != 200:
18 print(f"Error uploading: {response.status_code}, {response.text}")
19 response.raise_for_status()
20 return response.json()["upload_url"]
21
22def transcribe_audio(audio_url):
23 """Submit audio for transcription with sentences enabled and poll until complete"""
24 data = {
25 "audio_url": audio_url,
26 "speech_models": ["universal-3-pro"],
27 "auto_highlights": False,
28 "sentiment_analysis": False,
29 "entity_detection": False
30 }
31
32 response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
33
34 if response.status_code != 200:
35 print(f"Error submitting transcription: {response.status_code}, {response.text}")
36 response.raise_for_status()
37
38 transcript_id = response.json()["id"]
39 polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
40
41 print("Transcribing...")
42 while True:
43 transcript = requests.get(polling_endpoint, headers=headers).json()
44 if transcript["status"] == "completed":
45 print("Transcription completed!")
46 return transcript
47 elif transcript["status"] == "error":
48 raise RuntimeError(f"Transcription failed: {transcript['error']}")
49 else:
50 time.sleep(3)
51
52def get_sentences(transcript_id):
53 """Get sentences from a completed transcript"""
54 sentences_endpoint = f"{base_url}/v2/transcript/{transcript_id}/sentences"
55 response = requests.get(sentences_endpoint, headers=headers)
56
57 if response.status_code != 200:
58 print(f"Error getting sentences: {response.status_code}, {response.text}")
59 response.raise_for_status()
60
61 return response.json()["sentences"]
62
63def process_with_llm_gateway(transcript_text, question, context=""):
64 """Send transcript to LLM Gateway for question answering"""
65 prompt = f"""Based on the following transcript, please answer this question:
66 Question: {question}
67 Context: {context}
68 Transcript: {transcript_text}
69 Please provide a clear and specific answer."""
70
71 llm_gateway_data = {
72 "model": "claude-sonnet-4-5-20250929",
73 "messages": [
74 {
75 "role": "user",
76 "content": prompt
77 }
78 ],
79 "max_tokens": 2000
80 }
81
82 response = requests.post(
83 "https://llm-gateway.assemblyai.com/v1/chat/completions",
84 headers=headers,
85 json=llm_gateway_data
86 )
87
88 result = response.json()
89
90 if "error" in result:
91 raise RuntimeError(f"LLM Gateway error: {result['error']}")
92
93 return result['choices'][0]['message']['content']
94
95def sliding_window(elements, distance, stride):
96 """Create sliding windows of elements"""
97 idx = 0
98 results = []
99 while idx + distance < len(elements):
100 results.append(elements[idx:idx + distance])
101 idx += (distance - stride)
102 return results
103
104# Main execution
105# If using a local file:
106audio_url = upload_file("<YOUR_AUDIO FILE>")
107
108# If using a public URL:
109# audio_url = "<YOUR_AUDIO_URL>"
110
111# Transcribe audio
112transcript = transcribe_audio(audio_url)
113transcript_text = transcript["text"]
114transcript_id = transcript["id"]
115
116# Get sentences
117print("Getting sentences...")
118sentences = get_sentences(transcript_id)
119
120# Initialize embedder
121embedder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
122embeddings = {}
123
124# Create sliding window of sentences and generate embeddings
125print("Creating embeddings...")
126sentence_groups = sliding_window(sentences, 5, 2)
127
128for sentence_group in sentence_groups:
129 combined_text = " ".join([sentence["text"] for sentence in sentence_group])
130 start = sentence_group[0]["start"]
131 end = sentence_group[-1]["end"]
132
133 embeddings[(start, end, transcript_id, combined_text)] = embedder.encode(combined_text)
134
135# Use LLM Gateway to find the best quotes
136print("Asking LLM Gateway for best quotes...")
137question = "What are the 3 best quotes from this video?"
138context = "Please provide exactly 3 quotes."
139
140llm_answer = process_with_llm_gateway(transcript_text, question, context)
141print(f"\nLLM Gateway Response:\n{llm_answer}\n")
142
143# Embed the LLM output
144llm_gateway_embedding = embedder.encode(llm_answer)
145
146# Vectorize transcript embeddings
147np_embeddings = np.array(list(embeddings.values()))
148metadata = list(embeddings.keys())
149
150# Find the top 3 most similar quotes
151print("Finding matching quotes in transcript...")
152knn = NearestNeighbors(n_neighbors=3, metric="cosine")
153knn.fit(np_embeddings)
154distances, indices = knn.kneighbors([llm_gateway_embedding])
155
156matches = []
157for distance, index in zip(distances[0], indices[0]):
158 result_metadata = metadata[index]
159 matches.append(
160 {
161 "start_timestamp": result_metadata[0],
162 "end_timestamp": result_metadata[1],
163 "transcript_id": result_metadata[2],
164 "text": result_metadata[3],
165 "confidence": 1 - distance,
166 }
167 )
168
169# Display results
170print("\n" + "="*80)
171print("BEST MATCHING QUOTES FROM TRANSCRIPT:")
172print("="*80 + "\n")
173
174for index, m in enumerate(matches):
175 print('QUOTE #{}: "{}"'.format(index + 1, m['text']))
176 print('START TIMESTAMP:', str(datetime.timedelta(seconds=m['start_timestamp']/1000)))
177 print('END TIMESTAMP:', str(datetime.timedelta(seconds=m['end_timestamp']/1000)))
178 print('CONFIDENCE:', m['confidence'])
179 print()

Getting Started

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for an AssemblyAI account and get your API key from your dashboard.

Step-by-Step Instructions

Install the required packages:

You’ll need to install a few libraries that this code depends on:

$pip install -U numpy scikit-learn sentence-transformers

Then import all of these libraries and set our AssemblyAI API key, headers, and base URL.

1import datetime
2import numpy as np
3import requests
4import time
5from sklearn.neighbors import NearestNeighbors
6from sentence_transformers import SentenceTransformer
7
8# Configuration
9api_key = "<YOUR_API_KEY>"
10base_url = "https://api.assemblyai.com"
11headers = {"authorization": api_key}

Next, define functions to upload and transcribe files using AssemblyAI’s Async API, as well as request sentences.

1def upload_file(file_path):
2 """Upload a local audio file to AssemblyAI"""
3 with open(file_path, "rb") as f:
4 response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
5 if response.status_code != 200:
6 print(f"Error uploading: {response.status_code}, {response.text}")
7 response.raise_for_status()
8 return response.json()["upload_url"]
9
10def transcribe_audio(audio_url):
11 """Submit audio for transcription with sentences enabled and poll until complete"""
12 data = {
13 "audio_url": audio_url,
14 "speech_models": ["universal-3-pro"],
15 "auto_highlights": False,
16 "sentiment_analysis": False,
17 "entity_detection": False
18 }
19
20 response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
21
22 if response.status_code != 200:
23 print(f"Error submitting transcription: {response.status_code}, {response.text}")
24 response.raise_for_status()
25
26 transcript_id = response.json()["id"]
27 polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
28
29 print("Transcribing...")
30 while True:
31 transcript = requests.get(polling_endpoint, headers=headers).json()
32 if transcript["status"] == "completed":
33 print("Transcription completed!")
34 return transcript
35 elif transcript["status"] == "error":
36 raise RuntimeError(f"Transcription failed: {transcript['error']}")
37 else:
38 time.sleep(3)
39
40def get_sentences(transcript_id):
41 """Get sentences from a completed transcript"""
42 sentences_endpoint = f"{base_url}/v2/transcript/{transcript_id}/sentences"
43 response = requests.get(sentences_endpoint, headers=headers)
44
45 if response.status_code != 200:
46 print(f"Error getting sentences: {response.status_code}, {response.text}")
47 response.raise_for_status()
48
49 return response.json()["sentences"]

Then define a function to process each transcript text with LLM Gateway.

1def process_with_llm_gateway(transcript_text, question, context=""):
2 """Send transcript to LLM Gateway for question answering"""
3 prompt = f"""Based on the following transcript, please answer this question:
4 Question: {question}
5 Context: {context}
6 Transcript: {transcript_text}
7 Please provide a clear and specific answer."""
8
9 llm_gateway_data = {
10 "model": "claude-sonnet-4-5-20250929",
11 "messages": [
12 {
13 "role": "user",
14 "content": prompt
15 }
16 ],
17 "max_tokens": 2000
18 }
19
20 response = requests.post(
21 "https://llm-gateway.assemblyai.com/v1/chat/completions",
22 headers=headers,
23 json=llm_gateway_data
24 )
25
26 result = response.json()
27
28 if "error" in result:
29 raise RuntimeError(f"LLM Gateway error: {result['error']}")
30
31 return result['choices'][0]['message']['content']

Define a function to implement a sliding window, which allows us to group sentences together in different combinations to retain their semantic meaning and context while also enabling us to customize the length (and thus duration) of the quotes.

1def sliding_window(elements, distance, stride):
2 """Create sliding windows of elements"""
3 idx = 0
4 results = []
5 while idx + distance < len(elements):
6 results.append(elements[idx:idx + distance])
7 idx += (distance - stride)
8 return results

Execute all upload and transcription functions.

1# Main execution
2# If using a local file:
3audio_url = upload_file("<YOUR_AUDIO FILE>")
4
5# If using a public URL:
6# audio_url = "<YOUR_AUDIO_URL>"
7
8# Transcribe audio
9transcript = transcribe_audio(audio_url)
10transcript_text = transcript["text"]
11transcript_id = transcript["id"]
12
13# Get sentences
14print("Getting sentences...")
15sentences = get_sentences(transcript_id)

Now we can iterate over all of the sentences in our transcript and create embeddings for them to use as part of our Semantic Search later.

We’ll be relying on SentenceTransformer’s multi-qa-mpnet-base-dot-v1 model, which has been fine-tuned specifically for Semantic Search, and is their highest-performing model for this task.

By default, we’ll group 5 sentences together while having 2 of them overlap when the window moves. This should give us quotes around 30 seconds in length at most.

1# Initialize embedder
2embedder = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
3embeddings = {}
4
5# Create sliding window of sentences and generate embeddings
6print("Creating embeddings...")
7sentence_groups = sliding_window(sentences, 5, 2)
8
9for sentence_group in sentence_groups:
10 combined_text = " ".join([sentence["text"] for sentence in sentence_group])
11 start = sentence_group[0]["start"]
12 end = sentence_group[-1]["end"]
13
14 embeddings[(start, end, transcript_id, combined_text)] = embedder.encode(combined_text)

Now we can query LLM Gateway to provide the type of quotes we want. In this case, let’s prompt LLM Gateway to find the best 3 quotes out of a video that we transcribed.

1print("Asking LLM Gateway for best quotes...")
2question = "What are the 3 best quotes from this video?"
3context = "Please provide exactly 3 quotes."
4
5llm_answer = process_with_llm_gateway(transcript_text, question, context)
6print(f"\nLLM Gateway Response:\n{llm_answer}\n")

Now we can take the embeddings from the transcript text, as well as the embeddings from LLM Gateway’s output, and use them in our k-nearest neighbors algorithm to determine their similarity. The most similar quotes to what LLM Gateway identified will be surfaced as our 3 best quotes, along with their timestamps and confidence scores.

We’ll be relying on cosine similarity rather than the default Euclidean distance metric since it takes into account both the magnitude and direction of our vectors.

1# Embed the LLM output
2llm_gateway_embedding = embedder.encode(llm_answer)
3
4# Vectorize transcript embeddings
5np_embeddings = np.array(list(embeddings.values()))
6metadata = list(embeddings.keys())
7
8# Find the top 3 most similar quotes
9print("Finding matching quotes in transcript...")
10knn = NearestNeighbors(n_neighbors=3, metric="cosine")
11knn.fit(np_embeddings)
12distances, indices = knn.kneighbors([llm_gateway_embedding])
13
14matches = []
15for distance, index in zip(distances[0], indices[0]):
16 result_metadata = metadata[index]
17 matches.append(
18 {
19 "start_timestamp": result_metadata[0],
20 "end_timestamp": result_metadata[1],
21 "transcript_id": result_metadata[2],
22 "text": result_metadata[3],
23 "confidence": 1 - distance,
24 }
25 )
26
27# Display results
28print("\n" + "="*80)
29print("BEST MATCHING QUOTES FROM TRANSCRIPT:")
30print("="*80 + "\n")
31
32for index, m in enumerate(matches):
33 print('QUOTE #{}: "{}"'.format(index + 1, m['text']))
34 print('START TIMESTAMP:', str(datetime.timedelta(seconds=m['start_timestamp']/1000)))
35 print('END TIMESTAMP:', str(datetime.timedelta(seconds=m['end_timestamp']/1000)))
36 print('CONFIDENCE:', m['confidence'])
37 print()