| 1 | import datetime |
| 2 | import numpy as np |
| 3 | import requests |
| 4 | import time |
| 5 | from sklearn.neighbors import NearestNeighbors |
| 6 | from sentence_transformers import SentenceTransformer |
| 7 | |
| 8 | # Configuration |
| 9 | api_key = "<YOUR_API_KEY>" |
| 10 | base_url = "https://api.assemblyai.com" |
| 11 | headers = {"authorization": api_key} |
| 12 | |
| 13 | def upload_file(file_path): |
| 14 | """Upload a local audio file to AssemblyAI""" |
| 15 | with open(file_path, "rb") as f: |
| 16 | response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f) |
| 17 | if response.status_code != 200: |
| 18 | print(f"Error uploading: {response.status_code}, {response.text}") |
| 19 | response.raise_for_status() |
| 20 | return response.json()["upload_url"] |
| 21 | |
| 22 | def transcribe_audio(audio_url): |
| 23 | """Submit audio for transcription with sentences enabled and poll until complete""" |
| 24 | data = { |
| 25 | "audio_url": audio_url, |
| 26 | "speech_models": ["universal-3-pro"], |
| 27 | "auto_highlights": False, |
| 28 | "sentiment_analysis": False, |
| 29 | "entity_detection": False |
| 30 | } |
| 31 | |
| 32 | response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data) |
| 33 | |
| 34 | if response.status_code != 200: |
| 35 | print(f"Error submitting transcription: {response.status_code}, {response.text}") |
| 36 | response.raise_for_status() |
| 37 | |
| 38 | transcript_id = response.json()["id"] |
| 39 | polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}" |
| 40 | |
| 41 | print("Transcribing...") |
| 42 | while True: |
| 43 | transcript = requests.get(polling_endpoint, headers=headers).json() |
| 44 | if transcript["status"] == "completed": |
| 45 | print("Transcription completed!") |
| 46 | return transcript |
| 47 | elif transcript["status"] == "error": |
| 48 | raise RuntimeError(f"Transcription failed: {transcript['error']}") |
| 49 | else: |
| 50 | time.sleep(3) |
| 51 | |
| 52 | def get_sentences(transcript_id): |
| 53 | """Get sentences from a completed transcript""" |
| 54 | sentences_endpoint = f"{base_url}/v2/transcript/{transcript_id}/sentences" |
| 55 | response = requests.get(sentences_endpoint, headers=headers) |
| 56 | |
| 57 | if response.status_code != 200: |
| 58 | print(f"Error getting sentences: {response.status_code}, {response.text}") |
| 59 | response.raise_for_status() |
| 60 | |
| 61 | return response.json()["sentences"] |
| 62 | |
| 63 | def process_with_llm_gateway(transcript_text, question, context=""): |
| 64 | """Send transcript to LLM Gateway for question answering""" |
| 65 | prompt = f"""Based on the following transcript, please answer this question: |
| 66 | Question: {question} |
| 67 | Context: {context} |
| 68 | Transcript: {transcript_text} |
| 69 | Please provide a clear and specific answer.""" |
| 70 | |
| 71 | llm_gateway_data = { |
| 72 | "model": "claude-sonnet-4-5-20250929", |
| 73 | "messages": [ |
| 74 | { |
| 75 | "role": "user", |
| 76 | "content": prompt |
| 77 | } |
| 78 | ], |
| 79 | "max_tokens": 2000 |
| 80 | } |
| 81 | |
| 82 | response = requests.post( |
| 83 | "https://llm-gateway.assemblyai.com/v1/chat/completions", |
| 84 | headers=headers, |
| 85 | json=llm_gateway_data |
| 86 | ) |
| 87 | |
| 88 | result = response.json() |
| 89 | |
| 90 | if "error" in result: |
| 91 | raise RuntimeError(f"LLM Gateway error: {result['error']}") |
| 92 | |
| 93 | return result['choices'][0]['message']['content'] |
| 94 | |
| 95 | def sliding_window(elements, distance, stride): |
| 96 | """Create sliding windows of elements""" |
| 97 | idx = 0 |
| 98 | results = [] |
| 99 | while idx + distance < len(elements): |
| 100 | results.append(elements[idx:idx + distance]) |
| 101 | idx += (distance - stride) |
| 102 | return results |
| 103 | |
| 104 | # Main execution |
| 105 | # If using a local file: |
| 106 | audio_url = upload_file("<YOUR_AUDIO FILE>") |
| 107 | |
| 108 | # If using a public URL: |
| 109 | # audio_url = "<YOUR_AUDIO_URL>" |
| 110 | |
| 111 | # Transcribe audio |
| 112 | transcript = transcribe_audio(audio_url) |
| 113 | transcript_text = transcript["text"] |
| 114 | transcript_id = transcript["id"] |
| 115 | |
| 116 | # Get sentences |
| 117 | print("Getting sentences...") |
| 118 | sentences = get_sentences(transcript_id) |
| 119 | |
| 120 | # Initialize embedder |
| 121 | embedder = SentenceTransformer("multi-qa-mpnet-base-dot-v1") |
| 122 | embeddings = {} |
| 123 | |
| 124 | # Create sliding window of sentences and generate embeddings |
| 125 | print("Creating embeddings...") |
| 126 | sentence_groups = sliding_window(sentences, 5, 2) |
| 127 | |
| 128 | for sentence_group in sentence_groups: |
| 129 | combined_text = " ".join([sentence["text"] for sentence in sentence_group]) |
| 130 | start = sentence_group[0]["start"] |
| 131 | end = sentence_group[-1]["end"] |
| 132 | |
| 133 | embeddings[(start, end, transcript_id, combined_text)] = embedder.encode(combined_text) |
| 134 | |
| 135 | # Use LLM Gateway to find the best quotes |
| 136 | print("Asking LLM Gateway for best quotes...") |
| 137 | question = "What are the 3 best quotes from this video?" |
| 138 | context = "Please provide exactly 3 quotes." |
| 139 | |
| 140 | llm_answer = process_with_llm_gateway(transcript_text, question, context) |
| 141 | print(f"\nLLM Gateway Response:\n{llm_answer}\n") |
| 142 | |
| 143 | # Embed the LLM output |
| 144 | llm_gateway_embedding = embedder.encode(llm_answer) |
| 145 | |
| 146 | # Vectorize transcript embeddings |
| 147 | np_embeddings = np.array(list(embeddings.values())) |
| 148 | metadata = list(embeddings.keys()) |
| 149 | |
| 150 | # Find the top 3 most similar quotes |
| 151 | print("Finding matching quotes in transcript...") |
| 152 | knn = NearestNeighbors(n_neighbors=3, metric="cosine") |
| 153 | knn.fit(np_embeddings) |
| 154 | distances, indices = knn.kneighbors([llm_gateway_embedding]) |
| 155 | |
| 156 | matches = [] |
| 157 | for distance, index in zip(distances[0], indices[0]): |
| 158 | result_metadata = metadata[index] |
| 159 | matches.append( |
| 160 | { |
| 161 | "start_timestamp": result_metadata[0], |
| 162 | "end_timestamp": result_metadata[1], |
| 163 | "transcript_id": result_metadata[2], |
| 164 | "text": result_metadata[3], |
| 165 | "confidence": 1 - distance, |
| 166 | } |
| 167 | ) |
| 168 | |
| 169 | # Display results |
| 170 | print("\n" + "="*80) |
| 171 | print("BEST MATCHING QUOTES FROM TRANSCRIPT:") |
| 172 | print("="*80 + "\n") |
| 173 | |
| 174 | for index, m in enumerate(matches): |
| 175 | print('QUOTE #{}: "{}"'.format(index + 1, m['text'])) |
| 176 | print('START TIMESTAMP:', str(datetime.timedelta(seconds=m['start_timestamp']/1000))) |
| 177 | print('END TIMESTAMP:', str(datetime.timedelta(seconds=m['end_timestamp']/1000))) |
| 178 | print('CONFIDENCE:', m['confidence']) |
| 179 | print() |