Extract Dialogue Data with LLM Gateway and JSON

In this guide, we’ll show you how to use AssemblyAI’s LLM Gateway framework to process several audio files, and then format your results in JSON (JavaScript Object Notation) format.

JSON allows you to programmatically format, parse, and transfer responses from LLM Gateway, which is useful for implementing LLM Gateway with a wide range of other applications.

In this example, we will leverage the JSON formatting to create a .csv file from a directory of files that must be transcribed and submitted to LLM Gateway. However, you can use the same concepts in this guide to generate a JSON-formatted response, which you can then use to update a database table or interact with other APIs.

Quickstart

1import requests
2import json
3import os
4import csv
5import time
6import re
7
8# Configuration
9api_key = "<YOUR_API_KEY>"
10base_url = "https://api.assemblyai.com"
11headers = {"authorization": api_key}
12output_filename = "profiles.csv"
13
14def extract_json(text):
15 """Extract JSON from text that might contain markdown or extra text"""
16 # First, try to remove markdown code blocks
17 text = text.strip()
18
19 # Remove ```json and ``` markers
20 if text.startswith("```"):
21 text = re.sub(r'^```(?:json)?\s*', '', text)
22 text = re.sub(r'\s*```$', '', text)
23
24 # Find the first { and last } to extract just the JSON object
25 first_brace = text.find('{')
26 last_brace = text.rfind('}')
27
28 if first_brace != -1 and last_brace != -1:
29 json_str = text[first_brace:last_brace + 1]
30 return json.loads(json_str)
31
32 # If that didn't work, try parsing the whole thing
33 return json.loads(text)
34
35def upload_file(file_path):
36 """Upload a local audio file to AssemblyAI"""
37 with open(file_path, "rb") as f:
38 response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
39 if response.status_code != 200:
40 print(f"Error uploading {file_path}: {response.status_code}, {response.text}")
41 response.raise_for_status()
42 return response.json()["upload_url"]
43
44def transcribe_audio(audio_url):
45 """Submit audio for transcription and poll until complete"""
46 # Submit transcription request
47 data = {"audio_url": audio_url}
48 response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
49
50 if response.status_code != 200:
51 print(f"Error submitting transcription: {response.status_code}, {response.text}")
52 response.raise_for_status()
53
54 transcript_id = response.json()["id"]
55 polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
56
57 # Poll for completion
58 while True:
59 transcript = requests.get(polling_endpoint, headers=headers).json()
60 if transcript["status"] == "completed":
61 return transcript["text"]
62 elif transcript["status"] == "error":
63 raise RuntimeError(f"Transcription failed: {transcript['error']}")
64 else:
65 time.sleep(3)
66
67def process_with_llm_gateway(transcript_text, prompt):
68 """Send transcript to LLM Gateway for processing"""
69 llm_gateway_data = {
70 "model": "claude-sonnet-4-5-20250929",
71 "messages": [
72 {
73 "role": "user",
74 "content": f"{prompt}\n\nTranscript:\n\n{transcript_text}"
75 }
76 ],
77 "max_tokens": 1500
78 }
79
80 response = requests.post(
81 "https://llm-gateway.assemblyai.com/v1/chat/completions",
82 headers=headers,
83 json=llm_gateway_data
84 )
85
86 result = response.json()
87
88 if "error" in result:
89 raise RuntimeError(f"LLM Gateway error: {result['error']}")
90
91 return result['choices'][0]['message']['content']
92
93# Main execution
94prompt = """
95 You are an HR executive scanning through an interview transcript to extract information about a candidate.
96 You are required to create a JSON response with key information about the candidate.
97 You will use this template for your answer:
98 {
99 "Name": "<candidate-name>",
100 "Position": "<job position that candidate is applying for>",
101 "Past experience": "<A short phrase describing the candidate's relevant past experience for the role>"
102 }
103 Do not include any other text in your response. Only respond in JSON format that is not surrounded by markdown code, as your response will be parsed programmatically as JSON.
104 """
105
106# Get all files from interviews directory
107interview_files = [os.path.join("interviews", file) for file in os.listdir("interviews")]
108
109with open(output_filename, "w", newline="") as file:
110 writer = csv.writer(file)
111 header = ["Name", "Position", "Past Experience"]
112 writer.writerow(header)
113
114 print(f"Processing {len(interview_files)} interview files...")
115
116 for interview_file in interview_files:
117 print(f"\nProcessing: {interview_file}")
118
119 # Upload file and get URL
120 print(" Uploading file...")
121 audio_url = upload_file(interview_file)
122
123 # Transcribe audio
124 print(" Transcribing...")
125 transcript_text = transcribe_audio(audio_url)
126
127 # Process with LLM Gateway
128 print(" Analyzing with LLM Gateway...")
129 llm_response = process_with_llm_gateway(transcript_text, prompt)
130
131 # Parse JSON response
132 interviewee_data = extract_json(llm_response)
133 writer.writerow(interviewee_data.values())
134 print(f" Completed: {interviewee_data['Name']}")
135
136print(f"\nCreated .csv file {output_filename}")

Get Started

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for an AssemblyAI account and get your API key from your dashboard.

Step-by-Step Instructions

In this guide, we will ask the same questions to LLM Gateway about multiple files. Then, we will collate the answers in a .csv file.

Import the necessary libraries for making an HTTP request and set your API key, headers, and base URL.

1import requests
2import json
3import os
4import csv
5import time
6import re
7
8# Configuration
9api_key = "<YOUR_API_KEY>"
10base_url = "https://api.assemblyai.com"
11headers = {"authorization": api_key}
12output_filename = "profiles.csv"

Define a function to extract the JSON text from the response from LLM Gateway.

1def extract_json(text):
2 """Extract JSON from text that might contain markdown or extra text"""
3 # First, try to remove markdown code blocks
4 text = text.strip()
5
6 # Remove ```json and ``` markers
7 if text.startswith("```"):
8 text = re.sub(r'^```(?:json)?\s*', '', text)
9 text = re.sub(r'\s*```$', '', text)
10
11 # Find the first { and last } to extract just the JSON object
12 first_brace = text.find('{')
13 last_brace = text.rfind('}')
14
15 if first_brace != -1 and last_brace != -1:
16 json_str = text[first_brace:last_brace + 1]
17 return json.loads(json_str)
18
19 # If that didn't work, try parsing the whole thing
20 return json.loads(text)

Define functions to upload and transcribe each file using AssemblyAI’s Async API.

1def upload_file(file_path):
2 """Upload a local audio file to AssemblyAI"""
3 with open(file_path, "rb") as f:
4 response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
5 if response.status_code != 200:
6 print(f"Error uploading {file_path}: {response.status_code}, {response.text}")
7 response.raise_for_status()
8 return response.json()["upload_url"]
9
10def transcribe_audio(audio_url):
11 """Submit audio for transcription and poll until complete"""
12 # Submit transcription request
13 data = {"audio_url": audio_url}
14 response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
15
16 if response.status_code != 200:
17 print(f"Error submitting transcription: {response.status_code}, {response.text}")
18 response.raise_for_status()
19
20 transcript_id = response.json()["id"]
21 polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
22
23 # Poll for completion
24 while True:
25 transcript = requests.get(polling_endpoint, headers=headers).json()
26 if transcript["status"] == "completed":
27 return transcript["text"]
28 elif transcript["status"] == "error":
29 raise RuntimeError(f"Transcription failed: {transcript['error']}")
30 else:
31 time.sleep(3)

Define a function to process each transcript text with LLM Gateway.

1def process_with_llm_gateway(transcript_text, prompt):
2 """Send transcript to LLM Gateway for processing"""
3 llm_gateway_data = {
4 "model": "claude-sonnet-4-5-20250929",
5 "messages": [
6 {
7 "role": "user",
8 "content": f"{prompt}\n\nTranscript:\n\n{transcript_text}"
9 }
10 ],
11 "max_tokens": 1500
12 }
13
14 response = requests.post(
15 "https://llm-gateway.assemblyai.com/v1/chat/completions",
16 headers=headers,
17 json=llm_gateway_data
18 )
19
20 result = response.json()
21
22 if "error" in result:
23 raise RuntimeError(f"LLM Gateway error: {result['error']}")
24
25 return result['choices'][0]['message']['content']

Define your LLM Gateway request prompt.

1prompt = """
2 You are an HR executive scanning through an interview transcript to extract information about a candidate.
3 You are required to create a JSON response with key information about the candidate.
4 You will use this template for your answer:
5 {
6 "Name": "<candidate-name>",
7 "Position": "<job position that candidate is applying for>",
8 "Past experience": "<A short phrase describing the candidate's relevant past experience for the role>"
9 }
10 Do not include any other text in your response. Only respond in JSON format that is not surrounded by markdown code, as your response will be parsed programmatically as JSON.
11 """

Retrieve and process each file in the interviews folder and create a .csv file with the results.

1interview_files = [os.path.join("interviews", file) for file in os.listdir("interviews")]
2
3with open(output_filename, "w", newline="") as file:
4 writer = csv.writer(file)
5 header = ["Name", "Position", "Past Experience"]
6 writer.writerow(header)
7
8 print(f"Processing {len(interview_files)} interview files...")
9
10 for interview_file in interview_files:
11 print(f"\nProcessing: {interview_file}")
12
13 # Upload file and get URL
14 print(" Uploading file...")
15 audio_url = upload_file(interview_file)
16
17 # Transcribe audio
18 print(" Transcribing...")
19 transcript_text = transcribe_audio(audio_url)
20
21 # Process with LLM Gateway
22 print(" Analyzing with LLM Gateway...")
23 llm_response = process_with_llm_gateway(transcript_text, prompt)
24
25 # Parse JSON response
26 interviewee_data = extract_json(llm_response)
27 writer.writerow(interviewee_data.values())
28 print(f" Completed: {interviewee_data['Name']}")
29
30print(f"\nCreated .csv file {output_filename}")

For context, this is the response from LLM Gateway with our prompt.

1{
2 "Name": "John Smith",
3 "Position": "software engineer",
4 "Past experience": "three years of experience at Google"
5}

You can now run your Python script and you should see that a profiles.csv file is generated.