Extract Dialogue Data with LLM Gateway and JSON

In this guide, we’ll show you how to use AssemblyAI’s LLM Gateway framework to process several audio files, and then format your results in JSON (JavaScript Object Notation) format.

JSON allows you to programmatically format, parse, and transfer responses from LLM Gateway, which is useful for implementing LLM Gateway with a wide range of other applications.

In this example, we will leverage the JSON formatting to create a .csv file from a directory of files that must be transcribed and submitted to LLM Gateway. However, you can use the same concepts in this guide to generate a JSON-formatted response, which you can then use to update a database table or interact with other APIs.

Quickstart

1import requests
2import json
3import os
4import csv
5import time
6import re
7
8# Configuration
9api_key = "<YOUR_API_KEY>"
10base_url = "https://api.assemblyai.com"
11headers = {"authorization": api_key}
12output_filename = "profiles.csv"
13
14def extract_json(text):
15 """Extract JSON from text that might contain markdown or extra text"""
16 # First, try to remove markdown code blocks
17 text = text.strip()
18
19 # Remove ```json and ``` markers
20 if text.startswith("```"):
21 text = re.sub(r'^```(?:json)?\s*', '', text)
22 text = re.sub(r'\s*```$', '', text)
23
24 # Find the first { and last } to extract just the JSON object
25 first_brace = text.find('{')
26 last_brace = text.rfind('}')
27
28 if first_brace != -1 and last_brace != -1:
29 json_str = text[first_brace:last_brace + 1]
30 return json.loads(json_str)
31
32 # If that didn't work, try parsing the whole thing
33 return json.loads(text)
34
35def upload_file(file_path):
36 """Upload a local audio file to AssemblyAI"""
37 with open(file_path, "rb") as f:
38 response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
39 if response.status_code != 200:
40 print(f"Error uploading {file_path}: {response.status_code}, {response.text}")
41 response.raise_for_status()
42 return response.json()["upload_url"]
43
44def transcribe_audio(audio_url):
45 """Submit audio for transcription and poll until complete"""
46 # Submit transcription request
47 data = {"audio_url": audio_url, "speech_models": ["universal-3-pro"]}
48 response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
49
50 if response.status_code != 200:
51 print(f"Error submitting transcription: {response.status_code}, {response.text}")
52 response.raise_for_status()
53
54 transcript_id = response.json()["id"]
55 polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
56
57 # Poll for completion
58 while True:
59 transcript = requests.get(polling_endpoint, headers=headers).json()
60 if transcript["status"] == "completed":
61 return transcript_id
62 elif transcript["status"] == "error":
63 raise RuntimeError(f"Transcription failed: {transcript['error']}")
64 else:
65 time.sleep(3)
66
67def process_with_llm_gateway(transcript_id, prompt):
68 """Send transcript to LLM Gateway for processing"""
69 llm_gateway_data = {
70 "model": "claude-sonnet-4-5-20250929",
71 "messages": [
72 {
73 "role": "user",
74 "content": f"{prompt}\n\n{{{{ transcript }}}}"
75 }
76 ],
77 "transcript_id": transcript_id,
78 "max_tokens": 1500
79 }
80
81 response = requests.post(
82 "https://llm-gateway.assemblyai.com/v1/chat/completions",
83 headers=headers,
84 json=llm_gateway_data
85 )
86
87 result = response.json()
88
89 if "error" in result:
90 raise RuntimeError(f"LLM Gateway error: {result['error']}")
91
92 return result['choices'][0]['message']['content']
93
94# Main execution
95prompt = """
96 You are an HR executive scanning through an interview transcript to extract information about a candidate.
97 You are required to create a JSON response with key information about the candidate.
98 You will use this template for your answer:
99 {
100 "Name": "<candidate-name>",
101 "Position": "<job position that candidate is applying for>",
102 "Past experience": "<A short phrase describing the candidate's relevant past experience for the role>"
103 }
104 Do not include any other text in your response. Only respond in JSON format that is not surrounded by markdown code, as your response will be parsed programmatically as JSON.
105 """
106
107# Get all files from interviews directory
108interview_files = [os.path.join("interviews", file) for file in os.listdir("interviews")]
109
110with open(output_filename, "w", newline="") as file:
111 writer = csv.writer(file)
112 header = ["Name", "Position", "Past Experience"]
113 writer.writerow(header)
114
115 print(f"Processing {len(interview_files)} interview files...")
116
117 for interview_file in interview_files:
118 print(f"\nProcessing: {interview_file}")
119
120 # Upload file and get URL
121 print(" Uploading file...")
122 audio_url = upload_file(interview_file)
123
124 # Transcribe audio
125 print(" Transcribing...")
126 transcript_id = transcribe_audio(audio_url)
127
128 # Process with LLM Gateway
129 print(" Analyzing with LLM Gateway...")
130 llm_response = process_with_llm_gateway(transcript_id, prompt)
131
132 # Parse JSON response
133 interviewee_data = extract_json(llm_response)
134 writer.writerow(interviewee_data.values())
135 print(f" Completed: {interviewee_data['Name']}")
136
137print(f"\nCreated .csv file {output_filename}")

Get Started

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for an AssemblyAI account and get your API key from your dashboard.

Step-by-Step Instructions

In this guide, we will ask the same questions to LLM Gateway about multiple files. Then, we will collate the answers in a .csv file.

Install the required packages:

$pip install requests

Import the necessary libraries for making an HTTP request and set your API key, headers, and base URL.

1import requests
2import json
3import os
4import csv
5import time
6import re
7
8# Configuration
9api_key = "<YOUR_API_KEY>"
10base_url = "https://api.assemblyai.com"
11headers = {"authorization": api_key}
12output_filename = "profiles.csv"

Define a function to extract the JSON text from the response from LLM Gateway.

1def extract_json(text):
2 """Extract JSON from text that might contain markdown or extra text"""
3 # First, try to remove markdown code blocks
4 text = text.strip()
5
6 # Remove ```json and ``` markers
7 if text.startswith("```"):
8 text = re.sub(r'^```(?:json)?\s*', '', text)
9 text = re.sub(r'\s*```$', '', text)
10
11 # Find the first { and last } to extract just the JSON object
12 first_brace = text.find('{')
13 last_brace = text.rfind('}')
14
15 if first_brace != -1 and last_brace != -1:
16 json_str = text[first_brace:last_brace + 1]
17 return json.loads(json_str)
18
19 # If that didn't work, try parsing the whole thing
20 return json.loads(text)

Define functions to upload and transcribe each file using AssemblyAI’s Async API.

1def upload_file(file_path):
2 """Upload a local audio file to AssemblyAI"""
3 with open(file_path, "rb") as f:
4 response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f)
5 if response.status_code != 200:
6 print(f"Error uploading {file_path}: {response.status_code}, {response.text}")
7 response.raise_for_status()
8 return response.json()["upload_url"]
9
10def transcribe_audio(audio_url):
11 """Submit audio for transcription and poll until complete"""
12 # Submit transcription request
13 data = {"audio_url": audio_url, "speech_models": ["universal-3-pro"]}
14 response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data)
15
16 if response.status_code != 200:
17 print(f"Error submitting transcription: {response.status_code}, {response.text}")
18 response.raise_for_status()
19
20 transcript_id = response.json()["id"]
21 polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}"
22
23 # Poll for completion
24 while True:
25 transcript = requests.get(polling_endpoint, headers=headers).json()
26 if transcript["status"] == "completed":
27 return transcript_id
28 elif transcript["status"] == "error":
29 raise RuntimeError(f"Transcription failed: {transcript['error']}")
30 else:
31 time.sleep(3)

Define a function to process each transcript text with LLM Gateway.

1def process_with_llm_gateway(transcript_id, prompt):
2 """Send transcript to LLM Gateway for processing"""
3 llm_gateway_data = {
4 "model": "claude-sonnet-4-5-20250929",
5 "messages": [
6 {
7 "role": "user",
8 "content": f"{prompt}\n\n{{{{ transcript }}}}"
9 }
10 ],
11 "transcript_id": transcript_id,
12 "max_tokens": 1500
13 }
14
15 response = requests.post(
16 "https://llm-gateway.assemblyai.com/v1/chat/completions",
17 headers=headers,
18 json=llm_gateway_data
19 )
20
21 result = response.json()
22
23 if "error" in result:
24 raise RuntimeError(f"LLM Gateway error: {result['error']}")
25
26 return result['choices'][0]['message']['content']

Define your LLM Gateway request prompt.

1prompt = """
2 You are an HR executive scanning through an interview transcript to extract information about a candidate.
3 You are required to create a JSON response with key information about the candidate.
4 You will use this template for your answer:
5 {
6 "Name": "<candidate-name>",
7 "Position": "<job position that candidate is applying for>",
8 "Past experience": "<A short phrase describing the candidate's relevant past experience for the role>"
9 }
10 Do not include any other text in your response. Only respond in JSON format that is not surrounded by markdown code, as your response will be parsed programmatically as JSON.
11 """

Retrieve and process each file in the interviews folder and create a .csv file with the results.

1interview_files = [os.path.join("interviews", file) for file in os.listdir("interviews")]
2
3with open(output_filename, "w", newline="") as file:
4 writer = csv.writer(file)
5 header = ["Name", "Position", "Past Experience"]
6 writer.writerow(header)
7
8 print(f"Processing {len(interview_files)} interview files...")
9
10 for interview_file in interview_files:
11 print(f"\nProcessing: {interview_file}")
12
13 # Upload file and get URL
14 print(" Uploading file...")
15 audio_url = upload_file(interview_file)
16
17 # Transcribe audio
18 print(" Transcribing...")
19 transcript_id = transcribe_audio(audio_url)
20
21 # Process with LLM Gateway
22 print(" Analyzing with LLM Gateway...")
23 llm_response = process_with_llm_gateway(transcript_id, prompt)
24
25 # Parse JSON response
26 interviewee_data = extract_json(llm_response)
27 writer.writerow(interviewee_data.values())
28 print(f" Completed: {interviewee_data['Name']}")
29
30print(f"\nCreated .csv file {output_filename}")

For context, this is the response from LLM Gateway with our prompt.

1{
2 "Name": "John Smith",
3 "Position": "software engineer",
4 "Past experience": "three years of experience at Google"
5}

You can now run your Python script and you should see that a profiles.csv file is generated.