Redact PII from Text Using LLM Gateway | AssemblyAI

This guide will show you how to use AssemblyAI’s LLM Gateway to redact personally identifiable information (PII) from text.

Quickstart

1 import requests
2 import time
3 import json
4 import re
5 
6 base_url = "https://api.assemblyai.com"
7 headers = {"authorization": "<YOUR_API_KEY>"}
8 
9 def generate_ner(transcript_text):
10     prompt = '''
11     You will be given a transcript of a conversation or text. Your task is to generate named entities from the given transcript text.
12 
13     Please identify and extract the following named entities from the transcript:
14 
15     1. Person names
16     2. Organization names
17     3. Email addresses
18     4. Phone numbers
19     5. Full addresses
20 
21     When extracting these entities, make sure to return the exact spelling and formatting as they appear in the transcript. Do not modify or standardize the entities in any way.
22 
23     Present your results in a JSON format with a single field named "named_entities". This field should contain an array of strings, where each string is a named entity you've identified. For example:
24     {
25       "named_entities": ["John Doe", "Acme Corp", "john.doe@example.com", "123-456-7890", "123 Main St, Anytown, USA 12345"]
26     }
27 
28     Important: Do not include any other information, explanations, or text in your response. Your output should consist solely of the JSON object containing the named entities.
29 
30     If you do not find any named entities of a particular type, simply return an empty array for the "named_entities" field.
31     '''
32 
33     llm_gateway_data = {
34         "model": "claude-sonnet-4-5-20250929",
35         "messages": [
36             {"role": "user", "content": f"{prompt}\n\nTranscript: {transcript_text}"}
37         ],
38         "max_tokens": 1000,
39         "temperature": 0.0
40     }
41 
42     response = requests.post(
43         "https://llm-gateway.assemblyai.com/v1/chat/completions",
44         headers=headers,
45         json=llm_gateway_data
46     )
47 
48     result = response.json()["choices"][0]["message"]["content"]
49 
50     try:
51         res_json = json.loads(result)
52     except:
53         res_json = {'named_entities': []}
54 
55     named_entities = res_json.get('named_entities', [])
56     return named_entities
57 
58 # Step 1: Transcribe audio
59 with open("./my-audio.mp3", "rb") as f:
60     response = requests.post(base_url + "/v2/upload", headers=headers, data=f)
61 
62 upload_url = response.json()["upload_url"]
63 data = {"audio_url": upload_url}
64 
65 response = requests.post(base_url + "/v2/transcript", json=data, headers=headers)
66 transcript_id = response.json()['id']
67 polling_endpoint = base_url + "/v2/transcript/" + transcript_id
68 
69 while True:
70     transcription_result = requests.get(polling_endpoint, headers=headers).json()
71     if transcription_result['status'] == 'completed':
72         break
73     elif transcription_result['status'] == 'error':
74         raise RuntimeError(f"Transcription failed: {transcription_result['error']}")
75     else:
76         time.sleep(3)
77 
78 # Step 2: Split transcript into sentences and redact PII
79 transcript_text = transcription_result['text']
80 sentences = re.split(r'[.!?]+', transcript_text)
81 redacted_transcript = ''
82 
83 for sentence in sentences:
84     sentence = sentence.strip()
85     if not sentence:
86         continue
87 
88     generated_entities = generate_ner(sentence)
89     redacted_sentence = sentence
90 
91     for entity in generated_entities:
92         redacted_sentence = redacted_sentence.replace(entity, '#' * len(entity))
93 
94     redacted_transcript += redacted_sentence + '. '
95     print(redacted_sentence)
96 
97 print('\nFull redacted transcript:')
98 print(redacted_transcript)

Get Started

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for an account and get your API key from your dashboard.

Step-by-Step Instructions

Install the required packages:

$ pip install requests

Import the required packages and set up your API client:

1 import requests
2 import time
3 import json
4 import re
5 
6 base_url = "https://api.assemblyai.com"
7 headers = {"authorization": "<YOUR_API_KEY>"}

Define a function generate_ner that uses LLM Gateway to identify named entities (person names, organizations, emails, phone numbers, addresses) in a given text.

1 def generate_ner(transcript_text):
2     prompt = '''
3     You will be given a transcript of a conversation or text. Your task is to generate named entities from the given transcript text.
4 
5     Please identify and extract the following named entities from the transcript:
6 
7     1. Person names
8     2. Organization names
9     3. Email addresses
10     4. Phone numbers
11     5. Full addresses
12 
13     When extracting these entities, make sure to return the exact spelling and formatting as they appear in the transcript. Do not modify or standardize the entities in any way.
14 
15     Present your results in a JSON format with a single field named "named_entities". This field should contain an array of strings, where each string is a named entity you've identified. For example:
16     {
17       "named_entities": ["John Doe", "Acme Corp", "john.doe@example.com", "123-456-7890", "123 Main St, Anytown, USA 12345"]
18     }
19 
20     Important: Do not include any other information, explanations, or text in your response. Your output should consist solely of the JSON object containing the named entities.
21 
22     If you do not find any named entities of a particular type, simply return an empty array for the "named_entities" field.
23     '''
24 
25     llm_gateway_data = {
26         "model": "claude-sonnet-4-5-20250929",
27         "messages": [
28             {"role": "user", "content": f"{prompt}\n\nTranscript: {transcript_text}"}
29         ],
30         "max_tokens": 1000,
31         "temperature": 0.0
32     }
33 
34     response = requests.post(
35         "https://llm-gateway.assemblyai.com/v1/chat/completions",
36         headers=headers,
37         json=llm_gateway_data
38     )
39 
40     result = response.json()["choices"][0]["message"]["content"]
41 
42     try:
43         res_json = json.loads(result)
44     except:
45         res_json = {'named_entities': []}
46 
47     named_entities = res_json.get('named_entities', [])
48     return named_entities

Transcribe an audio file using the AssemblyAI API:

1 with open("./my-audio.mp3", "rb") as f:
2     response = requests.post(base_url + "/v2/upload", headers=headers, data=f)
3 
4 upload_url = response.json()["upload_url"]
5 data = {"audio_url": upload_url}  # You can also use a URL to an audio or video file on the web
6 
7 response = requests.post(base_url + "/v2/transcript", json=data, headers=headers)
8 transcript_id = response.json()['id']
9 polling_endpoint = base_url + "/v2/transcript/" + transcript_id
10 
11 while True:
12     transcription_result = requests.get(polling_endpoint, headers=headers).json()
13     if transcription_result['status'] == 'completed':
14         break
15     elif transcription_result['status'] == 'error':
16         raise RuntimeError(f"Transcription failed: {transcription_result['error']}")
17     else:
18         time.sleep(3)

Split the transcript into sentences, identify named entities using generate_ner, and replace them with # characters:

1 transcript_text = transcription_result['text']
2 sentences = re.split(r'[.!?]+', transcript_text)
3 redacted_transcript = ''
4 
5 for sentence in sentences:
6     sentence = sentence.strip()
7     if not sentence:
8         continue
9 
10     generated_entities = generate_ner(sentence)
11     redacted_sentence = sentence
12 
13     for entity in generated_entities:
14         redacted_sentence = redacted_sentence.replace(entity, '#' * len(entity))
15 
16     redacted_transcript += redacted_sentence + '. '
17     print(redacted_sentence)

Print the fully redacted transcript:

1 print('\nFull redacted transcript:')
2 print(redacted_transcript)