Create Subtitles with Speaker Labels | AssemblyAI

Quickstart

1 import assemblyai as aai
2 
3 # SETTINGS
4 aai.settings.api_key = "YOUR-API-KEY"
5 filename = "YOUR-FILE-NAME"
6 transcriber = aai.Transcriber(config=aai.TranscriptionConfig(speaker_labels=True))
7 transcript = transcriber.transcribe(filename)
8 
9 # Maximum number of words per subtitle
10 max_words_per_subtitle = 6
11 
12 # Color assignments for speakers
13 speaker_colors = {
14     "A": "red",
15     "B": "orange",
16     "C": "yellow",
17     "D": "yellowgreen",
18     "E": "green",
19     "F": "lightskyblue",
20     "G": "purple",
21     "H": "mediumpurple",
22     "I": "pink",
23     "J": "brown",
24 }
25 
26 # Process transcription segments
27 def process_segments(segments):
28     srt_content = ""
29     subtitle_index = 1
30     for segment in segments:
31         speaker = segment.speaker
32         color = speaker_colors.get(speaker, "black")  # Default color is black
33 
34         # Split text into words and group into chunks
35         words = segment.words
36         for i in range(0, len(words), max_words_per_subtitle):
37             chunk = words[i:i + max_words_per_subtitle]
38             start_time = chunk[0].start  # -1 indicates continuation
39             end_time = chunk[-1].end
40             srt_content += create_subtitle(subtitle_index, start_time, end_time, chunk, color)
41             subtitle_index += 1
42 
43     return srt_content
44 
45 
46 # Create a single subtitle
47 def create_subtitle(index, start_time, end_time, words, color):
48     text = ""
49     for word in words:
50         text += word.text + ' '
51     start_srt = format_time(start_time)
52     end_srt = format_time(end_time)
53     return f"{index}\n{start_srt} --> {end_srt}\n<font color=\"{color}\">{text}</font>\n\n"
54 
55 # Format time in SRT style
56 def format_time(milliseconds):
57     hours, remainder = divmod(milliseconds, 3600000)
58     minutes, remainder = divmod(remainder, 60000)
59     seconds, milliseconds = divmod(remainder, 1000)
60     return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{int(milliseconds):03}"
61 
62 # Generate SRT content
63 sentences = transcript.get_sentences()
64 srt_content = process_segments(sentences)
65 
66 # Save to SRT file
67 with open(filename + '.srt', 'w') as file:
68     file.write(srt_content)
69 
70 print(f"SRT file generated: {filename}.srt")

This Colab will demonstrate how to use AssemblyAI’s Speaker Diarization model together to format subtitles according to their respective speaker.

Step-by-step guide

Before we begin, make sure you have an AssemblyAI account and an API key. You can sign up for an AssemblyAI account and get your API key from your dashboard.

$ pip install assemblyai

First, we will configure our API key as well as our file to be transcribed. Then, we decide on a number of words we want to have per subtitle.

Lastly, we transcribe our file.

1 import assemblyai as aai
2 
3 # SETTINGS
4 aai.settings.api_key = "YOUR-API-KEY"
5 filename = "YOUR-FILE-NAME"
6 transcriber = aai.Transcriber(config=aai.TranscriptionConfig(speaker_labels=True))
7 transcript = transcriber.transcribe(filename)
8 
9 # Maximum number of words per subtitle
10 max_words_per_subtitle = 6

How the code works

speaker_colors is a dictionary that maps speaker identifiers (like “A”, “B”, “C”, etc.) to specific colors. Each speaker in the transcription will be associated with a unique color in the subtitles.

When Speaker Diarization is enabled, sentences in our API response have a speaker code under the speaker key. We use the speaker code to determine the color of the subtitle text.

1 # Color assignments for speakers
2 speaker_colors = {
3     "A": "red",
4     "B": "orange",
5     "C": "yellow",
6     "D": "yellowgreen",
7     "E": "green",
8     "F": "lightskyblue",
9     "G": "purple",
10     "H": "mediumpurple",
11     "I": "pink",
12     "J": "brown",
13 }
14 
15 # Process transcription segments
16 def process_segments(segments):
17     srt_content = ""
18     subtitle_index = 1
19     for segment in segments:
20         speaker = segment.speaker
21         color = speaker_colors.get(speaker, "black")  # Default color is black
22 
23         # Split text into words and group into chunks
24         words = segment.words
25         for i in range(0, len(words), max_words_per_subtitle):
26             chunk = words[i:i + max_words_per_subtitle]
27             start_time = chunk[0].start  # -1 indicates continuation
28             end_time = chunk[-1].end
29             srt_content += create_subtitle(subtitle_index, start_time, end_time, chunk, color)
30             subtitle_index += 1
31 
32     return srt_content
33 
34 # Create a single subtitle
35 def create_subtitle(index, start_time, end_time, words, color):
36     text = ""
37     for word in words:
38         text += word.text + ' '
39     start_srt = format_time(start_time)
40     end_srt = format_time(end_time)
41     return f"{index}\n{start_srt} --> {end_srt}\n<font color=\"{color}\">{text}</font>\n\n"
42 
43 # Format time in SRT style
44 def format_time(milliseconds):
45     hours, remainder = divmod(milliseconds, 3600000)
46     minutes, remainder = divmod(remainder, 60000)
47     seconds, milliseconds = divmod(remainder, 1000)
48     return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{int(milliseconds):03}"

Our last step is to generate and save our subtitle file!

1 # Generate SRT content
2 sentences = transcript.get_sentences()
3 srt_content = process_segments(sentences)
4 
5 # Save to SRT file
6 with open(filename + '.srt', 'w') as file:
7     file.write(srt_content)
8 
9 print(f"SRT file generated: {filename}.srt")