| 1 | import requests |
| 2 | import json |
| 3 | import os |
| 4 | import csv |
| 5 | import time |
| 6 | import re |
| 7 | |
| 8 | # Configuration |
| 9 | api_key = "<YOUR_API_KEY>" |
| 10 | base_url = "https://api.assemblyai.com" |
| 11 | headers = {"authorization": api_key} |
| 12 | output_filename = "profiles.csv" |
| 13 | |
| 14 | def extract_json(text): |
| 15 | """Extract JSON from text that might contain markdown or extra text""" |
| 16 | # First, try to remove markdown code blocks |
| 17 | text = text.strip() |
| 18 | |
| 19 | # Remove ```json and ``` markers |
| 20 | if text.startswith("```"): |
| 21 | text = re.sub(r'^```(?:json)?\s*', '', text) |
| 22 | text = re.sub(r'\s*```$', '', text) |
| 23 | |
| 24 | # Find the first { and last } to extract just the JSON object |
| 25 | first_brace = text.find('{') |
| 26 | last_brace = text.rfind('}') |
| 27 | |
| 28 | if first_brace != -1 and last_brace != -1: |
| 29 | json_str = text[first_brace:last_brace + 1] |
| 30 | return json.loads(json_str) |
| 31 | |
| 32 | # If that didn't work, try parsing the whole thing |
| 33 | return json.loads(text) |
| 34 | |
| 35 | def upload_file(file_path): |
| 36 | """Upload a local audio file to AssemblyAI""" |
| 37 | with open(file_path, "rb") as f: |
| 38 | response = requests.post(f"{base_url}/v2/upload", headers=headers, data=f) |
| 39 | if response.status_code != 200: |
| 40 | print(f"Error uploading {file_path}: {response.status_code}, {response.text}") |
| 41 | response.raise_for_status() |
| 42 | return response.json()["upload_url"] |
| 43 | |
| 44 | def transcribe_audio(audio_url): |
| 45 | """Submit audio for transcription and poll until complete""" |
| 46 | # Submit transcription request |
| 47 | data = {"audio_url": audio_url, "speech_models": ["universal-3-pro"]} |
| 48 | response = requests.post(f"{base_url}/v2/transcript", headers=headers, json=data) |
| 49 | |
| 50 | if response.status_code != 200: |
| 51 | print(f"Error submitting transcription: {response.status_code}, {response.text}") |
| 52 | response.raise_for_status() |
| 53 | |
| 54 | transcript_id = response.json()["id"] |
| 55 | polling_endpoint = f"{base_url}/v2/transcript/{transcript_id}" |
| 56 | |
| 57 | # Poll for completion |
| 58 | while True: |
| 59 | transcript = requests.get(polling_endpoint, headers=headers).json() |
| 60 | if transcript["status"] == "completed": |
| 61 | return transcript["text"] |
| 62 | elif transcript["status"] == "error": |
| 63 | raise RuntimeError(f"Transcription failed: {transcript['error']}") |
| 64 | else: |
| 65 | time.sleep(3) |
| 66 | |
| 67 | def process_with_llm_gateway(transcript_text, prompt): |
| 68 | """Send transcript to LLM Gateway for processing""" |
| 69 | llm_gateway_data = { |
| 70 | "model": "claude-sonnet-4-5-20250929", |
| 71 | "messages": [ |
| 72 | { |
| 73 | "role": "user", |
| 74 | "content": f"{prompt}\n\nTranscript:\n\n{transcript_text}" |
| 75 | } |
| 76 | ], |
| 77 | "max_tokens": 1500 |
| 78 | } |
| 79 | |
| 80 | response = requests.post( |
| 81 | "https://llm-gateway.assemblyai.com/v1/chat/completions", |
| 82 | headers=headers, |
| 83 | json=llm_gateway_data |
| 84 | ) |
| 85 | |
| 86 | result = response.json() |
| 87 | |
| 88 | if "error" in result: |
| 89 | raise RuntimeError(f"LLM Gateway error: {result['error']}") |
| 90 | |
| 91 | return result['choices'][0]['message']['content'] |
| 92 | |
| 93 | # Main execution |
| 94 | prompt = """ |
| 95 | You are an HR executive scanning through an interview transcript to extract information about a candidate. |
| 96 | You are required to create a JSON response with key information about the candidate. |
| 97 | You will use this template for your answer: |
| 98 | { |
| 99 | "Name": "<candidate-name>", |
| 100 | "Position": "<job position that candidate is applying for>", |
| 101 | "Past experience": "<A short phrase describing the candidate's relevant past experience for the role>" |
| 102 | } |
| 103 | Do not include any other text in your response. Only respond in JSON format that is not surrounded by markdown code, as your response will be parsed programmatically as JSON. |
| 104 | """ |
| 105 | |
| 106 | # Get all files from interviews directory |
| 107 | interview_files = [os.path.join("interviews", file) for file in os.listdir("interviews")] |
| 108 | |
| 109 | with open(output_filename, "w", newline="") as file: |
| 110 | writer = csv.writer(file) |
| 111 | header = ["Name", "Position", "Past Experience"] |
| 112 | writer.writerow(header) |
| 113 | |
| 114 | print(f"Processing {len(interview_files)} interview files...") |
| 115 | |
| 116 | for interview_file in interview_files: |
| 117 | print(f"\nProcessing: {interview_file}") |
| 118 | |
| 119 | # Upload file and get URL |
| 120 | print(" Uploading file...") |
| 121 | audio_url = upload_file(interview_file) |
| 122 | |
| 123 | # Transcribe audio |
| 124 | print(" Transcribing...") |
| 125 | transcript_text = transcribe_audio(audio_url) |
| 126 | |
| 127 | # Process with LLM Gateway |
| 128 | print(" Analyzing with LLM Gateway...") |
| 129 | llm_response = process_with_llm_gateway(transcript_text, prompt) |
| 130 | |
| 131 | # Parse JSON response |
| 132 | interviewee_data = extract_json(llm_response) |
| 133 | writer.writerow(interviewee_data.values()) |
| 134 | print(f" Completed: {interviewee_data['Name']}") |
| 135 | |
| 136 | print(f"\nCreated .csv file {output_filename}") |