Agents_Course_Final_Assignment / youtube_utils.py
vlapparov's picture
Update youtube_utils.py
9ffb7e0 verified
import re
from typing import Optional, Dict, Any, List
from youtube_transcript_api import YouTubeTranscriptApi
from smolagents import Tool
class YouTubeTranscriptTool(Tool):
"""
A tool to fetch transcripts from YouTube videos.
This tool can extract transcripts in various languages and formats,
providing clean text output for further processing by AI agents.
"""
name = "youtube_transcript"
description = """
Fetches the transcript/captions from a YouTube video.
Input: YouTube URL or video ID
Output: Clean transcript text with optional timestamps
Supports:
- Auto-generated and manual captions
- Multiple languages
- Timestamp formatting options
- Text cleaning and formatting
"""
inputs = {
"video_url": {
"type": "string",
"description": "YouTube video URL or video ID"
},
"language": {
"type": "string",
"description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect",
"default": "auto",
"nullable": True,
},
"include_timestamps": {
"type": "boolean",
"description": "Whether to include timestamps in the output",
"default": False,
"nullable": True,
},
"clean_text": {
"type": "boolean",
"description": "Whether to clean and format the text (remove extra spaces, fix punctuation)",
"default": True,
"nullable": True,
}
}
output_type = "string"
def __init__(self):
super().__init__()
def extract_video_id(self, url: str) -> Optional[str]:
"""Extract video ID from various YouTube URL formats."""
# Handle direct video ID
if len(url) == 11 and url.isalnum():
return url
# Regular expression patterns for different YouTube URL formats
patterns = [
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})',
r'youtu\.be\/([a-zA-Z0-9_-]{11})',
r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def clean_transcript_text(self, transcript: List[Dict]) -> str:
"""Clean and format transcript text."""
text_parts = []
for entry in transcript:
text = entry['text']
# Remove extra spaces and newlines
text = re.sub(r'\s+', ' ', text.strip())
# Fix common caption artifacts
text = re.sub(r'\[.*?\]', '', text) # Remove [Music], [Applause], etc.
text = re.sub(r'\(.*?\)', '', text) # Remove (inaudible), etc.
if text:
text_parts.append(text)
# Join and clean up the full text
full_text = ' '.join(text_parts)
# Fix punctuation spacing
full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text)
full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text)
return full_text.strip()
def format_with_timestamps(self, transcript: List[Dict]) -> str:
"""Format transcript with timestamps."""
formatted_parts = []
for entry in transcript:
start_time = entry['start']
minutes = int(start_time // 60)
seconds = int(start_time % 60)
timestamp = f"[{minutes:02d}:{seconds:02d}]"
text = entry['text'].strip()
if text:
formatted_parts.append(f"{timestamp} {text}")
return '\n'.join(formatted_parts)
def get_available_languages(self, video_id: str) -> List[str]:
"""Get list of available transcript languages for a video."""
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
languages = []
for transcript in transcript_list:
languages.append(transcript.language_code)
return languages
except Exception:
return []
def forward(self, video_url: str, language: str = "auto",
include_timestamps: bool = False, clean_text: bool = True) -> str:
"""
Fetch and format YouTube video transcript.
Args:
video_url: YouTube URL or video ID
language: Language code for transcript (default: auto-detect)
include_timestamps: Whether to include timestamps
clean_text: Whether to clean and format the text
Returns:
Formatted transcript text
"""
try:
# Extract video ID
video_id = self.extract_video_id(video_url)
if not video_id:
return "Error: Invalid YouTube URL or video ID provided."
# Get available languages if auto-detect is requested
if language == "auto":
available_languages = self.get_available_languages(video_id)
if not available_languages:
return "Error: No transcripts available for this video."
# Prefer English, then first available
language = 'en' if 'en' in available_languages else available_languages[0]
# Fetch transcript
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
except Exception as e:
# Try to get any available transcript
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
transcript = transcript_list.find_generated_transcript(['en']).fetch()
except Exception:
try:
# Try manual transcripts
transcript = transcript_list.find_manually_created_transcript(
['en']).fetch()
except Exception:
return f"Error: Could not fetch transcript. {str(e)}"
if not transcript:
return "Error: No transcript content found."
# Format output based on options
if include_timestamps:
result = self.format_with_timestamps(transcript)
else:
if clean_text:
result = self.clean_transcript_text(transcript)
else:
result = ' '.join([entry['text'] for entry in transcript])
# Add metadata
metadata = f"YouTube Video ID: {video_id}\n"
metadata += f"Language: {language}\n"
metadata += f"Transcript Length: {len(result)} characters\n"
metadata += "-" * 50 + "\n\n"
return metadata + result
except Exception as e:
return f"Error fetching transcript: {str(e)}"
# Example usage and testing
if __name__ == "__main__":
# Initialize the tool
transcript_tool = YouTubeTranscriptTool()
# Test with a sample video
test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
print("Testing YouTube Transcript Tool...")
print("=" * 50)
# Test basic transcript
result = transcript_tool.forward(test_url)
print("Basic transcript:")
print(result[:500] + "..." if len(result) > 500 else result)
print("\n" + "=" * 50 + "\n")
# Test with timestamps
result_with_timestamps = transcript_tool.forward(
test_url,
include_timestamps=True
)
print("With timestamps:")
print(result_with_timestamps[:500] + "..." if len(
result_with_timestamps) > 500 else result_with_timestamps)
# Installation requirements:
# pip install youtube-transcript-api smolagents