import re from typing import Optional, Dict, Any, List from youtube_transcript_api import YouTubeTranscriptApi from smolagents import Tool class YouTubeTranscriptTool(Tool): """ A tool to fetch transcripts from YouTube videos. This tool can extract transcripts in various languages and formats, providing clean text output for further processing by AI agents. """ name = "youtube_transcript" description = """ Fetches the transcript/captions from a YouTube video. Input: YouTube URL or video ID Output: Clean transcript text with optional timestamps Supports: - Auto-generated and manual captions - Multiple languages - Timestamp formatting options - Text cleaning and formatting """ inputs = { "video_url": { "type": "string", "description": "YouTube video URL or video ID" }, "language": { "type": "string", "description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect", "default": "auto", "nullable": True, }, "include_timestamps": { "type": "boolean", "description": "Whether to include timestamps in the output", "default": False, "nullable": True, }, "clean_text": { "type": "boolean", "description": "Whether to clean and format the text (remove extra spaces, fix punctuation)", "default": True, "nullable": True, } } output_type = "string" def __init__(self): super().__init__() def extract_video_id(self, url: str) -> Optional[str]: """Extract video ID from various YouTube URL formats.""" # Handle direct video ID if len(url) == 11 and url.isalnum(): return url # Regular expression patterns for different YouTube URL formats patterns = [ r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})', r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})', r'youtu\.be\/([a-zA-Z0-9_-]{11})', r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) return None def clean_transcript_text(self, transcript: List[Dict]) -> str: """Clean and format transcript text.""" text_parts = [] for entry in transcript: text = entry['text'] # Remove extra spaces and newlines text = re.sub(r'\s+', ' ', text.strip()) # Fix common caption artifacts text = re.sub(r'\[.*?\]', '', text) # Remove [Music], [Applause], etc. text = re.sub(r'\(.*?\)', '', text) # Remove (inaudible), etc. if text: text_parts.append(text) # Join and clean up the full text full_text = ' '.join(text_parts) # Fix punctuation spacing full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text) full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text) return full_text.strip() def format_with_timestamps(self, transcript: List[Dict]) -> str: """Format transcript with timestamps.""" formatted_parts = [] for entry in transcript: start_time = entry['start'] minutes = int(start_time // 60) seconds = int(start_time % 60) timestamp = f"[{minutes:02d}:{seconds:02d}]" text = entry['text'].strip() if text: formatted_parts.append(f"{timestamp} {text}") return '\n'.join(formatted_parts) def get_available_languages(self, video_id: str) -> List[str]: """Get list of available transcript languages for a video.""" try: transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) languages = [] for transcript in transcript_list: languages.append(transcript.language_code) return languages except Exception: return [] def forward(self, video_url: str, language: str = "auto", include_timestamps: bool = False, clean_text: bool = True) -> str: """ Fetch and format YouTube video transcript. Args: video_url: YouTube URL or video ID language: Language code for transcript (default: auto-detect) include_timestamps: Whether to include timestamps clean_text: Whether to clean and format the text Returns: Formatted transcript text """ try: # Extract video ID video_id = self.extract_video_id(video_url) if not video_id: return "Error: Invalid YouTube URL or video ID provided." # Get available languages if auto-detect is requested if language == "auto": available_languages = self.get_available_languages(video_id) if not available_languages: return "Error: No transcripts available for this video." # Prefer English, then first available language = 'en' if 'en' in available_languages else available_languages[0] # Fetch transcript try: transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) except Exception as e: # Try to get any available transcript try: transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) transcript = transcript_list.find_generated_transcript(['en']).fetch() except Exception: try: # Try manual transcripts transcript = transcript_list.find_manually_created_transcript( ['en']).fetch() except Exception: return f"Error: Could not fetch transcript. {str(e)}" if not transcript: return "Error: No transcript content found." # Format output based on options if include_timestamps: result = self.format_with_timestamps(transcript) else: if clean_text: result = self.clean_transcript_text(transcript) else: result = ' '.join([entry['text'] for entry in transcript]) # Add metadata metadata = f"YouTube Video ID: {video_id}\n" metadata += f"Language: {language}\n" metadata += f"Transcript Length: {len(result)} characters\n" metadata += "-" * 50 + "\n\n" return metadata + result except Exception as e: return f"Error fetching transcript: {str(e)}" # Example usage and testing if __name__ == "__main__": # Initialize the tool transcript_tool = YouTubeTranscriptTool() # Test with a sample video test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" print("Testing YouTube Transcript Tool...") print("=" * 50) # Test basic transcript result = transcript_tool.forward(test_url) print("Basic transcript:") print(result[:500] + "..." if len(result) > 500 else result) print("\n" + "=" * 50 + "\n") # Test with timestamps result_with_timestamps = transcript_tool.forward( test_url, include_timestamps=True ) print("With timestamps:") print(result_with_timestamps[:500] + "..." if len( result_with_timestamps) > 500 else result_with_timestamps) # Installation requirements: # pip install youtube-transcript-api smolagents