| | import re |
| | from typing import Optional, Dict, Any, List |
| | from youtube_transcript_api import YouTubeTranscriptApi |
| | from smolagents import Tool |
| |
|
| |
|
| | class YouTubeTranscriptTool(Tool): |
| | """ |
| | A tool to fetch transcripts from YouTube videos. |
| | |
| | This tool can extract transcripts in various languages and formats, |
| | providing clean text output for further processing by AI agents. |
| | """ |
| |
|
| | name = "youtube_transcript" |
| | description = """ |
| | Fetches the transcript/captions from a YouTube video. |
| | |
| | Input: YouTube URL or video ID |
| | Output: Clean transcript text with optional timestamps |
| | |
| | Supports: |
| | - Auto-generated and manual captions |
| | - Multiple languages |
| | - Timestamp formatting options |
| | - Text cleaning and formatting |
| | """ |
| |
|
| | inputs = { |
| | "video_url": { |
| | "type": "string", |
| | "description": "YouTube video URL or video ID" |
| | }, |
| | "language": { |
| | "type": "string", |
| | "description": "Language code (e.g., 'en', 'es', 'fr'). Optional, defaults to auto-detect", |
| | "default": "auto", |
| | "nullable": True, |
| | }, |
| | "include_timestamps": { |
| | "type": "boolean", |
| | "description": "Whether to include timestamps in the output", |
| | "default": False, |
| | "nullable": True, |
| | }, |
| | "clean_text": { |
| | "type": "boolean", |
| | "description": "Whether to clean and format the text (remove extra spaces, fix punctuation)", |
| | "default": True, |
| | "nullable": True, |
| | } |
| | } |
| |
|
| | output_type = "string" |
| |
|
| | def __init__(self): |
| | super().__init__() |
| |
|
| | def extract_video_id(self, url: str) -> Optional[str]: |
| | """Extract video ID from various YouTube URL formats.""" |
| | |
| | if len(url) == 11 and url.isalnum(): |
| | return url |
| |
|
| | |
| | patterns = [ |
| | r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})', |
| | r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})', |
| | r'youtu\.be\/([a-zA-Z0-9_-]{11})', |
| | r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})' |
| | ] |
| |
|
| | for pattern in patterns: |
| | match = re.search(pattern, url) |
| | if match: |
| | return match.group(1) |
| |
|
| | return None |
| |
|
| | def clean_transcript_text(self, transcript: List[Dict]) -> str: |
| | """Clean and format transcript text.""" |
| | text_parts = [] |
| |
|
| | for entry in transcript: |
| | text = entry['text'] |
| | |
| | text = re.sub(r'\s+', ' ', text.strip()) |
| | |
| | text = re.sub(r'\[.*?\]', '', text) |
| | text = re.sub(r'\(.*?\)', '', text) |
| | if text: |
| | text_parts.append(text) |
| |
|
| | |
| | full_text = ' '.join(text_parts) |
| | |
| | full_text = re.sub(r'\s+([,.!?;:])', r'\1', full_text) |
| | full_text = re.sub(r'([.!?])\s*([a-z])', r'\1 \2', full_text) |
| |
|
| | return full_text.strip() |
| |
|
| | def format_with_timestamps(self, transcript: List[Dict]) -> str: |
| | """Format transcript with timestamps.""" |
| | formatted_parts = [] |
| |
|
| | for entry in transcript: |
| | start_time = entry['start'] |
| | minutes = int(start_time // 60) |
| | seconds = int(start_time % 60) |
| | timestamp = f"[{minutes:02d}:{seconds:02d}]" |
| |
|
| | text = entry['text'].strip() |
| | if text: |
| | formatted_parts.append(f"{timestamp} {text}") |
| |
|
| | return '\n'.join(formatted_parts) |
| |
|
| | def get_available_languages(self, video_id: str) -> List[str]: |
| | """Get list of available transcript languages for a video.""" |
| | try: |
| | transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) |
| | languages = [] |
| |
|
| | for transcript in transcript_list: |
| | languages.append(transcript.language_code) |
| |
|
| | return languages |
| | except Exception: |
| | return [] |
| |
|
| | def forward(self, video_url: str, language: str = "auto", |
| | include_timestamps: bool = False, clean_text: bool = True) -> str: |
| | """ |
| | Fetch and format YouTube video transcript. |
| | |
| | Args: |
| | video_url: YouTube URL or video ID |
| | language: Language code for transcript (default: auto-detect) |
| | include_timestamps: Whether to include timestamps |
| | clean_text: Whether to clean and format the text |
| | |
| | Returns: |
| | Formatted transcript text |
| | """ |
| | try: |
| | |
| | video_id = self.extract_video_id(video_url) |
| | if not video_id: |
| | return "Error: Invalid YouTube URL or video ID provided." |
| |
|
| | |
| | if language == "auto": |
| | available_languages = self.get_available_languages(video_id) |
| | if not available_languages: |
| | return "Error: No transcripts available for this video." |
| |
|
| | |
| | language = 'en' if 'en' in available_languages else available_languages[0] |
| |
|
| | |
| | try: |
| | transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) |
| | except Exception as e: |
| | |
| | try: |
| | transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) |
| | transcript = transcript_list.find_generated_transcript(['en']).fetch() |
| | except Exception: |
| | try: |
| | |
| | transcript = transcript_list.find_manually_created_transcript( |
| | ['en']).fetch() |
| | except Exception: |
| | return f"Error: Could not fetch transcript. {str(e)}" |
| |
|
| | if not transcript: |
| | return "Error: No transcript content found." |
| |
|
| | |
| | if include_timestamps: |
| | result = self.format_with_timestamps(transcript) |
| | else: |
| | if clean_text: |
| | result = self.clean_transcript_text(transcript) |
| | else: |
| | result = ' '.join([entry['text'] for entry in transcript]) |
| |
|
| | |
| | metadata = f"YouTube Video ID: {video_id}\n" |
| | metadata += f"Language: {language}\n" |
| | metadata += f"Transcript Length: {len(result)} characters\n" |
| | metadata += "-" * 50 + "\n\n" |
| |
|
| | return metadata + result |
| |
|
| | except Exception as e: |
| | return f"Error fetching transcript: {str(e)}" |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | |
| | transcript_tool = YouTubeTranscriptTool() |
| |
|
| | |
| | test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" |
| |
|
| | print("Testing YouTube Transcript Tool...") |
| | print("=" * 50) |
| |
|
| | |
| | result = transcript_tool.forward(test_url) |
| | print("Basic transcript:") |
| | print(result[:500] + "..." if len(result) > 500 else result) |
| | print("\n" + "=" * 50 + "\n") |
| |
|
| | |
| | result_with_timestamps = transcript_tool.forward( |
| | test_url, |
| | include_timestamps=True |
| | ) |
| | print("With timestamps:") |
| | print(result_with_timestamps[:500] + "..." if len( |
| | result_with_timestamps) > 500 else result_with_timestamps) |
| |
|
| | |
| | |