import os import io import fitz # PyMuPDF import docx from pptx import Presentation import pandas as pd import base64 from openai import OpenAI def extract_text_from_file(uploaded_file, use_vision=False, api_key=None): """ Traffic Cop function. If use_vision=True, it routes PDFs/PPTs to the Vision pipeline. """ file_ext = os.path.splitext(uploaded_file.name)[1].lower() # 1. Vision Path (Only for visual formats: PDF/PPT) if use_vision and file_ext in [".pdf", ".pptx", ".ppt"]: if not api_key: return "[ERROR: Vision Mode requires an API Key]" return _extract_with_vision_model(uploaded_file, file_ext, api_key) # 2. Standard Text Path (Fast, Free) if file_ext == ".pdf": return _extract_pdf(uploaded_file) elif file_ext in [".docx", ".doc"]: return _extract_docx(uploaded_file) elif file_ext in [".pptx", ".ppt"]: return _extract_pptx(uploaded_file) elif file_ext in [".xlsx", ".xls", ".csv"]: return _extract_excel(uploaded_file) elif file_ext in [".txt", ".md"]: return uploaded_file.read().decode("utf-8") else: raise ValueError(f"Unsupported file type: {file_ext}") # --- VISION EXTRACTION (The Heavy Lifter) --- def _extract_with_vision_model(uploaded_file, file_ext, api_key): """ Converts file pages to images and asks GPT-4o to transcribe them into a format compatible with the OutlineProcessor. """ client = OpenAI(api_key=api_key) full_text = [] # 1. Convert File to Image List images = [] # List of base64 strings if file_ext == ".pdf": # Load PDF from memory doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") for page_num in range(len(doc)): page = doc.load_page(page_num) pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for clarity img_bytes = pix.tobytes("png") b64_img = base64.b64encode(img_bytes).decode('utf-8') images.append(b64_img) # (Note: PPTX vision support requires converting PPT slides to images. # For simplicity, we fallback to standard extraction for PPTX in this prototype # unless you install 'pdf2image' or similar heavy tools. # For now, we'll treat PPTX as text-only or add a placeholder.) elif file_ext in [".pptx", ".ppt"]: return "[System Note: Direct PPT Vision requires server-side rendering tools. Using Text Mode instead.]\n" + _extract_pptx(uploaded_file) # 2. Process Batch (One API call per page to ensure accuracy) # We loop through images. This is slower but handles context per page better. for i, b64_img in enumerate(images): response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ {"type": "text", "text": "Analyze this slide/page. Transcribe the content into a structured, hierarchical outline using markdown bullets (-). If there are tables, convert each row into a bullet point describing the data (e.g., '- The LM2500 has a weight of 4.7 tons'). If there are diagrams, describe the relationships labeled."}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_img}"}} ], } ], max_tokens=1000 ) content = response.choices[0].message.content full_text.append(f"--- Page {i+1} ---\n{content}") return "\n".join(full_text) # --- STANDARD EXTRACTORS (Existing Code) --- def _extract_pdf(uploaded_file): doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") full_text = [] for page in doc: full_text.append(page.get_text()) return "\n".join(full_text) def _extract_docx(uploaded_file): doc = docx.Document(uploaded_file) full_text = [] for para in doc.paragraphs: if para.text.strip(): full_text.append(para.text) for table in doc.tables: for row in table.rows: row_text = [cell.text for cell in row.cells if cell.text.strip()] if row_text: full_text.append(" | ".join(row_text)) return "\n".join(full_text) def _extract_pptx(uploaded_file): prs = Presentation(uploaded_file) full_text = [] for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): full_text.append(shape.text) if slide.has_notes_slide: notes = slide.notes_slide.notes_text_frame.text if notes.strip(): full_text.append(f"[SPEAKER NOTES]: {notes}") return "\n".join(full_text) def _extract_excel(uploaded_file): is_csv = uploaded_file.name.lower().endswith(".csv") if is_csv: df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) try: return df.to_markdown(index=False) except: return df.to_string(index=False)