Spaces:

NavyDevilDoc
/

Notes_Converter

Sleeping

File size: 5,110 Bytes

import os
import io
import fitz  # PyMuPDF
import docx
from pptx import Presentation
import pandas as pd
import base64
from openai import OpenAI

def extract_text_from_file(uploaded_file, use_vision=False, api_key=None):
    """
    Traffic Cop function.
    If use_vision=True, it routes PDFs/PPTs to the Vision pipeline.
    """
    file_ext = os.path.splitext(uploaded_file.name)[1].lower()

    # 1. Vision Path (Only for visual formats: PDF/PPT)
    if use_vision and file_ext in [".pdf", ".pptx", ".ppt"]:
        if not api_key:
            return "[ERROR: Vision Mode requires an API Key]"
        return _extract_with_vision_model(uploaded_file, file_ext, api_key)

    # 2. Standard Text Path (Fast, Free)
    if file_ext == ".pdf":
        return _extract_pdf(uploaded_file)
    elif file_ext in [".docx", ".doc"]:
        return _extract_docx(uploaded_file)
    elif file_ext in [".pptx", ".ppt"]:
        return _extract_pptx(uploaded_file)
    elif file_ext in [".xlsx", ".xls", ".csv"]:
        return _extract_excel(uploaded_file)
    elif file_ext in [".txt", ".md"]:
        return uploaded_file.read().decode("utf-8")
    else:
        raise ValueError(f"Unsupported file type: {file_ext}")

# --- VISION EXTRACTION (The Heavy Lifter) ---

def _extract_with_vision_model(uploaded_file, file_ext, api_key):
    """
    Converts file pages to images and asks GPT-4o to transcribe them 
    into a format compatible with the OutlineProcessor.
    """
    client = OpenAI(api_key=api_key)
    full_text = []
    
    # 1. Convert File to Image List
    images = [] # List of base64 strings
    
    if file_ext == ".pdf":
        # Load PDF from memory
        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for clarity
            img_bytes = pix.tobytes("png")
            b64_img = base64.b64encode(img_bytes).decode('utf-8')
            images.append(b64_img)
            
    # (Note: PPTX vision support requires converting PPT slides to images. 
    # For simplicity, we fallback to standard extraction for PPTX in this prototype 
    # unless you install 'pdf2image' or similar heavy tools. 
    # For now, we'll treat PPTX as text-only or add a placeholder.)
    elif file_ext in [".pptx", ".ppt"]:
         return "[System Note: Direct PPT Vision requires server-side rendering tools. Using Text Mode instead.]\n" + _extract_pptx(uploaded_file)

    # 2. Process Batch (One API call per page to ensure accuracy)
    # We loop through images. This is slower but handles context per page better.
    for i, b64_img in enumerate(images):
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Analyze this slide/page. Transcribe the content into a structured, hierarchical outline using markdown bullets (-). If there are tables, convert each row into a bullet point describing the data (e.g., '- The LM2500 has a weight of 4.7 tons'). If there are diagrams, describe the relationships labeled."},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_img}"}}
                    ],
                }
            ],
            max_tokens=1000
        )
        content = response.choices[0].message.content
        full_text.append(f"--- Page {i+1} ---\n{content}")
    
    return "\n".join(full_text)

# --- STANDARD EXTRACTORS (Existing Code) ---

def _extract_pdf(uploaded_file):
    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
    full_text = []
    for page in doc:
        full_text.append(page.get_text())
    return "\n".join(full_text)

def _extract_docx(uploaded_file):
    doc = docx.Document(uploaded_file)
    full_text = []
    for para in doc.paragraphs:
        if para.text.strip():
            full_text.append(para.text)
    for table in doc.tables:
        for row in table.rows:
            row_text = [cell.text for cell in row.cells if cell.text.strip()]
            if row_text:
                full_text.append(" | ".join(row_text))
    return "\n".join(full_text)

def _extract_pptx(uploaded_file):
    prs = Presentation(uploaded_file)
    full_text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                full_text.append(shape.text)
        if slide.has_notes_slide:
            notes = slide.notes_slide.notes_text_frame.text
            if notes.strip():
                full_text.append(f"[SPEAKER NOTES]: {notes}")
    return "\n".join(full_text)

def _extract_excel(uploaded_file):
    is_csv = uploaded_file.name.lower().endswith(".csv")
    if is_csv:
        df = pd.read_csv(uploaded_file)
    else:
        df = pd.read_excel(uploaded_file)
    try:
        return df.to_markdown(index=False)
    except:
        return df.to_string(index=False)