File size: 5,110 Bytes
32445ea
 
c07858c
32445ea
 
 
c07858c
 
32445ea
c07858c
32445ea
c07858c
 
32445ea
 
 
c07858c
 
 
 
 
 
 
32445ea
 
 
 
 
 
 
 
 
 
 
 
 
c07858c
32445ea
c07858c
32445ea
c07858c
 
32445ea
c07858c
32445ea
 
c07858c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32445ea
 
 
 
 
 
 
 
c07858c
32445ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import io
import fitz  # PyMuPDF
import docx
from pptx import Presentation
import pandas as pd
import base64
from openai import OpenAI

def extract_text_from_file(uploaded_file, use_vision=False, api_key=None):
    """
    Traffic Cop function.
    If use_vision=True, it routes PDFs/PPTs to the Vision pipeline.
    """
    file_ext = os.path.splitext(uploaded_file.name)[1].lower()

    # 1. Vision Path (Only for visual formats: PDF/PPT)
    if use_vision and file_ext in [".pdf", ".pptx", ".ppt"]:
        if not api_key:
            return "[ERROR: Vision Mode requires an API Key]"
        return _extract_with_vision_model(uploaded_file, file_ext, api_key)

    # 2. Standard Text Path (Fast, Free)
    if file_ext == ".pdf":
        return _extract_pdf(uploaded_file)
    elif file_ext in [".docx", ".doc"]:
        return _extract_docx(uploaded_file)
    elif file_ext in [".pptx", ".ppt"]:
        return _extract_pptx(uploaded_file)
    elif file_ext in [".xlsx", ".xls", ".csv"]:
        return _extract_excel(uploaded_file)
    elif file_ext in [".txt", ".md"]:
        return uploaded_file.read().decode("utf-8")
    else:
        raise ValueError(f"Unsupported file type: {file_ext}")

# --- VISION EXTRACTION (The Heavy Lifter) ---

def _extract_with_vision_model(uploaded_file, file_ext, api_key):
    """
    Converts file pages to images and asks GPT-4o to transcribe them 
    into a format compatible with the OutlineProcessor.
    """
    client = OpenAI(api_key=api_key)
    full_text = []
    
    # 1. Convert File to Image List
    images = [] # List of base64 strings
    
    if file_ext == ".pdf":
        # Load PDF from memory
        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for clarity
            img_bytes = pix.tobytes("png")
            b64_img = base64.b64encode(img_bytes).decode('utf-8')
            images.append(b64_img)
            
    # (Note: PPTX vision support requires converting PPT slides to images. 
    # For simplicity, we fallback to standard extraction for PPTX in this prototype 
    # unless you install 'pdf2image' or similar heavy tools. 
    # For now, we'll treat PPTX as text-only or add a placeholder.)
    elif file_ext in [".pptx", ".ppt"]:
         return "[System Note: Direct PPT Vision requires server-side rendering tools. Using Text Mode instead.]\n" + _extract_pptx(uploaded_file)

    # 2. Process Batch (One API call per page to ensure accuracy)
    # We loop through images. This is slower but handles context per page better.
    for i, b64_img in enumerate(images):
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Analyze this slide/page. Transcribe the content into a structured, hierarchical outline using markdown bullets (-). If there are tables, convert each row into a bullet point describing the data (e.g., '- The LM2500 has a weight of 4.7 tons'). If there are diagrams, describe the relationships labeled."},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_img}"}}
                    ],
                }
            ],
            max_tokens=1000
        )
        content = response.choices[0].message.content
        full_text.append(f"--- Page {i+1} ---\n{content}")
    
    return "\n".join(full_text)

# --- STANDARD EXTRACTORS (Existing Code) ---

def _extract_pdf(uploaded_file):
    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
    full_text = []
    for page in doc:
        full_text.append(page.get_text())
    return "\n".join(full_text)

def _extract_docx(uploaded_file):
    doc = docx.Document(uploaded_file)
    full_text = []
    for para in doc.paragraphs:
        if para.text.strip():
            full_text.append(para.text)
    for table in doc.tables:
        for row in table.rows:
            row_text = [cell.text for cell in row.cells if cell.text.strip()]
            if row_text:
                full_text.append(" | ".join(row_text))
    return "\n".join(full_text)

def _extract_pptx(uploaded_file):
    prs = Presentation(uploaded_file)
    full_text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                full_text.append(shape.text)
        if slide.has_notes_slide:
            notes = slide.notes_slide.notes_text_frame.text
            if notes.strip():
                full_text.append(f"[SPEAKER NOTES]: {notes}")
    return "\n".join(full_text)

def _extract_excel(uploaded_file):
    is_csv = uploaded_file.name.lower().endswith(".csv")
    if is_csv:
        df = pd.read_csv(uploaded_file)
    else:
        df = pd.read_excel(uploaded_file)
    try:
        return df.to_markdown(index=False)
    except:
        return df.to_string(index=False)