Spaces:
Sleeping
Sleeping
File size: 5,110 Bytes
32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea c07858c 32445ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import io
import fitz # PyMuPDF
import docx
from pptx import Presentation
import pandas as pd
import base64
from openai import OpenAI
def extract_text_from_file(uploaded_file, use_vision=False, api_key=None):
"""
Traffic Cop function.
If use_vision=True, it routes PDFs/PPTs to the Vision pipeline.
"""
file_ext = os.path.splitext(uploaded_file.name)[1].lower()
# 1. Vision Path (Only for visual formats: PDF/PPT)
if use_vision and file_ext in [".pdf", ".pptx", ".ppt"]:
if not api_key:
return "[ERROR: Vision Mode requires an API Key]"
return _extract_with_vision_model(uploaded_file, file_ext, api_key)
# 2. Standard Text Path (Fast, Free)
if file_ext == ".pdf":
return _extract_pdf(uploaded_file)
elif file_ext in [".docx", ".doc"]:
return _extract_docx(uploaded_file)
elif file_ext in [".pptx", ".ppt"]:
return _extract_pptx(uploaded_file)
elif file_ext in [".xlsx", ".xls", ".csv"]:
return _extract_excel(uploaded_file)
elif file_ext in [".txt", ".md"]:
return uploaded_file.read().decode("utf-8")
else:
raise ValueError(f"Unsupported file type: {file_ext}")
# --- VISION EXTRACTION (The Heavy Lifter) ---
def _extract_with_vision_model(uploaded_file, file_ext, api_key):
"""
Converts file pages to images and asks GPT-4o to transcribe them
into a format compatible with the OutlineProcessor.
"""
client = OpenAI(api_key=api_key)
full_text = []
# 1. Convert File to Image List
images = [] # List of base64 strings
if file_ext == ".pdf":
# Load PDF from memory
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for clarity
img_bytes = pix.tobytes("png")
b64_img = base64.b64encode(img_bytes).decode('utf-8')
images.append(b64_img)
# (Note: PPTX vision support requires converting PPT slides to images.
# For simplicity, we fallback to standard extraction for PPTX in this prototype
# unless you install 'pdf2image' or similar heavy tools.
# For now, we'll treat PPTX as text-only or add a placeholder.)
elif file_ext in [".pptx", ".ppt"]:
return "[System Note: Direct PPT Vision requires server-side rendering tools. Using Text Mode instead.]\n" + _extract_pptx(uploaded_file)
# 2. Process Batch (One API call per page to ensure accuracy)
# We loop through images. This is slower but handles context per page better.
for i, b64_img in enumerate(images):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this slide/page. Transcribe the content into a structured, hierarchical outline using markdown bullets (-). If there are tables, convert each row into a bullet point describing the data (e.g., '- The LM2500 has a weight of 4.7 tons'). If there are diagrams, describe the relationships labeled."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_img}"}}
],
}
],
max_tokens=1000
)
content = response.choices[0].message.content
full_text.append(f"--- Page {i+1} ---\n{content}")
return "\n".join(full_text)
# --- STANDARD EXTRACTORS (Existing Code) ---
def _extract_pdf(uploaded_file):
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
full_text = []
for page in doc:
full_text.append(page.get_text())
return "\n".join(full_text)
def _extract_docx(uploaded_file):
doc = docx.Document(uploaded_file)
full_text = []
for para in doc.paragraphs:
if para.text.strip():
full_text.append(para.text)
for table in doc.tables:
for row in table.rows:
row_text = [cell.text for cell in row.cells if cell.text.strip()]
if row_text:
full_text.append(" | ".join(row_text))
return "\n".join(full_text)
def _extract_pptx(uploaded_file):
prs = Presentation(uploaded_file)
full_text = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
full_text.append(shape.text)
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text
if notes.strip():
full_text.append(f"[SPEAKER NOTES]: {notes}")
return "\n".join(full_text)
def _extract_excel(uploaded_file):
is_csv = uploaded_file.name.lower().endswith(".csv")
if is_csv:
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
try:
return df.to_markdown(index=False)
except:
return df.to_string(index=False) |