import streamlit as st import os import re import zipfile import io from openai import OpenAI import doc_loader # FIX 1: Import the new loader module # --- CONFIGURATION --- st.set_page_config(page_title="Context Flattener", page_icon="📄", layout="centered") # --- CORE LOGIC: THE OUTLINE PARSER --- class OutlineProcessor: """ Parses a raw text file. Includes a pre-processing step to stitch multi-line items together. """ def __init__(self, file_content): self.raw_lines = file_content.split('\n') def _is_list_item(self, line): pattern = r"^\s*(\d+\.|[a-zA-Z]\.|-|\*)\s+" return bool(re.match(pattern, line)) def _merge_multiline_items(self): merged_lines = [] for line in self.raw_lines: stripped = line.strip() if not stripped: continue if not merged_lines: merged_lines.append(line) continue is_new_item = self._is_list_item(line) if not is_new_item: merged_lines[-1] = merged_lines[-1].rstrip() + " " + stripped else: merged_lines.append(line) return merged_lines def parse(self): clean_lines = self._merge_multiline_items() stack = [] results = [] for line in clean_lines: stripped = line.strip() indent = len(line) - len(line.lstrip()) while stack and stack[-1]['indent'] >= indent: stack.pop() stack.append({'indent': indent, 'text': stripped}) if len(stack) > 1: context_list = [item['text'] for item in stack[:-1]] context_str = " > ".join(context_list) else: context_str = "ROOT" results.append({ "context": context_str, "target": stripped, "full_path": " > ".join([item['text'] for item in stack]) }) return results # --- CORE LOGIC: THE LLM WRITER --- def flatten_sentence(context, target, api_key, model="gpt-4o"): if not api_key: return "[ERROR: No API Key]" client = OpenAI(api_key=api_key) prompt = ( "You are a Technical Data Specialist. " "Your task is to rewrite the 'Target' text into a single, self-contained sentence " "that explicitly incorporates information from the 'Context'.\n" "Rules:\n" "1. Do not use pronouns like 'it', 'this', or 'they'—use specific nouns from the context.\n" "2. The output must stand alone without needing the previous sentences.\n" "3. Do not add new facts, just merge the context.\n\n" f"### CONTEXT: {context}\n" f"### TARGET: {target}\n\n" "### FLATTENED STATEMENT:" ) try: response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], temperature=0.0, max_tokens=300 ) return response.choices[0].message.content.strip() except Exception as e: return f"[Error: {e}]" # --- STREAMLIT UI --- st.title("📄 Batch Context Flattener") st.caption("Convert nested outlines (PDF, Word, PPT, Text) into RAG-optimized flat statements.") # 1. Credentials & Settings with st.sidebar: st.header("Settings") api_key = os.getenv("OPENAI_API_KEY") model_choice = st.selectbox("Model", ["gpt-4o", "gpt-4-turbo", "gpt-3.5-turbo"]) st.divider() st.subheader("Ingestion Mode") use_vision = st.toggle("Enable Vision Mode", value=False, help="Uses GPT-4o to 'read' slides as images. Slower/Costlier, but captures tables and diagrams.") # 2. Upload uploaded_files = st.file_uploader( "Upload Documents", type=["txt", "md", "pdf", "docx", "pptx", "xlsx", "csv"], accept_multiple_files=True ) if uploaded_files: total_files = len(uploaded_files) st.write(f"**Queued {total_files} files for processing.**") # 3. Preview Logic (Updates based on Vision Toggle) first_file = uploaded_files[0] try: if use_vision and not api_key: st.warning("⚠️ Vision Mode selected but no API Key provided. Preview will fail.") else: with st.spinner(f"Extracting Preview ({'Vision Mode' if use_vision else 'Fast Mode'})..."): # FIX: Pass the vision flag and api key here preview_text = doc_loader.extract_text_from_file( first_file, use_vision=use_vision, api_key=api_key ) # Reset pointer first_file.seek(0) st.info(f"Preview (from {first_file.name}): Extracted {len(preview_text)} characters.") processor = OutlineProcessor(preview_text) preview_items = processor.parse() with st.expander(f"👁️ Preview Logic"): # Safety check for empty parse results if not preview_items: st.warning("No structured items found. Vision mode output might need different formatting.") st.text(preview_text[:500]) # Show raw text for debugging else: for i, item in enumerate(preview_items[:3]): st.code(f"{item['context']} -> {item['target']}") except Exception as e: st.error(f"Preview failed: {e}") # 4. Process Batch Action if st.button(f"🚀 Flatten All {total_files} Files", type="primary"): if not api_key: st.error("Please enter an API Key in the sidebar.") st.stop() progress_bar = st.progress(0) status_text = st.empty() zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, "w") as zf: for file_idx, uploaded_file in enumerate(uploaded_files): file_name = uploaded_file.name status_text.text(f"Processing {file_idx + 1}/{total_files}: {file_name}...") try: # FIX: Pass vision/api_key here too string_data = doc_loader.extract_text_from_file( uploaded_file, use_vision=use_vision, api_key=api_key ) # Parse processor = OutlineProcessor(string_data) parsed_items = processor.parse() # Flatten file_output = [] if not parsed_items: # Fallback if parsing fails: just flatten the whole chunk file_output.append("[WARNING: No outline structure detected. Content below:]") file_output.append(string_data) else: for item_idx, item in enumerate(parsed_items): flat_text = flatten_sentence(item['context'], item['target'], api_key, model_choice) file_output.append(flat_text) # Granular progress bar total_progress = (file_idx + (item_idx / len(parsed_items))) / total_files progress_bar.progress(min(total_progress, 1.0)) final_text = "\n".join(file_output) base_name = os.path.splitext(file_name)[0] zf.writestr(f"{base_name}_flattened.txt", final_text) except Exception as e: st.error(f"Error processing {file_name}: {e}") progress_bar.progress(1.0) status_text.text("✅ Batch Processing Complete!") st.divider() st.download_button( label="Download All Files (ZIP)", data=zip_buffer.getvalue(), file_name="flattened_batch.zip", mime="application/zip" )