| import gradio as gr |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| import torch |
| from threading import Thread |
|
|
| |
| model_id = "LiquidAI/LFM2-700M" |
| print("Loading model...") |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| dtype=torch.float32, |
| device_map="cpu" |
| ) |
| print("Model loaded!") |
|
|
| def chat(message, history): |
| """Gradio chat interface with streaming""" |
| messages = [] |
| |
| |
| if history: |
| for entry in history: |
| if isinstance(entry, dict): |
| messages.append(entry) |
| elif isinstance(entry, (list, tuple)) and len(entry) >= 2: |
| messages.append({"role": "user", "content": entry[0]}) |
| if entry[1]: |
| messages.append({"role": "assistant", "content": entry[1]}) |
| |
| messages.append({"role": "user", "content": message}) |
| |
| |
| inputs = tokenizer.apply_chat_template( |
| messages, |
| return_tensors="pt", |
| add_generation_prompt=True |
| ) |
| |
| |
| streamer = TextIteratorStreamer( |
| tokenizer, |
| skip_special_tokens=True, |
| skip_prompt=True |
| ) |
| |
| generation_kwargs = { |
| "inputs": inputs, |
| "max_new_tokens": 512, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| "do_sample": True, |
| "streamer": streamer |
| } |
| |
| |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) |
| thread.start() |
| |
| |
| partial_text = "" |
| for new_text in streamer: |
| partial_text += new_text |
| yield partial_text |
|
|
| |
| demo = gr.ChatInterface( |
| fn=chat, |
| title="LFM2-700M Chatbot (Streaming)", |
| description="Chat with Liquid AI's LFM2-700M - balanced speed and quality", |
| examples=["Hello!", "Explain AI", "Write a Python function"] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|