GhostScientist's picture
Upload app.py with huggingface_hub
e7cecfe verified
raw
history blame
3.87 kB
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
MODEL_ID = "GhostScientist/qwen25-coder-1.5b-codealpaca-sft"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
# Load tokenizer at startup (CPU)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
# Global model variable - will be loaded on first GPU call
model = None
def load_model():
"""Load and merge the model with adapter."""
global model
if model is None:
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, MODEL_ID)
model = model.merge_and_unload()
return model
@spaces.GPU(duration=120)
def generate_response(message, history, system_message, max_tokens, temperature, top_p):
"""Generate response using the fine-tuned Qwen coder model."""
# Load model on GPU
model = load_model()
messages = [{"role": "system", "content": system_message}]
for item in history:
if isinstance(item, (list, tuple)) and len(item) == 2:
user_msg, assistant_msg = item
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
# Decode only the new tokens
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
return response
SYSTEM_PROMPT = """You are an expert coding assistant. You help users write, debug, explain, and improve code.
You provide clear, concise, and accurate responses with well-formatted code examples when appropriate.
Always explain your reasoning and suggest best practices."""
EXAMPLES = [
["Write a Python function to check if a number is prime"],
["Explain the difference between a list and a tuple in Python"],
["How do I reverse a string in JavaScript?"],
["Write a SQL query to find duplicate records in a table"],
["Debug this code: def add(a, b): return a - b"],
]
demo = gr.ChatInterface(
fn=generate_response,
title="Qwen 2.5 Coder Assistant",
description="""A fine-tuned Qwen 2.5 Coder 1.5B model for code assistance.
Ask me to write code, explain concepts, debug issues, or help with any programming task!
**Model:** [GhostScientist/qwen25-coder-1.5b-codealpaca-sft](https://huggingface.co/GhostScientist/qwen25-coder-1.5b-codealpaca-sft)
""",
additional_inputs=[
gr.Textbox(
value=SYSTEM_PROMPT,
label="System Prompt",
lines=3
),
gr.Slider(
minimum=64,
maximum=2048,
value=512,
step=64,
label="Max Tokens"
),
gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p"
),
],
examples=EXAMPLES,
)
if __name__ == "__main__":
demo.launch()