import gradio as gr
from transformers import pipeline, TextIteratorStreamer
from threading import Thread
pipe = pipeline(
"text-generation",
model="HuggingFaceTB/SmolLM2-135M-Instruct",
device_map="auto"
)
def chat(message, history):
# Convert history to transformers format
messages = [{"role": "system", "content": "You are a helpful assistant."}]
for msg in history:
if msg["role"] == "user":
messages.append({"role": "user", "content": msg["content"][0]["text"]})
else:
messages.append({"role": "assistant", "content": msg["content"][0]["text"]})
# Add current message
messages.append({"role": "user", "content": message})
# Generate response with streaming
streamer = TextIteratorStreamer(pipe.tokenizer, skip_special_tokens=True)
generation_kwargs = dict(
messages=messages,
max_new_tokens=256,
streamer=streamer,
)
thread = Thread(target=pipe, kwargs=generation_kwargs)
thread.start()
partial_response = ""
for token in streamer:
partial_response += token
yield partial_response
demo = gr.ChatInterface(
fn=chat,
title="SmolLM2 Chatbot",
)
if __name__ == "__main__":
demo.launch()