Frontend Reasoning - Sglang

Launch A Server

Launch the server with a reasoning model (Qwen 3.5-4B) and reasoning parser.

python

from sglang import separate_reasoning, assistant_begin, assistant_end
from sglang import assistant, function, gen, system, user
from sglang import image
from sglang import RuntimeEndpoint, set_default_backend
from sglang.srt.utils import load_image
from sglang.test.test_utils import is_in_ci
from sglang.utils import print_highlight, terminate_process, wait_for_server

if is_in_ci():
    from patch import launch_server_cmd
else:
    from sglang.utils import launch_server_cmd


server_process, port = launch_server_cmd(
    "python3 -m sglang.launch_server --model-path Qwen/Qwen3-4B --reasoning-parser qwen3 --host 0.0.0.0"
)

wait_for_server(f"http://localhost:{port}", process=server_process)
print(f"Server started on http://localhost:{port}")

Set the default backend. Note: you can set chat_template_name in RontimeEndpoint.

python

set_default_backend(
    RuntimeEndpoint(f"http://localhost:{port}", chat_template_name="qwen")
)

Let's start with a basic question-answering task. And see how the reasoning content is generated.

python

@function
def basic_qa(s, question):
    s += system(f"You are a helpful assistant than can answer questions.")
    s += user(question)
    s += assistant_begin()
    s += gen("answer", max_tokens=512)
    s += assistant_end()


state = basic_qa("List 3 countries and their capitals.")
print_highlight(state["answer"])

With separate_reasoning, you can move the reasoning content to {param_name}_reasoning_content in the state.

python

@function
def basic_qa_separate_reasoning(s, question):
    s += system(f"You are a helpful assistant than can answer questions.")
    s += user(question)
    s += assistant_begin()
    s += separate_reasoning(gen("answer", max_tokens=512), model_type="qwen3")
    s += assistant_end()


reasoning_state = basic_qa_separate_reasoning("List 3 countries and their capitals.")
print_highlight(reasoning_state.stream_executor.variable_event.keys())
print_highlight(
    f"\nSeparated Reasoning Content:\n{reasoning_state['answer_reasoning_content']}"
)

print_highlight(f"\n\nContent:\n{reasoning_state['answer']}")
print_highlight(f"\n\nMessages:\n{reasoning_state.messages()[-1]}")

separate_reasoning can also be used in multi-turn conversations.

python

@function
def multi_turn_qa(s):
    s += system(f"You are a helpful assistant than can answer questions.")
    s += user("Please give me a list of 3 countries and their capitals.")
    s += assistant(
        separate_reasoning(gen("first_answer", max_tokens=512), model_type="qwen3")
    )
    s += user("Please give me another list of 3 countries and their capitals.")
    s += assistant(
        separate_reasoning(gen("second_answer", max_tokens=512), model_type="qwen3")
    )
    return s


reasoning_state = multi_turn_qa()
print_highlight(f"\n\nfirst_answer:\n{reasoning_state['first_answer']}")
print_highlight(
    f"\n\nfirst_answer_reasoning_content:\n{reasoning_state['first_answer_reasoning_content']}"
)
print_highlight(f"\n\nsecond_answer:\n{reasoning_state['second_answer']}")
print_highlight(
    f"\n\nsecond_answer_reasoning_content:\n{reasoning_state['second_answer_reasoning_content']}"
)

Using No thinking as Qwen 3's advanced feature

sglang separate_reasoning is particularly useful when combined with Qwen 3's advanced feature.

Qwen 3's advanced usages

python

reasoning_state = basic_qa_separate_reasoning(
    "List 3 countries and their capitals. /no_think"
)
print_highlight(f"Reasoning Content:\n{reasoning_state['answer_reasoning_content']}")
print_highlight(f"Content:\n{reasoning_state['answer']}")

separate_reasoning can also be used in regular expression generation.

python

@function
def regular_expression_gen(s):
    s += user(
        "What is the IP address of the Google DNS servers? just provide the answer"
    )
    s += assistant(
        separate_reasoning(
            gen(
                "answer",
                temperature=0,
                regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
                max_tokens=512,
            ),
            model_type="qwen3",
        ),
    )


reasoning_state = regular_expression_gen()

python

print_highlight(f"Answer:\n{reasoning_state['answer']}")
print_highlight(
    f"\n\nReasoning Content:\n{reasoning_state['answer_reasoning_content']}"
)