diff --git a/docs/CN/source/index.rst b/docs/CN/source/index.rst index 348ec0238..b97b2c759 100755 --- a/docs/CN/source/index.rst +++ b/docs/CN/source/index.rst @@ -53,6 +53,8 @@ Lightllm 整合了众多的开源方案的优点,包括但不限于 FasterTran 多模态部署 奖励模型部署 OpenAI 接口使用 + 工具调用(Function Calling) + 思考解析(Reasoning Parser) APIServer 参数详解 lightllm api介绍 diff --git a/docs/CN/source/tutorial/function_calling.rst b/docs/CN/source/tutorial/function_calling.rst new file mode 100644 index 000000000..afa588572 --- /dev/null +++ b/docs/CN/source/tutorial/function_calling.rst @@ -0,0 +1,272 @@ +.. _function_calling: + +工具调用(Function Calling) +============================ + +LightLLM 支持多种主流模型的工具调用功能,提供 OpenAI 兼容的 API。 + +支持的模型 +---------- + +Qwen2.5/Qwen3 +~~~~~~~~~~~~~ + +**解析器**: ``qwen25`` + +**格式**: + +.. code-block:: xml + + + {"name": "function_name", "arguments": {"param": "value"}} + + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/qwen2.5 \ + --tool_call_parser qwen25 \ + --tp 1 + +Llama 3.2 +~~~~~~~~~ + +**解析器**: ``llama3`` + +**格式**: ``<|python_tag|>{"name": "func", "arguments": {...}}`` + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/llama-3.2 \ + --tool_call_parser llama3 \ + --tp 1 + +Mistral +~~~~~~~ + +**解析器**: ``mistral`` + +**格式**: ``[TOOL_CALLS] [{"name": "func", "arguments": {...}}, ...]`` + +DeepSeek-V3 +~~~~~~~~~~~ + +**解析器**: ``deepseekv3`` + +**格式**: + +.. code-block:: xml + + <|tool▁calls▁begin|> + <|tool▁call▁begin|>function<|tool▁sep|>func_name + ```json + {"param": "value"} + ``` + <|tool▁call▁end|> + <|tool▁calls▁end|> + +DeepSeek-V3.1 +~~~~~~~~~~~~~ + +**解析器**: ``deepseekv31`` + +**格式**: 简化的 V3 格式,参数直接内联,无代码块包围 + +基本使用 +-------- + +定义工具 +~~~~~~~~ + +.. code-block:: python + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "获取指定城市的天气信息", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "城市名称" + } + }, + "required": ["city"] + } + } + } + ] + +非流式调用 +~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "model_name", + "messages": [ + {"role": "user", "content": "北京今天天气怎么样?"} + ], + "tools": tools, + "tool_choice": "auto" # "auto" | "none" | "required" + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + if message.get("tool_calls"): + for tc in message["tool_calls"]: + print(f"工具: {tc['function']['name']}") + print(f"参数: {tc['function']['arguments']}") + +流式调用 +~~~~~~~~ + +.. code-block:: python + + data = { + "model": "model_name", + "messages": [{"role": "user", "content": "查询北京和上海的天气"}], + "tools": tools, + "stream": True + } + + response = requests.post(url, json=data, stream=True) + tool_calls = {} + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + chunk = json.loads(line[6:]) + delta = chunk["choices"][0]["delta"] + + if delta.get("tool_calls"): + for tc in delta["tool_calls"]: + idx = tc.get("index", 0) + if idx not in tool_calls: + tool_calls[idx] = {"function": {"name": "", "arguments": ""}} + + if tc["function"].get("name"): + tool_calls[idx]["function"]["name"] = tc["function"]["name"] + if tc["function"].get("arguments"): + tool_calls[idx]["function"]["arguments"] += tc["function"]["arguments"] + +多轮对话 +~~~~~~~~ + +.. code-block:: python + + # 1. 用户提问 + messages = [{"role": "user", "content": "北京天气如何?"}] + + # 2. 模型调用工具 + response1 = requests.post(url, json={ + "messages": messages, + "tools": tools + }).json() + + tool_call = response1["choices"][0]["message"]["tool_calls"][0] + messages.append(response1["choices"][0]["message"]) + + # 3. 返回工具结果 + weather_result = {"temperature": 15, "condition": "晴朗"} + messages.append({ + "role": "tool", + "tool_call_id": tool_call["id"], + "name": tool_call["function"]["name"], + "content": json.dumps(weather_result, ensure_ascii=False) + }) + + # 4. 生成最终回答 + response2 = requests.post(url, json={"messages": messages}).json() + print(response2["choices"][0]["message"]["content"]) + +高级功能 +-------- + +并行工具调用 +~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "messages": messages, + "tools": tools, + "parallel_tool_calls": True # 启用并行调用 + } + +强制调用特定工具 +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "tools": tools, + "tool_choice": { + "type": "function", + "function": {"name": "get_weather"} + } + } + +与推理模型集成 +~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True # 分离推理内容 + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("推理:", message.get("reasoning_content")) + print("工具调用:", message.get("tool_calls")) + +常见问题 +-------- + +**工具调用未触发** + 检查 ``--tool_call_parser`` 参数和工具描述是否清晰 + +**参数解析错误** + 确认使用了正确的解析器,检查模型输出格式 + +**流式模式不完整** + 正确处理所有 chunks,使用 ``index`` 字段组装多个工具调用 + +**与推理模型集成失败** + 确保使用最新版本,正确配置 ``separate_reasoning`` 和 ``chat_template_kwargs`` + +技术细节 +-------- + +**核心文件**: +- ``lightllm/server/function_call_parser.py`` - 解析器实现 +- ``lightllm/server/api_openai.py`` - API 集成 +- ``lightllm/server/build_prompt.py`` - 工具注入 +- ``test/test_api/test_openai_api.py`` - 测试示例 + +**相关 PR**: +- PR #1158: 支持推理内容中的函数调用 + +参考资料 +-------- + +- OpenAI Function Calling: https://platform.openai.com/docs/guides/function-calling +- JSON Schema: https://json-schema.org/ +- LightLLM GitHub: https://github.com/ModelTC/lightllm diff --git a/docs/CN/source/tutorial/reasoning_parser.rst b/docs/CN/source/tutorial/reasoning_parser.rst new file mode 100644 index 000000000..547eb05d1 --- /dev/null +++ b/docs/CN/source/tutorial/reasoning_parser.rst @@ -0,0 +1,320 @@ +.. _reasoning_parser: + +思考解析(Reasoning Parser) +============================= + +LightLLM 支持推理模型的思考过程解析,将模型内部推理与最终答案分离,提高 AI 系统透明度。 + +支持的模型 +---------- + +DeepSeek-R1 +~~~~~~~~~~~ + +**解析器**: ``deepseek-r1`` + +**格式**: + +.. code-block:: text + + + 推理过程... + + 最终答案 + +**特点**: 强制推理模式,部分变体可能省略 ```` 起始标签 + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-R1 \ + --reasoning_parser deepseek-r1 \ + --tp 8 \ + --enable_fa3 + +DeepSeek-V3 +~~~~~~~~~~~ + +**解析器**: ``deepseek-v3`` + +**格式**: 与 Qwen3 相同 + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-V3 \ + --reasoning_parser deepseek-v3 \ + --tp 8 + +**请求配置**: + +.. code-block:: python + + data = { + "chat_template_kwargs": {"thinking": True} # 启用推理 + } + +Qwen3 +~~~~~ + +**解析器**: ``qwen3`` + +**格式**: ``推理内容回答`` + +**特点**: 可选推理模式,支持动态切换 + +.. code-block:: python + + # 启用推理 + data = {"chat_template_kwargs": {"enable_thinking": True}} + +GPT-OSS +~~~~~~~ + +**解析器**: ``gpt-oss`` + +**格式**: + +.. code-block:: xml + + <|start|><|channel|>analysis<|message|> + 推理分析... + <|end|> + <|channel|>final<|message|> + 最终回答 + <|return|> + +**特点**: 复杂状态机解析,支持多通道(analysis, commentary, final) + +基本使用 +-------- + +非流式 +~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "deepseek-r1", + "messages": [ + {"role": "user", "content": "单词 'strawberry' 中有多少个字母 'r'?"} + ], + "max_tokens": 2000, + "separate_reasoning": True, # 分离推理内容 + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("推理:", message.get("reasoning_content")) + print("答案:", message.get("content")) + +流式 +~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "messages": [{"role": "user", "content": "解释量子纠缠"}], + "stream": True, + "separate_reasoning": True, + "stream_reasoning": True # 实时流式传输推理内容 + } + + response = requests.post(url, json=data, stream=True) + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + data_str = line[6:].decode('utf-8') + if data_str == '[DONE]': + break + + chunk = json.loads(data_str) + delta = chunk["choices"][0]["delta"] + + # 推理内容 + if "reasoning_content" in delta: + print(delta["reasoning_content"], end="", flush=True) + + # 答案内容 + if "content" in delta: + print(delta["content"], end="", flush=True) + +响应格式 +-------- + +**非流式**: + +.. code-block:: json + + { + "choices": [{ + "message": { + "content": "最终答案", + "reasoning_content": "推理过程" + } + }] + } + +**流式**: + +.. code-block:: json + + // 推理块 + {"choices": [{"delta": {"reasoning_content": "推理片段"}}]} + + // 答案块 + {"choices": [{"delta": {"content": "答案片段"}}]} + +高级功能 +-------- + +动态切换推理模式 +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # 启用推理 + data = { + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True + } + + # 禁用推理 + data = { + "chat_template_kwargs": {"enable_thinking": False} + } + +控制推理显示 +~~~~~~~~~~~~ + +.. code-block:: python + + # 隐藏推理流式传输 + data = { + "separate_reasoning": True, + "stream_reasoning": False # reasoning_content 字段仍存在 + } + + # 合并推理和答案 + data = { + "separate_reasoning": False # 推理和答案合并在 content 中 + } + +与工具调用集成 +~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "tool_choice": "auto", + "separate_reasoning": True, + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + # 同时获得推理、工具调用和答案 + print("推理:", message.get("reasoning_content")) + print("工具:", message.get("tool_calls")) + print("答案:", message.get("content")) + +多轮推理对话 +~~~~~~~~~~~~ + +.. code-block:: python + + messages = [{"role": "user", "content": "什么是质数?"}] + + # 第一轮 + response1 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + + message1 = response1["choices"][0]["message"] + messages.append({ + "role": "assistant", + "content": message1["content"], + "reasoning_content": message1.get("reasoning_content") + }) + + # 第二轮 + messages.append({"role": "user", "content": "17 是质数吗?"}) + response2 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + +配置参数 +-------- + +**separate_reasoning** (布尔, 默认 True) + 是否分离推理内容到 ``reasoning_content`` 字段 + +**stream_reasoning** (布尔, 默认 False) + 是否实时流式传输推理内容 + +**chat_template_kwargs** (对象) + - ``enable_thinking``: 启用推理(Qwen3, GLM45) + - ``thinking``: 启用推理(DeepSeek-V3) + +**--reasoning_parser** (启动参数) + 指定解析器类型:``deepseek-r1``, ``qwen3``, ``glm45``, ``gpt-oss`` 等 + +常见问题 +-------- + +**推理内容未分离** + 检查 ``--reasoning_parser``, ``separate_reasoning: true``, ``chat_template_kwargs`` + +**模型不生成推理** + 确认模型支持推理模式,检查是否启用了推理参数 + +**流式模式不完整** + 处理所有 chunks,等待 ``[DONE]`` 信号 + +**与工具调用冲突** + 使用最新版本(包含 PR #1158),正确配置参数 + +性能考虑 +-------- + +**Token 消耗**: 推理模式可能增加 3-5 倍 token 消耗 + +**延迟影响**: TTFB 可能从 200ms 增加到 800ms + +**优化建议**: +- 使用 ``stream_reasoning: true`` 降低感知延迟 +- 非关键任务禁用推理模式 + +技术细节 +-------- + +**核心文件**: +- ``lightllm/server/reasoning_parser.py`` - 解析器实现 +- ``lightllm/server/api_openai.py`` - API 集成 +- ``test/test_api/test_openai_api.py`` - 测试示例 + +**相关 PR**: +- PR #1154: 添加推理解析器 +- PR #1158: 推理内容中的函数调用支持 + +参考资料 +-------- + +- DeepSeek-R1 技术报告 +- LightLLM GitHub: https://github.com/ModelTC/lightllm diff --git a/docs/EN/source/index.rst b/docs/EN/source/index.rst index 9aa65038c..07eaaa42e 100755 --- a/docs/EN/source/index.rst +++ b/docs/EN/source/index.rst @@ -52,6 +52,8 @@ Documentation List Multimodal Deployment Reward Model Deployment OpenAI api Usage + Function Calling + Reasoning Parser APIServer Parameters Lightllm API Introduction diff --git a/docs/EN/source/tutorial/function_calling.rst b/docs/EN/source/tutorial/function_calling.rst new file mode 100644 index 000000000..d038eb902 --- /dev/null +++ b/docs/EN/source/tutorial/function_calling.rst @@ -0,0 +1,272 @@ +.. _function_calling: + +Function Calling +================ + +LightLLM supports function calling for multiple mainstream models. Provides OpenAI-compatible API. + +Supported Models +---------------- + +Qwen2.5/Qwen3 +~~~~~~~~~~~~~ + +**Parser**: ``qwen25`` + +**Format**: + +.. code-block:: xml + + + {"name": "function_name", "arguments": {"param": "value"}} + + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/qwen2.5 \ + --tool_call_parser qwen25 \ + --tp 1 + +Llama 3.2 +~~~~~~~~~ + +**Parser**: ``llama3`` + +**Format**: ``<|python_tag|>{"name": "func", "arguments": {...}}`` + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/llama-3.2 \ + --tool_call_parser llama3 \ + --tp 1 + +Mistral +~~~~~~~ + +**Parser**: ``mistral`` + +**Format**: ``[TOOL_CALLS] [{"name": "func", "arguments": {...}}, ...]`` + +DeepSeek-V3 +~~~~~~~~~~~ + +**Parser**: ``deepseekv3`` + +**Format**: + +.. code-block:: xml + + <|tool▁calls▁begin|> + <|tool▁call▁begin|>function<|tool▁sep|>func_name + ```json + {"param": "value"} + ``` + <|tool▁call▁end|> + <|tool▁calls▁end|> + +DeepSeek-V3.1 +~~~~~~~~~~~~~ + +**Parser**: ``deepseekv31`` + +**Format**: Simplified V3 format, parameters directly inlined without code blocks + +Basic Usage +----------- + +Define Tools +~~~~~~~~~~~~ + +.. code-block:: python + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather information for a city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "City name" + } + }, + "required": ["city"] + } + } + } + ] + +Non-Streaming +~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "model_name", + "messages": [ + {"role": "user", "content": "What's the weather in Beijing?"} + ], + "tools": tools, + "tool_choice": "auto" # "auto" | "none" | "required" + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + if message.get("tool_calls"): + for tc in message["tool_calls"]: + print(f"Tool: {tc['function']['name']}") + print(f"Args: {tc['function']['arguments']}") + +Streaming +~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "model_name", + "messages": [{"role": "user", "content": "Check weather for Beijing and Shanghai"}], + "tools": tools, + "stream": True + } + + response = requests.post(url, json=data, stream=True) + tool_calls = {} + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + chunk = json.loads(line[6:]) + delta = chunk["choices"][0]["delta"] + + if delta.get("tool_calls"): + for tc in delta["tool_calls"]: + idx = tc.get("index", 0) + if idx not in tool_calls: + tool_calls[idx] = {"function": {"name": "", "arguments": ""}} + + if tc["function"].get("name"): + tool_calls[idx]["function"]["name"] = tc["function"]["name"] + if tc["function"].get("arguments"): + tool_calls[idx]["function"]["arguments"] += tc["function"]["arguments"] + +Multi-Turn Conversation +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # 1. User question + messages = [{"role": "user", "content": "How's the weather in Beijing?"}] + + # 2. Model calls tool + response1 = requests.post(url, json={ + "messages": messages, + "tools": tools + }).json() + + tool_call = response1["choices"][0]["message"]["tool_calls"][0] + messages.append(response1["choices"][0]["message"]) + + # 3. Return tool result + weather_result = {"temperature": 15, "condition": "sunny"} + messages.append({ + "role": "tool", + "tool_call_id": tool_call["id"], + "name": tool_call["function"]["name"], + "content": json.dumps(weather_result) + }) + + # 4. Generate final answer + response2 = requests.post(url, json={"messages": messages}).json() + print(response2["choices"][0]["message"]["content"]) + +Advanced Features +----------------- + +Parallel Tool Calls +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "messages": messages, + "tools": tools, + "parallel_tool_calls": True # Enable parallel calls + } + +Force Specific Tool +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "tools": tools, + "tool_choice": { + "type": "function", + "function": {"name": "get_weather"} + } + } + +Integration with Reasoning Models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True # Separate reasoning content + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("Reasoning:", message.get("reasoning_content")) + print("Tool calls:", message.get("tool_calls")) + +Common Issues +------------- + +**Tool calls not triggered** + Check ``--tool_call_parser`` parameter and tool descriptions + +**Parameter parsing errors** + Confirm correct parser is used, check model output format + +**Incomplete streaming** + Process all chunks correctly, use ``index`` field to assemble multiple calls + +**Integration with reasoning models fails** + Use latest version, configure ``separate_reasoning`` and ``chat_template_kwargs`` + +Technical Details +----------------- + +**Core Files**: +- ``lightllm/server/function_call_parser.py`` - Parser implementation +- ``lightllm/server/api_openai.py`` - API integration +- ``lightllm/server/build_prompt.py`` - Tool injection +- ``test/test_api/test_openai_api.py`` - Test examples + +**Related PRs**: +- PR #1158: Function call in reasoning content support + +References +---------- + +- OpenAI Function Calling: https://platform.openai.com/docs/guides/function-calling +- JSON Schema: https://json-schema.org/ +- LightLLM GitHub: https://github.com/ModelTC/lightllm diff --git a/docs/EN/source/tutorial/reasoning_parser.rst b/docs/EN/source/tutorial/reasoning_parser.rst new file mode 100644 index 000000000..e76e093d6 --- /dev/null +++ b/docs/EN/source/tutorial/reasoning_parser.rst @@ -0,0 +1,320 @@ +.. _reasoning_parser: + +Reasoning Parser +================ + +LightLLM supports parsing reasoning model's thinking process, separating internal reasoning from final answers to improve AI system transparency. + +Supported Models +---------------- + +DeepSeek-R1 +~~~~~~~~~~~ + +**Parser**: ``deepseek-r1`` + +**Format**: + +.. code-block:: text + + + Reasoning process... + + Final answer + +**Features**: Forced reasoning mode, some variants may omit ```` opening tag + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-R1 \ + --reasoning_parser deepseek-r1 \ + --tp 8 \ + --enable_fa3 + +DeepSeek-V3 +~~~~~~~~~~~ + +**Parser**: ``deepseek-v3`` + +**Format**: Same as Qwen3 + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-V3 \ + --reasoning_parser deepseek-v3 \ + --tp 8 + +**Request Config**: + +.. code-block:: python + + data = { + "chat_template_kwargs": {"thinking": True} # Enable reasoning + } + +Qwen3 +~~~~~ + +**Parser**: ``qwen3`` + +**Format**: ``Reasoning contentAnswer`` + +**Features**: Optional reasoning mode, supports dynamic switching + +.. code-block:: python + + # Enable reasoning + data = {"chat_template_kwargs": {"enable_thinking": True}} + +GPT-OSS +~~~~~~~ + +**Parser**: ``gpt-oss`` + +**Format**: + +.. code-block:: xml + + <|start|><|channel|>analysis<|message|> + Reasoning analysis... + <|end|> + <|channel|>final<|message|> + Final answer + <|return|> + +**Features**: Complex state machine parsing, supports multiple channels (analysis, commentary, final) + +Basic Usage +----------- + +Non-Streaming +~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "deepseek-r1", + "messages": [ + {"role": "user", "content": "How many 'r's in 'strawberry'?"} + ], + "max_tokens": 2000, + "separate_reasoning": True, # Separate reasoning content + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("Reasoning:", message.get("reasoning_content")) + print("Answer:", message.get("content")) + +Streaming +~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "messages": [{"role": "user", "content": "Explain quantum entanglement"}], + "stream": True, + "separate_reasoning": True, + "stream_reasoning": True # Stream reasoning content in real-time + } + + response = requests.post(url, json=data, stream=True) + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + data_str = line[6:].decode('utf-8') + if data_str == '[DONE]': + break + + chunk = json.loads(data_str) + delta = chunk["choices"][0]["delta"] + + # Reasoning content + if "reasoning_content" in delta: + print(delta["reasoning_content"], end="", flush=True) + + # Answer content + if "content" in delta: + print(delta["content"], end="", flush=True) + +Response Format +--------------- + +**Non-Streaming**: + +.. code-block:: json + + { + "choices": [{ + "message": { + "content": "Final answer", + "reasoning_content": "Reasoning process" + } + }] + } + +**Streaming**: + +.. code-block:: json + + // Reasoning chunk + {"choices": [{"delta": {"reasoning_content": "Reasoning fragment"}}]} + + // Answer chunk + {"choices": [{"delta": {"content": "Answer fragment"}}]} + +Advanced Features +----------------- + +Dynamic Reasoning Mode +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Enable reasoning + data = { + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True + } + + # Disable reasoning + data = { + "chat_template_kwargs": {"enable_thinking": False} + } + +Control Reasoning Display +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Hide reasoning streaming + data = { + "separate_reasoning": True, + "stream_reasoning": False # reasoning_content field still exists + } + + # Merge reasoning and answer + data = { + "separate_reasoning": False # Merged in content field + } + +Integration with Tool Calling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "tool_choice": "auto", + "separate_reasoning": True, + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + # Get reasoning, tool calls, and answer simultaneously + print("Reasoning:", message.get("reasoning_content")) + print("Tools:", message.get("tool_calls")) + print("Answer:", message.get("content")) + +Multi-Turn Reasoning +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + messages = [{"role": "user", "content": "What is a prime number?"}] + + # First turn + response1 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + + message1 = response1["choices"][0]["message"] + messages.append({ + "role": "assistant", + "content": message1["content"], + "reasoning_content": message1.get("reasoning_content") + }) + + # Second turn + messages.append({"role": "user", "content": "Is 17 a prime number?"}) + response2 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + +Configuration +------------- + +**separate_reasoning** (bool, default: True) + Whether to separate reasoning content into ``reasoning_content`` field + +**stream_reasoning** (bool, default: False) + Whether to stream reasoning content in real-time + +**chat_template_kwargs** (object) + - ``enable_thinking``: Enable reasoning (Qwen3, GLM45) + - ``thinking``: Enable reasoning (DeepSeek-V3) + +**--reasoning_parser** (startup parameter) + Specify parser type: ``deepseek-r1``, ``qwen3``, ``glm45``, ``gpt-oss``, etc. + +Common Issues +------------- + +**Reasoning content not separated** + Check ``--reasoning_parser``, ``separate_reasoning: true``, ``chat_template_kwargs`` + +**Model not generating reasoning** + Confirm model supports reasoning mode, check if reasoning parameters are enabled + +**Incomplete streaming** + Process all chunks, wait for ``[DONE]`` signal + +**Conflict with tool calling** + Use latest version (includes PR #1158), configure parameters correctly + +Performance +----------- + +**Token Consumption**: Reasoning mode may increase token usage by 3-5x + +**Latency Impact**: TTFB may increase from 200ms to 800ms + +**Optimization**: +- Use ``stream_reasoning: true`` to reduce perceived latency +- Disable reasoning mode for non-critical tasks + +Technical Details +----------------- + +**Core Files**: +- ``lightllm/server/reasoning_parser.py`` - Parser implementation +- ``lightllm/server/api_openai.py`` - API integration +- ``test/test_api/test_openai_api.py`` - Test examples + +**Related PRs**: +- PR #1154: Add reasoning parser +- PR #1158: Function call in reasoning content support + +References +---------- + +- DeepSeek-R1 Technical Report +- LightLLM GitHub: https://github.com/ModelTC/lightllm