From 52819205eb1ac0f5b3f0557ed3069af1c477e605 Mon Sep 17 00:00:00 2001 From: sufubao Date: Tue, 30 Dec 2025 11:21:51 +0000 Subject: [PATCH 1/6] add function call and reasoning docs --- docs/CN/source/index.rst | 2 + docs/CN/source/tutorial/function_calling.rst | 287 ++++++++++++++++ docs/CN/source/tutorial/reasoning_parser.rst | 342 +++++++++++++++++++ docs/EN/source/index.rst | 2 + docs/EN/source/tutorial/function_calling.rst | 287 ++++++++++++++++ docs/EN/source/tutorial/reasoning_parser.rst | 342 +++++++++++++++++++ 6 files changed, 1262 insertions(+) create mode 100644 docs/CN/source/tutorial/function_calling.rst create mode 100644 docs/CN/source/tutorial/reasoning_parser.rst create mode 100644 docs/EN/source/tutorial/function_calling.rst create mode 100644 docs/EN/source/tutorial/reasoning_parser.rst diff --git a/docs/CN/source/index.rst b/docs/CN/source/index.rst index 348ec0238..b97b2c759 100755 --- a/docs/CN/source/index.rst +++ b/docs/CN/source/index.rst @@ -53,6 +53,8 @@ Lightllm 整合了众多的开源方案的优点,包括但不限于 FasterTran 多模态部署 奖励模型部署 OpenAI 接口使用 + 工具调用(Function Calling) + 思考解析(Reasoning Parser) APIServer 参数详解 lightllm api介绍 diff --git a/docs/CN/source/tutorial/function_calling.rst b/docs/CN/source/tutorial/function_calling.rst new file mode 100644 index 000000000..728b95d86 --- /dev/null +++ b/docs/CN/source/tutorial/function_calling.rst @@ -0,0 +1,287 @@ +.. _function_calling: + +工具调用(Function Calling) +============================ + +LightLLM 支持多种主流模型的工具调用功能,提供 OpenAI 兼容的 API。 + +支持的模型 +---------- + +Qwen2.5/Qwen3 +~~~~~~~~~~~~~ + +**解析器**: ``qwen25`` + +**格式**: + +.. code-block:: xml + + + {"name": "function_name", "arguments": {"param": "value"}} + + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/qwen2.5 \ + --tool_call_parser qwen25 \ + --tp 1 + +Llama 3.2 +~~~~~~~~~ + +**解析器**: ``llama3`` + +**格式**: ``<|python_tag|>{"name": "func", "arguments": {...}}`` + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/llama-3.2 \ + --tool_call_parser llama3 \ + --tp 1 + +Mistral +~~~~~~~ + +**解析器**: ``mistral`` + +**格式**: ``[TOOL_CALLS] [{"name": "func", "arguments": {...}}, ...]`` + +DeepSeek-V3 +~~~~~~~~~~~ + +**解析器**: ``deepseekv3`` + +**格式**: + +.. code-block:: xml + + <|tool▁calls▁begin|> + <|tool▁call▁begin|>function<|tool▁sep|>func_name + ```json + {"param": "value"} + ``` + <|tool▁call▁end|> + <|tool▁calls▁end|> + +DeepSeek-V3.1 +~~~~~~~~~~~~~ + +**解析器**: ``deepseekv31`` + +**格式**: 简化的 V3 格式,参数直接内联,无代码块包围 + +Kimi K2 +~~~~~~~ + +**解析器**: ``kimi_k2`` + +**格式**: + +.. code-block:: xml + + <|tool_calls_section_begin|> + <|tool_call_begin|>functions.func_name:0 + <|tool_call_argument_begin|>{"param": "value"} + <|tool_call_end|> + <|tool_calls_section_end|> + +基本使用 +-------- + +定义工具 +~~~~~~~~ + +.. code-block:: python + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "获取指定城市的天气信息", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "城市名称" + } + }, + "required": ["city"] + } + } + } + ] + +非流式调用 +~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "model_name", + "messages": [ + {"role": "user", "content": "北京今天天气怎么样?"} + ], + "tools": tools, + "tool_choice": "auto" # "auto" | "none" | "required" + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + if message.get("tool_calls"): + for tc in message["tool_calls"]: + print(f"工具: {tc['function']['name']}") + print(f"参数: {tc['function']['arguments']}") + +流式调用 +~~~~~~~~ + +.. code-block:: python + + data = { + "model": "model_name", + "messages": [{"role": "user", "content": "查询北京和上海的天气"}], + "tools": tools, + "stream": True + } + + response = requests.post(url, json=data, stream=True) + tool_calls = {} + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + chunk = json.loads(line[6:]) + delta = chunk["choices"][0]["delta"] + + if delta.get("tool_calls"): + for tc in delta["tool_calls"]: + idx = tc.get("index", 0) + if idx not in tool_calls: + tool_calls[idx] = {"function": {"name": "", "arguments": ""}} + + if tc["function"].get("name"): + tool_calls[idx]["function"]["name"] = tc["function"]["name"] + if tc["function"].get("arguments"): + tool_calls[idx]["function"]["arguments"] += tc["function"]["arguments"] + +多轮对话 +~~~~~~~~ + +.. code-block:: python + + # 1. 用户提问 + messages = [{"role": "user", "content": "北京天气如何?"}] + + # 2. 模型调用工具 + response1 = requests.post(url, json={ + "messages": messages, + "tools": tools + }).json() + + tool_call = response1["choices"][0]["message"]["tool_calls"][0] + messages.append(response1["choices"][0]["message"]) + + # 3. 返回工具结果 + weather_result = {"temperature": 15, "condition": "晴朗"} + messages.append({ + "role": "tool", + "tool_call_id": tool_call["id"], + "name": tool_call["function"]["name"], + "content": json.dumps(weather_result, ensure_ascii=False) + }) + + # 4. 生成最终回答 + response2 = requests.post(url, json={"messages": messages}).json() + print(response2["choices"][0]["message"]["content"]) + +高级功能 +-------- + +并行工具调用 +~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "messages": messages, + "tools": tools, + "parallel_tool_calls": True # 启用并行调用 + } + +强制调用特定工具 +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "tools": tools, + "tool_choice": { + "type": "function", + "function": {"name": "get_weather"} + } + } + +与推理模型集成 +~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True # 分离推理内容 + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("推理:", message.get("reasoning_content")) + print("工具调用:", message.get("tool_calls")) + +常见问题 +-------- + +**工具调用未触发** + 检查 ``--tool_call_parser`` 参数和工具描述是否清晰 + +**参数解析错误** + 确认使用了正确的解析器,检查模型输出格式 + +**流式模式不完整** + 正确处理所有 chunks,使用 ``index`` 字段组装多个工具调用 + +**与推理模型集成失败** + 确保使用最新版本,正确配置 ``separate_reasoning`` 和 ``chat_template_kwargs`` + +技术细节 +-------- + +**核心文件**: +- ``lightllm/server/function_call_parser.py`` - 解析器实现(1267行) +- ``lightllm/server/api_openai.py`` - API 集成 +- ``lightllm/server/build_prompt.py`` - 工具注入 +- ``test/test_api/test_openai_api.py`` - 测试示例 + +**相关 PR**: +- PR #1158: 支持推理内容中的函数调用 + +参考资料 +-------- + +- OpenAI Function Calling: https://platform.openai.com/docs/guides/function-calling +- JSON Schema: https://json-schema.org/ +- LightLLM GitHub: https://github.com/ModelTC/lightllm diff --git a/docs/CN/source/tutorial/reasoning_parser.rst b/docs/CN/source/tutorial/reasoning_parser.rst new file mode 100644 index 000000000..7176dc697 --- /dev/null +++ b/docs/CN/source/tutorial/reasoning_parser.rst @@ -0,0 +1,342 @@ +.. _reasoning_parser: + +思考解析(Reasoning Parser) +============================= + +LightLLM 支持推理模型的思考过程解析,将模型内部推理与最终答案分离,提高 AI 系统透明度。 + +支持的模型 +---------- + +DeepSeek-R1 +~~~~~~~~~~~ + +**解析器**: ``deepseek-r1`` + +**格式**: + +.. code-block:: text + + + 推理过程... + + 最终答案 + +**特点**: 强制推理模式,部分变体可能省略 ```` 起始标签 + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-R1 \ + --reasoning_parser deepseek-r1 \ + --tp 8 \ + --enable_fa3 + +DeepSeek-V3 +~~~~~~~~~~~ + +**解析器**: ``deepseek-v3`` + +**格式**: 与 Qwen3 相同 + +**启动**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-V3 \ + --reasoning_parser deepseek-v3 \ + --tp 8 + +**请求配置**: + +.. code-block:: python + + data = { + "chat_template_kwargs": {"thinking": True} # 启用推理 + } + +Qwen3 +~~~~~ + +**解析器**: ``qwen3`` + +**格式**: ``推理内容回答`` + +**特点**: 可选推理模式,支持动态切换 + +.. code-block:: python + + # 启用推理 + data = {"chat_template_kwargs": {"enable_thinking": True}} + +GLM-4.5 +~~~~~~~ + +**解析器**: ``glm45`` + +**格式**: 同 Qwen3 + +Kimi +~~~~ + +**Kimi Thinking**: ``kimi`` - 使用 ``◁think▷`` 和 ``◁/think▷`` 标记 + +**Kimi K2**: ``kimi_k2`` - 使用 DeepSeek-R1 格式 + +GPT-OSS +~~~~~~~ + +**解析器**: ``gpt-oss`` + +**格式**: + +.. code-block:: xml + + <|start|><|channel|>analysis<|message|> + 推理分析... + <|end|> + <|channel|>final<|message|> + 最终回答 + <|return|> + +**特点**: 复杂状态机解析,支持多通道(analysis, commentary, final) + +其他模型 +~~~~~~~~ + +- **MiniMax**: ``minimax``, ``minimax-append-think`` +- **Step3**: ``step3`` +- **NanoV3**: ``nano_v3`` +- **InternS1**: ``interns1`` + +基本使用 +-------- + +非流式 +~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "deepseek-r1", + "messages": [ + {"role": "user", "content": "单词 'strawberry' 中有多少个字母 'r'?"} + ], + "max_tokens": 2000, + "separate_reasoning": True, # 分离推理内容 + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("推理:", message.get("reasoning_content")) + print("答案:", message.get("content")) + +流式 +~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "messages": [{"role": "user", "content": "解释量子纠缠"}], + "stream": True, + "separate_reasoning": True, + "stream_reasoning": True # 实时流式传输推理内容 + } + + response = requests.post(url, json=data, stream=True) + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + data_str = line[6:].decode('utf-8') + if data_str == '[DONE]': + break + + chunk = json.loads(data_str) + delta = chunk["choices"][0]["delta"] + + # 推理内容 + if "reasoning_content" in delta: + print(delta["reasoning_content"], end="", flush=True) + + # 答案内容 + if "content" in delta: + print(delta["content"], end="", flush=True) + +响应格式 +-------- + +**非流式**: + +.. code-block:: json + + { + "choices": [{ + "message": { + "content": "最终答案", + "reasoning_content": "推理过程" + } + }] + } + +**流式**: + +.. code-block:: json + + // 推理块 + {"choices": [{"delta": {"reasoning_content": "推理片段"}}]} + + // 答案块 + {"choices": [{"delta": {"content": "答案片段"}}]} + +高级功能 +-------- + +动态切换推理模式 +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # 启用推理 + data = { + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True + } + + # 禁用推理 + data = { + "chat_template_kwargs": {"enable_thinking": False} + } + +控制推理显示 +~~~~~~~~~~~~ + +.. code-block:: python + + # 隐藏推理流式传输 + data = { + "separate_reasoning": True, + "stream_reasoning": False # reasoning_content 字段仍存在 + } + + # 合并推理和答案 + data = { + "separate_reasoning": False # 推理和答案合并在 content 中 + } + +与工具调用集成 +~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "tool_choice": "auto", + "separate_reasoning": True, + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + # 同时获得推理、工具调用和答案 + print("推理:", message.get("reasoning_content")) + print("工具:", message.get("tool_calls")) + print("答案:", message.get("content")) + +多轮推理对话 +~~~~~~~~~~~~ + +.. code-block:: python + + messages = [{"role": "user", "content": "什么是质数?"}] + + # 第一轮 + response1 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + + message1 = response1["choices"][0]["message"] + messages.append({ + "role": "assistant", + "content": message1["content"], + "reasoning_content": message1.get("reasoning_content") + }) + + # 第二轮 + messages.append({"role": "user", "content": "17 是质数吗?"}) + response2 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + +配置参数 +-------- + +**separate_reasoning** (布尔, 默认 True) + 是否分离推理内容到 ``reasoning_content`` 字段 + +**stream_reasoning** (布尔, 默认 False) + 是否实时流式传输推理内容 + +**chat_template_kwargs** (对象) + - ``enable_thinking``: 启用推理(Qwen3, GLM45) + - ``thinking``: 启用推理(DeepSeek-V3) + +**--reasoning_parser** (启动参数) + 指定解析器类型:``deepseek-r1``, ``qwen3``, ``glm45``, ``gpt-oss`` 等 + +常见问题 +-------- + +**推理内容未分离** + 检查 ``--reasoning_parser``, ``separate_reasoning: true``, ``chat_template_kwargs`` + +**模型不生成推理** + 确认模型支持推理模式,检查是否启用了推理参数 + +**流式模式不完整** + 处理所有 chunks,等待 ``[DONE]`` 信号 + +**与工具调用冲突** + 使用最新版本(包含 PR #1158),正确配置参数 + +性能考虑 +-------- + +**Token 消耗**: 推理模式可能增加 3-5 倍 token 消耗 + +**延迟影响**: TTFB 可能从 200ms 增加到 800ms + +**优化建议**: +- 使用 ``stream_reasoning: true`` 降低感知延迟 +- 非关键任务禁用推理模式 + +技术细节 +-------- + +**核心文件**: +- ``lightllm/server/reasoning_parser.py`` - 解析器实现(910行) +- ``lightllm/server/api_openai.py`` - API 集成 +- ``test/test_api/test_openai_api.py`` - 测试示例(752-794行) + +**相关 PR**: +- PR #1154: 添加推理解析器 +- PR #1158: 推理内容中的函数调用支持 + +参考资料 +-------- + +- DeepSeek-R1 技术报告 +- LightLLM GitHub: https://github.com/ModelTC/lightllm diff --git a/docs/EN/source/index.rst b/docs/EN/source/index.rst index 9aa65038c..07eaaa42e 100755 --- a/docs/EN/source/index.rst +++ b/docs/EN/source/index.rst @@ -52,6 +52,8 @@ Documentation List Multimodal Deployment Reward Model Deployment OpenAI api Usage + Function Calling + Reasoning Parser APIServer Parameters Lightllm API Introduction diff --git a/docs/EN/source/tutorial/function_calling.rst b/docs/EN/source/tutorial/function_calling.rst new file mode 100644 index 000000000..42f3d5b3d --- /dev/null +++ b/docs/EN/source/tutorial/function_calling.rst @@ -0,0 +1,287 @@ +.. _function_calling: + +Function Calling +================ + +LightLLM supports function calling for multiple mainstream models. Provides OpenAI-compatible API. + +Supported Models +---------------- + +Qwen2.5/Qwen3 +~~~~~~~~~~~~~ + +**Parser**: ``qwen25`` + +**Format**: + +.. code-block:: xml + + + {"name": "function_name", "arguments": {"param": "value"}} + + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/qwen2.5 \ + --tool_call_parser qwen25 \ + --tp 1 + +Llama 3.2 +~~~~~~~~~ + +**Parser**: ``llama3`` + +**Format**: ``<|python_tag|>{"name": "func", "arguments": {...}}`` + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/llama-3.2 \ + --tool_call_parser llama3 \ + --tp 1 + +Mistral +~~~~~~~ + +**Parser**: ``mistral`` + +**Format**: ``[TOOL_CALLS] [{"name": "func", "arguments": {...}}, ...]`` + +DeepSeek-V3 +~~~~~~~~~~~ + +**Parser**: ``deepseekv3`` + +**Format**: + +.. code-block:: xml + + <|tool▁calls▁begin|> + <|tool▁call▁begin|>function<|tool▁sep|>func_name + ```json + {"param": "value"} + ``` + <|tool▁call▁end|> + <|tool▁calls▁end|> + +DeepSeek-V3.1 +~~~~~~~~~~~~~ + +**Parser**: ``deepseekv31`` + +**Format**: Simplified V3 format, parameters directly inlined without code blocks + +Kimi K2 +~~~~~~~ + +**Parser**: ``kimi_k2`` + +**Format**: + +.. code-block:: xml + + <|tool_calls_section_begin|> + <|tool_call_begin|>functions.func_name:0 + <|tool_call_argument_begin|>{"param": "value"} + <|tool_call_end|> + <|tool_calls_section_end|> + +Basic Usage +----------- + +Define Tools +~~~~~~~~~~~~ + +.. code-block:: python + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather information for a city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "City name" + } + }, + "required": ["city"] + } + } + } + ] + +Non-Streaming +~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "model_name", + "messages": [ + {"role": "user", "content": "What's the weather in Beijing?"} + ], + "tools": tools, + "tool_choice": "auto" # "auto" | "none" | "required" + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + if message.get("tool_calls"): + for tc in message["tool_calls"]: + print(f"Tool: {tc['function']['name']}") + print(f"Args: {tc['function']['arguments']}") + +Streaming +~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "model_name", + "messages": [{"role": "user", "content": "Check weather for Beijing and Shanghai"}], + "tools": tools, + "stream": True + } + + response = requests.post(url, json=data, stream=True) + tool_calls = {} + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + chunk = json.loads(line[6:]) + delta = chunk["choices"][0]["delta"] + + if delta.get("tool_calls"): + for tc in delta["tool_calls"]: + idx = tc.get("index", 0) + if idx not in tool_calls: + tool_calls[idx] = {"function": {"name": "", "arguments": ""}} + + if tc["function"].get("name"): + tool_calls[idx]["function"]["name"] = tc["function"]["name"] + if tc["function"].get("arguments"): + tool_calls[idx]["function"]["arguments"] += tc["function"]["arguments"] + +Multi-Turn Conversation +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # 1. User question + messages = [{"role": "user", "content": "How's the weather in Beijing?"}] + + # 2. Model calls tool + response1 = requests.post(url, json={ + "messages": messages, + "tools": tools + }).json() + + tool_call = response1["choices"][0]["message"]["tool_calls"][0] + messages.append(response1["choices"][0]["message"]) + + # 3. Return tool result + weather_result = {"temperature": 15, "condition": "sunny"} + messages.append({ + "role": "tool", + "tool_call_id": tool_call["id"], + "name": tool_call["function"]["name"], + "content": json.dumps(weather_result) + }) + + # 4. Generate final answer + response2 = requests.post(url, json={"messages": messages}).json() + print(response2["choices"][0]["message"]["content"]) + +Advanced Features +----------------- + +Parallel Tool Calls +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "messages": messages, + "tools": tools, + "parallel_tool_calls": True # Enable parallel calls + } + +Force Specific Tool +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "tools": tools, + "tool_choice": { + "type": "function", + "function": {"name": "get_weather"} + } + } + +Integration with Reasoning Models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True # Separate reasoning content + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("Reasoning:", message.get("reasoning_content")) + print("Tool calls:", message.get("tool_calls")) + +Common Issues +------------- + +**Tool calls not triggered** + Check ``--tool_call_parser`` parameter and tool descriptions + +**Parameter parsing errors** + Confirm correct parser is used, check model output format + +**Incomplete streaming** + Process all chunks correctly, use ``index`` field to assemble multiple calls + +**Integration with reasoning models fails** + Use latest version, configure ``separate_reasoning`` and ``chat_template_kwargs`` + +Technical Details +----------------- + +**Core Files**: +- ``lightllm/server/function_call_parser.py`` - Parser implementation (1267 lines) +- ``lightllm/server/api_openai.py`` - API integration +- ``lightllm/server/build_prompt.py`` - Tool injection +- ``test/test_api/test_openai_api.py`` - Test examples + +**Related PRs**: +- PR #1158: Function call in reasoning content support + +References +---------- + +- OpenAI Function Calling: https://platform.openai.com/docs/guides/function-calling +- JSON Schema: https://json-schema.org/ +- LightLLM GitHub: https://github.com/ModelTC/lightllm diff --git a/docs/EN/source/tutorial/reasoning_parser.rst b/docs/EN/source/tutorial/reasoning_parser.rst new file mode 100644 index 000000000..12682f83b --- /dev/null +++ b/docs/EN/source/tutorial/reasoning_parser.rst @@ -0,0 +1,342 @@ +.. _reasoning_parser: + +Reasoning Parser +================ + +LightLLM supports parsing reasoning model's thinking process, separating internal reasoning from final answers to improve AI system transparency. + +Supported Models +---------------- + +DeepSeek-R1 +~~~~~~~~~~~ + +**Parser**: ``deepseek-r1`` + +**Format**: + +.. code-block:: text + + + Reasoning process... + + Final answer + +**Features**: Forced reasoning mode, some variants may omit ```` opening tag + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-R1 \ + --reasoning_parser deepseek-r1 \ + --tp 8 \ + --enable_fa3 + +DeepSeek-V3 +~~~~~~~~~~~ + +**Parser**: ``deepseek-v3`` + +**Format**: Same as Qwen3 + +**Startup**: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/DeepSeek-V3 \ + --reasoning_parser deepseek-v3 \ + --tp 8 + +**Request Config**: + +.. code-block:: python + + data = { + "chat_template_kwargs": {"thinking": True} # Enable reasoning + } + +Qwen3 +~~~~~ + +**Parser**: ``qwen3`` + +**Format**: ``Reasoning contentAnswer`` + +**Features**: Optional reasoning mode, supports dynamic switching + +.. code-block:: python + + # Enable reasoning + data = {"chat_template_kwargs": {"enable_thinking": True}} + +GLM-4.5 +~~~~~~~ + +**Parser**: ``glm45`` + +**Format**: Same as Qwen3 + +Kimi +~~~~ + +**Kimi Thinking**: ``kimi`` - Uses ``◁think▷`` and ``◁/think▷`` tokens + +**Kimi K2**: ``kimi_k2`` - Uses DeepSeek-R1 format + +GPT-OSS +~~~~~~~ + +**Parser**: ``gpt-oss`` + +**Format**: + +.. code-block:: xml + + <|start|><|channel|>analysis<|message|> + Reasoning analysis... + <|end|> + <|channel|>final<|message|> + Final answer + <|return|> + +**Features**: Complex state machine parsing, supports multiple channels (analysis, commentary, final) + +Other Models +~~~~~~~~~~~~ + +- **MiniMax**: ``minimax``, ``minimax-append-think`` +- **Step3**: ``step3`` +- **NanoV3**: ``nano_v3`` +- **InternS1**: ``interns1`` + +Basic Usage +----------- + +Non-Streaming +~~~~~~~~~~~~~ + +.. code-block:: python + + import requests + import json + + url = "http://localhost:8088/v1/chat/completions" + data = { + "model": "deepseek-r1", + "messages": [ + {"role": "user", "content": "How many 'r's in 'strawberry'?"} + ], + "max_tokens": 2000, + "separate_reasoning": True, # Separate reasoning content + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + print("Reasoning:", message.get("reasoning_content")) + print("Answer:", message.get("content")) + +Streaming +~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "messages": [{"role": "user", "content": "Explain quantum entanglement"}], + "stream": True, + "separate_reasoning": True, + "stream_reasoning": True # Stream reasoning content in real-time + } + + response = requests.post(url, json=data, stream=True) + + for line in response.iter_lines(): + if line and line.startswith(b"data: "): + data_str = line[6:].decode('utf-8') + if data_str == '[DONE]': + break + + chunk = json.loads(data_str) + delta = chunk["choices"][0]["delta"] + + # Reasoning content + if "reasoning_content" in delta: + print(delta["reasoning_content"], end="", flush=True) + + # Answer content + if "content" in delta: + print(delta["content"], end="", flush=True) + +Response Format +--------------- + +**Non-Streaming**: + +.. code-block:: json + + { + "choices": [{ + "message": { + "content": "Final answer", + "reasoning_content": "Reasoning process" + } + }] + } + +**Streaming**: + +.. code-block:: json + + // Reasoning chunk + {"choices": [{"delta": {"reasoning_content": "Reasoning fragment"}}]} + + // Answer chunk + {"choices": [{"delta": {"content": "Answer fragment"}}]} + +Advanced Features +----------------- + +Dynamic Reasoning Mode +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Enable reasoning + data = { + "chat_template_kwargs": {"enable_thinking": True}, + "separate_reasoning": True + } + + # Disable reasoning + data = { + "chat_template_kwargs": {"enable_thinking": False} + } + +Control Reasoning Display +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Hide reasoning streaming + data = { + "separate_reasoning": True, + "stream_reasoning": False # reasoning_content field still exists + } + + # Merge reasoning and answer + data = { + "separate_reasoning": False # Merged in content field + } + +Integration with Tool Calling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data = { + "model": "deepseek-r1", + "tools": tools, + "tool_choice": "auto", + "separate_reasoning": True, + "chat_template_kwargs": {"enable_thinking": True} + } + + response = requests.post(url, json=data).json() + message = response["choices"][0]["message"] + + # Get reasoning, tool calls, and answer simultaneously + print("Reasoning:", message.get("reasoning_content")) + print("Tools:", message.get("tool_calls")) + print("Answer:", message.get("content")) + +Multi-Turn Reasoning +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + messages = [{"role": "user", "content": "What is a prime number?"}] + + # First turn + response1 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + + message1 = response1["choices"][0]["message"] + messages.append({ + "role": "assistant", + "content": message1["content"], + "reasoning_content": message1.get("reasoning_content") + }) + + # Second turn + messages.append({"role": "user", "content": "Is 17 a prime number?"}) + response2 = requests.post(url, json={ + "messages": messages, + "separate_reasoning": True + }).json() + +Configuration +------------- + +**separate_reasoning** (bool, default: True) + Whether to separate reasoning content into ``reasoning_content`` field + +**stream_reasoning** (bool, default: False) + Whether to stream reasoning content in real-time + +**chat_template_kwargs** (object) + - ``enable_thinking``: Enable reasoning (Qwen3, GLM45) + - ``thinking``: Enable reasoning (DeepSeek-V3) + +**--reasoning_parser** (startup parameter) + Specify parser type: ``deepseek-r1``, ``qwen3``, ``glm45``, ``gpt-oss``, etc. + +Common Issues +------------- + +**Reasoning content not separated** + Check ``--reasoning_parser``, ``separate_reasoning: true``, ``chat_template_kwargs`` + +**Model not generating reasoning** + Confirm model supports reasoning mode, check if reasoning parameters are enabled + +**Incomplete streaming** + Process all chunks, wait for ``[DONE]`` signal + +**Conflict with tool calling** + Use latest version (includes PR #1158), configure parameters correctly + +Performance +----------- + +**Token Consumption**: Reasoning mode may increase token usage by 3-5x + +**Latency Impact**: TTFB may increase from 200ms to 800ms + +**Optimization**: +- Use ``stream_reasoning: true`` to reduce perceived latency +- Disable reasoning mode for non-critical tasks + +Technical Details +----------------- + +**Core Files**: +- ``lightllm/server/reasoning_parser.py`` - Parser implementation (910 lines) +- ``lightllm/server/api_openai.py`` - API integration +- ``test/test_api/test_openai_api.py`` - Test examples (lines 752-794) + +**Related PRs**: +- PR #1154: Add reasoning parser +- PR #1158: Function call in reasoning content support + +References +---------- + +- DeepSeek-R1 Technical Report +- LightLLM GitHub: https://github.com/ModelTC/lightllm From c53c52a6d4f792902fa5118d489427409a8b89ac Mon Sep 17 00:00:00 2001 From: sufubao Date: Sun, 4 Jan 2026 03:52:54 +0000 Subject: [PATCH 2/6] clean --- docs/CN/source/tutorial/function_calling.rst | 15 ------------- docs/CN/source/tutorial/reasoning_parser.rst | 22 -------------------- docs/EN/source/tutorial/function_calling.rst | 15 ------------- docs/EN/source/tutorial/reasoning_parser.rst | 22 -------------------- 4 files changed, 74 deletions(-) diff --git a/docs/CN/source/tutorial/function_calling.rst b/docs/CN/source/tutorial/function_calling.rst index 728b95d86..eea2855f3 100644 --- a/docs/CN/source/tutorial/function_calling.rst +++ b/docs/CN/source/tutorial/function_calling.rst @@ -77,21 +77,6 @@ DeepSeek-V3.1 **格式**: 简化的 V3 格式,参数直接内联,无代码块包围 -Kimi K2 -~~~~~~~ - -**解析器**: ``kimi_k2`` - -**格式**: - -.. code-block:: xml - - <|tool_calls_section_begin|> - <|tool_call_begin|>functions.func_name:0 - <|tool_call_argument_begin|>{"param": "value"} - <|tool_call_end|> - <|tool_calls_section_end|> - 基本使用 -------- diff --git a/docs/CN/source/tutorial/reasoning_parser.rst b/docs/CN/source/tutorial/reasoning_parser.rst index 7176dc697..f80e06b90 100644 --- a/docs/CN/source/tutorial/reasoning_parser.rst +++ b/docs/CN/source/tutorial/reasoning_parser.rst @@ -72,20 +72,6 @@ Qwen3 # 启用推理 data = {"chat_template_kwargs": {"enable_thinking": True}} -GLM-4.5 -~~~~~~~ - -**解析器**: ``glm45`` - -**格式**: 同 Qwen3 - -Kimi -~~~~ - -**Kimi Thinking**: ``kimi`` - 使用 ``◁think▷`` 和 ``◁/think▷`` 标记 - -**Kimi K2**: ``kimi_k2`` - 使用 DeepSeek-R1 格式 - GPT-OSS ~~~~~~~ @@ -104,14 +90,6 @@ GPT-OSS **特点**: 复杂状态机解析,支持多通道(analysis, commentary, final) -其他模型 -~~~~~~~~ - -- **MiniMax**: ``minimax``, ``minimax-append-think`` -- **Step3**: ``step3`` -- **NanoV3**: ``nano_v3`` -- **InternS1**: ``interns1`` - 基本使用 -------- diff --git a/docs/EN/source/tutorial/function_calling.rst b/docs/EN/source/tutorial/function_calling.rst index 42f3d5b3d..d7f798140 100644 --- a/docs/EN/source/tutorial/function_calling.rst +++ b/docs/EN/source/tutorial/function_calling.rst @@ -77,21 +77,6 @@ DeepSeek-V3.1 **Format**: Simplified V3 format, parameters directly inlined without code blocks -Kimi K2 -~~~~~~~ - -**Parser**: ``kimi_k2`` - -**Format**: - -.. code-block:: xml - - <|tool_calls_section_begin|> - <|tool_call_begin|>functions.func_name:0 - <|tool_call_argument_begin|>{"param": "value"} - <|tool_call_end|> - <|tool_calls_section_end|> - Basic Usage ----------- diff --git a/docs/EN/source/tutorial/reasoning_parser.rst b/docs/EN/source/tutorial/reasoning_parser.rst index 12682f83b..539136633 100644 --- a/docs/EN/source/tutorial/reasoning_parser.rst +++ b/docs/EN/source/tutorial/reasoning_parser.rst @@ -72,20 +72,6 @@ Qwen3 # Enable reasoning data = {"chat_template_kwargs": {"enable_thinking": True}} -GLM-4.5 -~~~~~~~ - -**Parser**: ``glm45`` - -**Format**: Same as Qwen3 - -Kimi -~~~~ - -**Kimi Thinking**: ``kimi`` - Uses ``◁think▷`` and ``◁/think▷`` tokens - -**Kimi K2**: ``kimi_k2`` - Uses DeepSeek-R1 format - GPT-OSS ~~~~~~~ @@ -104,14 +90,6 @@ GPT-OSS **Features**: Complex state machine parsing, supports multiple channels (analysis, commentary, final) -Other Models -~~~~~~~~~~~~ - -- **MiniMax**: ``minimax``, ``minimax-append-think`` -- **Step3**: ``step3`` -- **NanoV3**: ``nano_v3`` -- **InternS1**: ``interns1`` - Basic Usage ----------- From fb9f071ca237839324818b3c257f5d613e222b33 Mon Sep 17 00:00:00 2001 From: shihaobai <42648726+shihaobai@users.noreply.github.com> Date: Sun, 4 Jan 2026 15:25:50 +0800 Subject: [PATCH 3/6] Update docs/CN/source/tutorial/function_calling.rst Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/CN/source/tutorial/function_calling.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/CN/source/tutorial/function_calling.rst b/docs/CN/source/tutorial/function_calling.rst index eea2855f3..afa588572 100644 --- a/docs/CN/source/tutorial/function_calling.rst +++ b/docs/CN/source/tutorial/function_calling.rst @@ -256,7 +256,7 @@ DeepSeek-V3.1 -------- **核心文件**: -- ``lightllm/server/function_call_parser.py`` - 解析器实现(1267行) +- ``lightllm/server/function_call_parser.py`` - 解析器实现 - ``lightllm/server/api_openai.py`` - API 集成 - ``lightllm/server/build_prompt.py`` - 工具注入 - ``test/test_api/test_openai_api.py`` - 测试示例 From 45b6f1c1c61256c9e8d2b3d75df14d23639d8991 Mon Sep 17 00:00:00 2001 From: shihaobai <42648726+shihaobai@users.noreply.github.com> Date: Sun, 4 Jan 2026 15:26:59 +0800 Subject: [PATCH 4/6] Update docs/CN/source/tutorial/reasoning_parser.rst Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/CN/source/tutorial/reasoning_parser.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/CN/source/tutorial/reasoning_parser.rst b/docs/CN/source/tutorial/reasoning_parser.rst index f80e06b90..547eb05d1 100644 --- a/docs/CN/source/tutorial/reasoning_parser.rst +++ b/docs/CN/source/tutorial/reasoning_parser.rst @@ -305,9 +305,9 @@ GPT-OSS -------- **核心文件**: -- ``lightllm/server/reasoning_parser.py`` - 解析器实现(910行) +- ``lightllm/server/reasoning_parser.py`` - 解析器实现 - ``lightllm/server/api_openai.py`` - API 集成 -- ``test/test_api/test_openai_api.py`` - 测试示例(752-794行) +- ``test/test_api/test_openai_api.py`` - 测试示例 **相关 PR**: - PR #1154: 添加推理解析器 From 4e4d6cbd60a0d432a15f2315d018b9064b47b705 Mon Sep 17 00:00:00 2001 From: shihaobai <42648726+shihaobai@users.noreply.github.com> Date: Sun, 4 Jan 2026 15:27:06 +0800 Subject: [PATCH 5/6] Update docs/EN/source/tutorial/function_calling.rst Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/EN/source/tutorial/function_calling.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/EN/source/tutorial/function_calling.rst b/docs/EN/source/tutorial/function_calling.rst index d7f798140..d038eb902 100644 --- a/docs/EN/source/tutorial/function_calling.rst +++ b/docs/EN/source/tutorial/function_calling.rst @@ -256,7 +256,7 @@ Technical Details ----------------- **Core Files**: -- ``lightllm/server/function_call_parser.py`` - Parser implementation (1267 lines) +- ``lightllm/server/function_call_parser.py`` - Parser implementation - ``lightllm/server/api_openai.py`` - API integration - ``lightllm/server/build_prompt.py`` - Tool injection - ``test/test_api/test_openai_api.py`` - Test examples From d530914b6ad745474ebf44c7e4ed985584b72d9a Mon Sep 17 00:00:00 2001 From: shihaobai <42648726+shihaobai@users.noreply.github.com> Date: Sun, 4 Jan 2026 15:27:15 +0800 Subject: [PATCH 6/6] Update docs/EN/source/tutorial/reasoning_parser.rst Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/EN/source/tutorial/reasoning_parser.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/EN/source/tutorial/reasoning_parser.rst b/docs/EN/source/tutorial/reasoning_parser.rst index 539136633..e76e093d6 100644 --- a/docs/EN/source/tutorial/reasoning_parser.rst +++ b/docs/EN/source/tutorial/reasoning_parser.rst @@ -305,9 +305,9 @@ Technical Details ----------------- **Core Files**: -- ``lightllm/server/reasoning_parser.py`` - Parser implementation (910 lines) +- ``lightllm/server/reasoning_parser.py`` - Parser implementation - ``lightllm/server/api_openai.py`` - API integration -- ``test/test_api/test_openai_api.py`` - Test examples (lines 752-794) +- ``test/test_api/test_openai_api.py`` - Test examples **Related PRs**: - PR #1154: Add reasoning parser