PaddlePaddle · luukunn · Jan 22, 2026 · Jan 19, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -405,12 +405,15 @@ async def chat_completion_stream_generator(
                     output_speculate_metrics = res["metrics"].get("speculate_metrics", None)
 
                     delta_message = DeltaMessage(
-                        reasoning_content="",
+                        reasoning_content=output["reasoning_content"],
                         prompt_token_ids=None,
-                        tool_calls=None,
+                        tool_calls=output["tool_calls"],
                         completion_token_ids=None,
                     )
 
+                    if output["tool_calls"] is not None:
+                        tool_called[idx] = True
+
                     if response_processor.enable_multimodal_content():
                         delta_message.multimodal_content = output["multipart"]
                     else:
@@ -419,15 +422,8 @@ async def chat_completion_stream_generator(
                     if output.get("audio_content", None) is not None:
                         delta_message.audio_content = output["audio_content"]
 
-                    if not res["finished"] and output["enable_parser"]:
-                        delta_message_output = output["delta_message"]
-                        if delta_message_output is None:
-                            continue
-                        delta_message.content = delta_message_output.content or ""
-                        delta_message.reasoning_content = delta_message_output.reasoning_content or ""
-                        if delta_message_output.tool_calls:
-                            delta_message.tool_calls = delta_message_output.tool_calls
-                            tool_called[idx] = True
+                    if output["skipped"]:
+                        continue
 
                     choice = ChatCompletionResponseStreamChoice(
                         index=idx,
@@ -758,7 +754,7 @@ async def _create_chat_completion_choice(
         message = ChatMessage(
             role="assistant",
             reasoning_content=output.get("reasoning_content"),
-            tool_calls=output.get("tool_call"),
+            tool_calls=output.get("tool_calls"),
             prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
             completion_token_ids=completion_token_ids if request.return_token_ids else None,
             prompt_tokens=prompt_tokens if request.return_token_ids else None,
@@ -790,7 +786,7 @@ async def _create_chat_completion_choice(
         finish_reason = "stop"
         if previous_num_tokens != max_tokens:
             finish_reason = "stop"
-            if output.get("tool_call", None):
+            if output.get("tool_calls"):
                 finish_reason = "tool_calls"
         else:
             finish_reason = "length"

diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -407,7 +407,7 @@ async def _process_echo_logic(self, request, idx, res_outputs):
 
     def calc_finish_reason(self, max_tokens, token_num, output, tool_called):
         if max_tokens is None or token_num != max_tokens:
-            if tool_called or output.get("tool_call"):
+            if tool_called or output.get("tool_calls"):
                 return "tool_calls"
             else:
                 return "stop"
@@ -554,9 +554,9 @@ async def completion_stream_generator(
                         text=output["text"],
                         prompt_token_ids=None,
                         completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
-                        tool_calls=None,
+                        tool_calls=output["tool_calls"],
                         completion_tokens=output.get("completion_tokens") if request.return_token_ids else None,
-                        reasoning_content="",
+                        reasoning_content=output["reasoning_content"],
                         arrival_time=arrival_time,
                         logprobs=logprobs_res,
                         prompt_logprobs=(
@@ -565,15 +565,12 @@ async def completion_stream_generator(
                         draft_logprobs=draft_logprobs_res,
                         speculate_metrics=output_speculate_metrics,
                     )
-                    if not res["finished"] and output["enable_parser"]:
-                        delta_message_output = output["delta_message"]
-                        if delta_message_output is None:
-                            continue
-                        delta_message.text = delta_message_output.content or ""
-                        delta_message.reasoning_content = delta_message_output.reasoning_content or ""
-                        if delta_message_output.tool_calls:
-                            delta_message.tool_calls = delta_message_output.tool_calls
-                            tool_called[idx] = True
+
+                    if output["tool_calls"] is not None:
+                        tool_called[idx] = True
+
+                    if output["skipped"]:
+                        continue
 
                     choices.append(delta_message)
 
@@ -740,7 +737,7 @@ def request_output_to_completion_response(
                     else None
                 ),
                 reasoning_content=output.get("reasoning_content"),
-                tool_calls=output.get("tool_call", None),
+                tool_calls=output.get("tool_calls"),
                 logprobs=aggregated_logprobs,
                 draft_logprobs=aggregated_draft_logprobs,
                 prompt_logprobs=clamp_prompt_logprobs(prompt_logprobs_res),

diff --git a/fastdeploy/entrypoints/openai/tool_parsers/__init__.py b/fastdeploy/entrypoints/openai/tool_parsers/__init__.py
@@ -14,8 +14,11 @@
 # limitations under the License.
 """
 
+from fastdeploy.plugins import load_tool_parser_plugins
+
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .ernie_45_vl_thinking_tool_parser import Ernie45VLThinkingToolParser
 from .ernie_x1_tool_parser import ErnieX1ToolParser
 
 __all__ = ["ToolParser", "ToolParserManager", "ErnieX1ToolParser", "Ernie45VLThinkingToolParser"]
+load_tool_parser_plugins()
diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py
@@ -341,7 +341,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
                 tool_parser = self.tool_parser_obj(self.tokenizer)
                 tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
                 if tool_call_info.tools_called:
-                    response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
+                    response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls
                     response_dict["outputs"]["text"] = tool_call_info.content
             response_dict["outputs"]["completion_tokens"] = full_text
             data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
@@ -369,7 +369,11 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             if token_ids[-1] == self.tokenizer.eos_token_id:
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
+        response_dict["outputs"]["text"] = delta_text
         response_dict["outputs"]["completion_tokens"] = delta_text
+        response_dict["outputs"]["skipped"] = False
+        response_dict["outputs"]["tool_calls"] = None
+        response_dict["outputs"]["reasoning_content"] = ""
         if self.reasoning_parser:
             reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
                 previous_texts,
@@ -380,19 +384,15 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 token_ids,
                 self.model_status_dict[req_id],
             )
-            response_dict["outputs"]["enable_parser"] = True
-            response_dict["outputs"]["delta_message"] = reasoning_delta_message
-            reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
-            reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
-            response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
-            response_dict["outputs"]["reasoning_content"] = reasoning_content
-            response_dict["outputs"]["text"] = (
-                reasoning_delta_message.content or ""
-                if reasoning_delta_message and hasattr(reasoning_delta_message, "content")
-                else ""
-            )
-        else:
-            response_dict["outputs"]["text"] = delta_text
+            if reasoning_delta_message:
+                reasoning_content = reasoning_delta_message.reasoning_content
+                reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
+                response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
+                response_dict["outputs"]["reasoning_content"] = reasoning_content or ""
+                response_dict["outputs"]["text"] = reasoning_delta_message.content or ""
+            else:
+                if not is_end:
+                    response_dict["outputs"]["skipped"] = True
         if self.tool_parser_obj:
             response_dict["outputs"]["enable_parser"] = True
             if req_id not in self.tool_parser_dict:
@@ -407,8 +407,13 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 token_ids,
                 response_dict,
             )
-            if tool_call_delta_message is None or tool_call_delta_message.tool_calls:
-                response_dict["outputs"]["delta_message"] = tool_call_delta_message
+            if tool_call_delta_message:
+                if tool_call_delta_message.tool_calls:
+                    response_dict["outputs"]["text"] = tool_call_delta_message.content
+                    response_dict["outputs"]["tool_calls"] = tool_call_delta_message.tool_calls
+            else:
+                if not is_end:
+                    response_dict["outputs"]["skipped"] = True
 
         if is_end:
             data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")

diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
@@ -442,7 +442,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
                 tool_parser = self.tool_parser_obj(self.tokenizer)
                 tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
                 if tool_call_info.tools_called:
-                    response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
+                    response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls
                     response_dict["outputs"]["text"] = tool_call_info.content
             data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
             del self.decode_status[req_id]
@@ -469,7 +469,11 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             if token_ids[-1] in self.eos_token_ids:
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
+        response_dict["outputs"]["text"] = delta_text
         response_dict["outputs"]["completion_tokens"] = delta_text
+        response_dict["outputs"]["skipped"] = False
+        response_dict["outputs"]["tool_calls"] = None
+        response_dict["outputs"]["reasoning_content"] = ""
         if self.reasoning_parser:
             response_dict["outputs"]["enable_parser"] = True
             reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
@@ -481,16 +485,21 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 token_ids,
                 self.model_status_dict[req_id],
             )
-            response_dict["outputs"]["delta_message"] = reasoning_delta_message
-            reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
-            reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
-            response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
+            if reasoning_delta_message:
+                reasoning_content = reasoning_delta_message.reasoning_content
+                reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
+                response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
+                response_dict["outputs"]["reasoning_content"] = reasoning_content or ""
+                response_dict["outputs"]["text"] = reasoning_delta_message.content or ""
+            else:
+                if not is_end:
+                    response_dict["outputs"]["skipped"] = True
         if self.tool_parser_obj:
             response_dict["outputs"]["enable_parser"] = True
             if req_id not in self.tool_parser_dict:
                 self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
             tool_parser = self.tool_parser_dict[req_id]
-            tool_call = tool_parser.extract_tool_calls_streaming(
+            tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
                 previous_texts,
                 previous_texts + delta_text,
                 delta_text,
@@ -499,9 +508,14 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 token_ids,
                 response_dict,
             )
-            if tool_call is None or tool_call.tool_calls:
-                response_dict["outputs"]["delta_message"] = tool_call
-        response_dict["outputs"]["text"] = delta_text
+            if tool_call_delta_message:
+                if tool_call_delta_message.tool_calls:
+                    response_dict["outputs"]["text"] = tool_call_delta_message.content
+                    response_dict["outputs"]["tool_calls"] = tool_call_delta_message.tool_calls
+            else:
+                if not is_end:
+                    response_dict["outputs"]["skipped"] = True
+
         if is_end:
             data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
             del self.decode_status[req_id]

diff --git a/fastdeploy/plugins/__init__.py b/fastdeploy/plugins/__init__.py
@@ -19,11 +19,13 @@
 from .model_runner import load_model_runner_plugins
 from .reasoning_parser import load_reasoning_parser_plugins
 from .token_processor import load_token_processor_plugins
+from .tool_parser import load_tool_parser_plugins
 
 __all__ = [
     "load_model_register_plugins",
     "load_model_runner_plugins",
     "load_input_processor_plugins",
     "load_reasoning_parser_plugins",
     "load_token_processor_plugins",
+    "load_tool_parser_plugins",
 ]
diff --git a/fastdeploy/plugins/tool_parser/__init__.py b/fastdeploy/plugins/tool_parser/__init__.py
@@ -0,0 +1,34 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from fastdeploy.plugins.utils import load_plugins_by_group
+
+# make sure one process only loads plugins once
+plugins_loaded = False
+PLUGINS_GROUP = "fastdeploy.tool_parser_plugins"
+
+
+def load_tool_parser_plugins():
+    """load_tool_parser_plugins"""
+    global plugins_loaded
+    if plugins_loaded:
+        return
+    plugins_loaded = True
+
+    plugins = load_plugins_by_group(group=PLUGINS_GROUP)
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()