Skip to content
22 changes: 9 additions & 13 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,15 @@ async def chat_completion_stream_generator(
output_speculate_metrics = res["metrics"].get("speculate_metrics", None)

delta_message = DeltaMessage(
reasoning_content="",
reasoning_content=output["reasoning_content"],
prompt_token_ids=None,
tool_calls=None,
tool_calls=output["tool_calls"],
completion_token_ids=None,
)

if output["tool_calls"] is not None:
tool_called[idx] = True

if response_processor.enable_multimodal_content():
delta_message.multimodal_content = output["multipart"]
else:
Expand All @@ -419,15 +422,8 @@ async def chat_completion_stream_generator(
if output.get("audio_content", None) is not None:
delta_message.audio_content = output["audio_content"]

if not res["finished"] and output["enable_parser"]:
delta_message_output = output["delta_message"]
if delta_message_output is None:
continue
delta_message.content = delta_message_output.content or ""
delta_message.reasoning_content = delta_message_output.reasoning_content or ""
if delta_message_output.tool_calls:
delta_message.tool_calls = delta_message_output.tool_calls
tool_called[idx] = True
if output["skipped"]:
continue

choice = ChatCompletionResponseStreamChoice(
index=idx,
Expand Down Expand Up @@ -758,7 +754,7 @@ async def _create_chat_completion_choice(
message = ChatMessage(
role="assistant",
reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call"),
tool_calls=output.get("tool_calls"),
prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
completion_token_ids=completion_token_ids if request.return_token_ids else None,
prompt_tokens=prompt_tokens if request.return_token_ids else None,
Expand Down Expand Up @@ -790,7 +786,7 @@ async def _create_chat_completion_choice(
finish_reason = "stop"
if previous_num_tokens != max_tokens:
finish_reason = "stop"
if output.get("tool_call", None):
if output.get("tool_calls"):
finish_reason = "tool_calls"
else:
finish_reason = "length"
Expand Down
23 changes: 10 additions & 13 deletions fastdeploy/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ async def _process_echo_logic(self, request, idx, res_outputs):

def calc_finish_reason(self, max_tokens, token_num, output, tool_called):
if max_tokens is None or token_num != max_tokens:
if tool_called or output.get("tool_call"):
if tool_called or output.get("tool_calls"):
return "tool_calls"
else:
return "stop"
Expand Down Expand Up @@ -554,9 +554,9 @@ async def completion_stream_generator(
text=output["text"],
prompt_token_ids=None,
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
tool_calls=None,
tool_calls=output["tool_calls"],
completion_tokens=output.get("completion_tokens") if request.return_token_ids else None,
reasoning_content="",
reasoning_content=output["reasoning_content"],
arrival_time=arrival_time,
logprobs=logprobs_res,
prompt_logprobs=(
Expand All @@ -565,15 +565,12 @@ async def completion_stream_generator(
draft_logprobs=draft_logprobs_res,
speculate_metrics=output_speculate_metrics,
)
if not res["finished"] and output["enable_parser"]:
delta_message_output = output["delta_message"]
if delta_message_output is None:
continue
delta_message.text = delta_message_output.content or ""
delta_message.reasoning_content = delta_message_output.reasoning_content or ""
if delta_message_output.tool_calls:
delta_message.tool_calls = delta_message_output.tool_calls
tool_called[idx] = True

if output["tool_calls"] is not None:
tool_called[idx] = True

if output["skipped"]:
continue

choices.append(delta_message)

Expand Down Expand Up @@ -740,7 +737,7 @@ def request_output_to_completion_response(
else None
),
reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call", None),
tool_calls=output.get("tool_calls"),
logprobs=aggregated_logprobs,
draft_logprobs=aggregated_draft_logprobs,
prompt_logprobs=clamp_prompt_logprobs(prompt_logprobs_res),
Expand Down
3 changes: 3 additions & 0 deletions fastdeploy/entrypoints/openai/tool_parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
# limitations under the License.
"""

from fastdeploy.plugins import load_tool_parser_plugins

from .abstract_tool_parser import ToolParser, ToolParserManager
from .ernie_45_vl_thinking_tool_parser import Ernie45VLThinkingToolParser
from .ernie_x1_tool_parser import ErnieX1ToolParser

__all__ = ["ToolParser", "ToolParserManager", "ErnieX1ToolParser", "Ernie45VLThinkingToolParser"]
load_tool_parser_plugins()
37 changes: 21 additions & 16 deletions fastdeploy/input/ernie4_5_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
tool_parser = self.tool_parser_obj(self.tokenizer)
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
if tool_call_info.tools_called:
response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls
response_dict["outputs"]["text"] = tool_call_info.content
response_dict["outputs"]["completion_tokens"] = full_text
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
Expand Down Expand Up @@ -369,7 +369,11 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
response_dict["outputs"]["text"] = delta_text
response_dict["outputs"]["completion_tokens"] = delta_text
response_dict["outputs"]["skipped"] = False
response_dict["outputs"]["tool_calls"] = None
response_dict["outputs"]["reasoning_content"] = ""
if self.reasoning_parser:
reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
previous_texts,
Expand All @@ -380,19 +384,15 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
token_ids,
self.model_status_dict[req_id],
)
response_dict["outputs"]["enable_parser"] = True
response_dict["outputs"]["delta_message"] = reasoning_delta_message
reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
response_dict["outputs"]["reasoning_content"] = reasoning_content
response_dict["outputs"]["text"] = (
reasoning_delta_message.content or ""
if reasoning_delta_message and hasattr(reasoning_delta_message, "content")
else ""
)
else:
response_dict["outputs"]["text"] = delta_text
if reasoning_delta_message:
reasoning_content = reasoning_delta_message.reasoning_content
reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
response_dict["outputs"]["reasoning_content"] = reasoning_content or ""
response_dict["outputs"]["text"] = reasoning_delta_message.content or ""
else:
if not is_end:
response_dict["outputs"]["skipped"] = True
if self.tool_parser_obj:
response_dict["outputs"]["enable_parser"] = True
if req_id not in self.tool_parser_dict:
Expand All @@ -407,8 +407,13 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
token_ids,
response_dict,
)
if tool_call_delta_message is None or tool_call_delta_message.tool_calls:
response_dict["outputs"]["delta_message"] = tool_call_delta_message
if tool_call_delta_message:
if tool_call_delta_message.tool_calls:
response_dict["outputs"]["text"] = tool_call_delta_message.content
response_dict["outputs"]["tool_calls"] = tool_call_delta_message.tool_calls
else:
if not is_end:
response_dict["outputs"]["skipped"] = True

if is_end:
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
Expand Down
32 changes: 23 additions & 9 deletions fastdeploy/input/text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
tool_parser = self.tool_parser_obj(self.tokenizer)
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
if tool_call_info.tools_called:
response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
response_dict["outputs"]["tool_calls"] = tool_call_info.tool_calls
response_dict["outputs"]["text"] = tool_call_info.content
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
del self.decode_status[req_id]
Expand All @@ -469,7 +469,11 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
if token_ids[-1] in self.eos_token_ids:
token_ids = token_ids[:-1]
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
response_dict["outputs"]["text"] = delta_text
response_dict["outputs"]["completion_tokens"] = delta_text
response_dict["outputs"]["skipped"] = False
response_dict["outputs"]["tool_calls"] = None
response_dict["outputs"]["reasoning_content"] = ""
if self.reasoning_parser:
response_dict["outputs"]["enable_parser"] = True
reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
Expand All @@ -481,16 +485,21 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
token_ids,
self.model_status_dict[req_id],
)
response_dict["outputs"]["delta_message"] = reasoning_delta_message
reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
if reasoning_delta_message:
reasoning_content = reasoning_delta_message.reasoning_content
reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
response_dict["outputs"]["reasoning_content"] = reasoning_content or ""
response_dict["outputs"]["text"] = reasoning_delta_message.content or ""
else:
if not is_end:
response_dict["outputs"]["skipped"] = True
if self.tool_parser_obj:
response_dict["outputs"]["enable_parser"] = True
if req_id not in self.tool_parser_dict:
self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
tool_parser = self.tool_parser_dict[req_id]
tool_call = tool_parser.extract_tool_calls_streaming(
tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
previous_texts,
previous_texts + delta_text,
delta_text,
Expand All @@ -499,9 +508,14 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
token_ids,
response_dict,
)
if tool_call is None or tool_call.tool_calls:
response_dict["outputs"]["delta_message"] = tool_call
response_dict["outputs"]["text"] = delta_text
if tool_call_delta_message:
if tool_call_delta_message.tool_calls:
response_dict["outputs"]["text"] = tool_call_delta_message.content
response_dict["outputs"]["tool_calls"] = tool_call_delta_message.tool_calls
else:
if not is_end:
response_dict["outputs"]["skipped"] = True

if is_end:
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
del self.decode_status[req_id]
Expand Down
2 changes: 2 additions & 0 deletions fastdeploy/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
from .model_runner import load_model_runner_plugins
from .reasoning_parser import load_reasoning_parser_plugins
from .token_processor import load_token_processor_plugins
from .tool_parser import load_tool_parser_plugins

__all__ = [
"load_model_register_plugins",
"load_model_runner_plugins",
"load_input_processor_plugins",
"load_reasoning_parser_plugins",
"load_token_processor_plugins",
"load_tool_parser_plugins",
]
34 changes: 34 additions & 0 deletions fastdeploy/plugins/tool_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from fastdeploy.plugins.utils import load_plugins_by_group

# make sure one process only loads plugins once
plugins_loaded = False
PLUGINS_GROUP = "fastdeploy.tool_parser_plugins"


def load_tool_parser_plugins():
"""load_tool_parser_plugins"""
global plugins_loaded
if plugins_loaded:
return
plugins_loaded = True

plugins = load_plugins_by_group(group=PLUGINS_GROUP)
# general plugins, we only need to execute the loaded functions
for func in plugins.values():
func()
Loading
Loading