feat: improve tools to include name and add tests (#1693)

This PR makes tool calling aware of the name of the function selected. Fixes: https://github.com/huggingface/text-generation-inference/issues/1657 Thank you @puppetm4st3r for the helpful snippets, large parts of this PR are simply refactors of the code shared 🙏 **opening draft PR because small tweaks are needed before merging

feat: improve tools to include name and add tests (#1693)
This PR makes tool calling aware of the name of the function selected. Fixes: https://github.com/huggingface/text-generation-inference/issues/1657 Thank you @puppetm4st3r for the helpful snippets, large parts of this PR are simply refactors of the code shared 🙏 **opening draft PR because small tweaks are needed before merging
7276d434 · drbh · GitHub · 88702d87 · 7276d434 · 7276d434
Unverified Commit 7276d434 authored Apr 16, 2024 by drbh Committed by GitHub Apr 16, 2024
11 changed files
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json
@@ -13,7 +13,7 @@
      "usage": null
    }
  ],
-  "created": 1710795556,
+  "created": 1712874856,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",

--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
@@ -11,13 +11,12 @@
        "tool_calls": [
          {
            "function": {
-              "description": null,
-              "name": "tools",
-              "parameters": {
+              "arguments": {
                "format": "celsius",
-                "location": "New York, NY",
-                "num_days": 14
-              }
+                "location": "Brooklyn"
+              },
+              "description": null,
+              "name": "get_current_weather"
            },
            "id": 0,
            "type": "function"
@@ -27,14 +26,14 @@
      "usage": null
    }
  ],
-  "created": 1710795556,
+  "created": 1712782670,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
  "system_fingerprint": "2.0.0-native",
  "usage": {
-    "completion_tokens": 29,
-    "prompt_tokens": 316,
-    "total_tokens": 345
+    "completion_tokens": 37,
+    "prompt_tokens": 524,
+    "total_tokens": 561
  }
 }
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
@@ -11,13 +11,12 @@
        "tool_calls": [
          {
            "function": {
-              "description": null,
-              "name": "tools",
-              "parameters": {
+              "arguments": {
                "format": "celsius",
-                "location": "New York, NY",
-                "num_days": 14
-              }
+                "location": "Brooklyn"
+              },
+              "description": null,
+              "name": "get_current_weather"
            },
            "id": 0,
            "type": "function"
@@ -27,14 +26,14 @@
      "usage": null
    }
  ],
-  "created": 1710795557,
+  "created": 1712787937,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
  "system_fingerprint": "2.0.0-native",
  "usage": {
-    "completion_tokens": 29,
-    "prompt_tokens": 316,
-    "total_tokens": 345
+    "completion_tokens": 37,
+    "prompt_tokens": 524,
+    "total_tokens": 561
  }
 }
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
@@ -11,12 +11,12 @@
        "tool_calls": [
          {
            "function": {
-              "description": null,
-              "name": "tools",
-              "parameters": {
+              "arguments": {
                "format": "celsius",
                "location": "New York, NY"
-              }
+              },
+              "description": null,
+              "name": "get_current_weather"
            },
            "id": 0,
            "type": "function"
@@ -26,14 +26,14 @@
      "usage": null
    }
  ],
-  "created": 1710795557,
+  "created": 1712852394,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
  "system_fingerprint": "2.0.0-native",
  "usage": {
-    "completion_tokens": 21,
-    "prompt_tokens": 187,
-    "total_tokens": 208
+    "completion_tokens": 48,
+    "prompt_tokens": 320,
+    "total_tokens": 368
  }
 }
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": {
+                "error": "Cannot get current weather forecast from specified location and temperature unit. Please try again with different options."
+              },
+              "description": null,
+              "name": "notify_error"
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1712852597,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "1.4.5-native",
+  "usage": {
+    "completion_tokens": 39,
+    "prompt_tokens": 496,
+    "total_tokens": 535
+  }
+}
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
@@ -19,7 +19,7 @@
      "logprobs": null
    }
  ],
-  "created": 1710795499,
+  "created": 1712788218,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",

--- a/integration-tests/models/test_chat_llama.py
+++ b/integration-tests/models/test_chat_llama.py
+import pytest
+import json
+
+from text_generation.types import GrammarType
+
+
+@pytest.fixture(scope="module")
+def flash_llama_chat_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_shard=2, disable_grammar_support=False
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_chat(flash_llama_chat_handle):
+    await flash_llama_chat_handle.health(300)
+    return flash_llama_chat_handle.client
+
+
+@pytest.mark.private
+async def test_flash_llama_simple(flash_llama_chat, response_snapshot):
+    response = await flash_llama_chat.chat(
+        max_tokens=100,
+        seed=1,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "As of today, there is a Update available for the Brooklyn, New York, area. According to the latest forecast, it's warm with high temperatures throughout the day. It's forecasted at 75°F for today and 77°F for tomorrow. However, in autumn, the weather typically changes drastically, becoming cooler and wetter. You can find the current weather forecast for the area through your local weather service. Additionally"
+    )
+    assert response == response_snapshot
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -71,34 +71,7 @@ tools = [
 ]


-@pytest.mark.asyncio
-@pytest.mark.private
-async def test_flash_llama_grammar_no_tools(
-    flash_llama_grammar_tools, response_snapshot
-):
-    response = await flash_llama_grammar_tools.chat(
-        max_tokens=100,
-        seed=1,
-        messages=[
-            {
-                "role": "system",
-                "content": "Youre a helpful assistant! Answer the users question best you can.",
-            },
-            {
-                "role": "user",
-                "content": "What is the weather like in Brooklyn, New York?",
-            },
-        ],
-    )
-
-    assert (
-        response.choices[0].message.content
-        == "As of today, there is a Update available for the Brooklyn, New York, area. According to the latest forecast, it's warm with high temperatures throughout the day. It's forecasted at 75°F for today and 77°F for tomorrow. However, in autumn, the weather typically changes drastically, becoming cooler and wetter. You can find the current weather forecast for the area through your local weather service. Additionally"
-    )
-    assert response == response_snapshot
-
-
-@pytest.mark.skip
+@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_snapshot):
@@ -121,23 +94,19 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna
    assert response.choices[0].message.content == None
    assert response.choices[0].message.tool_calls == [
        {
+            "id": 0,
+            "type": "function",
            "function": {
                "description": None,
-                "name": "tools",
-                "parameters": {
-                    "format": "celsius",
-                    "location": "New York, NY",
-                    "num_days": 14,
-                },
+                "name": "get_current_weather",
+                "arguments": {"format": "celsius", "location": "New York, NY"},
            },
-            "id": 0,
-            "type": "function",
        }
    ]
    assert response == response_snapshot


-@pytest.mark.skip
+@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_auto(
@@ -163,23 +132,20 @@ async def test_flash_llama_grammar_tools_auto(
    assert response.choices[0].message.content == None
    assert response.choices[0].message.tool_calls == [
        {
+            "id": 0,
+            "type": "function",
            "function": {
                "description": None,
-                "name": "tools",
-                "parameters": {
-                    "format": "celsius",
-                    "location": "New York, NY",
-                    "num_days": 14,
-                },
+                "name": "get_current_weather",
+                "arguments": {"format": "celsius", "location": "New York, NY"},
            },
-            "id": 0,
-            "type": "function",
        }
    ]
+
    assert response == response_snapshot


-@pytest.mark.skip
+@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_choice(
@@ -209,15 +175,16 @@ async def test_flash_llama_grammar_tools_choice(
            "type": "function",
            "function": {
                "description": None,
-                "name": "tools",
-                "parameters": {"format": "celsius", "location": "New York, NY"},
+                "name": "get_current_weather",
+                "arguments": {"format": "celsius", "location": "New York, NY"},
            },
        }
    ]
+
    assert response == response_snapshot


-@pytest.mark.skip
+@pytest.mark.skip(reason="Takes too long to run")
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_stream(
@@ -246,5 +213,47 @@ async def test_flash_llama_grammar_tools_stream(
    async for response in responses:
        count += 1

-    assert count == 20
+    assert count == 38
    assert response == response_snapshot
+
+
+@pytest.mark.skip(reason="Takes too long to run")
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_insufficient_information(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=8,
+        tools=tools,
+        tool_choice="auto",
+        messages=[
+            {
+                "role": "system",
+                "content": "ONLY RESPOND IF THE USER ASKS A WEATHER RELATED QUESTION",
+            },
+            {
+                "role": "user",
+                "content": "Tell me a story about 3 sea creatures",
+            },
+        ],
+        stream=False,
+    )
+
+    assert responses.choices[0].message.content == None
+    assert responses.choices[0].message.tool_calls == [
+        {
+            "function": {
+                "arguments": {
+                    "error": "Cannot get current weather forecast from specified location and temperature unit. Please try again with different options."
+                },
+                "description": None,
+                "name": "notify_error",
+            },
+            "id": 0,
+            "type": "function",
+        }
+    ]
+
+    assert responses == response_snapshot
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -79,7 +79,7 @@ impl HubTokenizerConfig {
    }
 }

-#[derive(Clone, Debug, Deserialize, ToSchema)]
+#[derive(Clone, Debug, Deserialize, ToSchema, Serialize)]
 #[serde(tag = "type", content = "value")]
 pub(crate) enum GrammarType {
    /// A string that represents a [JSON Schema](https://json-schema.org/).
@@ -669,7 +669,7 @@ pub(crate) struct ChatRequest {
    #[serde(default = "default_tool_prompt")]
    #[schema(
        nullable = true,
-        example = "\"Based on the conversation, please choose the most appropriate tool to use: \""
+        example = "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\""
    )]
    pub tool_prompt: Option<String>,

@@ -682,7 +682,7 @@ pub(crate) struct ChatRequest {

 fn default_tool_prompt() -> Option<String> {
    Some(
-        "\nBased on the conversation, please choose the most appropriate tool to use: ".to_string(),
+        "\nYou will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n".to_string(),
    )
 }
 #[derive(Clone, Deserialize, ToSchema, Serialize)]
@@ -727,26 +727,26 @@ mod deserialize_tool_choice {
    }
 }

-#[derive(Debug, Deserialize, Serialize, ToSchema)]
+#[derive(Debug, Deserialize, Serialize, ToSchema, PartialEq)]
 pub struct Tools {
    #[serde(flatten)]
    functions_map: FunctionsMap,
    properties: Properties,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq)]
 struct FunctionsMap {
    #[serde(rename = "$functions")]
    functions: std::collections::HashMap<String, serde_json::Value>,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq)]
 struct FunctionRef {
    #[serde(rename = "$ref")]
    ref_path: String,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq)]
 struct Properties {
    #[serde(serialize_with = "serialize_function")]
    function: Vec<FunctionRef>,
@@ -767,7 +767,8 @@ pub(crate) struct FunctionDefinition {
    #[serde(default)]
    pub description: Option<String>,
    pub name: String,
-    pub parameters: serde_json::Value,
+    #[serde(alias = "parameters")]
+    pub arguments: serde_json::Value,
 }

 #[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
@@ -779,12 +780,14 @@ pub(crate) struct Tool {
    pub function: FunctionDefinition,
 }

-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, Default)]
 pub(crate) struct ChatTemplateInputs<'a> {
    messages: Vec<Message>,
    bos_token: Option<&'a str>,
    eos_token: Option<&'a str>,
    add_generation_prompt: bool,
+    tools: Option<&'a str>,
+    tools_prompt: Option<&'a str>,
 }

 #[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]

--- a/router/src/server.rs
+++ b/router/src/server.rs
 use crate::config::Config;
 /// HTTP Server logic
 use crate::health::Health;
-use crate::infer::{InferError, InferResponse, InferStreamResponse};
+use crate::infer::{InferError, InferResponse, InferStreamResponse, ToolGrammar};
 use crate::validation::ValidationError;
 use crate::{
    BestOfSequence, Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest,
@@ -15,7 +15,7 @@ use crate::{
    ChatRequest, CompatGenerateRequest, Completion, CompletionComplete, CompletionCompleteChunk,
    CompletionRequest, DeltaToolCall, Function, Tool, VertexRequest, VertexResponse,
 };
-use crate::{FunctionDefinition, FunctionRef, FunctionsMap, Properties, ToolCall, ToolType, Tools};
+use crate::{FunctionDefinition, ToolCall, ToolType};
 use axum::extract::Extension;
 use axum::http::{HeaderMap, Method, StatusCode};
 use axum::response::sse::{Event, KeepAlive, Sse};
@@ -29,7 +29,6 @@ use futures::Stream;
 use futures::TryStreamExt;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
 use serde_json::Value;
-use std::collections::HashMap;
 use std::convert::Infallible;
 use std::net::SocketAddr;
 use std::sync::atomic::AtomicBool;
@@ -757,19 +756,29 @@ async fn chat_completions(
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    metrics::increment_counter!("tgi_request_count");

-    let stream = req.stream;
-    let max_new_tokens = req.max_tokens.or(Some(100));
-    let repetition_penalty = req
-        .presence_penalty
-        // rescale repetition_penalty from (-2.0, 2.0) to (0.0, 4.0)
-        .map(|x| x + 2.0);
-    let logprobs = req.logprobs.unwrap_or(false);
-    let seed = req.seed;
-    let stop = req.stop.unwrap_or_default();
-
-    // apply chat template to flatten the request into a single input
-    let mut inputs = match infer.apply_chat_template(req.messages) {
-        Ok(inputs) => inputs,
+    let ChatRequest {
+        logprobs,
+        max_tokens,
+        messages,
+        presence_penalty,
+        seed,
+        stop,
+        stream,
+        tools,
+        tool_choice,
+        tool_prompt,
+        ..
+    } = req;
+
+    let repetition_penalty = presence_penalty.map(|x| x + 2.0);
+    let max_new_tokens = max_tokens.or(Some(100));
+    let logprobs = logprobs.unwrap_or(false);
+    let tool_prompt = tool_prompt.unwrap_or_default();
+    let stop = stop.unwrap_or_default();
+
+    // extract tool grammar if present
+    let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
+        Ok(grammar) => grammar,
        Err(err) => {
            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
            tracing::error!("{err}");
@@ -783,60 +792,28 @@ async fn chat_completions(
        }
    };

-    let tool_grammar = if let Some((req_tools, tool_choice)) = req.tools.zip(req.tool_choice) {
-        let tool_prompt = req.tool_prompt.unwrap_or_default();
-        let tools_to_use = match tool_choice {
-            ToolType::FunctionName(name) => {
-                vec![req_tools
-                    .iter()
-                    .find(|tool| tool.function.name == *name)
-                    .ok_or_else(|| {
-                        (
-                            StatusCode::UNPROCESSABLE_ENTITY,
-                            Json(ErrorResponse {
-                                error: "Tool choice not found in tool names".to_string(),
-                                error_type: "Tool not found".to_string(),
-                            }),
-                        )
-                    })?
-                    .clone()]
-            }
-            ToolType::OneOf => req_tools.to_owned(),
-        };
+    let grammar_with_prompt = tool_grammar
+        .as_ref()
+        .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt));

-        let functions: HashMap<String, Value> = tools_to_use
-            .iter()
-            .map(|tool| {
-                let func = tool.function.clone();
-                (func.name, func.parameters)
-            })
-            .collect();
+    let typed_grammar = grammar_with_prompt
+        .as_ref()
+        .map(|(grammar, _)| grammar.clone());

-        let tools = Tools {
-            functions_map: FunctionsMap { functions },
-            properties: Properties {
-                function: tools_to_use
-                    .iter()
-                    .map(|tool| FunctionRef {
-                        ref_path: format!("#/$functions/{}", tool.function.name.clone()),
-                    })
-                    .collect(),
-            },
-        };
-
-        let tools_str = serde_json::to_string(&tools).map_err(|e| {
-            (
+    // apply chat template to flatten the request into a single input
+    let inputs = match infer.apply_chat_template(messages, grammar_with_prompt) {
+        Ok(inputs) => inputs,
+        Err(err) => {
+            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            tracing::error!("{err}");
+            return Err((
                StatusCode::UNPROCESSABLE_ENTITY,
                Json(ErrorResponse {
-                    error: e.to_string(),
-                    error_type: "Input validation error".to_string(),
+                    error: err.to_string(),
+                    error_type: err.error_type().to_string(),
                }),
-            )
-        })?;
-        inputs = format!("{inputs}{tool_prompt}{tools_str}");
-        Some(GrammarType::Json(serde_json::json!(tools)))
-    } else {
-        None
+            ));
+        }
    };

    // build the request passing some parameters
@@ -860,7 +837,7 @@ async fn chat_completions(
            decoder_input_details: !stream,
            seed,
            top_n_tokens: req.top_logprobs,
-            grammar: tool_grammar.clone(),
+            grammar: typed_grammar,
        },
    };

@@ -943,27 +920,28 @@ async fn chat_completions(
                        }),
                    )
                })?;
-
            let tool_calls = vec![ToolCall {
                id: 0,
                r#type: "function".to_string(),
                function: FunctionDefinition {
                    description: None,
-                    name: "tools".to_string(),
-                    parameters: gen_text_value.get("function").map_or_else(
-                        || {
-                            serde_json::from_str(&generation.generated_text).map_err(|e| {
-                                (
-                                    StatusCode::UNPROCESSABLE_ENTITY,
-                                    Json(ErrorResponse {
-                                        error: e.to_string(),
-                                        error_type: "Input validation error".to_string(),
-                                    }),
-                                )
+                    name: gen_text_value
+                        .get("function")
+                        .and_then(|f| f.get("_name"))
+                        .and_then(|name| name.as_str())
+                        .unwrap_or("default_function_name")
+                        .to_string(),
+                    // Serialize the JSON object obtained from "function" to an escaped JSON string
+                    arguments: gen_text_value
+                        .get("function")
+                        .map(|f| {
+                            let mut f_cloned = f.clone();
+                            if let Value::Object(ref mut props) = f_cloned {
+                                props.remove("_name");
+                            }
+                            f_cloned
                        })
-                        },
-                        |f| Ok(f.clone()),
-                    )?,
+                        .unwrap_or_default(),
                },
            }];
            (Some(tool_calls), None)
@@ -1539,6 +1517,7 @@ impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
            InferError::ValidationError(_) => StatusCode::UNPROCESSABLE_ENTITY,
            InferError::IncompleteGeneration => StatusCode::INTERNAL_SERVER_ERROR,
            InferError::TemplateError(_) => StatusCode::UNPROCESSABLE_ENTITY,
+            InferError::ToolError(_) => StatusCode::UNPROCESSABLE_ENTITY,
        };

        (