Add python backend support

- Modify dockerfile to include bitsandbytes, transformers and latest version of pytorch - Minor modifications in utils/codegen.py so that same client works with FT and Py-backend - Minor modifications in launch.sh (no need to name models by GPU) - Add installation script for adding a new python model (with super simple config_template) - Modify setup.sh so that it aworks with both FT and Python backend models Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
2025-07-08 05:51:26 -07:00 · 2022-10-16 22:05:00 -05:00 · 2022-10-16 22:05:00 -05:00 · 01f1cbb629
commit 01f1cbb629
parent 9b2bc84670
9 changed files with 487 additions and 73 deletions
--- a/python_backend/config_template.pbtxt
+++ b/python_backend/config_template.pbtxt
@ -0,0 +1,180 @@
+name: "py-model"
+backend: "python"
+max_batch_size: 4
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    # UNUSED
+    name: "start_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "random_seed"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "is_return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    # UNUSED
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  } #,
+# Following is currently unsupported, but should be supported in the future
+#  {
+#    name: "cum_log_probs"
+#    data_type: TYPE_FP32
+#    dims: [ -1 ]
+#  },
+#  {
+#    name: "output_log_probs"
+#    data_type: TYPE_FP32
+#    dims: [ -1, -1 ]
+#  }
+]
+# unsure what this is for
+instance_group [
+  {
+    count: 1
+    kind: KIND_CPU
+  }
+]
+parameters {
+  key: "use_half"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "model_name"
+  value: {
+    string_value: "${model_name}" # e.g. "codegen-350M-multi"
+  }
+}
+parameters {
+  key: "org_name"
+  value: {
+    string_value: "${org_name}" # e.g. "Salesforce"
+  }
+}
+parameters {
+  key: "use_int8",
+  value: {
+    string_value: "${use_int8}" # e.g. "0" or "1"
+  }
+}
+parameters {
+  key: "use_auto_device_map",
+  value: {
+    string_value: "${use_auto_device_map}" # e.g. "0" or "1"
+  }
+}
--- a/python_backend/init_model.py
+++ b/python_backend/init_model.py
@ -0,0 +1,42 @@
+"""
+A simple script that sets up the model directory of a given model for Triton.
+"""
+
+import argparse
+import shutil
+from pathlib import Path
+from string import Template
+
+SCRIPT_DIR = Path(__file__).parent
+CONFIG_TEMPLATE_PATH = SCRIPT_DIR/'config_template.pbtxt'
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True)
+parser.add_argument("--model_name", type=str, required=True)
+parser.add_argument("--org_name", type=str, required=True)
+parser.add_argument("--use_half", type=str, default="1")
+parser.add_argument("--use_int8", type=str, default="0")
+parser.add_argument("--use_auto_device_map", type=str, default="1")
+args = parser.parse_args()
+
+
+# Step1: Make model directory
+model_dir_path = Path(args.model_dir)/f"py-{args.model_name}/py-model/1"
+model_dir_path.mkdir(parents=True, exist_ok=True)
+
+# Step 2: copy model.py
+shutil.copy(SCRIPT_DIR/'model.py', model_dir_path/'model.py')
+
+# Step 3: Generate config.pbtxt
+with open(CONFIG_TEMPLATE_PATH, 'r') as f:
+    template = Template(f.read())
+
+config = template.substitute(
+    org_name=args.org_name,
+    model_name=args.model_name,
+    use_half=args.use_half,
+    use_int8=args.use_int8,
+    use_auto_device_map=args.use_auto_device_map,
+)
+with open(model_dir_path/'../config.pbtxt', 'w') as f:
+    f.write(config)
--- a/python_backend/model.py
+++ b/python_backend/model.py
@ -0,0 +1,91 @@
+import json
+
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import to_dlpack, from_dlpack
+import torch
+
+def pb2torch(request, name):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    return from_dlpack(tensor.to_dlpack())
+
+def torch2pb(name, tensor):
+    return pb_utils.Tensor.from_dlpack(name, to_dlpack(tensor))
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_config = model_config = json.loads(args["model_config"])
+        org_name = model_config["parameters"].get("org_name", {"string_value": "Salesforce"})["string_value"]
+        model_name = org_name + "/" + model_config["parameters"]["model_name"]["string_value"]
+
+        get_bool = lambda x: model_config["parameters"][x]["string_value"].lower() in ["1", "true"] 
+
+        is_half = get_bool("use_half")
+        int8 = get_bool("use_int8")  # this will make inference marginally slower, but will allow bigger models to fit in GPU
+        auto_device_map = get_bool("use_auto_device_map")
+
+        print(f"is_half: {is_half}, int8: {int8}, auto_device_map: {auto_device_map}")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if is_half else "auto",
+            load_in_8bit=int8,
+            device_map="auto" if auto_device_map else None,
+            low_cpu_mem_usage=True,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        print(f"Model {model_name} Loaded. Footprint: {self.model.get_memory_footprint()}")
+
+        # set max_batch_size
+        self.max_batch_size = 0 # model_config["max_batch_size"]
+    
+    def execute(self, requests):
+        # TODO: don't just loop over requests. batch them up
+
+        responses = []
+
+        for request in requests:
+            input_ids_torch = pb2torch(request, "input_ids")
+            input_lengths_torch = pb2torch(request, "input_lengths")
+            request_output_len_torch = pb2torch(request, "request_output_len")
+
+            # Attention mask
+            attention_mask = None
+            if input_lengths_torch.min() != input_lengths_torch.max():
+                attention_mask = torch.zeros(input_ids_torch.shape, dtype=torch.long)
+                for i, l in enumerate(input_lengths_torch):
+                    attention_mask[i, :l] = 1
+            
+            # Output length
+            max_new_tokens = request_output_len_torch[0][0]
+
+            top_k = pb_utils.get_input_tensor_by_name(request, "runtime_top_k").as_numpy().tolist()[0]
+            top_p = pb_utils.get_input_tensor_by_name(request, "runtime_top_p").as_numpy().tolist()[0]
+            temperature = pb_utils.get_input_tensor_by_name(request, "temperature").as_numpy().tolist()[0]
+            # n_samples = pb_utils.get_input_tensor_by_name(request, "n")
+            n_samples = 1  # TODO: client doesn't send this yet. instead it duplicates the request n times
+
+            # Generate
+            output_ids = self.model.generate(
+                input_ids=input_ids_torch, attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p, num_return_sequences=n_samples,
+                temperature=temperature,
+            )
+            # assert len(output_ids.shape) == 2, "huggingface format is batch x seq_len"
+            # assert output_ids.shape[0] == input_ids_torch.shape[0], "expecting batch size to match input"
+            output_ids = output_ids.unsqueeze(1) # client wants batch x beam_width x seq_len and we don't support beam_width yet
+
+            # create output tensors
+            out_tensor_pb = torch2pb("output_ids", output_ids)
+
+            # calculate sequence_length
+            sequence_length = torch.zeros(output_ids.shape[:2], dtype=torch.int32)
+            for i in range(output_ids.shape[0]):
+                sequence_length[i, 0] = torch.sum(output_ids[i, 0] != self.model.config.eos_token_id).item()
+            sequence_length_pb = torch2pb("sequence_length", sequence_length)
+
+            # create response
+            response = pb_utils.InferenceResponse([out_tensor_pb, sequence_length_pb])
+            responses.append(response)
+
+        return responses