mirror of
https://github.com/fauxpilot/fauxpilot.git
synced 2025-07-07 13:31:55 -07:00
Add python backend support
- Modify dockerfile to include bitsandbytes, transformers and latest version of pytorch - Minor modifications in utils/codegen.py so that same client works with FT and Py-backend - Minor modifications in launch.sh (no need to name models by GPU) - Add installation script for adding a new python model (with super simple config_template) - Modify setup.sh so that it aworks with both FT and Python backend models Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
This commit is contained in:
parent
9b2bc84670
commit
01f1cbb629
9 changed files with 487 additions and 73 deletions
|
@ -70,17 +70,21 @@ class CodeGenProxy:
|
|||
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
||||
|
||||
def generate(self, data):
|
||||
model_name = "fastertransformer"
|
||||
prompt = data['prompt']
|
||||
n = data.get('n', 1)
|
||||
model_name = data["model"]
|
||||
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
|
||||
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
|
||||
np_type = np.int32 if model_name.startswith("py-") else np.uint32
|
||||
|
||||
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
|
||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
|
||||
prompt_len = input_start_ids.shape[1]
|
||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
max_tokens = data.get('max_tokens', 16)
|
||||
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
|
||||
raise ValueError("Max tokens + prompt length exceeds maximum model length")
|
||||
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
|
||||
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
||||
num_logprobs = data.get('logprobs', -1)
|
||||
if num_logprobs is None:
|
||||
num_logprobs = 1
|
||||
|
@ -95,7 +99,7 @@ class CodeGenProxy:
|
|||
|
||||
top_p = data.get('top_p', 1.0)
|
||||
frequency_penalty = data.get('frequency_penalty', 1.0)
|
||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
||||
|
@ -103,9 +107,9 @@ class CodeGenProxy:
|
|||
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
|
||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
|
||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
|
||||
stop_words = data.get('stop', [])
|
||||
if stop_words is None:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue