mirror of
https://github.com/fauxpilot/fauxpilot.git
synced 2025-08-20 21:34:13 -07:00
Merge branch 'main' into main
This commit is contained in:
commit
7ebb56c551
37 changed files with 1156 additions and 121 deletions
|
@ -10,4 +10,4 @@ COPY . .
|
|||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD [ "uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
|
||||
CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
import logging
|
||||
import os
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Response
|
||||
from fastapi import FastAPI, Request, Response
|
||||
from fastapi.responses import JSONResponse
|
||||
from sse_starlette.sse import EventSourceResponse
|
||||
|
||||
from config.log_config import uvicorn_logger
|
||||
from models import OpenAIinput
|
||||
from utils.codegen import CodeGenProxy
|
||||
from utils.errors import FauxPilotException
|
||||
|
||||
logging.config.dictConfig(uvicorn_logger)
|
||||
|
||||
codegen = CodeGenProxy(
|
||||
host=os.environ.get("TRITON_HOST", "triton"),
|
||||
|
@ -21,24 +27,50 @@ app = FastAPI(
|
|||
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
|
||||
)
|
||||
|
||||
@app.exception_handler(FauxPilotException)
|
||||
async def fauxpilot_handler(request: Request, exc: FauxPilotException):
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=exc.json()
|
||||
)
|
||||
|
||||
@app.post("/v1/engines/codegen/completions", status_code=200)
|
||||
@app.post("/v1/completions", status_code=200)
|
||||
# Used to support copilot.vim
|
||||
@app.get("/copilot_internal/v2/token")
|
||||
def get_copilot_token():
|
||||
content = {'token': '1', 'expires_at': 2600000000, 'refresh_in': 900}
|
||||
return JSONResponse(
|
||||
status_code=200,
|
||||
content=content
|
||||
)
|
||||
|
||||
@app.post("/v1/engines/codegen/completions")
|
||||
# Used to support copilot.vim
|
||||
@app.post("/v1/engines/copilot-codex/completions")
|
||||
@app.post("/v1/completions")
|
||||
async def completions(data: OpenAIinput):
|
||||
data = data.dict()
|
||||
print(data)
|
||||
try:
|
||||
content = codegen(data=data)
|
||||
except codegen.TokensExceedsMaximum as E:
|
||||
raise FauxPilotException(
|
||||
message=str(E),
|
||||
type="invalid_request_error",
|
||||
param=None,
|
||||
code=None,
|
||||
)
|
||||
|
||||
if data.get("stream") is not None:
|
||||
return EventSourceResponse(
|
||||
content=codegen(data=data),
|
||||
content=content,
|
||||
status_code=200,
|
||||
media_type="text/event-stream"
|
||||
)
|
||||
else:
|
||||
return Response(
|
||||
status_code=200,
|
||||
content=codegen(data=data),
|
||||
content=content,
|
||||
media_type="application/json"
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("app:app", host=os.environ.get("API_HOST", "0.0.0.0"), port=os.environ.get("API_PORT", 5000))
|
||||
uvicorn.run("app:app", host="0.0.0.0", port=5000)
|
||||
|
|
0
copilot_proxy/config/__init__.py
Normal file
0
copilot_proxy/config/__init__.py
Normal file
27
copilot_proxy/config/log_config.py
Normal file
27
copilot_proxy/config/log_config.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# The uvicorn_logger is used to add timestamps
|
||||
|
||||
uvicorn_logger = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"access": {
|
||||
"()": "uvicorn.logging.AccessFormatter",
|
||||
"fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
|
||||
"use_colors": True
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"access": {
|
||||
"formatter": "access",
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": "ext://sys.stdout",
|
||||
},
|
||||
},
|
||||
"loggers": {
|
||||
"uvicorn.access": {
|
||||
"handlers": ["access"],
|
||||
# "level": "INFO",
|
||||
"propagate": False
|
||||
},
|
||||
},
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, constr
|
||||
|
||||
|
||||
class OpenAIinput(BaseModel):
|
||||
model: str
|
||||
model: constr(regex="^(fastertransformer|py-model)$") = "fastertransformer"
|
||||
prompt: Optional[str]
|
||||
suffix: Optional[str]
|
||||
max_tokens: Optional[int] = 16
|
||||
|
|
|
@ -6,14 +6,14 @@ import time
|
|||
import numpy as np
|
||||
import tritonclient.grpc as client_util
|
||||
from tokenizers import Tokenizer
|
||||
from tritonclient.utils import np_to_triton_dtype
|
||||
from tritonclient.utils import np_to_triton_dtype, InferenceServerException
|
||||
|
||||
np.finfo(np.dtype("float32"))
|
||||
np.finfo(np.dtype("float64"))
|
||||
|
||||
|
||||
class CodeGenProxy:
|
||||
def __init__(self, host: str = 'localhost', port: int = 8001, verbose: bool = False):
|
||||
def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
|
||||
self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
|
||||
self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
|
||||
self.PAD_CHAR = 50256
|
||||
|
@ -21,6 +21,9 @@ class CodeGenProxy:
|
|||
# Max number of tokens the model can handle
|
||||
self.MAX_MODEL_LEN = 2048
|
||||
|
||||
class TokensExceedsMaximum(Exception):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def prepare_tensor(name: str, tensor_input):
|
||||
t = client_util.InferInput(
|
||||
|
@ -70,20 +73,31 @@ class CodeGenProxy:
|
|||
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
||||
|
||||
def generate(self, data):
|
||||
model_name = "fastertransformer"
|
||||
prompt = data['prompt']
|
||||
n = data.get('n', 1)
|
||||
model_name = data["model"]
|
||||
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
|
||||
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
|
||||
np_type = np.int32 if model_name.startswith("py-") else np.uint32
|
||||
|
||||
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
|
||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
|
||||
prompt_len = input_start_ids.shape[1]
|
||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
max_tokens = data.get('max_tokens', 16)
|
||||
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
|
||||
raise ValueError("Max tokens + prompt length exceeds maximum model length")
|
||||
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
|
||||
prompt_tokens: int = input_len[0][0]
|
||||
requested_tokens = max_tokens + prompt_tokens
|
||||
if requested_tokens > self.MAX_MODEL_LEN:
|
||||
print(1)
|
||||
raise self.TokensExceedsMaximum(
|
||||
f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
|
||||
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
|
||||
f"Please reduce your prompt; or completion length."
|
||||
)
|
||||
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
||||
num_logprobs = data.get('logprobs', -1)
|
||||
if num_logprobs is None:
|
||||
num_logprobs = 1
|
||||
num_logprobs = -1
|
||||
want_logprobs = num_logprobs > 0
|
||||
|
||||
temperature = data.get('temperature', 0.2)
|
||||
|
@ -95,7 +109,7 @@ class CodeGenProxy:
|
|||
|
||||
top_p = data.get('top_p', 1.0)
|
||||
frequency_penalty = data.get('frequency_penalty', 1.0)
|
||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
||||
|
@ -103,9 +117,9 @@ class CodeGenProxy:
|
|||
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
|
||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
|
||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
|
||||
stop_words = data.get('stop', [])
|
||||
if stop_words is None:
|
||||
|
@ -220,8 +234,8 @@ class CodeGenProxy:
|
|||
for c in choices:
|
||||
completion['id'] = self.random_completion_id()
|
||||
completion['choices'] = [c]
|
||||
yield f'data: {json.dumps(completion)}\n\n'
|
||||
yield 'data: [DONE]\n\n'
|
||||
yield f'{json.dumps(completion)}'
|
||||
yield '[DONE]'
|
||||
|
||||
def non_streamed_response(self, completion, choices) -> str:
|
||||
completion['id'] = self.random_completion_id()
|
||||
|
@ -230,7 +244,19 @@ class CodeGenProxy:
|
|||
|
||||
def __call__(self, data: dict):
|
||||
st = time.time()
|
||||
completion, choices = self.generate(data)
|
||||
try:
|
||||
completion, choices = self.generate(data)
|
||||
except InferenceServerException as exc:
|
||||
# status: unavailable -- this happens if the `model` string is invalid
|
||||
print(exc)
|
||||
if exc.status() == 'StatusCode.UNAVAILABLE':
|
||||
print(
|
||||
f"WARNING: Model '{data['model']}' is not available. Please ensure that "
|
||||
"`model` is set to either 'fastertransformer' or 'py-model' depending on "
|
||||
"your installation"
|
||||
)
|
||||
completion = {}
|
||||
choices = []
|
||||
ed = time.time()
|
||||
print(f"Returned completion in {(ed - st) * 1000} ms")
|
||||
if data.get('stream', False):
|
||||
|
|
19
copilot_proxy/utils/errors.py
Normal file
19
copilot_proxy/utils/errors.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
from typing import *
|
||||
|
||||
class FauxPilotException(Exception):
|
||||
def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.type = type
|
||||
self.param = param
|
||||
self.code = code
|
||||
|
||||
def json(self):
|
||||
return {
|
||||
'error': {
|
||||
'message': self.message,
|
||||
'type': self.type,
|
||||
'param': self.param,
|
||||
'code': self.code
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue