mirror of
https://github.com/fauxpilot/fauxpilot.git
synced 2025-08-14 02:28:06 -07:00
Merge branch 'main' into python_backend
Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
This commit is contained in:
commit
f0a12b5e8e
10 changed files with 145 additions and 13 deletions
49
.editorconfig
Normal file
49
.editorconfig
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
# EditorConfig is awesome: https://EditorConfig.org
|
||||||
|
|
||||||
|
# top-most EditorConfig file
|
||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
tab_width = 4
|
||||||
|
# end_of_line = crlf
|
||||||
|
charset = utf-8
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
insert_final_newline = false
|
||||||
|
|
||||||
|
# Markdown
|
||||||
|
[*.{md}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Serialized data
|
||||||
|
[*.{yml,yaml,json,pbtxt}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Shell script
|
||||||
|
[*.{sh,bash,bashrc,zsh,fish,ksh,csh}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Python
|
||||||
|
[*.py]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
[*.env]
|
||||||
|
insert_final_newline = false
|
||||||
|
|
||||||
|
# Python requirements
|
||||||
|
[requirements.txt]
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Dockerfile
|
||||||
|
[Dockerfile]
|
||||||
|
insert_final_newline = true
|
|
@ -1,7 +1,11 @@
|
||||||
|
|
||||||
# FauxPilot
|
# FauxPilot
|
||||||
|
|
||||||
This is an attempt to build a locally hosted version of [GitHub Copilot](https://copilot.github.com/). It uses the [SalesForce CodeGen](https://github.com/salesforce/CodeGen) models inside of NVIDIA's [Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server) with the [FasterTransformer backend](https://github.com/triton-inference-server/fastertransformer_backend/).
|
This is an attempt to build a locally hosted version of [GitHub Copilot](https://copilot.github.com/). It uses the [SalesForce CodeGen](https://github.com/salesforce/CodeGen) models inside of NVIDIA's [Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server) with the [FasterTransformer backend](https://github.com/triton-inference-server/fastertransformer_backend/).
|
||||||
|
|
||||||
|
<p align="right">
|
||||||
|
<img width="50%" align="right" src="https://user-images.githubusercontent.com/34380/199604776-e08ce261-48d1-4d09-9163-f5ab2f904e55.png">
|
||||||
|
</p>
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,17 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI, Response
|
from fastapi import FastAPI, Request, Response
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
from sse_starlette.sse import EventSourceResponse
|
from sse_starlette.sse import EventSourceResponse
|
||||||
|
|
||||||
|
from config.log_config import uvicorn_logger
|
||||||
from models import OpenAIinput
|
from models import OpenAIinput
|
||||||
from utils.codegen import CodeGenProxy
|
from utils.codegen import CodeGenProxy
|
||||||
|
from utils.errors import FauxPilotException
|
||||||
|
|
||||||
|
logging.config.dictConfig(uvicorn_logger)
|
||||||
|
|
||||||
codegen = CodeGenProxy(
|
codegen = CodeGenProxy(
|
||||||
host=os.environ.get("TRITON_HOST", "triton"),
|
host=os.environ.get("TRITON_HOST", "triton"),
|
||||||
|
@ -21,22 +27,37 @@ app = FastAPI(
|
||||||
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
|
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@app.exception_handler(FauxPilotException)
|
||||||
|
async def fauxpilot_handler(request: Request, exc: FauxPilotException):
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content=exc.json()
|
||||||
|
)
|
||||||
|
|
||||||
@app.post("/v1/engines/codegen/completions", status_code=200)
|
@app.post("/v1/engines/codegen/completions")
|
||||||
@app.post("/v1/completions", status_code=200)
|
@app.post("/v1/completions")
|
||||||
async def completions(data: OpenAIinput):
|
async def completions(data: OpenAIinput):
|
||||||
data = data.dict()
|
data = data.dict()
|
||||||
print(data)
|
try:
|
||||||
|
content = codegen(data=data)
|
||||||
|
except codegen.TokensExceedsMaximum as E:
|
||||||
|
raise FauxPilotException(
|
||||||
|
message=str(E),
|
||||||
|
type="invalid_request_error",
|
||||||
|
param=None,
|
||||||
|
code=None,
|
||||||
|
)
|
||||||
|
|
||||||
if data.get("stream") is not None:
|
if data.get("stream") is not None:
|
||||||
return EventSourceResponse(
|
return EventSourceResponse(
|
||||||
content=codegen(data=data),
|
content=content,
|
||||||
status_code=200,
|
status_code=200,
|
||||||
media_type="text/event-stream"
|
media_type="text/event-stream"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return Response(
|
return Response(
|
||||||
status_code=200,
|
status_code=200,
|
||||||
content=codegen(data=data),
|
content=content,
|
||||||
media_type="application/json"
|
media_type="application/json"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
0
copilot_proxy/config/__init__.py
Normal file
0
copilot_proxy/config/__init__.py
Normal file
27
copilot_proxy/config/log_config.py
Normal file
27
copilot_proxy/config/log_config.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
# The uvicorn_logger is used to add timestamps
|
||||||
|
|
||||||
|
uvicorn_logger = {
|
||||||
|
"version": 1,
|
||||||
|
"disable_existing_loggers": False,
|
||||||
|
"formatters": {
|
||||||
|
"access": {
|
||||||
|
"()": "uvicorn.logging.AccessFormatter",
|
||||||
|
"fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
|
||||||
|
"use_colors": True
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"handlers": {
|
||||||
|
"access": {
|
||||||
|
"formatter": "access",
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
"stream": "ext://sys.stdout",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"loggers": {
|
||||||
|
"uvicorn.access": {
|
||||||
|
"handlers": ["access"],
|
||||||
|
# "level": "INFO",
|
||||||
|
"propagate": False
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
|
@ -21,6 +21,9 @@ class CodeGenProxy:
|
||||||
# Max number of tokens the model can handle
|
# Max number of tokens the model can handle
|
||||||
self.MAX_MODEL_LEN = 2048
|
self.MAX_MODEL_LEN = 2048
|
||||||
|
|
||||||
|
class TokensExceedsMaximum(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def prepare_tensor(name: str, tensor_input):
|
def prepare_tensor(name: str, tensor_input):
|
||||||
t = client_util.InferInput(
|
t = client_util.InferInput(
|
||||||
|
@ -82,8 +85,15 @@ class CodeGenProxy:
|
||||||
prompt_len = input_start_ids.shape[1]
|
prompt_len = input_start_ids.shape[1]
|
||||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
max_tokens = data.get('max_tokens', 16)
|
max_tokens = data.get('max_tokens', 16)
|
||||||
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
|
prompt_tokens: int = input_len[0][0]
|
||||||
raise ValueError("Max tokens + prompt length exceeds maximum model length")
|
requested_tokens = max_tokens + prompt_tokens
|
||||||
|
if requested_tokens > self.MAX_MODEL_LEN:
|
||||||
|
print(1)
|
||||||
|
raise self.TokensExceedsMaximum(
|
||||||
|
f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
|
||||||
|
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
|
||||||
|
f"Please reduce your prompt; or completion length."
|
||||||
|
)
|
||||||
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
||||||
num_logprobs = data.get('logprobs', -1)
|
num_logprobs = data.get('logprobs', -1)
|
||||||
if num_logprobs is None:
|
if num_logprobs is None:
|
||||||
|
|
19
copilot_proxy/utils/errors.py
Normal file
19
copilot_proxy/utils/errors.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from typing import *
|
||||||
|
|
||||||
|
class FauxPilotException(Exception):
|
||||||
|
def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
|
||||||
|
super().__init__(message)
|
||||||
|
self.message = message
|
||||||
|
self.type = type
|
||||||
|
self.param = param
|
||||||
|
self.code = code
|
||||||
|
|
||||||
|
def json(self):
|
||||||
|
return {
|
||||||
|
'error': {
|
||||||
|
'message': self.message,
|
||||||
|
'type': self.type,
|
||||||
|
'param': self.param,
|
||||||
|
'code': self.code
|
||||||
|
}
|
||||||
|
}
|
|
@ -64,3 +64,9 @@ And you should be able to use Copilot with your own locally hosted suggestions!
|
||||||
Another issue with using the Copilot plugin is that its tokenizer (the component that turns text into a sequence of integers for the model) is slightly different from the one used by CodeGen, so the plugin will sometimes send a request that is longer than CodeGen can handle. You can work around this by replacing the `vocab.bpe` and `tokenizer.json` found in the Copilot extension (something like `.vscode/extensions/github.copilot-[version]/dist/`) with the ones found [here](https://github.com/moyix/fauxpilot/tree/main/copilot_proxy/cgtok/openai_format).
|
Another issue with using the Copilot plugin is that its tokenizer (the component that turns text into a sequence of integers for the model) is slightly different from the one used by CodeGen, so the plugin will sometimes send a request that is longer than CodeGen can handle. You can work around this by replacing the `vocab.bpe` and `tokenizer.json` found in the Copilot extension (something like `.vscode/extensions/github.copilot-[version]/dist/`) with the ones found [here](https://github.com/moyix/fauxpilot/tree/main/copilot_proxy/cgtok/openai_format).
|
||||||
|
|
||||||
Have fun!
|
Have fun!
|
||||||
|
|
||||||
|
## GitLab - VS Code extentension
|
||||||
|
|
||||||
|
Another option is to use the [GitLab VS Code extension](https://marketplace.visualstudio.com/items?itemName=GitLab.gitlab-workflow) which has support for FauxPilot.
|
||||||
|
|
||||||
|
Contributions are encouraged :smile: https://gitlab.com/gitlab-org/gitlab-vscode-extension
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
TRITON_HOST=triton
|
|
||||||
TRITON_PORT=8001
|
|
||||||
API_HOST=0.0.0.0
|
|
||||||
API_PORT=5000
|
|
|
@ -160,5 +160,5 @@ def test_python_backend(n_gpus: int):
|
||||||
|
|
||||||
# killing docker-compose process doesn't bring down the containers.
|
# killing docker-compose process doesn't bring down the containers.
|
||||||
# explicitly stop the containers:
|
# explicitly stop the containers:
|
||||||
subprocess.run(["docker-compose", "-f", compose_file, "down"], cwd=curdir, check=True)
|
subprocess.run(["docker-compose", "-f", compose_file, "down"], cwd=curdir, check=True, env=load_test_env())
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue