From 78f52859c4eec625fd654ac3dd019eb5cc169245 Mon Sep 17 00:00:00 2001 From: bsu3338 Date: Sat, 2 Mar 2024 11:42:02 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9A=20docs:=20Separate=20LiteLLM=20and?= =?UTF-8?q?=20Ollama=20Documentation=20(#1948)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Separate LiteLLM and Ollama Documentation * Clarify Ollama Setup * Fix litellm config --- docker-compose.override.yml.example | 66 ++++++ docs/install/configuration/ai_endpoints.md | 52 ++++- docs/install/configuration/litellm.md | 231 +++------------------ docs/install/configuration/ollama.md | 29 +++ 4 files changed, 179 insertions(+), 199 deletions(-) create mode 100644 docs/install/configuration/ollama.md diff --git a/docker-compose.override.yml.example b/docker-compose.override.yml.example index 200f30085..ef1547ba3 100644 --- a/docker-compose.override.yml.example +++ b/docker-compose.override.yml.example @@ -92,3 +92,69 @@ version: '3.4' # meilisearch: # ports: # - 7700:7700 + +# # ADD OLLAMA +# ollama: +# image: ollama/ollama:latest +# deploy: +# resources: +# reservations: +# devices: +# - driver: nvidia +# capabilities: [compute, utility] +# ports: +# - "11434:11434" +# volumes: +# - ./ollama:/root/.ollama + +# # ADD LITELLM BASIC - NEED TO CONFIGURE litellm-config.yaml, ONLY NEED ENV TO ENABLE REDIS FOR CACHING OR LANGFUSE FOR MONITORING +# litellm: +# image: ghcr.io/berriai/litellm:main-latest +# volumes: +# - ./litellm/litellm-config.yaml:/app/config.yaml +# command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ] +# environment: +# REDIS_HOST: redis +# REDIS_PORT: 6379 +# REDIS_PASSWORD: RedisChangeMe +# LANGFUSE_PUBLIC_KEY: pk-lf-RandomStringFromLangfuseWebInterface +# LANGFUSE_SECRET_KEY: sk-lf-RandomStringFromLangfuseWebInterface +# LANGFUSE_HOST: http://langfuse-server:3000 + +# # ADD LITELLM CACHING +# redis: +# image: redis:7-alpine +# command: +# - sh +# - -c # this is to evaluate the $REDIS_PASSWORD from the env +# - redis-server --appendonly yes --requirepass $$REDIS_PASSWORD ## $$ because of docker-compose +# environment: +# REDIS_PASSWORD: RedisChangeMe +# volumes: +# - ./redis:/data + +# # ADD LITELLM MONITORING +# langfuse-server: +# image: ghcr.io/langfuse/langfuse:latest +# depends_on: +# - db +# ports: +# - "3000:3000" +# environment: +# - NODE_ENV=production +# - DATABASE_URL=postgresql://postgres:PostgresChangeMe@db:5432/postgres +# - NEXTAUTH_SECRET=ChangeMe +# - SALT=ChangeMe +# - NEXTAUTH_URL=http://localhost:3000 +# - TELEMETRY_ENABLED=${TELEMETRY_ENABLED:-true} +# - NEXT_PUBLIC_SIGN_UP_DISABLED=${NEXT_PUBLIC_SIGN_UP_DISABLED:-false} +# - LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES=${LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES:-false} +# db: +# image: postgres +# restart: always +# environment: +# - POSTGRES_USER=postgres +# - POSTGRES_PASSWORD=PostgresChangeMe +# - POSTGRES_DB=postgres +# volumes: +# - ./postgres:/var/lib/postgresql/data diff --git a/docs/install/configuration/ai_endpoints.md b/docs/install/configuration/ai_endpoints.md index f0caad7ab..59ba1bff6 100644 --- a/docs/install/configuration/ai_endpoints.md +++ b/docs/install/configuration/ai_endpoints.md @@ -259,5 +259,55 @@ Some of the endpoints are marked as **Known,** which means they might have speci forcePrompt: false modelDisplayLabel: "together.ai" ``` +## LiteLLM +> LiteLLM API key: master_key value [LiteLLM](./litellm.md) -![image](https://github.com/danny-avila/LibreChat/assets/32828263/fe3eae7a-d157-4f21-bb98-00688f261967) +**Notes:** + +- Reference [LiteLLM](./litellm.md) for configuration. + +```yaml + - name: "LiteLLM" + apiKey: "sk-from-config-file" + baseURL: "http://localhost:8000/v1" + models: + default: ["gpt-3.5-turbo"] + fetch: true + titleConvo: true + titleModel: "gpt-3.5-turbo" + summarize: false + summaryModel: "gpt-3.5-turbo" + forcePrompt: false + modelDisplayLabel: "LiteLLM" +``` + +## Ollama +> Ollama API key: Required but ignored - [Ollama OpenAI Compatibility](https://github.com/ollama/ollama/blob/main/docs/openai.md) + +**Notes:** + +- **Known:** icon provided. +- **Known issue:** fetching list of models is not supported. See [Pull Request 2728](https://github.com/ollama/ollama/pull/2728). +- Download models with ollama run command. See [Ollama Library](https://ollama.com/library) +- The example includes a top 5 popular model list from the Ollama Library, which was last updated on March 1, 2024, for your convenience. + +```yaml + - name: "Ollama" + apiKey: "ollama" + baseURL: "http://localhost:11434/v1/" + models: + default: [ + "llama2", + "mistral", + "codellama", + "dolphin-mixtral", + "mistral-openorca" + ] + fetch: false # fetching list of models is not supported + titleConvo: true + titleModel: "llama2" + summarize: false + summaryModel: "llama2" + forcePrompt: false + modelDisplayLabel: "Ollama" +``` diff --git a/docs/install/configuration/litellm.md b/docs/install/configuration/litellm.md index 87a2f67ac..04fba6c5e 100644 --- a/docs/install/configuration/litellm.md +++ b/docs/install/configuration/litellm.md @@ -1,5 +1,5 @@ --- -title: 🚅 LiteLLM and Ollama +title: 🚅 LiteLLM description: Using LibreChat with LiteLLM Proxy weight: -7 --- @@ -12,12 +12,18 @@ Use **[LiteLLM Proxy](https://docs.litellm.ai/docs/simple_proxy)** for: * Authentication & Spend Tracking Virtual Keys ## Start LiteLLM Proxy Server -### Pip install litellm -```shell -pip install litellm -``` +### 1. Uncomment desired sections in docker-compose.override.yml +The override file contains sections for the below LiteLLM features -### Create a config.yaml for litellm proxy +#### Caching with Redis +Litellm supports in-memory, redis, and s3 caching. Note: Caching currently only works with exact matching. + +#### Performance Monitoring with Langfuse +Litellm supports various logging and observability options. The settings below will enable Langfuse which will provide a cache_hit tag showing which conversations used cache. + +### 2. Create a config.yaml for LiteLLM proxy +LiteLLM requires a configuration file in addition to the override file. The file +below has the options to enable llm proxy to various providers, load balancing, Redis caching, and Langfuse monitoring. Review documentation for other configuration options. More information on LiteLLM configurations here: **[docs.litellm.ai/docs/simple_proxy](https://docs.litellm.ai/docs/simple_proxy)** ```yaml @@ -40,42 +46,29 @@ model_list: api_base: https://openai-france-1234.openai.azure.com/ api_key: rpm: 1440 + - model_name: mixtral + litellm_params: + model: ollama/mixtral:8x7b-instruct-v0.1-q5_K_M + api_base: http://ollama:11434 + stream: True + - model_name: mistral + litellm_params: + model: ollama/mistral + api_base: http://ollama:11434 + stream: True +litellm_settings: + success_callback: ["langfuse"] + cache: True + cache_params: + type: "redis" + supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] +general_settings: + master_key: sk_live_SetToRandomValue ``` -### Start the proxy -```shell -litellm --config /path/to/config.yaml +### 3. Configure LibreChat -#INFO: Proxy running on http://0.0.0.0:8000 -``` - -## Use LiteLLM Proxy Server with LibreChat - - -#### 1. Clone the repo -```shell -git clone https://github.com/danny-avila/LibreChat.git -``` - - -#### 2. Modify Librechat's `docker-compose.yml` -```yaml -OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions -``` - -**Important**: As of v0.6.6, it's recommend you use the `librechat.yaml` [Configuration file (guide here)](./custom_config.md) to add Reverse Proxies as separate endpoints. - -#### 3. Save fake OpenAI key in Librechat's `.env` - -Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key). -```env -OPENAI_API_KEY=sk-1234 -``` - -#### 4. Run LibreChat: -```shell -docker compose up -``` +Use `librechat.yaml` [Configuration file (guide here)](./ai_endpoints.md) to add Reverse Proxies as separate endpoints. --- @@ -102,162 +95,4 @@ Key components and features include: - **Deployment and Performance**: Information on deploying LiteLLM Proxy and its performance metrics. - **Proxy CLI Arguments**: A wide range of command-line arguments for customization. -Overall, LiteLLM Server offers a comprehensive suite of tools for managing, deploying, and interacting with a variety of LLMs, making it a versatile choice for large-scale AI applications. - -## Ollama -Use [Ollama](https://ollama.ai/) for - -* Run large language models on local hardware -* Host multiple models -* Dynamically load the model upon request - -### GPU Acceleration - -- **Linux**: Requires a Linux distrubution support by official Nvidia drivers. [Nvidia CUDA Toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Linux) -- **Windows**: Requires Windows Subsytem for Linux. Follow Nvidia instructions at [Nvidia WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html) -- **macOS**: [macOS Ollama Download](https://ollama.ai/download/mac) - -### docker-compose.override.yml with GPU -```yaml -version: "3.8" -services: - litellm: - image: ghcr.io/berriai/litellm:main-latest - volumes: - - ./litellm/litellm-config.yaml:/app/config.yaml - command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ] - ollama: - image: ollama/ollama - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [compute, utility] - ports: - - "11434:11434" - volumes: - - ./ollama:/root/.ollama - -``` - -### Loading Models in Ollama -1. Browse the available models at [Ollama Library](https://ollama.ai/library) -2. Run ```docker exec -it ollama /bin/bash``` -3. Copy the text from the Tags tab from the library website. It should begin with 'ollama run' -4. Check model size. Models that can run in GPU memory perform the best. -5. Use /bye to exit the terminal - -### Litellm Ollama Configuration -Add the below lines to the config to access the Ollama models -```yaml - - model_name: mixtral - litellm_params: - model: ollama/mixtral:8x7b-instruct-v0.1-q5_K_M - api_base: http://ollama:11434 - stream: True - - model_name: mistral - litellm_params: - model: ollama/mistral - api_base: http://ollama:11434 - stream: True -``` - -## Caching with Redis -Litellm supports in-memory, redis, and s3 caching. Note: Caching currently only works with exact matching. - -### Update docker-compose.override.yml to enable Redis -Add the below service to your docker-compose.override.yml -```yaml - redis: - image: redis:7-alpine - command: - - sh - - -c # this is to evaluate the $REDIS_PASSWORD from the env - - redis-server --appendonly yes --requirepass $$REDIS_PASSWORD ## $$ because of docker-compose - environment: - REDIS_PASSWORD: RedisChangeMe - volumes: - - ./redis:/data -``` - -Add the following to the environment variables in the litellm service inside the docker-compose.override.yml -```yaml - litellm: - image: ghcr.io/berriai/litellm:main-latest - volumes: - - ./litellm/litellm-config.yaml:/app/config.yaml - command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ] - environment: - REDIS_HOST: redis - REDIS_PORT: 6379 - REDIS_PASSWORD: RedisChangeMe -``` - -### Update Litellm Config File -Add the below options to the litellm config file -```yaml -litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py - cache: True # set cache responses to True, litellm defaults to using a redis cache - cache_params: # cache_params are optional - type: "redis" # The type of cache to initialize. Can be "local" or "redis". Defaults to "local". - - # Optional configurations - supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types -``` - - -## Performance Monitoring with Langfuse -Litellm supports various logging and observability options. The settings below will enable Langfuse which will provide a cache_hit tag showing which conversations used cache. - -### Update docker-compose.override.yml to enable Langfuse -Langfuse requires a postgres database, so add both postgres and langfuse services to the docker-compose.override.yml -```yaml - langfuse-server: - image: ghcr.io/langfuse/langfuse:latest - depends_on: - - db - ports: - - "3000:3000" - environment: - - NODE_ENV=production - - DATABASE_URL=postgresql://postgres:PostgresChangeMe@db:5432/postgres - - NEXTAUTH_SECRET=ChangeMe - - SALT=ChangeMe - - NEXTAUTH_URL=http://localhost:3000 - - TELEMETRY_ENABLED=${TELEMETRY_ENABLED:-true} - - NEXT_PUBLIC_SIGN_UP_DISABLED=${NEXT_PUBLIC_SIGN_UP_DISABLED:-false} - - LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES=${LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES:-false} - - db: - image: postgres - restart: always - environment: - - POSTGRES_USER=postgres - - POSTGRES_PASSWORD=PostgresChangeMe - - POSTGRES_DB=postgres - volumes: - - ./postgres:/var/lib/postgresql/data -``` - -Once Langfuse is running, create an account by accessing the web interface on port 3000. Create a new project to obtain the needed public and private key used by the litellm config -Add environement variable within the litellm service within docker-compose.override.yml -```yaml - litellm: - image: ghcr.io/berriai/litellm:main-latest - ports: - - "8000:8000" - volumes: - - /srv/litellm/config/litellm-config.yaml:/app/config.yaml - command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ] - environment: - LANGFUSE_PUBLIC_KEY: pk-lf-RandomStringFromLangfuseWebInterface - LANGFUSE_SECRET_KEY: sk-lf-RandomStringFromLangfuseWebInterface - LANGFUSE_HOST: http://langfuse-server:3000 -``` - -### Update litellm config file -```yaml -litellm_settings: - success_callback: ["langfuse"] -``` +Overall, LiteLLM Server offers a comprehensive suite of tools for managing, deploying, and interacting with a variety of LLMs, making it a versatile choice for large-scale AI applications. \ No newline at end of file diff --git a/docs/install/configuration/ollama.md b/docs/install/configuration/ollama.md new file mode 100644 index 000000000..a18f1fde5 --- /dev/null +++ b/docs/install/configuration/ollama.md @@ -0,0 +1,29 @@ +--- +title: 🚅 Ollama +description: Using LibreChat with Ollama +weight: -6 +--- +## Ollama +Use [Ollama](https://ollama.ai/) for + +* Running large language models on local hardware +* Hosting multiple models +* Dynamically loading the model upon request + +### 1. Install Ollama +#### Mac, Linux, Windows Install +Ollama supports GPU acceleration on Nvidia, AMD, and Apple Metal. Follow Instructions at [Ollama Download](https://ollama.com/download) + +#### Docker Install +Reference docker-compose.override.yml.example for configuration of Ollama in a Docker environment. + +Run ```docker exec -it ollama /bin/bash``` to access the Ollama command within the container. + +### 2. Load Models in Ollama +1. Browse the available models at [Ollama Library](https://ollama.ai/library) +2. Copy the text from the Tags tab from the library website and paste it into the terminal. It should begin with 'ollama run' +3. Check model size. Models that can run in GPU memory perform the best. +4. Use /bye to exit the terminal + +### 3. Configure LibreChat +Use `librechat.yaml` [Configuration file (guide here)](./ai_endpoints.md) to add Ollama as a separate endpoint. \ No newline at end of file