💾 feat: Anthropic Prompt Caching (#3670)

* wip: initial cache control implementation, add typing for transactions handling * feat: first pass of Anthropic Prompt Caching * feat: standardize stream usage as pass in when calculating token counts * feat: Add getCacheMultiplier function to calculate cache multiplier for different valueKeys and cacheTypes * chore: imports order * refactor: token usage recording in AnthropicClient, no need to "correct" as we have the correct amount * feat: more accurate token counting using stream usage data * feat: Improve token counting accuracy with stream usage data * refactor: ensure more accurate than not token estimations if custom instructions or files are not being resent with every request * refactor: cleanup updateUserMessageTokenCount to allow transactions to be as accurate as possible even if we shouldn't update user message token counts * ci: fix tests
2025-09-22 06:00:56 +02:00 · 2024-08-17 03:24:09 -04:00 · 2024-08-17 03:24:09 -04:00 · a45b384bbc
commit a45b384bbc
parent 9f4c516615
17 changed files with 973 additions and 34 deletions
--- a/api/app/clients/AnthropicClient.js
+++ b/api/app/clients/AnthropicClient.js
@ -12,12 +12,13 @@ const { encodeAndFormat } = require('~/server/services/Files/images/encode');
 const {
  truncateText,
  formatMessage,
+  addCacheControl,
  titleFunctionPrompt,
  parseParamFromPrompt,
  createContextHandlers,
 } = require('./prompts');
-const spendTokens = require('~/models/spendTokens');
-const { getModelMaxTokens } = require('~/utils');
+const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
+const { getModelMaxTokens, matchModelName } = require('~/utils');
 const { sleep } = require('~/server/utils');
 const BaseClient = require('./BaseClient');
 const { logger } = require('~/config');
@ -32,6 +33,7 @@ function delayBeforeRetry(attempts, baseDelay = 1000) {
  return new Promise((resolve) => setTimeout(resolve, baseDelay * attempts));
 }

+const tokenEventTypes = new Set(['message_start', 'message_delta']);
 const { legacy } = anthropicSettings;

 class AnthropicClient extends BaseClient {
@ -44,6 +46,24 @@ class AnthropicClient extends BaseClient {
      ? options.contextStrategy.toLowerCase()
      : 'discard';
    this.setOptions(options);
+    /** @type {string | undefined} */
+    this.systemMessage;
+    /** @type {AnthropicMessageStartEvent| undefined} */
+    this.message_start;
+    /** @type {AnthropicMessageDeltaEvent| undefined} */
+    this.message_delta;
+    /** Whether the model is part of the Claude 3 Family
+     * @type {boolean} */
+    this.isClaude3;
+    /** Whether to use Messages API or Completions API
+     * @type {boolean} */
+    this.useMessages;
+    /** Whether or not the model is limited to the legacy amount of output tokens
+     * @type {boolean} */
+    this.isLegacyOutput;
+    /** Whether or not the model supports Prompt Caching
+     * @type {boolean} */
+    this.supportsCacheControl;
  }

  setOptions(options) {
@ -69,8 +89,10 @@ class AnthropicClient extends BaseClient {
      model: modelOptions.model || anthropicSettings.model.default,
    };

-    this.isClaude3 = this.modelOptions.model.includes('claude-3');
-    this.isLegacyOutput = !this.modelOptions.model.includes('claude-3-5-sonnet');
+    const modelMatch = matchModelName(this.modelOptions.model, EModelEndpoint.anthropic);
+    this.isClaude3 = modelMatch.startsWith('claude-3');
+    this.isLegacyOutput = !modelMatch.startsWith('claude-3-5-sonnet');
+    this.supportsCacheControl = this.checkPromptCacheSupport(modelMatch);

    if (
      this.isLegacyOutput &&
@ -147,19 +169,74 @@ class AnthropicClient extends BaseClient {
      options.baseURL = this.options.reverseProxyUrl;
    }

-    if (requestOptions?.model && requestOptions.model.includes('claude-3-5-sonnet')) {
+    if (
+      this.supportsCacheControl &&
+      requestOptions?.model &&
+      requestOptions.model.includes('claude-3-5-sonnet')
+    ) {
      options.defaultHeaders = {
-        'anthropic-beta': 'max-tokens-3-5-sonnet-2024-07-15',
+        'anthropic-beta': 'max-tokens-3-5-sonnet-2024-07-15,prompt-caching-2024-07-31',
+      };
+    } else if (this.supportsCacheControl) {
+      options.defaultHeaders = {
+        'anthropic-beta': 'prompt-caching-2024-07-31',
      };
    }

    return new Anthropic(options);
  }

-  getTokenCountForResponse(response) {
+  /**
+   * Get stream usage as returned by this client's API response.
+   * @returns {AnthropicStreamUsage} The stream usage object.
+   */
+  getStreamUsage() {
+    const inputUsage = this.message_start?.message?.usage ?? {};
+    const outputUsage = this.message_delta?.usage ?? {};
+    return Object.assign({}, inputUsage, outputUsage);
+  }
+
+  /**
+   * Calculates the correct token count for the current message based on the token count map and API usage.
+   * Edge case: If the calculation results in a negative value, it returns the original estimate.
+   * If revisiting a conversation with a chat history entirely composed of token estimates,
+   * the cumulative token count going forward should become more accurate as the conversation progresses.
+   * @param {Object} params - The parameters for the calculation.
+   * @param {Record<string, number>} params.tokenCountMap - A map of message IDs to their token counts.
+   * @param {string} params.currentMessageId - The ID of the current message to calculate.
+   * @param {AnthropicStreamUsage} params.usage - The usage object returned by the API.
+   * @returns {number} The correct token count for the current message.
+   */
+  calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage }) {
+    const originalEstimate = tokenCountMap[currentMessageId] || 0;
+
+    if (!usage || typeof usage.input_tokens !== 'number') {
+      return originalEstimate;
+    }
+
+    tokenCountMap[currentMessageId] = 0;
+    const totalTokensFromMap = Object.values(tokenCountMap).reduce((sum, count) => {
+      const numCount = Number(count);
+      return sum + (isNaN(numCount) ? 0 : numCount);
+    }, 0);
+    const totalInputTokens =
+      (usage.input_tokens ?? 0) +
+      (usage.cache_creation_input_tokens ?? 0) +
+      (usage.cache_read_input_tokens ?? 0);
+
+    const currentMessageTokens = totalInputTokens - totalTokensFromMap;
+    return currentMessageTokens > 0 ? currentMessageTokens : originalEstimate;
+  }
+
+  /**
+   * Get Token Count for LibreChat Message
+   * @param {TMessage} responseMessage
+   * @returns {number}
+   */
+  getTokenCountForResponse(responseMessage) {
    return this.getTokenCountForMessage({
      role: 'assistant',
-      content: response.text,
+      content: responseMessage.text,
    });
  }

@ -212,7 +289,38 @@ class AnthropicClient extends BaseClient {
    return files;
  }

-  async recordTokenUsage({ promptTokens, completionTokens, model, context = 'message' }) {
+  /**
+   * @param {object} params
+   * @param {number} params.promptTokens
+   * @param {number} params.completionTokens
+   * @param {AnthropicStreamUsage} [params.usage]
+   * @param {string} [params.model]
+   * @param {string} [params.context='message']
+   * @returns {Promise<void>}
+   */
+  async recordTokenUsage({ promptTokens, completionTokens, usage, model, context = 'message' }) {
+    if (usage != null && usage?.input_tokens != null) {
+      const input = usage.input_tokens ?? 0;
+      const write = usage.cache_creation_input_tokens ?? 0;
+      const read = usage.cache_read_input_tokens ?? 0;
+
+      await spendStructuredTokens(
+        {
+          context,
+          user: this.user,
+          conversationId: this.conversationId,
+          model: model ?? this.modelOptions.model,
+          endpointTokenConfig: this.options.endpointTokenConfig,
+        },
+        {
+          promptTokens: { input, write, read },
+          completionTokens,
+        },
+      );
+
+      return;
+    }
+
    await spendTokens(
      {
        context,
@ -560,6 +668,18 @@ class AnthropicClient extends BaseClient {
      : await client.completions.create(options);
  }

+  /**
+   * @param {string} modelName
+   * @returns {boolean}
+   */
+  checkPromptCacheSupport(modelName) {
+    const modelMatch = matchModelName(modelName, EModelEndpoint.anthropic);
+    if (modelMatch === 'claude-3-5-sonnet' || modelMatch === 'claude-3-haiku') {
+      return true;
+    }
+    return false;
+  }
+
  async sendCompletion(payload, { onProgress, abortController }) {
    if (!abortController) {
      abortController = new AbortController();
@ -606,10 +726,22 @@ class AnthropicClient extends BaseClient {
      requestOptions.max_tokens_to_sample = maxOutputTokens || 1500;
    }

-    if (this.systemMessage) {
+    if (this.systemMessage && this.supportsCacheControl === true) {
+      requestOptions.system = [
+        {
+          type: 'text',
+          text: this.systemMessage,
+          cache_control: { type: 'ephemeral' },
+        },
+      ];
+    } else if (this.systemMessage) {
      requestOptions.system = this.systemMessage;
    }

+    if (this.supportsCacheControl === true && this.useMessages) {
+      requestOptions.messages = addCacheControl(requestOptions.messages);
+    }
+
    logger.debug('[AnthropicClient]', { ...requestOptions });

    const handleChunk = (currentChunk) => {
@ -639,6 +771,11 @@ class AnthropicClient extends BaseClient {

          for await (const completion of response) {
            // Handle each completion as before
+            const type = completion?.type ?? '';
+            if (tokenEventTypes.has(type)) {
+              logger.debug(`[AnthropicClient] ${type}`, completion);
+              this[type] = completion;
+            }
            if (completion?.delta?.text) {
              handleChunk(completion.delta.text);
            } else if (completion.completion) {
@ -727,6 +864,8 @@ class AnthropicClient extends BaseClient {
   */
  async titleConvo({ text, responseText = '' }) {
    let title = 'New Chat';
+    this.message_delta = undefined;
+    this.message_start = undefined;
    const convo = `<initial_message>
  ${truncateText(text)}
  </initial_message>
--- a/api/app/clients/BaseClient.js
+++ b/api/app/clients/BaseClient.js
@ -54,10 +54,22 @@ class BaseClient {
    throw new Error('Subclasses attempted to call summarizeMessages without implementing it');
  }

-  async getTokenCountForResponse(response) {
-    logger.debug('`[BaseClient] recordTokenUsage` not implemented.', response);
+  /**
+   * Abstract method to get the token count for a message. Subclasses must implement this method.
+   * @param {TMessage} responseMessage
+   * @returns {number}
+   */
+  getTokenCountForResponse(responseMessage) {
+    logger.debug('`[BaseClient] recordTokenUsage` not implemented.', responseMessage);
  }

+  /**
+   * Abstract method to record token usage. Subclasses must implement this method.
+   * If a correction to the token usage is needed, the method should return an object with the corrected token counts.
+   * @param {number} promptTokens
+   * @param {number} completionTokens
+   * @returns {Promise<void>}
+   */
  async recordTokenUsage({ promptTokens, completionTokens }) {
    logger.debug('`[BaseClient] recordTokenUsage` not implemented.', {
      promptTokens,
@ -536,13 +548,31 @@ class BaseClient {
      this.getTokenCountForResponse &&
      this.getTokenCount
    ) {
-      responseMessage.tokenCount = this.getTokenCountForResponse(responseMessage);
-      const completionTokens = this.getTokenCount(completion);
-      await this.recordTokenUsage({ promptTokens, completionTokens });
+      let completionTokens;
+
+      /**
+       * Metadata about input/output costs for the current message. The client
+       * should provide a function to get the current stream usage metadata; if not,
+       * use the legacy token estimations.
+       * @type {StreamUsage | null} */
+      const usage = this.getStreamUsage != null ? this.getStreamUsage() : null;
+
+      if (usage != null && Number(usage.output_tokens) > 0) {
+        responseMessage.tokenCount = usage.output_tokens;
+        completionTokens = responseMessage.tokenCount;
+        await this.updateUserMessageTokenCount({ usage, tokenCountMap, userMessage, opts });
+      } else {
+        responseMessage.tokenCount = this.getTokenCountForResponse(responseMessage);
+        completionTokens = this.getTokenCount(completion);
+      }
+
+      await this.recordTokenUsage({ promptTokens, completionTokens, usage });
    }
+
    if (this.userMessagePromise) {
      await this.userMessagePromise;
    }
+
    this.responsePromise = this.saveMessageToDatabase(responseMessage, saveOptions, user);
    const messageCache = getLogStores(CacheKeys.MESSAGES);
    messageCache.set(
@ -557,6 +587,66 @@ class BaseClient {
    return responseMessage;
  }

+  /**
+   * Stream usage should only be used for user message token count re-calculation if:
+   * - The stream usage is available, with input tokens greater than 0,
+   * - the client provides a function to calculate the current token count,
+   * - files are being resent with every message (default behavior; or if `false`, with no attachments),
+   * - the `promptPrefix` (custom instructions) is not set.
+   *
+   * In these cases, the legacy token estimations would be more accurate.
+   *
+   * TODO: included system messages in the `orderedMessages` accounting, potentially as a
+   * separate message in the UI. ChatGPT does this through "hidden" system messages.
+   * @param {object} params
+   * @param {StreamUsage} params.usage
+   * @param {Record<string, number>} params.tokenCountMap
+   * @param {TMessage} params.userMessage
+   * @param {object} params.opts
+   */
+  async updateUserMessageTokenCount({ usage, tokenCountMap, userMessage, opts }) {
+    /** @type {boolean} */
+    const shouldUpdateCount =
+      this.calculateCurrentTokenCount != null &&
+      Number(usage.input_tokens) > 0 &&
+      (this.options.resendFiles ||
+        (!this.options.resendFiles && !this.options.attachments?.length)) &&
+      !this.options.promptPrefix;
+
+    if (!shouldUpdateCount) {
+      return;
+    }
+
+    const userMessageTokenCount = this.calculateCurrentTokenCount({
+      currentMessageId: userMessage.messageId,
+      tokenCountMap,
+      usage,
+    });
+
+    if (userMessageTokenCount === userMessage.tokenCount) {
+      return;
+    }
+
+    userMessage.tokenCount = userMessageTokenCount;
+    /*
+      Note: `AskController` saves the user message, so we update the count of its `userMessage` reference
+    */
+    if (typeof opts?.getReqData === 'function') {
+      opts.getReqData({
+        userMessage,
+      });
+    }
+    /*
+      Note: we update the user message to be sure it gets the calculated token count;
+      though `AskController` saves the user message, EditController does not
+    */
+    await this.userMessagePromise;
+    await this.updateMessageInDatabase({
+      messageId: userMessage.messageId,
+      tokenCount: userMessageTokenCount,
+    });
+  }
+
  async loadHistory(conversationId, parentMessageId = null) {
    logger.debug('[BaseClient] Loading history:', { conversationId, parentMessageId });

@ -644,6 +734,10 @@ class BaseClient {
    return { message: savedMessage, conversation };
  }

+  /**
+   * Update a message in the database.
+   * @param {Partial<TMessage>} message
+   */
  async updateMessageInDatabase(message) {
    await updateMessage(this.options.req, message);
  }
--- a/api/app/clients/OpenAIClient.js
+++ b/api/app/clients/OpenAIClient.js
@ -27,9 +27,9 @@ const {
  createContextHandlers,
 } = require('./prompts');
 const { encodeAndFormat } = require('~/server/services/Files/images/encode');
+const { spendTokens } = require('~/models/spendTokens');
 const { isEnabled, sleep } = require('~/server/utils');
 const { handleOpenAIErrors } = require('./tools/util');
-const spendTokens = require('~/models/spendTokens');
 const { createLLM, RunManager } = require('./llm');
 const ChatGPTClient = require('./ChatGPTClient');
 const { summaryBuffer } = require('./memory');
--- a/api/app/clients/llm/RunManager.js
+++ b/api/app/clients/llm/RunManager.js
@ -1,5 +1,5 @@
 const { createStartHandler } = require('~/app/clients/callbacks');
-const spendTokens = require('~/models/spendTokens');
+const { spendTokens } = require('~/models/spendTokens');
 const { logger } = require('~/config');

 class RunManager {
--- a/api/app/clients/prompts/addCacheControl.js
+++ b/api/app/clients/prompts/addCacheControl.js
@ -0,0 +1,43 @@
+/**
+ * Anthropic API: Adds cache control to the appropriate user messages in the payload.
+ * @param {Array<AnthropicMessage>} messages - The array of message objects.
+ * @returns {Array<AnthropicMessage>} - The updated array of message objects with cache control added.
+ */
+function addCacheControl(messages) {
+  if (!Array.isArray(messages) || messages.length < 2) {
+    return messages;
+  }
+
+  const updatedMessages = [...messages];
+  let userMessagesFound = 0;
+
+  for (let i = updatedMessages.length - 1; i >= 0 && userMessagesFound < 2; i--) {
+    if (updatedMessages[i].role === 'user') {
+      if (typeof updatedMessages[i].content === 'string') {
+        updatedMessages[i] = {
+          ...updatedMessages[i],
+          content: [
+            {
+              type: 'text',
+              text: updatedMessages[i].content,
+              cache_control: { type: 'ephemeral' },
+            },
+          ],
+        };
+      } else if (Array.isArray(updatedMessages[i].content)) {
+        updatedMessages[i] = {
+          ...updatedMessages[i],
+          content: updatedMessages[i].content.map((item) => ({
+            ...item,
+            cache_control: { type: 'ephemeral' },
+          })),
+        };
+      }
+      userMessagesFound++;
+    }
+  }
+
+  return updatedMessages;
+}
+
+module.exports = addCacheControl;
--- a/api/app/clients/prompts/addCacheControl.spec.js
+++ b/api/app/clients/prompts/addCacheControl.spec.js
@ -0,0 +1,164 @@
+const addCacheControl = require('./addCacheControl');
+
+describe('addCacheControl', () => {
+  test('should add cache control to the last two user messages with array content', () => {
+    const messages = [
+      { role: 'user', content: [{ type: 'text', text: 'Hello' }] },
+      { role: 'assistant', content: [{ type: 'text', text: 'Hi there' }] },
+      { role: 'user', content: [{ type: 'text', text: 'How are you?' }] },
+      { role: 'assistant', content: [{ type: 'text', text: 'I\'m doing well, thanks!' }] },
+      { role: 'user', content: [{ type: 'text', text: 'Great!' }] },
+    ];
+
+    const result = addCacheControl(messages);
+
+    expect(result[0].content[0]).not.toHaveProperty('cache_control');
+    expect(result[2].content[0].cache_control).toEqual({ type: 'ephemeral' });
+    expect(result[4].content[0].cache_control).toEqual({ type: 'ephemeral' });
+  });
+
+  test('should add cache control to the last two user messages with string content', () => {
+    const messages = [
+      { role: 'user', content: 'Hello' },
+      { role: 'assistant', content: 'Hi there' },
+      { role: 'user', content: 'How are you?' },
+      { role: 'assistant', content: 'I\'m doing well, thanks!' },
+      { role: 'user', content: 'Great!' },
+    ];
+
+    const result = addCacheControl(messages);
+
+    expect(result[0].content).toBe('Hello');
+    expect(result[2].content[0]).toEqual({
+      type: 'text',
+      text: 'How are you?',
+      cache_control: { type: 'ephemeral' },
+    });
+    expect(result[4].content[0]).toEqual({
+      type: 'text',
+      text: 'Great!',
+      cache_control: { type: 'ephemeral' },
+    });
+  });
+
+  test('should handle mixed string and array content', () => {
+    const messages = [
+      { role: 'user', content: 'Hello' },
+      { role: 'assistant', content: 'Hi there' },
+      { role: 'user', content: [{ type: 'text', text: 'How are you?' }] },
+    ];
+
+    const result = addCacheControl(messages);
+
+    expect(result[0].content[0]).toEqual({
+      type: 'text',
+      text: 'Hello',
+      cache_control: { type: 'ephemeral' },
+    });
+    expect(result[2].content[0].cache_control).toEqual({ type: 'ephemeral' });
+  });
+
+  test('should handle less than two user messages', () => {
+    const messages = [
+      { role: 'user', content: 'Hello' },
+      { role: 'assistant', content: 'Hi there' },
+    ];
+
+    const result = addCacheControl(messages);
+
+    expect(result[0].content[0]).toEqual({
+      type: 'text',
+      text: 'Hello',
+      cache_control: { type: 'ephemeral' },
+    });
+    expect(result[1].content).toBe('Hi there');
+  });
+
+  test('should return original array if no user messages', () => {
+    const messages = [
+      { role: 'assistant', content: 'Hi there' },
+      { role: 'assistant', content: 'How can I help?' },
+    ];
+
+    const result = addCacheControl(messages);
+
+    expect(result).toEqual(messages);
+  });
+
+  test('should handle empty array', () => {
+    const messages = [];
+    const result = addCacheControl(messages);
+    expect(result).toEqual([]);
+  });
+
+  test('should handle non-array input', () => {
+    const messages = 'not an array';
+    const result = addCacheControl(messages);
+    expect(result).toBe('not an array');
+  });
+
+  test('should not modify assistant messages', () => {
+    const messages = [
+      { role: 'user', content: 'Hello' },
+      { role: 'assistant', content: 'Hi there' },
+      { role: 'user', content: 'How are you?' },
+    ];
+
+    const result = addCacheControl(messages);
+
+    expect(result[1].content).toBe('Hi there');
+  });
+
+  test('should handle multiple content items in user messages', () => {
+    const messages = [
+      {
+        role: 'user',
+        content: [
+          { type: 'text', text: 'Hello' },
+          { type: 'image', url: 'http://example.com/image.jpg' },
+        ],
+      },
+      { role: 'assistant', content: 'Hi there' },
+      { role: 'user', content: 'How are you?' },
+    ];
+
+    const result = addCacheControl(messages);
+
+    expect(result[0].content[0].cache_control).toEqual({ type: 'ephemeral' });
+    expect(result[0].content[1].cache_control).toEqual({ type: 'ephemeral' });
+    expect(result[2].content[0]).toEqual({
+      type: 'text',
+      text: 'How are you?',
+      cache_control: { type: 'ephemeral' },
+    });
+  });
+
+  test('should handle an array with mixed content types', () => {
+    const messages = [
+      { role: 'user', content: 'Hello' },
+      { role: 'assistant', content: 'Hi there' },
+      { role: 'user', content: [{ type: 'text', text: 'How are you?' }] },
+      { role: 'assistant', content: 'I\'m doing well, thanks!' },
+      { role: 'user', content: 'Great!' },
+    ];
+
+    const result = addCacheControl(messages);
+    console.dir(result, { depth: null });
+
+    expect(result[0].content).toEqual('Hello');
+    expect(result[2].content[0]).toEqual({
+      type: 'text',
+      text: 'How are you?',
+      cache_control: { type: 'ephemeral' },
+    });
+    expect(result[4].content).toEqual([
+      {
+        type: 'text',
+        text: 'Great!',
+        cache_control: { type: 'ephemeral' },
+      },
+    ]);
+    expect(result[1].content).toBe('Hi there');
+    expect(result[3].content).toBe('I\'m doing well, thanks!');
+  });
+});
--- a/api/app/clients/prompts/index.js
+++ b/api/app/clients/prompts/index.js
@ -1,3 +1,4 @@
+const addCacheControl = require('./addCacheControl');
 const formatMessages = require('./formatMessages');
 const summaryPrompts = require('./summaryPrompts');
 const handleInputs = require('./handleInputs');
@ -8,6 +9,7 @@ const createVisionPrompt = require('./createVisionPrompt');
 const createContextHandlers = require('./createContextHandlers');

 module.exports = {
+  addCacheControl,
  ...formatMessages,
  ...summaryPrompts,
  ...handleInputs,
--- a/api/app/clients/specs/AnthropicClient.test.js
+++ b/api/app/clients/specs/AnthropicClient.test.js
@ -211,7 +211,21 @@ describe('AnthropicClient', () => {
      expect(anthropicClient._options.defaultHeaders).toBeDefined();
      expect(anthropicClient._options.defaultHeaders).toHaveProperty('anthropic-beta');
      expect(anthropicClient._options.defaultHeaders['anthropic-beta']).toBe(
-        'max-tokens-3-5-sonnet-2024-07-15',
+        'max-tokens-3-5-sonnet-2024-07-15,prompt-caching-2024-07-31',
+      );
+    });
+
+    it('should add beta header for claude-3-haiku model', () => {
+      const client = new AnthropicClient('test-api-key');
+      const modelOptions = {
+        model: 'claude-3-haiku-2028',
+      };
+      client.setOptions({ modelOptions });
+      const anthropicClient = client.getClient(modelOptions);
+      expect(anthropicClient._options.defaultHeaders).toBeDefined();
+      expect(anthropicClient._options.defaultHeaders).toHaveProperty('anthropic-beta');
+      expect(anthropicClient._options.defaultHeaders['anthropic-beta']).toBe(
+        'prompt-caching-2024-07-31',
      );
    });

@ -226,4 +240,145 @@ describe('AnthropicClient', () => {
      expect(anthropicClient.defaultHeaders).not.toHaveProperty('anthropic-beta');
    });
  });
+
+  describe('calculateCurrentTokenCount', () => {
+    let client;
+
+    beforeEach(() => {
+      client = new AnthropicClient('test-api-key');
+    });
+
+    it('should calculate correct token count when usage is provided', () => {
+      const tokenCountMap = {
+        msg1: 10,
+        msg2: 20,
+        currentMsg: 30,
+      };
+      const currentMessageId = 'currentMsg';
+      const usage = {
+        input_tokens: 70,
+        output_tokens: 50,
+      };
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(40); // 70 - (10 + 20) = 40
+    });
+
+    it('should return original estimate if calculation results in negative value', () => {
+      const tokenCountMap = {
+        msg1: 40,
+        msg2: 50,
+        currentMsg: 30,
+      };
+      const currentMessageId = 'currentMsg';
+      const usage = {
+        input_tokens: 80,
+        output_tokens: 50,
+      };
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(30); // Original estimate
+    });
+
+    it('should handle cache creation and read input tokens', () => {
+      const tokenCountMap = {
+        msg1: 10,
+        msg2: 20,
+        currentMsg: 30,
+      };
+      const currentMessageId = 'currentMsg';
+      const usage = {
+        input_tokens: 50,
+        cache_creation_input_tokens: 10,
+        cache_read_input_tokens: 20,
+        output_tokens: 40,
+      };
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(50); // (50 + 10 + 20) - (10 + 20) = 50
+    });
+
+    it('should handle missing usage properties', () => {
+      const tokenCountMap = {
+        msg1: 10,
+        msg2: 20,
+        currentMsg: 30,
+      };
+      const currentMessageId = 'currentMsg';
+      const usage = {
+        output_tokens: 40,
+      };
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(30); // Original estimate
+    });
+
+    it('should handle empty tokenCountMap', () => {
+      const tokenCountMap = {};
+      const currentMessageId = 'currentMsg';
+      const usage = {
+        input_tokens: 50,
+        output_tokens: 40,
+      };
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(50);
+      expect(Number.isNaN(result)).toBe(false);
+    });
+
+    it('should handle zero values in usage', () => {
+      const tokenCountMap = {
+        msg1: 10,
+        currentMsg: 20,
+      };
+      const currentMessageId = 'currentMsg';
+      const usage = {
+        input_tokens: 0,
+        cache_creation_input_tokens: 0,
+        cache_read_input_tokens: 0,
+        output_tokens: 0,
+      };
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(20); // Should return original estimate
+      expect(Number.isNaN(result)).toBe(false);
+    });
+
+    it('should handle undefined usage', () => {
+      const tokenCountMap = {
+        msg1: 10,
+        currentMsg: 20,
+      };
+      const currentMessageId = 'currentMsg';
+      const usage = undefined;
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(20); // Should return original estimate
+      expect(Number.isNaN(result)).toBe(false);
+    });
+
+    it('should handle non-numeric values in tokenCountMap', () => {
+      const tokenCountMap = {
+        msg1: 'ten',
+        currentMsg: 20,
+      };
+      const currentMessageId = 'currentMsg';
+      const usage = {
+        input_tokens: 30,
+        output_tokens: 10,
+      };
+
+      const result = client.calculateCurrentTokenCount({ tokenCountMap, currentMessageId, usage });
+
+      expect(result).toBe(30); // Should return 30 (input_tokens) - 0 (ignored 'ten') = 30
+      expect(Number.isNaN(result)).toBe(false);
+    });
+  });
 });
--- a/api/models/Message.js
+++ b/api/models/Message.js
@ -212,8 +212,8 @@ async function updateMessageText(req, { messageId, text }) {
 *
 * @async
 * @function updateMessage
- * @param {Object} message - The message object containing update data.
 * @param {Object} req - The request object.
+ * @param {Object} message - The message object containing update data.
 * @param {string} message.messageId - The unique identifier for the message.
 * @param {string} [message.text] - The new text content of the message.
 * @param {Object[]} [message.files] - The files associated with the message.
--- a/api/models/Transaction.js
+++ b/api/models/Transaction.js
@ -1,12 +1,12 @@
 const mongoose = require('mongoose');
 const { isEnabled } = require('../server/utils/handleText');
 const transactionSchema = require('./schema/transaction');
-const { getMultiplier } = require('./tx');
+const { getMultiplier, getCacheMultiplier } = require('./tx');
 const { logger } = require('~/config');
 const Balance = require('./Balance');
 const cancelRate = 1.15;

-// Method to calculate and set the tokenValue for a transaction
+/** Method to calculate and set the tokenValue for a transaction */
 transactionSchema.methods.calculateTokenValue = function () {
  if (!this.valueKey || !this.tokenType) {
    this.tokenValue = this.rawAmount;
@ -21,15 +21,17 @@ transactionSchema.methods.calculateTokenValue = function () {
  }
 };

-// Static method to create a transaction and update the balance
-transactionSchema.statics.create = async function (transactionData) {
+/**
+ * Static method to create a transaction and update the balance
+ * @param {txData} txData - Transaction data.
+ */
+transactionSchema.statics.create = async function (txData) {
  const Transaction = this;

-  const transaction = new Transaction(transactionData);
-  transaction.endpointTokenConfig = transactionData.endpointTokenConfig;
+  const transaction = new Transaction(txData);
+  transaction.endpointTokenConfig = txData.endpointTokenConfig;
  transaction.calculateTokenValue();

-  // Save the transaction
  await transaction.save();

  if (!isEnabled(process.env.CHECK_BALANCE)) {
@ -57,6 +59,104 @@ transactionSchema.statics.create = async function (transactionData) {
  };
 };

+/**
+ * Static method to create a structured transaction and update the balance
+ * @param {txData} txData - Transaction data.
+ */
+transactionSchema.statics.createStructured = async function (txData) {
+  const Transaction = this;
+
+  const transaction = new Transaction({
+    ...txData,
+    endpointTokenConfig: txData.endpointTokenConfig,
+  });
+
+  transaction.calculateStructuredTokenValue();
+
+  await transaction.save();
+
+  if (!isEnabled(process.env.CHECK_BALANCE)) {
+    return transaction;
+  }
+
+  let balance = await Balance.findOne({ user: transaction.user }).lean();
+  let incrementValue = transaction.tokenValue;
+
+  if (balance && balance?.tokenCredits + incrementValue < 0) {
+    incrementValue = -balance.tokenCredits;
+  }
+
+  balance = await Balance.findOneAndUpdate(
+    { user: transaction.user },
+    { $inc: { tokenCredits: incrementValue } },
+    { upsert: true, new: true },
+  ).lean();
+
+  return {
+    rate: transaction.rate,
+    user: transaction.user.toString(),
+    balance: balance.tokenCredits,
+    [transaction.tokenType]: incrementValue,
+  };
+};
+
+/** Method to calculate token value for structured tokens */
+transactionSchema.methods.calculateStructuredTokenValue = function () {
+  if (!this.tokenType) {
+    this.tokenValue = this.rawAmount;
+    return;
+  }
+
+  const { model, endpointTokenConfig } = this;
+
+  if (this.tokenType === 'prompt') {
+    const inputMultiplier = getMultiplier({ tokenType: 'prompt', model, endpointTokenConfig });
+    const writeMultiplier =
+      getCacheMultiplier({ cacheType: 'write', model, endpointTokenConfig }) ?? inputMultiplier;
+    const readMultiplier =
+      getCacheMultiplier({ cacheType: 'read', model, endpointTokenConfig }) ?? inputMultiplier;
+
+    this.rateDetail = {
+      input: inputMultiplier,
+      write: writeMultiplier,
+      read: readMultiplier,
+    };
+
+    const totalTokens = (this.inputTokens || 0) + (this.writeTokens || 0) + (this.readTokens || 0);
+
+    if (totalTokens > 0) {
+      this.rate =
+        (inputMultiplier * (this.inputTokens || 0) +
+          writeMultiplier * (this.writeTokens || 0) +
+          readMultiplier * (this.readTokens || 0)) /
+        totalTokens;
+    } else {
+      this.rate = inputMultiplier; // Default to input rate if no tokens
+    }
+
+    this.tokenValue =
+      this.inputTokens * inputMultiplier +
+      (this.writeTokens || 0) * writeMultiplier +
+      (this.readTokens || 0) * readMultiplier;
+  } else {
+    const multiplier = Math.abs(
+      getMultiplier({ tokenType: this.tokenType, model, endpointTokenConfig }),
+    );
+    this.rate = multiplier;
+    this.tokenValue = this.rawAmount * multiplier;
+  }
+
+  if (this.context && this.tokenType === 'completion' && this.context === 'incomplete') {
+    this.tokenValue = Math.ceil(this.tokenValue * cancelRate);
+    this.rate *= cancelRate;
+    if (this.rateDetail) {
+      this.rateDetail = Object.fromEntries(
+        Object.entries(this.rateDetail).map(([k, v]) => [k, v * cancelRate]),
+      );
+    }
+  }
+};
+
 const Transaction = mongoose.model('Transaction', transactionSchema);

 /**
--- a/api/models/schema/transaction.js
+++ b/api/models/schema/transaction.js
@ -30,6 +30,9 @@ const transactionSchema = mongoose.Schema(
    rate: Number,
    rawAmount: Number,
    tokenValue: Number,
+    inputTokens: { type: Number },
+    writeTokens: { type: Number },
+    readTokens: { type: Number },
  },
  {
    timestamps: true,
--- a/api/models/spendTokens.js
+++ b/api/models/spendTokens.js
@ -11,7 +11,7 @@ const { logger } = require('~/config');
 * @param {String} txData.conversationId - The ID of the conversation.
 * @param {String} txData.model - The model name.
 * @param {String} txData.context - The context in which the transaction is made.
- * @param {String} [txData.endpointTokenConfig] - The current endpoint token config.
+ * @param {EndpointTokenConfig} [txData.endpointTokenConfig] - The current endpoint token config.
 * @param {String} [txData.valueKey] - The value key (optional).
 * @param {Object} tokenUsage - The number of tokens used.
 * @param {Number} tokenUsage.promptTokens - The number of prompt tokens used.
@ -66,4 +66,74 @@ const spendTokens = async (txData, tokenUsage) => {
  }
 };

-module.exports = spendTokens;
+/**
+ * Creates transactions to record the spending of structured tokens.
+ *
+ * @function
+ * @async
+ * @param {Object} txData - Transaction data.
+ * @param {mongoose.Schema.Types.ObjectId} txData.user - The user ID.
+ * @param {String} txData.conversationId - The ID of the conversation.
+ * @param {String} txData.model - The model name.
+ * @param {String} txData.context - The context in which the transaction is made.
+ * @param {EndpointTokenConfig} [txData.endpointTokenConfig] - The current endpoint token config.
+ * @param {String} [txData.valueKey] - The value key (optional).
+ * @param {Object} tokenUsage - The number of tokens used.
+ * @param {Object} tokenUsage.promptTokens - The number of prompt tokens used.
+ * @param {Number} tokenUsage.promptTokens.input - The number of input tokens.
+ * @param {Number} tokenUsage.promptTokens.write - The number of write tokens.
+ * @param {Number} tokenUsage.promptTokens.read - The number of read tokens.
+ * @param {Number} tokenUsage.completionTokens - The number of completion tokens used.
+ * @returns {Promise<void>} - Returns nothing.
+ * @throws {Error} - Throws an error if there's an issue creating the transactions.
+ */
+const spendStructuredTokens = async (txData, tokenUsage) => {
+  const { promptTokens, completionTokens } = tokenUsage;
+  logger.debug(
+    `[spendStructuredTokens] conversationId: ${txData.conversationId}${
+      txData?.context ? ` | Context: ${txData?.context}` : ''
+    } | Token usage: `,
+    {
+      promptTokens,
+      completionTokens,
+    },
+  );
+  let prompt, completion;
+  try {
+    if (promptTokens) {
+      const { input = 0, write = 0, read = 0 } = promptTokens;
+      const promptAmount = input + write + read;
+      prompt = await Transaction.createStructured({
+        ...txData,
+        tokenType: 'prompt',
+        rawAmount: -promptAmount,
+        inputTokens: input,
+        writeTokens: write,
+        readTokens: read,
+      });
+    }
+
+    if (completionTokens) {
+      completion = await Transaction.create({
+        ...txData,
+        tokenType: 'completion',
+        rawAmount: -completionTokens,
+      });
+    }
+
+    prompt &&
+      completion &&
+      logger.debug('[spendStructuredTokens] Transaction data record against balance:', {
+        user: txData.user,
+        prompt: prompt.tokenValue,
+        promptRate: prompt.rate,
+        completion: completion.tokenValue,
+        completionRate: completion.rate,
+        balance: completion.balance,
+      });
+  } catch (err) {
+    logger.error('[spendStructuredTokens]', err);
+  }
+};
+
+module.exports = { spendTokens, spendStructuredTokens };
--- a/api/models/tx.js
+++ b/api/models/tx.js
@ -70,6 +70,17 @@ const tokenValues = Object.assign(
  bedrockValues,
 );

+/**
+ * Mapping of model token sizes to their respective multipliers for cached input, read and write.
+ * See Anthropic's documentation on this: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#pricing
+ * The rates are 1 USD per 1M tokens.
+ * @type {Object.<string, {write: number, read: number }>}
+ */
+const cacheTokenValues = {
+  'claude-3-5-sonnet': { write: 3.75, read: 0.3 },
+  'claude-3-haiku': { write: 0.3, read: 0.03 },
+};
+
 /**
 * Retrieves the key associated with a given model name.
 *
@ -122,7 +133,7 @@ const getValueKey = (model, endpoint) => {
 *
 * @param {Object} params - The parameters for the function.
 * @param {string} [params.valueKey] - The key corresponding to the model name.
- * @param {string} [params.tokenType] - The type of token (e.g., 'prompt' or 'completion').
+ * @param {'prompt' | 'completion'} [params.tokenType] - The type of token (e.g., 'prompt' or 'completion').
 * @param {string} [params.model] - The model name to derive the value key from if not provided.
 * @param {string} [params.endpoint] - The endpoint name to derive the value key from if not provided.
 * @param {EndpointTokenConfig} [params.endpointTokenConfig] - The token configuration for the endpoint.
@ -147,7 +158,41 @@ const getMultiplier = ({ valueKey, tokenType, model, endpoint, endpointTokenConf
  }

  // If we got this far, and values[tokenType] is undefined somehow, return a rough average of default multipliers
-  return tokenValues[valueKey][tokenType] ?? defaultRate;
+  return tokenValues[valueKey]?.[tokenType] ?? defaultRate;
 };

-module.exports = { tokenValues, getValueKey, getMultiplier, defaultRate };
+/**
+ * Retrieves the cache multiplier for a given value key and token type. If no value key is provided,
+ * it attempts to derive it from the model name.
+ *
+ * @param {Object} params - The parameters for the function.
+ * @param {string} [params.valueKey] - The key corresponding to the model name.
+ * @param {'write' | 'read'} [params.cacheType] - The type of token (e.g., 'write' or 'read').
+ * @param {string} [params.model] - The model name to derive the value key from if not provided.
+ * @param {string} [params.endpoint] - The endpoint name to derive the value key from if not provided.
+ * @param {EndpointTokenConfig} [params.endpointTokenConfig] - The token configuration for the endpoint.
+ * @returns {number | null} The multiplier for the given parameters, or `null` if not found.
+ */
+const getCacheMultiplier = ({ valueKey, cacheType, model, endpoint, endpointTokenConfig }) => {
+  if (endpointTokenConfig) {
+    return endpointTokenConfig?.[model]?.[cacheType] ?? null;
+  }
+
+  if (valueKey && cacheType) {
+    return cacheTokenValues[valueKey]?.[cacheType] ?? null;
+  }
+
+  if (!cacheType || !model) {
+    return null;
+  }
+
+  valueKey = getValueKey(model, endpoint);
+  if (!valueKey) {
+    return null;
+  }
+
+  // If we got this far, and values[cacheType] is undefined somehow, return a rough average of default multipliers
+  return cacheTokenValues[valueKey]?.[cacheType] ?? null;
+};
+
+module.exports = { tokenValues, getValueKey, getMultiplier, getCacheMultiplier, defaultRate };
--- a/api/models/tx.spec.js
+++ b/api/models/tx.spec.js
@ -1,4 +1,10 @@
-const { getValueKey, getMultiplier, defaultRate, tokenValues } = require('./tx');
+const {
+  defaultRate,
+  tokenValues,
+  getValueKey,
+  getMultiplier,
+  getCacheMultiplier,
+} = require('./tx');

 describe('getValueKey', () => {
  it('should return "16k" for model name containing "gpt-3.5-turbo-16k"', () => {
@ -243,3 +249,76 @@ describe('AWS Bedrock Model Tests', () => {
    expect(results.every(Boolean)).toBe(true);
  });
 });
+
+describe('getCacheMultiplier', () => {
+  it('should return the correct cache multiplier for a given valueKey and cacheType', () => {
+    expect(getCacheMultiplier({ valueKey: 'claude-3-5-sonnet', cacheType: 'write' })).toBe(3.75);
+    expect(getCacheMultiplier({ valueKey: 'claude-3-5-sonnet', cacheType: 'read' })).toBe(0.3);
+    expect(getCacheMultiplier({ valueKey: 'claude-3-haiku', cacheType: 'write' })).toBe(0.3);
+    expect(getCacheMultiplier({ valueKey: 'claude-3-haiku', cacheType: 'read' })).toBe(0.03);
+  });
+
+  it('should return null if cacheType is provided but not found in cacheTokenValues', () => {
+    expect(
+      getCacheMultiplier({ valueKey: 'claude-3-5-sonnet', cacheType: 'unknownType' }),
+    ).toBeNull();
+  });
+
+  it('should derive the valueKey from the model if not provided', () => {
+    expect(getCacheMultiplier({ cacheType: 'write', model: 'claude-3-5-sonnet-20240620' })).toBe(
+      3.75,
+    );
+    expect(getCacheMultiplier({ cacheType: 'read', model: 'claude-3-haiku-20240307' })).toBe(0.03);
+  });
+
+  it('should return null if only model or cacheType is missing', () => {
+    expect(getCacheMultiplier({ cacheType: 'write' })).toBeNull();
+    expect(getCacheMultiplier({ model: 'claude-3-5-sonnet' })).toBeNull();
+  });
+
+  it('should return null if derived valueKey does not match any known patterns', () => {
+    expect(getCacheMultiplier({ cacheType: 'write', model: 'gpt-4-some-other-info' })).toBeNull();
+  });
+
+  it('should handle endpointTokenConfig if provided', () => {
+    const endpointTokenConfig = {
+      'custom-model': {
+        write: 5,
+        read: 1,
+      },
+    };
+    expect(
+      getCacheMultiplier({ model: 'custom-model', cacheType: 'write', endpointTokenConfig }),
+    ).toBe(5);
+    expect(
+      getCacheMultiplier({ model: 'custom-model', cacheType: 'read', endpointTokenConfig }),
+    ).toBe(1);
+  });
+
+  it('should return null if model is not found in endpointTokenConfig', () => {
+    const endpointTokenConfig = {
+      'custom-model': {
+        write: 5,
+        read: 1,
+      },
+    };
+    expect(
+      getCacheMultiplier({ model: 'unknown-model', cacheType: 'write', endpointTokenConfig }),
+    ).toBeNull();
+  });
+
+  it('should handle models with "bedrock/" prefix', () => {
+    expect(
+      getCacheMultiplier({
+        model: 'bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0',
+        cacheType: 'write',
+      }),
+    ).toBe(3.75);
+    expect(
+      getCacheMultiplier({
+        model: 'bedrock/anthropic.claude-3-haiku-20240307-v1:0',
+        cacheType: 'read',
+      }),
+    ).toBe(0.03);
+  });
+});
--- a/api/server/middleware/abortMiddleware.js
+++ b/api/server/middleware/abortMiddleware.js
@ -2,9 +2,9 @@ const { isAssistantsEndpoint } = require('librechat-data-provider');
 const { sendMessage, sendError, countTokens, isEnabled } = require('~/server/utils');
 const { truncateText, smartTruncateText } = require('~/app/clients/prompts');
 const clearPendingReq = require('~/cache/clearPendingReq');
+const { spendTokens } = require('~/models/spendTokens');
 const abortControllers = require('./abortControllers');
 const { saveMessage, getConvo } = require('~/models');
-const spendTokens = require('~/models/spendTokens');
 const { abortRun } = require('./abortRun');
 const { logger } = require('~/config');

--- a/api/server/services/Threads/manage.js
+++ b/api/server/services/Threads/manage.js
@ -8,8 +8,8 @@ const {
 } = require('librechat-data-provider');
 const { retrieveAndProcessFile } = require('~/server/services/Files/process');
 const { recordMessage, getMessages } = require('~/models/Message');
+const { spendTokens } = require('~/models/spendTokens');
 const { saveConvo } = require('~/models/Conversation');
-const spendTokens = require('~/models/spendTokens');
 const { countTokens } = require('~/server/utils');

 /**
--- a/api/typedefs.js
+++ b/api/typedefs.js
@ -26,6 +26,24 @@
 * @memberof typedefs
 */

+/**
+ * @exports AnthropicMessage
+ * @typedef {import('@anthropic-ai/sdk').default.MessageParam} AnthropicMessage
+ * @memberof typedefs
+ */
+
+/**
+ * @exports AnthropicMessageStartEvent
+ * @typedef {import('@anthropic-ai/sdk').default.MessageStartEvent} AnthropicMessageStartEvent
+ * @memberof typedefs
+ */
+
+/**
+ * @exports AnthropicMessageDeltaEvent
+ * @typedef {import('@anthropic-ai/sdk').default.MessageDeltaEvent} AnthropicMessageDeltaEvent
+ * @memberof typedefs
+ */
+
 /**
 * @exports GenerativeModel
 * @typedef {import('@google/generative-ai').GenerativeModel} GenerativeModel
@ -1311,6 +1329,33 @@
 * @method messageCompleted Handles the completion of a message processing.
 */

+/* TX Types */
+
+/**
+ * @typedef {object} txData - Transaction data.
+ * @property {mongoose.Schema.Types.ObjectId} user - The user ID.
+ * @property {String} conversationId - The ID of the conversation.
+ * @property {String} model - The model name.
+ * @property {String} context - The context in which the transaction is made.
+ * @property {EndpointTokenConfig} [endpointTokenConfig] - The current endpoint token config.
+ * @property {object} [cacheUsage] - Cache usage, if any.
+ * @property {String} [valueKey] - The value key (optional).
+ * @memberof typedefs
+ */
+
+/**
+ * https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#pricing
+ * @typedef {object} AnthropicStreamUsage - Stream usage for Anthropic
+ * @property {number} [input_tokens] - The number of input tokens used.
+ * @property {number} [cache_creation_input_tokens] - The number of cache creation input tokens used (write).
+ * @property {number} [cache_read_input_tokens] - The number of cache input tokens used (read).
+ * @property {number} [output_tokens] - The number of output tokens used.
+ */
+
+/**
+ * @typedef {AnthropicStreamUsage} StreamUsage - Stream usage for all providers (currently only Anthropic)
+ */
+
 /* Native app/client methods */

 /**