📂 refactor: File Read Operations (#9747)

* fix: axios response logging for text parsing, remove console logging, remove jsdoc * refactor: error logging in logAxiosError function to handle various error types with type guards * refactor: enhance text parsing with improved error handling and async file reading * refactor: replace synchronous file reading with asynchronous methods for improved performance and memory management * ci: update tests
2026-03-02 06:10:18 +01:00 · 2025-09-20 10:17:24 -04:00 · 2025-09-20 10:17:24 -04:00 · 2489670f54
commit 2489670f54
parent 0352067da2
10 changed files with 692 additions and 83 deletions
--- a/packages/api/src/files/mistral/crud.spec.ts
+++ b/packages/api/src/files/mistral/crud.spec.ts
@ -45,6 +45,10 @@ jest.mock('~/utils/axios', () => ({
  logAxiosError: jest.fn(({ message }) => message || 'Error'),
 }));

+jest.mock('~/utils/files', () => ({
+  readFileAsBuffer: jest.fn(),
+}));
+
 import * as fs from 'fs';
 import axios from 'axios';
 import { HttpsProxyAgent } from 'https-proxy-agent';
@ -56,6 +60,7 @@ import type {
  OCRResult,
 } from '~/types';
 import { logger as mockLogger } from '@librechat/data-schemas';
+import { readFileAsBuffer } from '~/utils/files';
 import {
  uploadDocumentToMistral,
  uploadAzureMistralOCR,
@ -1978,9 +1983,10 @@ describe('MistralOCR Service', () => {

    describe('Azure Mistral OCR with proxy', () => {
      beforeEach(() => {
-        (jest.mocked(fs).readFileSync as jest.Mock).mockReturnValue(
-          Buffer.from('mock-file-content'),
-        );
+        (readFileAsBuffer as jest.Mock).mockResolvedValue({
+          content: Buffer.from('mock-file-content'),
+          bytes: Buffer.from('mock-file-content').length,
+        });
      });

      it('should use proxy for Azure Mistral OCR requests', async () => {
@ -2098,7 +2104,10 @@ describe('MistralOCR Service', () => {

  describe('uploadAzureMistralOCR', () => {
    beforeEach(() => {
-      (jest.mocked(fs).readFileSync as jest.Mock).mockReturnValue(Buffer.from('mock-file-content'));
+      (readFileAsBuffer as jest.Mock).mockResolvedValue({
+        content: Buffer.from('mock-file-content'),
+        bytes: Buffer.from('mock-file-content').length,
+      });
      // Reset the HttpsProxyAgent mock to its default implementation for Azure tests
      (HttpsProxyAgent as unknown as jest.Mock).mockImplementation((url) => ({ proxyUrl: url }));
      // Clean up any PROXY env var from previous tests
@ -2172,7 +2181,9 @@ describe('MistralOCR Service', () => {
        loadAuthValues: mockLoadAuthValues,
      });

-      expect(jest.mocked(fs).readFileSync).toHaveBeenCalledWith('/tmp/upload/azure-file.pdf');
+      expect(readFileAsBuffer).toHaveBeenCalledWith('/tmp/upload/azure-file.pdf', {
+        fileSize: undefined,
+      });

      // Verify OCR was called with base64 data URL
      expect(mockAxios.post).toHaveBeenCalledWith(
--- a/packages/api/src/files/mistral/crud.ts
+++ b/packages/api/src/files/mistral/crud.ts
@ -22,6 +22,7 @@ import type {
  OCRImage,
 } from '~/types';
 import { logAxiosError, createAxiosInstance } from '~/utils/axios';
+import { readFileAsBuffer } from '~/utils/files';
 import { loadServiceKey } from '~/utils/key';

 const axios = createAxiosInstance();
@ -464,7 +465,9 @@ export const uploadAzureMistralOCR = async (
    const { apiKey, baseURL } = await loadAuthConfig(context);
    const model = getModelConfig(context.req.config?.ocr);

-    const buffer = fs.readFileSync(context.file.path);
+    const { content: buffer } = await readFileAsBuffer(context.file.path, {
+      fileSize: context.file.size,
+    });
    const base64 = buffer.toString('base64');
    /** Uses actual mimetype of the file, 'image/jpeg' as fallback since it seems to be accepted regardless of mismatch */
    const base64Prefix = `data:${context.file.mimetype || 'image/jpeg'};base64,`;
@ -691,7 +694,9 @@ export const uploadGoogleVertexMistralOCR = async (
    const { serviceAccount, accessToken } = await loadGoogleAuthConfig();
    const model = getModelConfig(context.req.config?.ocr);

-    const buffer = fs.readFileSync(context.file.path);
+    const { content: buffer } = await readFileAsBuffer(context.file.path, {
+      fileSize: context.file.size,
+    });
    const base64 = buffer.toString('base64');
    const base64Prefix = `data:${context.file.mimetype || 'application/pdf'};base64,`;

--- a/packages/api/src/files/text.spec.ts
+++ b/packages/api/src/files/text.spec.ts
@ -9,8 +9,6 @@ jest.mock('@librechat/data-schemas', () => ({
  },
 }));

-import { parseTextNative, parseText } from './text';
-
 jest.mock('fs', () => ({
  readFileSync: jest.fn(),
  createReadStream: jest.fn(),
@ -36,10 +34,24 @@ jest.mock('form-data', () => {
  }));
 });

+// Mock the utils module to avoid AWS SDK issues
+jest.mock('../utils', () => ({
+  logAxiosError: jest.fn((args) => {
+    if (typeof args === 'object' && args.message) {
+      return args.message;
+    }
+    return 'Error';
+  }),
+  readFileAsString: jest.fn(),
+}));
+
+// Now import everything after mocks are in place
+import { parseTextNative, parseText } from './text';
 import fs, { ReadStream } from 'fs';
 import axios from 'axios';
 import FormData from 'form-data';
 import { generateShortLivedToken } from '../crypto/jwt';
+import { readFileAsString } from '../utils';

 const mockedFs = fs as jest.Mocked<typeof fs>;
 const mockedAxios = axios as jest.Mocked<typeof axios>;
@ -47,6 +59,7 @@ const mockedFormData = FormData as jest.MockedClass<typeof FormData>;
 const mockedGenerateShortLivedToken = generateShortLivedToken as jest.MockedFunction<
  typeof generateShortLivedToken
 >;
+const mockedReadFileAsString = readFileAsString as jest.MockedFunction<typeof readFileAsString>;

 describe('text', () => {
  const mockFile: Express.Multer.File = {
@ -74,29 +87,32 @@ describe('text', () => {
  });

  describe('parseTextNative', () => {
-    it('should successfully parse a text file', () => {
+    it('should successfully parse a text file', async () => {
      const mockText = 'Hello, world!';
-      mockedFs.readFileSync.mockReturnValue(mockText);
+      const mockBytes = Buffer.byteLength(mockText, 'utf8');

-      const result = parseTextNative(mockFile);
+      mockedReadFileAsString.mockResolvedValue({
+        content: mockText,
+        bytes: mockBytes,
+      });

-      expect(mockedFs.readFileSync).toHaveBeenCalledWith('/tmp/test.txt', 'utf8');
+      const result = await parseTextNative(mockFile);
+
+      expect(mockedReadFileAsString).toHaveBeenCalledWith('/tmp/test.txt', {
+        fileSize: 100,
+      });
      expect(result).toEqual({
        text: mockText,
-        bytes: Buffer.byteLength(mockText, 'utf8'),
+        bytes: mockBytes,
        source: FileSources.text,
      });
    });

-    it('should throw an error when file cannot be read', () => {
+    it('should handle file read errors', async () => {
      const mockError = new Error('File not found');
-      mockedFs.readFileSync.mockImplementation(() => {
-        throw mockError;
-      });
+      mockedReadFileAsString.mockRejectedValue(mockError);

-      expect(() => parseTextNative(mockFile)).toThrow(
-        'Failed to read file as text: Error: File not found',
-      );
+      await expect(parseTextNative(mockFile)).rejects.toThrow('File not found');
    });
  });

@ -115,7 +131,12 @@ describe('text', () => {

    it('should fall back to native parsing when RAG_API_URL is not defined', async () => {
      const mockText = 'Native parsing result';
-      mockedFs.readFileSync.mockReturnValue(mockText);
+      const mockBytes = Buffer.byteLength(mockText, 'utf8');
+
+      mockedReadFileAsString.mockResolvedValue({
+        content: mockText,
+        bytes: mockBytes,
+      });

      const result = await parseText({
        req: mockReq,
@ -125,7 +146,7 @@ describe('text', () => {

      expect(result).toEqual({
        text: mockText,
-        bytes: Buffer.byteLength(mockText, 'utf8'),
+        bytes: mockBytes,
        source: FileSources.text,
      });
      expect(mockedAxios.get).not.toHaveBeenCalled();
@ -134,7 +155,12 @@ describe('text', () => {
    it('should fall back to native parsing when health check fails', async () => {
      process.env.RAG_API_URL = 'http://rag-api.test';
      const mockText = 'Native parsing result';
-      mockedFs.readFileSync.mockReturnValue(mockText);
+      const mockBytes = Buffer.byteLength(mockText, 'utf8');
+
+      mockedReadFileAsString.mockResolvedValue({
+        content: mockText,
+        bytes: mockBytes,
+      });

      mockedAxios.get.mockRejectedValue(new Error('Health check failed'));

@ -145,11 +171,11 @@ describe('text', () => {
      });

      expect(mockedAxios.get).toHaveBeenCalledWith('http://rag-api.test/health', {
-        timeout: 5000,
+        timeout: 10000,
      });
      expect(result).toEqual({
        text: mockText,
-        bytes: Buffer.byteLength(mockText, 'utf8'),
+        bytes: mockBytes,
        source: FileSources.text,
      });
    });
@ -157,7 +183,12 @@ describe('text', () => {
    it('should fall back to native parsing when health check returns non-OK status', async () => {
      process.env.RAG_API_URL = 'http://rag-api.test';
      const mockText = 'Native parsing result';
-      mockedFs.readFileSync.mockReturnValue(mockText);
+      const mockBytes = Buffer.byteLength(mockText, 'utf8');
+
+      mockedReadFileAsString.mockResolvedValue({
+        content: mockText,
+        bytes: mockBytes,
+      });

      mockedAxios.get.mockResolvedValue({
        status: 500,
@ -172,7 +203,7 @@ describe('text', () => {

      expect(result).toEqual({
        text: mockText,
-        bytes: Buffer.byteLength(mockText, 'utf8'),
+        bytes: mockBytes,
        source: FileSources.text,
      });
    });
@ -207,7 +238,12 @@ describe('text', () => {
    it('should fall back to native parsing when RAG API response lacks text property', async () => {
      process.env.RAG_API_URL = 'http://rag-api.test';
      const mockText = 'Native parsing result';
-      mockedFs.readFileSync.mockReturnValue(mockText);
+      const mockBytes = Buffer.byteLength(mockText, 'utf8');
+
+      mockedReadFileAsString.mockResolvedValue({
+        content: mockText,
+        bytes: mockBytes,
+      });

      mockedAxios.get.mockResolvedValue({
        status: 200,
@ -226,7 +262,7 @@ describe('text', () => {

      expect(result).toEqual({
        text: mockText,
-        bytes: Buffer.byteLength(mockText, 'utf8'),
+        bytes: mockBytes,
        source: FileSources.text,
      });
    });
@ -234,7 +270,12 @@ describe('text', () => {
    it('should fall back to native parsing when user is undefined', async () => {
      process.env.RAG_API_URL = 'http://rag-api.test';
      const mockText = 'Native parsing result';
-      mockedFs.readFileSync.mockReturnValue(mockText);
+      const mockBytes = Buffer.byteLength(mockText, 'utf8');
+
+      mockedReadFileAsString.mockResolvedValue({
+        content: mockText,
+        bytes: mockBytes,
+      });

      const result = await parseText({
        req: { user: undefined },
@ -247,7 +288,7 @@ describe('text', () => {
      expect(mockedAxios.post).not.toHaveBeenCalled();
      expect(result).toEqual({
        text: mockText,
-        bytes: Buffer.byteLength(mockText, 'utf8'),
+        bytes: mockBytes,
        source: FileSources.text,
      });
    });
--- a/packages/api/src/files/text.ts
+++ b/packages/api/src/files/text.ts
@ -1,18 +1,19 @@
-import fs from 'fs';
 import axios from 'axios';
 import FormData from 'form-data';
+import { createReadStream } from 'fs';
 import { logger } from '@librechat/data-schemas';
 import { FileSources } from 'librechat-data-provider';
 import type { Request as ServerRequest } from 'express';
+import { logAxiosError, readFileAsString } from '~/utils';
 import { generateShortLivedToken } from '~/crypto/jwt';

 /**
 * Attempts to parse text using RAG API, falls back to native text parsing
- * @param {Object} params - The parameters object
- * @param {Express.Request} params.req - The Express request object
- * @param {Express.Multer.File} params.file - The uploaded file
- * @param {string} params.file_id - The file ID
- * @returns {Promise<{text: string, bytes: number, source: string}>}
+ * @param params - The parameters object
+ * @param params.req - The Express request object
+ * @param params.file - The uploaded file
+ * @param params.file_id - The file ID
+ * @returns
 */
 export async function parseText({
  req,
@ -30,32 +31,33 @@ export async function parseText({
    return parseTextNative(file);
  }

-  if (!req.user?.id) {
+  const userId = req.user?.id;
+  if (!userId) {
    logger.debug('[parseText] No user ID provided, falling back to native text parsing');
    return parseTextNative(file);
  }

  try {
    const healthResponse = await axios.get(`${process.env.RAG_API_URL}/health`, {
-      timeout: 5000,
+      timeout: 10000,
    });
    if (healthResponse?.statusText !== 'OK' && healthResponse?.status !== 200) {
      logger.debug('[parseText] RAG API health check failed, falling back to native parsing');
      return parseTextNative(file);
    }
  } catch (healthError) {
-    logger.debug(
-      '[parseText] RAG API health check failed, falling back to native parsing',
-      healthError,
-    );
+    logAxiosError({
+      message: '[parseText] RAG API health check failed, falling back to native parsing:',
+      error: healthError,
+    });
    return parseTextNative(file);
  }

  try {
-    const jwtToken = generateShortLivedToken(req.user.id);
+    const jwtToken = generateShortLivedToken(userId);
    const formData = new FormData();
    formData.append('file_id', file_id);
-    formData.append('file', fs.createReadStream(file.path));
+    formData.append('file', createReadStream(file.path));

    const formHeaders = formData.getHeaders();

@ -69,7 +71,7 @@ export async function parseText({
    });

    const responseData = response.data;
-    logger.debug('[parseText] Response from RAG API', responseData);
+    logger.debug(`[parseText] RAG API completed successfully (${response.status})`);

    if (!('text' in responseData)) {
      throw new Error('RAG API did not return parsed text');
@ -81,7 +83,10 @@ export async function parseText({
      source: FileSources.text,
    };
  } catch (error) {
-    logger.warn('[parseText] RAG API text parsing failed, falling back to native parsing', error);
+    logAxiosError({
+      message: '[parseText] RAG API text parsing failed, falling back to native parsing',
+      error,
+    });
    return parseTextNative(file);
  }
 }
@ -89,25 +94,21 @@ export async function parseText({
 /**
 * Native JavaScript text parsing fallback
 * Simple text file reading - complex formats handled by RAG API
- * @param {Express.Multer.File} file - The uploaded file
- * @returns {{text: string, bytes: number, source: string}}
+ * @param file - The uploaded file
+ * @returns
 */
-export function parseTextNative(file: Express.Multer.File): {
+export async function parseTextNative(file: Express.Multer.File): Promise<{
  text: string;
  bytes: number;
  source: string;
-} {
-  try {
-    const text = fs.readFileSync(file.path, 'utf8');
-    const bytes = Buffer.byteLength(text, 'utf8');
+}> {
+  const { content: text, bytes } = await readFileAsString(file.path, {
+    fileSize: file.size,
+  });

-    return {
-      text,
-      bytes,
-      source: FileSources.text,
-    };
-  } catch (error) {
-    console.error('[parseTextNative] Failed to parse file:', error);
-    throw new Error(`Failed to read file as text: ${error}`);
-  }
+  return {
+    text,
+    bytes,
+    source: FileSources.text,
+  };
 }