📂 refactor: File Read Operations (#9747)

* fix: axios response logging for text parsing, remove console logging, remove jsdoc

* refactor: error logging in logAxiosError function to handle various error types with type guards

* refactor: enhance text parsing with improved error handling and async file reading

* refactor: replace synchronous file reading with asynchronous methods for improved performance and memory management

* ci: update tests
This commit is contained in:
Danny Avila 2025-09-20 10:17:24 -04:00 committed by GitHub
parent 0352067da2
commit 2489670f54
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 692 additions and 83 deletions

View file

@ -45,6 +45,10 @@ jest.mock('~/utils/axios', () => ({
logAxiosError: jest.fn(({ message }) => message || 'Error'),
}));
jest.mock('~/utils/files', () => ({
readFileAsBuffer: jest.fn(),
}));
import * as fs from 'fs';
import axios from 'axios';
import { HttpsProxyAgent } from 'https-proxy-agent';
@ -56,6 +60,7 @@ import type {
OCRResult,
} from '~/types';
import { logger as mockLogger } from '@librechat/data-schemas';
import { readFileAsBuffer } from '~/utils/files';
import {
uploadDocumentToMistral,
uploadAzureMistralOCR,
@ -1978,9 +1983,10 @@ describe('MistralOCR Service', () => {
describe('Azure Mistral OCR with proxy', () => {
beforeEach(() => {
(jest.mocked(fs).readFileSync as jest.Mock).mockReturnValue(
Buffer.from('mock-file-content'),
);
(readFileAsBuffer as jest.Mock).mockResolvedValue({
content: Buffer.from('mock-file-content'),
bytes: Buffer.from('mock-file-content').length,
});
});
it('should use proxy for Azure Mistral OCR requests', async () => {
@ -2098,7 +2104,10 @@ describe('MistralOCR Service', () => {
describe('uploadAzureMistralOCR', () => {
beforeEach(() => {
(jest.mocked(fs).readFileSync as jest.Mock).mockReturnValue(Buffer.from('mock-file-content'));
(readFileAsBuffer as jest.Mock).mockResolvedValue({
content: Buffer.from('mock-file-content'),
bytes: Buffer.from('mock-file-content').length,
});
// Reset the HttpsProxyAgent mock to its default implementation for Azure tests
(HttpsProxyAgent as unknown as jest.Mock).mockImplementation((url) => ({ proxyUrl: url }));
// Clean up any PROXY env var from previous tests
@ -2172,7 +2181,9 @@ describe('MistralOCR Service', () => {
loadAuthValues: mockLoadAuthValues,
});
expect(jest.mocked(fs).readFileSync).toHaveBeenCalledWith('/tmp/upload/azure-file.pdf');
expect(readFileAsBuffer).toHaveBeenCalledWith('/tmp/upload/azure-file.pdf', {
fileSize: undefined,
});
// Verify OCR was called with base64 data URL
expect(mockAxios.post).toHaveBeenCalledWith(

View file

@ -22,6 +22,7 @@ import type {
OCRImage,
} from '~/types';
import { logAxiosError, createAxiosInstance } from '~/utils/axios';
import { readFileAsBuffer } from '~/utils/files';
import { loadServiceKey } from '~/utils/key';
const axios = createAxiosInstance();
@ -464,7 +465,9 @@ export const uploadAzureMistralOCR = async (
const { apiKey, baseURL } = await loadAuthConfig(context);
const model = getModelConfig(context.req.config?.ocr);
const buffer = fs.readFileSync(context.file.path);
const { content: buffer } = await readFileAsBuffer(context.file.path, {
fileSize: context.file.size,
});
const base64 = buffer.toString('base64');
/** Uses actual mimetype of the file, 'image/jpeg' as fallback since it seems to be accepted regardless of mismatch */
const base64Prefix = `data:${context.file.mimetype || 'image/jpeg'};base64,`;
@ -691,7 +694,9 @@ export const uploadGoogleVertexMistralOCR = async (
const { serviceAccount, accessToken } = await loadGoogleAuthConfig();
const model = getModelConfig(context.req.config?.ocr);
const buffer = fs.readFileSync(context.file.path);
const { content: buffer } = await readFileAsBuffer(context.file.path, {
fileSize: context.file.size,
});
const base64 = buffer.toString('base64');
const base64Prefix = `data:${context.file.mimetype || 'application/pdf'};base64,`;

View file

@ -9,8 +9,6 @@ jest.mock('@librechat/data-schemas', () => ({
},
}));
import { parseTextNative, parseText } from './text';
jest.mock('fs', () => ({
readFileSync: jest.fn(),
createReadStream: jest.fn(),
@ -36,10 +34,24 @@ jest.mock('form-data', () => {
}));
});
// Mock the utils module to avoid AWS SDK issues
jest.mock('../utils', () => ({
logAxiosError: jest.fn((args) => {
if (typeof args === 'object' && args.message) {
return args.message;
}
return 'Error';
}),
readFileAsString: jest.fn(),
}));
// Now import everything after mocks are in place
import { parseTextNative, parseText } from './text';
import fs, { ReadStream } from 'fs';
import axios from 'axios';
import FormData from 'form-data';
import { generateShortLivedToken } from '../crypto/jwt';
import { readFileAsString } from '../utils';
const mockedFs = fs as jest.Mocked<typeof fs>;
const mockedAxios = axios as jest.Mocked<typeof axios>;
@ -47,6 +59,7 @@ const mockedFormData = FormData as jest.MockedClass<typeof FormData>;
const mockedGenerateShortLivedToken = generateShortLivedToken as jest.MockedFunction<
typeof generateShortLivedToken
>;
const mockedReadFileAsString = readFileAsString as jest.MockedFunction<typeof readFileAsString>;
describe('text', () => {
const mockFile: Express.Multer.File = {
@ -74,29 +87,32 @@ describe('text', () => {
});
describe('parseTextNative', () => {
it('should successfully parse a text file', () => {
it('should successfully parse a text file', async () => {
const mockText = 'Hello, world!';
mockedFs.readFileSync.mockReturnValue(mockText);
const mockBytes = Buffer.byteLength(mockText, 'utf8');
const result = parseTextNative(mockFile);
mockedReadFileAsString.mockResolvedValue({
content: mockText,
bytes: mockBytes,
});
expect(mockedFs.readFileSync).toHaveBeenCalledWith('/tmp/test.txt', 'utf8');
const result = await parseTextNative(mockFile);
expect(mockedReadFileAsString).toHaveBeenCalledWith('/tmp/test.txt', {
fileSize: 100,
});
expect(result).toEqual({
text: mockText,
bytes: Buffer.byteLength(mockText, 'utf8'),
bytes: mockBytes,
source: FileSources.text,
});
});
it('should throw an error when file cannot be read', () => {
it('should handle file read errors', async () => {
const mockError = new Error('File not found');
mockedFs.readFileSync.mockImplementation(() => {
throw mockError;
});
mockedReadFileAsString.mockRejectedValue(mockError);
expect(() => parseTextNative(mockFile)).toThrow(
'Failed to read file as text: Error: File not found',
);
await expect(parseTextNative(mockFile)).rejects.toThrow('File not found');
});
});
@ -115,7 +131,12 @@ describe('text', () => {
it('should fall back to native parsing when RAG_API_URL is not defined', async () => {
const mockText = 'Native parsing result';
mockedFs.readFileSync.mockReturnValue(mockText);
const mockBytes = Buffer.byteLength(mockText, 'utf8');
mockedReadFileAsString.mockResolvedValue({
content: mockText,
bytes: mockBytes,
});
const result = await parseText({
req: mockReq,
@ -125,7 +146,7 @@ describe('text', () => {
expect(result).toEqual({
text: mockText,
bytes: Buffer.byteLength(mockText, 'utf8'),
bytes: mockBytes,
source: FileSources.text,
});
expect(mockedAxios.get).not.toHaveBeenCalled();
@ -134,7 +155,12 @@ describe('text', () => {
it('should fall back to native parsing when health check fails', async () => {
process.env.RAG_API_URL = 'http://rag-api.test';
const mockText = 'Native parsing result';
mockedFs.readFileSync.mockReturnValue(mockText);
const mockBytes = Buffer.byteLength(mockText, 'utf8');
mockedReadFileAsString.mockResolvedValue({
content: mockText,
bytes: mockBytes,
});
mockedAxios.get.mockRejectedValue(new Error('Health check failed'));
@ -145,11 +171,11 @@ describe('text', () => {
});
expect(mockedAxios.get).toHaveBeenCalledWith('http://rag-api.test/health', {
timeout: 5000,
timeout: 10000,
});
expect(result).toEqual({
text: mockText,
bytes: Buffer.byteLength(mockText, 'utf8'),
bytes: mockBytes,
source: FileSources.text,
});
});
@ -157,7 +183,12 @@ describe('text', () => {
it('should fall back to native parsing when health check returns non-OK status', async () => {
process.env.RAG_API_URL = 'http://rag-api.test';
const mockText = 'Native parsing result';
mockedFs.readFileSync.mockReturnValue(mockText);
const mockBytes = Buffer.byteLength(mockText, 'utf8');
mockedReadFileAsString.mockResolvedValue({
content: mockText,
bytes: mockBytes,
});
mockedAxios.get.mockResolvedValue({
status: 500,
@ -172,7 +203,7 @@ describe('text', () => {
expect(result).toEqual({
text: mockText,
bytes: Buffer.byteLength(mockText, 'utf8'),
bytes: mockBytes,
source: FileSources.text,
});
});
@ -207,7 +238,12 @@ describe('text', () => {
it('should fall back to native parsing when RAG API response lacks text property', async () => {
process.env.RAG_API_URL = 'http://rag-api.test';
const mockText = 'Native parsing result';
mockedFs.readFileSync.mockReturnValue(mockText);
const mockBytes = Buffer.byteLength(mockText, 'utf8');
mockedReadFileAsString.mockResolvedValue({
content: mockText,
bytes: mockBytes,
});
mockedAxios.get.mockResolvedValue({
status: 200,
@ -226,7 +262,7 @@ describe('text', () => {
expect(result).toEqual({
text: mockText,
bytes: Buffer.byteLength(mockText, 'utf8'),
bytes: mockBytes,
source: FileSources.text,
});
});
@ -234,7 +270,12 @@ describe('text', () => {
it('should fall back to native parsing when user is undefined', async () => {
process.env.RAG_API_URL = 'http://rag-api.test';
const mockText = 'Native parsing result';
mockedFs.readFileSync.mockReturnValue(mockText);
const mockBytes = Buffer.byteLength(mockText, 'utf8');
mockedReadFileAsString.mockResolvedValue({
content: mockText,
bytes: mockBytes,
});
const result = await parseText({
req: { user: undefined },
@ -247,7 +288,7 @@ describe('text', () => {
expect(mockedAxios.post).not.toHaveBeenCalled();
expect(result).toEqual({
text: mockText,
bytes: Buffer.byteLength(mockText, 'utf8'),
bytes: mockBytes,
source: FileSources.text,
});
});

View file

@ -1,18 +1,19 @@
import fs from 'fs';
import axios from 'axios';
import FormData from 'form-data';
import { createReadStream } from 'fs';
import { logger } from '@librechat/data-schemas';
import { FileSources } from 'librechat-data-provider';
import type { Request as ServerRequest } from 'express';
import { logAxiosError, readFileAsString } from '~/utils';
import { generateShortLivedToken } from '~/crypto/jwt';
/**
* Attempts to parse text using RAG API, falls back to native text parsing
* @param {Object} params - The parameters object
* @param {Express.Request} params.req - The Express request object
* @param {Express.Multer.File} params.file - The uploaded file
* @param {string} params.file_id - The file ID
* @returns {Promise<{text: string, bytes: number, source: string}>}
* @param params - The parameters object
* @param params.req - The Express request object
* @param params.file - The uploaded file
* @param params.file_id - The file ID
* @returns
*/
export async function parseText({
req,
@ -30,32 +31,33 @@ export async function parseText({
return parseTextNative(file);
}
if (!req.user?.id) {
const userId = req.user?.id;
if (!userId) {
logger.debug('[parseText] No user ID provided, falling back to native text parsing');
return parseTextNative(file);
}
try {
const healthResponse = await axios.get(`${process.env.RAG_API_URL}/health`, {
timeout: 5000,
timeout: 10000,
});
if (healthResponse?.statusText !== 'OK' && healthResponse?.status !== 200) {
logger.debug('[parseText] RAG API health check failed, falling back to native parsing');
return parseTextNative(file);
}
} catch (healthError) {
logger.debug(
'[parseText] RAG API health check failed, falling back to native parsing',
healthError,
);
logAxiosError({
message: '[parseText] RAG API health check failed, falling back to native parsing:',
error: healthError,
});
return parseTextNative(file);
}
try {
const jwtToken = generateShortLivedToken(req.user.id);
const jwtToken = generateShortLivedToken(userId);
const formData = new FormData();
formData.append('file_id', file_id);
formData.append('file', fs.createReadStream(file.path));
formData.append('file', createReadStream(file.path));
const formHeaders = formData.getHeaders();
@ -69,7 +71,7 @@ export async function parseText({
});
const responseData = response.data;
logger.debug('[parseText] Response from RAG API', responseData);
logger.debug(`[parseText] RAG API completed successfully (${response.status})`);
if (!('text' in responseData)) {
throw new Error('RAG API did not return parsed text');
@ -81,7 +83,10 @@ export async function parseText({
source: FileSources.text,
};
} catch (error) {
logger.warn('[parseText] RAG API text parsing failed, falling back to native parsing', error);
logAxiosError({
message: '[parseText] RAG API text parsing failed, falling back to native parsing',
error,
});
return parseTextNative(file);
}
}
@ -89,25 +94,21 @@ export async function parseText({
/**
* Native JavaScript text parsing fallback
* Simple text file reading - complex formats handled by RAG API
* @param {Express.Multer.File} file - The uploaded file
* @returns {{text: string, bytes: number, source: string}}
* @param file - The uploaded file
* @returns
*/
export function parseTextNative(file: Express.Multer.File): {
export async function parseTextNative(file: Express.Multer.File): Promise<{
text: string;
bytes: number;
source: string;
} {
try {
const text = fs.readFileSync(file.path, 'utf8');
const bytes = Buffer.byteLength(text, 'utf8');
}> {
const { content: text, bytes } = await readFileAsString(file.path, {
fileSize: file.size,
});
return {
text,
bytes,
source: FileSources.text,
};
} catch (error) {
console.error('[parseTextNative] Failed to parse file:', error);
throw new Error(`Failed to read file as text: ${error}`);
}
return {
text,
bytes,
source: FileSources.text,
};
}