🔗 fix: File Citation Processing to Use Tool Artifacts

2025-12-17 00:40:14 +01:00 · 2025-07-30 19:24:01 -04:00 · 2025-07-30 19:24:01 -04:00 · fc8fd489d6
commit fc8fd489d6
parent 81b32e400a
8 changed files with 524 additions and 538 deletions
--- a/api/test/services/Files/processAgentResponse.test.js
+++ b/api/test/services/Files/processAgentResponse.test.js
@ -1,237 +0,0 @@
-const { processAgentResponse } = require('../../../app/clients/agents/processAgentResponse');
-const { Files } = require('../../../models');
-const { getCustomConfig } = require('../../../server/services/Config/getCustomConfig');
-
-// Mock dependencies
-jest.mock('../../../models', () => ({
-  Files: {
-    find: jest.fn(),
-  },
-}));
-
-jest.mock('../../../server/services/Config/getCustomConfig', () => ({
-  getCustomConfig: jest.fn(),
-}));
-
-jest.mock('../../../config', () => ({
-  logger: {
-    warn: jest.fn(),
-    error: jest.fn(),
-    debug: jest.fn(),
-  },
-}));
-
-describe('processAgentResponse', () => {
-  beforeEach(() => {
-    jest.clearAllMocks();
-  });
-
-  it('should return response unchanged when no messageId', async () => {
-    const response = { messageId: null };
-    const result = await processAgentResponse(response, 'user123', 'conv123');
-    expect(result).toBe(response);
-  });
-
-  it('should return response unchanged when no file search results', async () => {
-    getCustomConfig.mockResolvedValue({ endpoints: { agents: { maxCitations: 10 } } });
-
-    const response = { messageId: 'msg123' };
-    const contentParts = [{ type: 'text', content: 'some text' }];
-
-    const result = await processAgentResponse(response, 'user123', 'conv123', contentParts);
-    expect(result).toBe(response);
-  });
-
-  it('should process file search results and create attachments', async () => {
-    getCustomConfig.mockResolvedValue({
-      endpoints: { agents: { maxCitations: 10 } },
-      fileStrategy: 's3',
-    });
-
-    Files.find.mockResolvedValue([
-      {
-        file_id: 'file123',
-        source: 's3',
-        filename: 'test.pdf',
-      },
-    ]);
-
-    const response = { messageId: 'msg123' };
-    const contentParts = [
-      {
-        type: 'tool_call',
-        tool_call: {
-          name: 'file_search',
-          output: `File: test.pdf
-File_ID: file123
-Relevance: 0.8
-Page: 1
-Storage_Type: s3
-S3_Bucket: test-bucket
-S3_Key: uploads/user123/file123__test.pdf
-Content: Test content`,
-        },
-      },
-    ];
-
-    const result = await processAgentResponse(response, 'user123', 'conv123', contentParts);
-
-    expect(result.attachments).toBeDefined();
-    expect(result.attachments).toHaveLength(1);
-    expect(result.attachments[0].type).toBe('file_search');
-    expect(result.attachments[0].file_search.sources).toBeDefined();
-    expect(result.attachments[0].file_search.sources).toHaveLength(1);
-
-    const source = result.attachments[0].file_search.sources[0];
-    expect(source.fileId).toBe('file123');
-    expect(source.fileName).toBe('test.pdf');
-    expect(source.metadata.storageType).toBe('s3');
-  });
-
-  it('should use configured fileStrategy when file metadata is missing', async () => {
-    getCustomConfig.mockResolvedValue({
-      endpoints: { agents: { maxCitations: 10 } },
-      fileStrategy: 's3',
-    });
-
-    Files.find.mockResolvedValue([
-      {
-        file_id: 'file123',
-        // source is undefined, should fallback to fileStrategy
-      },
-    ]);
-
-    const response = { messageId: 'msg123' };
-    const contentParts = [
-      {
-        type: 'tool_call',
-        tool_call: {
-          name: 'file_search',
-          output: `File: test.pdf
-File_ID: file123
-Relevance: 0.8
-Content: Test content`,
-        },
-      },
-    ];
-
-    const result = await processAgentResponse(response, 'user123', 'conv123', contentParts);
-
-    const source = result.attachments[0].file_search.sources[0];
-    expect(source.metadata.storageType).toBe('s3'); // Should use fileStrategy
-  });
-
-  it('should handle file diversity and allow multiple pages per file', async () => {
-    getCustomConfig.mockResolvedValue({
-      endpoints: { agents: { maxCitations: 5, maxCitationsPerFile: 3 } },
-      fileStrategy: 's3',
-    });
-
-    Files.find.mockResolvedValue([
-      { file_id: 'file1', source: 'local', filename: 'test1.pdf' },
-      { file_id: 'file2', source: 'local', filename: 'test2.pdf' },
-    ]);
-
-    const response = { messageId: 'msg123' };
-    const contentParts = [
-      {
-        type: 'tool_call',
-        tool_call: {
-          name: 'file_search',
-          output: `File: test1.pdf
-File_ID: file1
-Relevance: 0.9
-Page: 1
-Content: High relevance content
-
---
-
-File: test1.pdf  
-File_ID: file1
-Relevance: 0.7
-Page: 2
-Content: Lower relevance content
-
---
-
-File: test2.pdf
-File_ID: file2
-Relevance: 0.8
-Page: 1
-Content: Different file content`,
-        },
-      },
-    ];
-
-    const result = await processAgentResponse(response, 'user123', 'conv123', contentParts);
-
-    const sources = result.attachments[0].file_search.sources;
-    expect(sources.length).toBeGreaterThanOrEqual(2); // Can include multiple pages per file now
-
-    // Should have both files represented
-    const fileIds = sources.map((s) => s.fileId);
-    expect(fileIds).toContain('file1');
-    expect(fileIds).toContain('file2');
-
-    // Should include multiple pages from file1 due to high relevance
-    const file1Sources = sources.filter((s) => s.fileId === 'file1');
-    expect(file1Sources.length).toBeGreaterThanOrEqual(1);
-  });
-
-  it('should respect maxCitationsPerFile configuration', async () => {
-    getCustomConfig.mockResolvedValue({
-      endpoints: { agents: { maxCitations: 10, maxCitationsPerFile: 2 } },
-      fileStrategy: 'local',
-    });
-
-    Files.find.mockResolvedValue([{ file_id: 'file1', source: 'local', filename: 'test1.pdf' }]);
-
-    const response = { messageId: 'msg123' };
-    const contentParts = [
-      {
-        type: 'tool_call',
-        tool_call: {
-          name: 'file_search',
-          output: `File: test1.pdf
-File_ID: file1
-Relevance: 0.9
-Page: 1
-Content: Page 1 content
-
---
-
-File: test1.pdf
-File_ID: file1
-Relevance: 0.8
-Page: 2
-Content: Page 2 content
-
---
-
-File: test1.pdf
-File_ID: file1
-Relevance: 0.7
-Page: 3
-Content: Page 3 content
-
---
-
-File: test1.pdf
-File_ID: file1
-Relevance: 0.6
-Page: 4
-Content: Page 4 content`,
-        },
-      },
-    ];
-
-    const result = await processAgentResponse(response, 'user123', 'conv123', contentParts);
-
-    const sources = result.attachments[0].file_search.sources;
-    expect(sources).toHaveLength(2); // Should be limited to maxCitationsPerFile (2)
-
-    // Should include the 2 highest relevance pages (0.9 and 0.8)
-    expect(sources[0].relevance).toBe(0.9);
-    expect(sources[1].relevance).toBe(0.8);
-  });
-});
--- a/api/test/services/Files/processFileCitations.test.js
+++ b/api/test/services/Files/processFileCitations.test.js
@ -0,0 +1,337 @@
+const { Tools } = require('librechat-data-provider');
+const {
+  processFileCitations,
+  applyCitationLimits,
+  enhanceSourcesWithMetadata,
+} = require('~/server/services/Files/Citations');
+
+// Mock dependencies
+jest.mock('~/models', () => ({
+  Files: {
+    find: jest.fn().mockResolvedValue([]),
+  },
+}));
+
+jest.mock('~/models/Role', () => ({
+  getRoleByName: jest.fn(),
+}));
+
+jest.mock('@librechat/api', () => ({
+  checkAccess: jest.fn().mockResolvedValue(true),
+}));
+
+jest.mock('~/server/services/Config/getCustomConfig', () => ({
+  getCustomConfig: jest.fn().mockResolvedValue({
+    endpoints: {
+      agents: {
+        maxCitations: 30,
+        maxCitationsPerFile: 5,
+        minRelevanceScore: 0.45,
+      },
+    },
+    fileStrategy: 'local',
+  }),
+}));
+
+jest.mock('~/config', () => ({
+  logger: {
+    debug: jest.fn(),
+    error: jest.fn(),
+    warn: jest.fn(),
+  },
+}));
+
+describe('processFileCitations', () => {
+  const mockReq = {
+    user: {
+      id: 'user123',
+    },
+  };
+
+  const mockMetadata = {
+    run_id: 'run123',
+    thread_id: 'conv123',
+  };
+
+  describe('file search artifact processing', () => {
+    it('should process file search artifacts correctly', async () => {
+      const toolArtifact = {
+        [Tools.file_search]: {
+          sources: [
+            {
+              fileId: 'file_123',
+              fileName: 'example.pdf',
+              pages: [5],
+              relevance: 0.85,
+              type: 'file',
+              pageRelevance: { 5: 0.85 },
+              content: 'This is the content',
+            },
+            {
+              fileId: 'file_456',
+              fileName: 'document.txt',
+              pages: [],
+              relevance: 0.72,
+              type: 'file',
+              pageRelevance: {},
+              content: 'Another document',
+            },
+          ],
+        },
+      };
+
+      const result = await processFileCitations({
+        toolArtifact,
+        toolCallId: 'call_123',
+        metadata: mockMetadata,
+        user: mockReq.user,
+      });
+
+      expect(result).toBeTruthy();
+      expect(result.type).toBe('file_search');
+      expect(result.file_search.sources).toHaveLength(2);
+      expect(result.file_search.sources[0].fileId).toBe('file_123');
+      expect(result.file_search.sources[0].relevance).toBe(0.85);
+    });
+
+    it('should return null for non-file_search tools', async () => {
+      const result = await processFileCitations({
+        toolArtifact: { other_tool: {} },
+        toolCallId: 'call_123',
+        metadata: mockMetadata,
+        user: mockReq.user,
+      });
+
+      expect(result).toBeNull();
+    });
+
+    it('should filter results below relevance threshold', async () => {
+      const toolArtifact = {
+        [Tools.file_search]: {
+          sources: [
+            {
+              fileId: 'file_789',
+              fileName: 'low_relevance.pdf',
+              pages: [],
+              relevance: 0.2,
+              type: 'file',
+              pageRelevance: {},
+              content: 'Low relevance content',
+            },
+          ],
+        },
+      };
+
+      const result = await processFileCitations({
+        toolArtifact,
+        toolCallId: 'call_123',
+        metadata: mockMetadata,
+        user: mockReq.user,
+      });
+
+      expect(result).toBeNull();
+    });
+
+    it('should return null when artifact is missing file_search data', async () => {
+      const result = await processFileCitations({
+        toolArtifact: {},
+        toolCallId: 'call_123',
+        metadata: mockMetadata,
+        user: mockReq.user,
+      });
+
+      expect(result).toBeNull();
+    });
+  });
+
+  describe('applyCitationLimits', () => {
+    it('should limit citations per file and total', () => {
+      const sources = [
+        { fileId: 'file1', relevance: 0.9 },
+        { fileId: 'file1', relevance: 0.8 },
+        { fileId: 'file1', relevance: 0.7 },
+        { fileId: 'file2', relevance: 0.85 },
+        { fileId: 'file2', relevance: 0.75 },
+      ];
+
+      const result = applyCitationLimits(sources, 3, 2);
+
+      expect(result).toHaveLength(3);
+      expect(result[0].relevance).toBe(0.9);
+      expect(result[1].relevance).toBe(0.85);
+      expect(result[2].relevance).toBe(0.8);
+    });
+  });
+
+  describe('enhanceSourcesWithMetadata', () => {
+    const { Files } = require('~/models');
+    const mockCustomConfig = {
+      fileStrategy: 'local',
+    };
+
+    beforeEach(() => {
+      jest.clearAllMocks();
+    });
+
+    it('should enhance sources with file metadata from database', async () => {
+      const sources = [
+        {
+          fileId: 'file_123',
+          fileName: 'example.pdf',
+          relevance: 0.85,
+          type: 'file',
+        },
+        {
+          fileId: 'file_456',
+          fileName: 'document.txt',
+          relevance: 0.72,
+          type: 'file',
+        },
+      ];
+
+      Files.find.mockResolvedValue([
+        {
+          file_id: 'file_123',
+          filename: 'example_from_db.pdf',
+          source: 's3',
+        },
+        {
+          file_id: 'file_456',
+          filename: 'document_from_db.txt',
+          source: 'local',
+        },
+      ]);
+
+      const result = await enhanceSourcesWithMetadata(sources, mockCustomConfig);
+
+      expect(Files.find).toHaveBeenCalledWith({ file_id: { $in: ['file_123', 'file_456'] } });
+      expect(result).toHaveLength(2);
+
+      expect(result[0]).toEqual({
+        fileId: 'file_123',
+        fileName: 'example_from_db.pdf',
+        relevance: 0.85,
+        type: 'file',
+        metadata: {
+          storageType: 's3',
+        },
+      });
+
+      expect(result[1]).toEqual({
+        fileId: 'file_456',
+        fileName: 'document_from_db.txt',
+        relevance: 0.72,
+        type: 'file',
+        metadata: {
+          storageType: 'local',
+        },
+      });
+    });
+
+    it('should preserve existing metadata and source data', async () => {
+      const sources = [
+        {
+          fileId: 'file_123',
+          fileName: 'example.pdf',
+          relevance: 0.85,
+          type: 'file',
+          pages: [1, 2, 3],
+          content: 'Some content',
+          metadata: {
+            existingField: 'value',
+          },
+        },
+      ];
+
+      Files.find.mockResolvedValue([
+        {
+          file_id: 'file_123',
+          filename: 'example_from_db.pdf',
+          source: 'gcs',
+        },
+      ]);
+
+      const result = await enhanceSourcesWithMetadata(sources, mockCustomConfig);
+
+      expect(result[0]).toEqual({
+        fileId: 'file_123',
+        fileName: 'example_from_db.pdf',
+        relevance: 0.85,
+        type: 'file',
+        pages: [1, 2, 3],
+        content: 'Some content',
+        metadata: {
+          existingField: 'value',
+          storageType: 'gcs',
+        },
+      });
+    });
+
+    it('should handle missing file metadata gracefully', async () => {
+      const sources = [
+        {
+          fileId: 'file_789',
+          fileName: 'missing.pdf',
+          relevance: 0.9,
+          type: 'file',
+        },
+      ];
+
+      Files.find.mockResolvedValue([]);
+
+      const result = await enhanceSourcesWithMetadata(sources, mockCustomConfig);
+
+      expect(result[0]).toEqual({
+        fileId: 'file_789',
+        fileName: 'missing.pdf',
+        relevance: 0.9,
+        type: 'file',
+        metadata: {
+          storageType: 'local', // Falls back to customConfig.fileStrategy
+        },
+      });
+    });
+
+    it('should handle database errors gracefully', async () => {
+      const sources = [
+        {
+          fileId: 'file_123',
+          fileName: 'example.pdf',
+          relevance: 0.85,
+          type: 'file',
+        },
+      ];
+
+      Files.find.mockRejectedValue(new Error('Database error'));
+
+      const result = await enhanceSourcesWithMetadata(sources, mockCustomConfig);
+
+      expect(result[0]).toEqual({
+        fileId: 'file_123',
+        fileName: 'example.pdf',
+        relevance: 0.85,
+        type: 'file',
+        metadata: {
+          storageType: 'local',
+        },
+      });
+    });
+
+    it('should deduplicate file IDs when querying database', async () => {
+      const sources = [
+        { fileId: 'file_123', fileName: 'doc1.pdf', relevance: 0.9, type: 'file' },
+        { fileId: 'file_123', fileName: 'doc1.pdf', relevance: 0.8, type: 'file' },
+        { fileId: 'file_456', fileName: 'doc2.pdf', relevance: 0.7, type: 'file' },
+      ];
+
+      Files.find.mockResolvedValue([
+        { file_id: 'file_123', filename: 'document1.pdf', source: 's3' },
+        { file_id: 'file_456', filename: 'document2.pdf', source: 'local' },
+      ]);
+
+      await enhanceSourcesWithMetadata(sources, mockCustomConfig);
+
+      expect(Files.find).toHaveBeenCalledWith({ file_id: { $in: ['file_123', 'file_456'] } });
+    });
+  });
+});