📑 refactor: File Search Citations Dual-Format Unicode Handling (#10888)

* 🔖 refactor: citation handling with support for both literal and Unicode formats

* refactor: file search messages for edge cases in documents

* 🔧 refactor: Enhance citation handling with detailed regex patterns for literal and Unicode formats

* 🔧 refactor: Simplify file search query handling by removing unnecessary parameters and improving result formatting

*  test: Add comprehensive integration tests for citation processing flow with support for literal and Unicode formats

* 🔧 refactor: Improve regex match handling and add performance tests for citation processing
This commit is contained in:
Danny Avila 2025-12-10 13:25:56 -05:00 committed by GitHub
parent af8394b05c
commit 03c9d5f79f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 638 additions and 18 deletions

View file

@ -86,7 +86,6 @@ const createFileSearchTool = async ({ userId, files, entity_id, fileCitations =
}
/**
*
* @param {import('librechat-data-provider').TFile} file
* @returns {{ file_id: string, query: string, k: number, entity_id?: string }}
*/
@ -135,11 +134,16 @@ const createFileSearchTool = async ({ userId, files, entity_id, fileCitations =
page: docInfo.metadata.page || null,
})),
)
// TODO: results should be sorted by relevance, not distance
.sort((a, b) => a.distance - b.distance)
// TODO: make this configurable
.slice(0, 10);
if (formattedResults.length === 0) {
return [
'No content found in the files. The files may not have been processed correctly or you may need to refine your query.',
undefined,
];
}
const formattedString = formattedResults
.map(
(result, index) =>
@ -169,11 +173,12 @@ const createFileSearchTool = async ({ userId, files, entity_id, fileCitations =
? `
**CITE FILE SEARCH RESULTS:**
Use anchor markers immediately after statements derived from file content. Reference the filename in your text:
Use the EXACT anchor markers shown below (copy them verbatim) immediately after statements derived from file content. Reference the filename in your text:
- File citation: "The document.pdf states that... \\ue202turn0file0"
- Page reference: "According to report.docx... \\ue202turn0file1"
- Multi-file: "Multiple sources confirm... \\ue200\\ue202turn0file0\\ue202turn0file1\\ue201"
**CRITICAL:** Output these escape sequences EXACTLY as shown (e.g., \\ue202turn0file0). Do NOT substitute with other characters like or similar symbols.
**ALWAYS mention the filename in your text before the citation marker. NEVER use markdown links or footnotes.**`
: ''
}`,