📑 refactor: File Search Citations Dual-Format Unicode Handling (#10888)

* 🔖 refactor: citation handling with support for both literal and Unicode formats

* refactor: file search messages for edge cases in documents

* 🔧 refactor: Enhance citation handling with detailed regex patterns for literal and Unicode formats

* 🔧 refactor: Simplify file search query handling by removing unnecessary parameters and improving result formatting

*  test: Add comprehensive integration tests for citation processing flow with support for literal and Unicode formats

* 🔧 refactor: Improve regex match handling and add performance tests for citation processing
This commit is contained in:
Danny Avila 2025-12-10 13:25:56 -05:00 committed by GitHub
parent af8394b05c
commit 03c9d5f79f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 638 additions and 18 deletions

View file

@ -4,13 +4,29 @@ import type { Citation, CitationNode } from './types';
import { SPAN_REGEX, STANDALONE_PATTERN, CLEANUP_REGEX, COMPOSITE_REGEX } from '~/utils/citations';
/**
* Checks if a standalone marker is truly standalone (not inside a composite block)
* Checks if a standalone marker is truly standalone (not inside a composite block).
* A marker is inside a composite if there's an opening \ue200 without a closing \ue201 after it.
*
* Handles both literal text format ("\ue200") and actual Unicode (U+E200) by checking
* for both and using the rightmost occurrence. This correctly handles:
* - Pure literal format: "\ue200...\ue201"
* - Pure Unicode format: "..."
* - Mixed formats: "\ue200..." (different formats for open/close)
*/
function isStandaloneMarker(text: string, position: number): boolean {
const beforeText = text.substring(0, position);
const lastUe200 = beforeText.lastIndexOf('\\ue200');
const lastUe201 = beforeText.lastIndexOf('\\ue201');
// Find rightmost composite block start (either format)
const lastUe200Literal = beforeText.lastIndexOf('\\ue200');
const lastUe200Char = beforeText.lastIndexOf('\ue200');
const lastUe200 = Math.max(lastUe200Literal, lastUe200Char);
// Find rightmost composite block end (either format)
const lastUe201Literal = beforeText.lastIndexOf('\\ue201');
const lastUe201Char = beforeText.lastIndexOf('\ue201');
const lastUe201 = Math.max(lastUe201Literal, lastUe201Char);
// Standalone if: no opening marker OR closing marker appears after opening
return lastUe200 === -1 || (lastUe201 !== -1 && lastUe201 > lastUe200);
}