mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-16 08:20:14 +01:00
📑 refactor: File Search Citations Dual-Format Unicode Handling (#10888)
* 🔖 refactor: citation handling with support for both literal and Unicode formats * refactor: file search messages for edge cases in documents * 🔧 refactor: Enhance citation handling with detailed regex patterns for literal and Unicode formats * 🔧 refactor: Simplify file search query handling by removing unnecessary parameters and improving result formatting * ✨ test: Add comprehensive integration tests for citation processing flow with support for literal and Unicode formats * 🔧 refactor: Improve regex match handling and add performance tests for citation processing
This commit is contained in:
parent
af8394b05c
commit
03c9d5f79f
6 changed files with 638 additions and 18 deletions
|
|
@ -86,7 +86,6 @@ const createFileSearchTool = async ({ userId, files, entity_id, fileCitations =
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param {import('librechat-data-provider').TFile} file
|
* @param {import('librechat-data-provider').TFile} file
|
||||||
* @returns {{ file_id: string, query: string, k: number, entity_id?: string }}
|
* @returns {{ file_id: string, query: string, k: number, entity_id?: string }}
|
||||||
*/
|
*/
|
||||||
|
|
@ -135,11 +134,16 @@ const createFileSearchTool = async ({ userId, files, entity_id, fileCitations =
|
||||||
page: docInfo.metadata.page || null,
|
page: docInfo.metadata.page || null,
|
||||||
})),
|
})),
|
||||||
)
|
)
|
||||||
// TODO: results should be sorted by relevance, not distance
|
|
||||||
.sort((a, b) => a.distance - b.distance)
|
.sort((a, b) => a.distance - b.distance)
|
||||||
// TODO: make this configurable
|
|
||||||
.slice(0, 10);
|
.slice(0, 10);
|
||||||
|
|
||||||
|
if (formattedResults.length === 0) {
|
||||||
|
return [
|
||||||
|
'No content found in the files. The files may not have been processed correctly or you may need to refine your query.',
|
||||||
|
undefined,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
const formattedString = formattedResults
|
const formattedString = formattedResults
|
||||||
.map(
|
.map(
|
||||||
(result, index) =>
|
(result, index) =>
|
||||||
|
|
@ -169,11 +173,12 @@ const createFileSearchTool = async ({ userId, files, entity_id, fileCitations =
|
||||||
? `
|
? `
|
||||||
|
|
||||||
**CITE FILE SEARCH RESULTS:**
|
**CITE FILE SEARCH RESULTS:**
|
||||||
Use anchor markers immediately after statements derived from file content. Reference the filename in your text:
|
Use the EXACT anchor markers shown below (copy them verbatim) immediately after statements derived from file content. Reference the filename in your text:
|
||||||
- File citation: "The document.pdf states that... \\ue202turn0file0"
|
- File citation: "The document.pdf states that... \\ue202turn0file0"
|
||||||
- Page reference: "According to report.docx... \\ue202turn0file1"
|
- Page reference: "According to report.docx... \\ue202turn0file1"
|
||||||
- Multi-file: "Multiple sources confirm... \\ue200\\ue202turn0file0\\ue202turn0file1\\ue201"
|
- Multi-file: "Multiple sources confirm... \\ue200\\ue202turn0file0\\ue202turn0file1\\ue201"
|
||||||
|
|
||||||
|
**CRITICAL:** Output these escape sequences EXACTLY as shown (e.g., \\ue202turn0file0). Do NOT substitute with other characters like † or similar symbols.
|
||||||
**ALWAYS mention the filename in your text before the citation marker. NEVER use markdown links or footnotes.**`
|
**ALWAYS mention the filename in your text before the citation marker. NEVER use markdown links or footnotes.**`
|
||||||
: ''
|
: ''
|
||||||
}`,
|
}`,
|
||||||
|
|
|
||||||
|
|
@ -320,19 +320,19 @@ Current Date & Time: ${replaceSpecialVars({ text: '{{iso_datetime}}' })}
|
||||||
|
|
||||||
**Execute immediately without preface.** After search, provide a brief summary addressing the query directly, then structure your response with clear Markdown formatting (## headers, lists, tables). Cite sources properly, tailor tone to query type, and provide comprehensive details.
|
**Execute immediately without preface.** After search, provide a brief summary addressing the query directly, then structure your response with clear Markdown formatting (## headers, lists, tables). Cite sources properly, tailor tone to query type, and provide comprehensive details.
|
||||||
|
|
||||||
**CITATION FORMAT - INVISIBLE UNICODE ANCHORS ONLY:**
|
**CITATION FORMAT - UNICODE ESCAPE SEQUENCES ONLY:**
|
||||||
Use these Unicode characters: \\ue202 (before each anchor), \\ue200 (group start), \\ue201 (group end), \\ue203 (highlight start), \\ue204 (highlight end)
|
Use these EXACT escape sequences (copy verbatim): \\ue202 (before each anchor), \\ue200 (group start), \\ue201 (group end), \\ue203 (highlight start), \\ue204 (highlight end)
|
||||||
|
|
||||||
Anchor pattern: turn{N}{type}{index} where N=turn number, type=search|news|image|ref, index=0,1,2...
|
Anchor pattern: \\ue202turn{N}{type}{index} where N=turn number, type=search|news|image|ref, index=0,1,2...
|
||||||
|
|
||||||
**Examples:**
|
**Examples (copy these exactly):**
|
||||||
- Single: "Statement.\\ue202turn0search0"
|
- Single: "Statement.\\ue202turn0search0"
|
||||||
- Multiple: "Statement.\\ue202turn0search0\\ue202turn0news1"
|
- Multiple: "Statement.\\ue202turn0search0\\ue202turn0news1"
|
||||||
- Group: "Statement. \\ue200\\ue202turn0search0\\ue202turn0news1\\ue201"
|
- Group: "Statement. \\ue200\\ue202turn0search0\\ue202turn0news1\\ue201"
|
||||||
- Highlight: "\\ue203Cited text.\\ue204\\ue202turn0search0"
|
- Highlight: "\\ue203Cited text.\\ue204\\ue202turn0search0"
|
||||||
- Image: "See photo\\ue202turn0image0."
|
- Image: "See photo\\ue202turn0image0."
|
||||||
|
|
||||||
**CRITICAL:** Place anchors AFTER punctuation. Cite every non-obvious fact/quote. NEVER use markdown links, [1], footnotes, or HTML tags.`.trim();
|
**CRITICAL:** Output escape sequences EXACTLY as shown. Do NOT substitute with † or other symbols. Place anchors AFTER punctuation. Cite every non-obvious fact/quote. NEVER use markdown links, [1], footnotes, or HTML tags.`.trim();
|
||||||
return createSearchTool({
|
return createSearchTool({
|
||||||
...result.authResult,
|
...result.authResult,
|
||||||
onSearchResults,
|
onSearchResults,
|
||||||
|
|
|
||||||
|
|
@ -4,13 +4,29 @@ import type { Citation, CitationNode } from './types';
|
||||||
import { SPAN_REGEX, STANDALONE_PATTERN, CLEANUP_REGEX, COMPOSITE_REGEX } from '~/utils/citations';
|
import { SPAN_REGEX, STANDALONE_PATTERN, CLEANUP_REGEX, COMPOSITE_REGEX } from '~/utils/citations';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if a standalone marker is truly standalone (not inside a composite block)
|
* Checks if a standalone marker is truly standalone (not inside a composite block).
|
||||||
|
* A marker is inside a composite if there's an opening \ue200 without a closing \ue201 after it.
|
||||||
|
*
|
||||||
|
* Handles both literal text format ("\ue200") and actual Unicode (U+E200) by checking
|
||||||
|
* for both and using the rightmost occurrence. This correctly handles:
|
||||||
|
* - Pure literal format: "\ue200...\ue201"
|
||||||
|
* - Pure Unicode format: "..."
|
||||||
|
* - Mixed formats: "\ue200..." (different formats for open/close)
|
||||||
*/
|
*/
|
||||||
function isStandaloneMarker(text: string, position: number): boolean {
|
function isStandaloneMarker(text: string, position: number): boolean {
|
||||||
const beforeText = text.substring(0, position);
|
const beforeText = text.substring(0, position);
|
||||||
const lastUe200 = beforeText.lastIndexOf('\\ue200');
|
|
||||||
const lastUe201 = beforeText.lastIndexOf('\\ue201');
|
|
||||||
|
|
||||||
|
// Find rightmost composite block start (either format)
|
||||||
|
const lastUe200Literal = beforeText.lastIndexOf('\\ue200');
|
||||||
|
const lastUe200Char = beforeText.lastIndexOf('\ue200');
|
||||||
|
const lastUe200 = Math.max(lastUe200Literal, lastUe200Char);
|
||||||
|
|
||||||
|
// Find rightmost composite block end (either format)
|
||||||
|
const lastUe201Literal = beforeText.lastIndexOf('\\ue201');
|
||||||
|
const lastUe201Char = beforeText.lastIndexOf('\ue201');
|
||||||
|
const lastUe201 = Math.max(lastUe201Literal, lastUe201Char);
|
||||||
|
|
||||||
|
// Standalone if: no opening marker OR closing marker appears after opening
|
||||||
return lastUe200 === -1 || (lastUe201 !== -1 && lastUe201 > lastUe200);
|
return lastUe200 === -1 || (lastUe201 !== -1 && lastUe201 > lastUe200);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -129,7 +129,7 @@ function processCitations(text: string, searchResults: { [key: string]: SearchRe
|
||||||
|
|
||||||
// Step 1: Process highlighted text first (simplify by just making it bold in markdown)
|
// Step 1: Process highlighted text first (simplify by just making it bold in markdown)
|
||||||
formattedText = formattedText.replace(SPAN_REGEX, (match) => {
|
formattedText = formattedText.replace(SPAN_REGEX, (match) => {
|
||||||
const text = match.replace(/\\ue203|\\ue204/g, '');
|
const text = match.replace(/\\ue203|\\ue204|\ue203|\ue204/g, '');
|
||||||
return `**${text}**`;
|
return `**${text}**`;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
||||||
558
client/src/utils/__tests__/citations.test.ts
Normal file
558
client/src/utils/__tests__/citations.test.ts
Normal file
|
|
@ -0,0 +1,558 @@
|
||||||
|
import {
|
||||||
|
SPAN_REGEX,
|
||||||
|
COMPOSITE_REGEX,
|
||||||
|
STANDALONE_PATTERN,
|
||||||
|
CLEANUP_REGEX,
|
||||||
|
INVALID_CITATION_REGEX,
|
||||||
|
} from '../citations';
|
||||||
|
|
||||||
|
describe('Citation Regex Patterns', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
// Reset regex lastIndex for global patterns
|
||||||
|
SPAN_REGEX.lastIndex = 0;
|
||||||
|
COMPOSITE_REGEX.lastIndex = 0;
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
CLEANUP_REGEX.lastIndex = 0;
|
||||||
|
INVALID_CITATION_REGEX.lastIndex = 0;
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('STANDALONE_PATTERN', () => {
|
||||||
|
describe('literal text format (\\ue202)', () => {
|
||||||
|
it('should match literal text search citation', () => {
|
||||||
|
const text = 'Some fact \\ue202turn0search0 here';
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
const match = STANDALONE_PATTERN.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[1]).toBe('0'); // turn number
|
||||||
|
expect(match?.[2]).toBe('search'); // type
|
||||||
|
expect(match?.[3]).toBe('0'); // index
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match literal text file citation', () => {
|
||||||
|
const text = 'Document says \\ue202turn0file0 (doc.pdf)';
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
const match = STANDALONE_PATTERN.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[1]).toBe('0');
|
||||||
|
expect(match?.[2]).toBe('file');
|
||||||
|
expect(match?.[3]).toBe('0');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match literal text news citation', () => {
|
||||||
|
const text = 'Breaking news \\ue202turn0news1';
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
const match = STANDALONE_PATTERN.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[1]).toBe('0');
|
||||||
|
expect(match?.[2]).toBe('news');
|
||||||
|
expect(match?.[3]).toBe('1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match multiple literal text citations', () => {
|
||||||
|
const text = 'Fact one \\ue202turn0search0 and fact two \\ue202turn0file1';
|
||||||
|
const matches: RegExpExecArray[] = [];
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while ((match = STANDALONE_PATTERN.exec(text)) !== null) {
|
||||||
|
matches.push(match);
|
||||||
|
}
|
||||||
|
expect(matches).toHaveLength(2);
|
||||||
|
expect(matches[0][2]).toBe('search');
|
||||||
|
expect(matches[1][2]).toBe('file');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match all supported types in literal text format', () => {
|
||||||
|
const types = ['search', 'image', 'news', 'video', 'ref', 'file'];
|
||||||
|
for (const type of types) {
|
||||||
|
const text = `Test \\ue202turn0${type}0`;
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
const match = STANDALONE_PATTERN.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[2]).toBe(type);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('actual Unicode character format (U+E202)', () => {
|
||||||
|
it('should match actual Unicode search citation', () => {
|
||||||
|
const text = 'Some fact \ue202turn0search0 here';
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
const match = STANDALONE_PATTERN.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[1]).toBe('0');
|
||||||
|
expect(match?.[2]).toBe('search');
|
||||||
|
expect(match?.[3]).toBe('0');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match actual Unicode file citation', () => {
|
||||||
|
const text = 'Document says \ue202turn0file0 (doc.pdf)';
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
const match = STANDALONE_PATTERN.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[1]).toBe('0');
|
||||||
|
expect(match?.[2]).toBe('file');
|
||||||
|
expect(match?.[3]).toBe('0');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match all supported types in actual Unicode format', () => {
|
||||||
|
const types = ['search', 'image', 'news', 'video', 'ref', 'file'];
|
||||||
|
for (const type of types) {
|
||||||
|
const text = `Test \ue202turn0${type}0`;
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
const match = STANDALONE_PATTERN.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[2]).toBe(type);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('mixed format handling', () => {
|
||||||
|
it('should match both formats in the same text', () => {
|
||||||
|
const text = 'Literal \\ue202turn0search0 and Unicode \ue202turn0file1';
|
||||||
|
const matches: RegExpExecArray[] = [];
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while ((match = STANDALONE_PATTERN.exec(text)) !== null) {
|
||||||
|
matches.push(match);
|
||||||
|
}
|
||||||
|
expect(matches).toHaveLength(2);
|
||||||
|
expect(matches[0][2]).toBe('search');
|
||||||
|
expect(matches[1][2]).toBe('file');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('SPAN_REGEX', () => {
|
||||||
|
it('should match literal text span markers', () => {
|
||||||
|
const text = 'Before \\ue203highlighted text\\ue204 after';
|
||||||
|
SPAN_REGEX.lastIndex = 0;
|
||||||
|
const match = SPAN_REGEX.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[0]).toContain('highlighted text');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match actual Unicode span markers', () => {
|
||||||
|
const text = 'Before \ue203highlighted text\ue204 after';
|
||||||
|
SPAN_REGEX.lastIndex = 0;
|
||||||
|
const match = SPAN_REGEX.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
expect(match?.[0]).toContain('highlighted text');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('COMPOSITE_REGEX', () => {
|
||||||
|
it('should match literal text composite markers', () => {
|
||||||
|
const text = 'Statement \\ue200\\ue202turn0search0\\ue202turn0news0\\ue201';
|
||||||
|
COMPOSITE_REGEX.lastIndex = 0;
|
||||||
|
const match = COMPOSITE_REGEX.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match actual Unicode composite markers', () => {
|
||||||
|
const text = 'Statement \ue200\ue202turn0search0\ue202turn0news0\ue201';
|
||||||
|
COMPOSITE_REGEX.lastIndex = 0;
|
||||||
|
const match = COMPOSITE_REGEX.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('CLEANUP_REGEX', () => {
|
||||||
|
it('should clean up literal text markers', () => {
|
||||||
|
const text = '\\ue200\\ue201\\ue202\\ue203\\ue204\\ue206';
|
||||||
|
const cleaned = text.replace(CLEANUP_REGEX, '');
|
||||||
|
expect(cleaned).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should clean up actual Unicode markers', () => {
|
||||||
|
const text = '\ue200\ue201\ue202\ue203\ue204\ue206';
|
||||||
|
const cleaned = text.replace(CLEANUP_REGEX, '');
|
||||||
|
expect(cleaned).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should preserve normal text while cleaning markers', () => {
|
||||||
|
const text = 'Hello \\ue202turn0search0 world';
|
||||||
|
const cleaned = text.replace(CLEANUP_REGEX, '');
|
||||||
|
expect(cleaned).toBe('Hello turn0search0 world');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('INVALID_CITATION_REGEX', () => {
|
||||||
|
it('should match invalid literal text citations with leading whitespace', () => {
|
||||||
|
const text = 'Text \\ue202turn0search5';
|
||||||
|
INVALID_CITATION_REGEX.lastIndex = 0;
|
||||||
|
const match = INVALID_CITATION_REGEX.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should match invalid actual Unicode citations with leading whitespace', () => {
|
||||||
|
const text = 'Text \ue202turn0search5';
|
||||||
|
INVALID_CITATION_REGEX.lastIndex = 0;
|
||||||
|
const match = INVALID_CITATION_REGEX.exec(text);
|
||||||
|
expect(match).not.toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Integration: Full Citation Processing Flow', () => {
|
||||||
|
/**
|
||||||
|
* Simulates the citation processing flow used in the markdown plugin and copy-to-clipboard
|
||||||
|
*/
|
||||||
|
const processFullCitationFlow = (text: string) => {
|
||||||
|
// Step 1: Extract highlighted spans
|
||||||
|
const spans: Array<{ content: string; position: number }> = [];
|
||||||
|
let spanMatch;
|
||||||
|
const spanRegex = new RegExp(SPAN_REGEX.source, 'g');
|
||||||
|
while ((spanMatch = spanRegex.exec(text)) !== null) {
|
||||||
|
const content = spanMatch[0].replace(/\\ue203|\\ue204|\ue203|\ue204/g, '');
|
||||||
|
spans.push({ content, position: spanMatch.index });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Extract composite blocks
|
||||||
|
const composites: Array<{ citations: string[]; position: number }> = [];
|
||||||
|
let compMatch;
|
||||||
|
const compRegex = new RegExp(COMPOSITE_REGEX.source, 'g');
|
||||||
|
while ((compMatch = compRegex.exec(text)) !== null) {
|
||||||
|
const block = compMatch[0];
|
||||||
|
const citations: string[] = [];
|
||||||
|
let citMatch;
|
||||||
|
const citRegex = new RegExp(STANDALONE_PATTERN.source, 'g');
|
||||||
|
while ((citMatch = citRegex.exec(block)) !== null) {
|
||||||
|
citations.push(`turn${citMatch[1]}${citMatch[2]}${citMatch[3]}`);
|
||||||
|
}
|
||||||
|
composites.push({ citations, position: compMatch.index });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: Extract standalone citations (not in composites)
|
||||||
|
const standalones: Array<{ citation: string; position: number }> = [];
|
||||||
|
let standMatch;
|
||||||
|
const standRegex = new RegExp(STANDALONE_PATTERN.source, 'g');
|
||||||
|
while ((standMatch = standRegex.exec(text)) !== null) {
|
||||||
|
// Check if this position is inside a composite
|
||||||
|
const isInComposite = composites.some(
|
||||||
|
(c) => standMatch && standMatch.index >= c.position && standMatch.index < c.position + 50,
|
||||||
|
);
|
||||||
|
if (!isInComposite) {
|
||||||
|
standalones.push({
|
||||||
|
citation: `turn${standMatch[1]}${standMatch[2]}${standMatch[3]}`,
|
||||||
|
position: standMatch.index,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4: Clean up text
|
||||||
|
const cleanedText = text.replace(INVALID_CITATION_REGEX, '').replace(CLEANUP_REGEX, '');
|
||||||
|
|
||||||
|
return { spans, composites, standalones, cleanedText };
|
||||||
|
};
|
||||||
|
|
||||||
|
describe('literal text format integration', () => {
|
||||||
|
it('should process complex LLM response with multiple citation types', () => {
|
||||||
|
const llmResponse = `Here's what I found about the topic.
|
||||||
|
|
||||||
|
\\ue203This is an important quote from the source.\\ue204 \\ue202turn0search0
|
||||||
|
|
||||||
|
The data shows several key findings \\ue202turn0search1 including:
|
||||||
|
- First finding \\ue202turn0news0
|
||||||
|
- Second finding \\ue200\\ue202turn0search2\\ue202turn0file0\\ue201
|
||||||
|
|
||||||
|
For more details, see the attached document \\ue202turn0file1.`;
|
||||||
|
|
||||||
|
const result = processFullCitationFlow(llmResponse);
|
||||||
|
|
||||||
|
expect(result.spans).toHaveLength(1);
|
||||||
|
expect(result.spans[0].content).toBe('This is an important quote from the source.');
|
||||||
|
|
||||||
|
expect(result.composites).toHaveLength(1);
|
||||||
|
expect(result.composites[0].citations).toEqual(['turn0search2', 'turn0file0']);
|
||||||
|
|
||||||
|
expect(result.standalones.length).toBeGreaterThanOrEqual(3);
|
||||||
|
|
||||||
|
expect(result.cleanedText).not.toContain('\\ue202');
|
||||||
|
expect(result.cleanedText).not.toContain('\\ue200');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle file citations from document search', () => {
|
||||||
|
const fileSearchResponse = `Based on the document medical-anthem-blue-cross.pdf:
|
||||||
|
|
||||||
|
- **Annual deductible:** $3,300 per person \\ue202turn0file0
|
||||||
|
- **Out-of-pocket maximum:** $4,000 per person \\ue202turn0file0
|
||||||
|
- **Network:** Prudent Buyer PPO \\ue202turn0file1
|
||||||
|
|
||||||
|
Multiple sources confirm these details. \\ue200\\ue202turn0file0\\ue202turn0file1\\ue202turn0file2\\ue201`;
|
||||||
|
|
||||||
|
const result = processFullCitationFlow(fileSearchResponse);
|
||||||
|
|
||||||
|
expect(result.composites).toHaveLength(1);
|
||||||
|
expect(result.composites[0].citations).toHaveLength(3);
|
||||||
|
|
||||||
|
// Should find standalone file citations
|
||||||
|
const fileCitations = result.standalones.filter((s) => s.citation.includes('file'));
|
||||||
|
expect(fileCitations.length).toBeGreaterThanOrEqual(2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('actual Unicode format integration', () => {
|
||||||
|
it('should process response with actual Unicode characters', () => {
|
||||||
|
const llmResponse = `Research findings indicate the following:
|
||||||
|
|
||||||
|
\ue203Key insight from the study.\ue204 \ue202turn0search0
|
||||||
|
|
||||||
|
Additional context \ue202turn0news0 supports this conclusion \ue200\ue202turn0search1\ue202turn0ref0\ue201.`;
|
||||||
|
|
||||||
|
const result = processFullCitationFlow(llmResponse);
|
||||||
|
|
||||||
|
expect(result.spans).toHaveLength(1);
|
||||||
|
expect(result.composites).toHaveLength(1);
|
||||||
|
expect(result.standalones.length).toBeGreaterThanOrEqual(1);
|
||||||
|
expect(result.cleanedText).not.toContain('\ue202');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('mixed format integration', () => {
|
||||||
|
it('should handle mixed literal and Unicode formats in same response', () => {
|
||||||
|
const mixedResponse = `First citation uses literal \\ue202turn0search0 format.
|
||||||
|
Second citation uses Unicode \ue202turn0search1 format.
|
||||||
|
Composite with mixed: \\ue200\\ue202turn0file0\ue202turn0file1\\ue201`;
|
||||||
|
|
||||||
|
const result = processFullCitationFlow(mixedResponse);
|
||||||
|
|
||||||
|
// Should find citations from both formats
|
||||||
|
expect(result.standalones.length).toBeGreaterThanOrEqual(2);
|
||||||
|
expect(result.composites).toHaveLength(1);
|
||||||
|
expect(result.composites[0].citations).toHaveLength(2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('Performance: Regex Benchmarks', () => {
|
||||||
|
/**
|
||||||
|
* Generates a realistic citation-heavy text with specified number of citations
|
||||||
|
*/
|
||||||
|
const generateCitationHeavyText = (citationCount: number, format: 'literal' | 'unicode') => {
|
||||||
|
const marker = format === 'literal' ? '\\ue202' : '\ue202';
|
||||||
|
const spanStart = format === 'literal' ? '\\ue203' : '\ue203';
|
||||||
|
const spanEnd = format === 'literal' ? '\\ue204' : '\ue204';
|
||||||
|
const compStart = format === 'literal' ? '\\ue200' : '\ue200';
|
||||||
|
const compEnd = format === 'literal' ? '\\ue201' : '\ue201';
|
||||||
|
|
||||||
|
const types = ['search', 'news', 'file', 'ref', 'image', 'video'];
|
||||||
|
let text = '';
|
||||||
|
|
||||||
|
for (let i = 0; i < citationCount; i++) {
|
||||||
|
const type = types[i % types.length];
|
||||||
|
const turn = Math.floor(i / 10);
|
||||||
|
const index = i % 10;
|
||||||
|
|
||||||
|
if (i % 5 === 0) {
|
||||||
|
// Add highlighted text every 5th citation
|
||||||
|
text += `${spanStart}Important fact number ${i}.${spanEnd} ${marker}turn${turn}${type}${index} `;
|
||||||
|
} else if (i % 7 === 0) {
|
||||||
|
// Add composite every 7th citation
|
||||||
|
text += `Multiple sources ${compStart}${marker}turn${turn}${type}${index}${marker}turn${turn}${types[(i + 1) % types.length]}${(index + 1) % 10}${compEnd} confirm this. `;
|
||||||
|
} else {
|
||||||
|
text += `This is fact ${i} ${marker}turn${turn}${type}${index} from the research. `;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
|
};
|
||||||
|
|
||||||
|
it('should process 100 literal citations in reasonable time (<100ms)', () => {
|
||||||
|
const text = generateCitationHeavyText(100, 'literal');
|
||||||
|
|
||||||
|
const start = performance.now();
|
||||||
|
|
||||||
|
// Run all regex operations
|
||||||
|
const results = { spans: 0, composites: 0, standalones: 0, cleaned: '' };
|
||||||
|
|
||||||
|
SPAN_REGEX.lastIndex = 0;
|
||||||
|
while (SPAN_REGEX.exec(text) !== null) {
|
||||||
|
results.spans++;
|
||||||
|
}
|
||||||
|
|
||||||
|
COMPOSITE_REGEX.lastIndex = 0;
|
||||||
|
while (COMPOSITE_REGEX.exec(text) !== null) {
|
||||||
|
results.composites++;
|
||||||
|
}
|
||||||
|
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while (STANDALONE_PATTERN.exec(text) !== null) {
|
||||||
|
results.standalones++;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.cleaned = text.replace(CLEANUP_REGEX, '');
|
||||||
|
|
||||||
|
const duration = performance.now() - start;
|
||||||
|
|
||||||
|
expect(duration).toBeLessThan(100);
|
||||||
|
expect(results.standalones).toBeGreaterThan(80); // Most should be standalone
|
||||||
|
expect(results.spans).toBeGreaterThan(10); // Some highlighted
|
||||||
|
expect(results.composites).toBeGreaterThan(5); // Some composites
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should process 100 Unicode citations in reasonable time (<100ms)', () => {
|
||||||
|
const text = generateCitationHeavyText(100, 'unicode');
|
||||||
|
|
||||||
|
const start = performance.now();
|
||||||
|
|
||||||
|
const results = { spans: 0, composites: 0, standalones: 0, cleaned: '' };
|
||||||
|
|
||||||
|
SPAN_REGEX.lastIndex = 0;
|
||||||
|
while (SPAN_REGEX.exec(text) !== null) {
|
||||||
|
results.spans++;
|
||||||
|
}
|
||||||
|
|
||||||
|
COMPOSITE_REGEX.lastIndex = 0;
|
||||||
|
while (COMPOSITE_REGEX.exec(text) !== null) {
|
||||||
|
results.composites++;
|
||||||
|
}
|
||||||
|
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while (STANDALONE_PATTERN.exec(text) !== null) {
|
||||||
|
results.standalones++;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.cleaned = text.replace(CLEANUP_REGEX, '');
|
||||||
|
|
||||||
|
const duration = performance.now() - start;
|
||||||
|
|
||||||
|
expect(duration).toBeLessThan(100);
|
||||||
|
expect(results.standalones).toBeGreaterThan(80);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should process 500 citations without timeout (<500ms)', () => {
|
||||||
|
const text = generateCitationHeavyText(500, 'literal');
|
||||||
|
|
||||||
|
const start = performance.now();
|
||||||
|
|
||||||
|
let count = 0;
|
||||||
|
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while (STANDALONE_PATTERN.exec(text) !== null) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cleaned = text.replace(CLEANUP_REGEX, '');
|
||||||
|
|
||||||
|
const duration = performance.now() - start;
|
||||||
|
|
||||||
|
expect(duration).toBeLessThan(500);
|
||||||
|
expect(count).toBeGreaterThan(400);
|
||||||
|
expect(cleaned.length).toBeLessThan(text.length);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle mixed formats efficiently (<100ms for 100 citations)', () => {
|
||||||
|
// Generate text with alternating formats
|
||||||
|
const literalText = generateCitationHeavyText(50, 'literal');
|
||||||
|
const unicodeText = generateCitationHeavyText(50, 'unicode');
|
||||||
|
const mixedText = literalText + '\n\n' + unicodeText;
|
||||||
|
|
||||||
|
const start = performance.now();
|
||||||
|
|
||||||
|
let count = 0;
|
||||||
|
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while (STANDALONE_PATTERN.exec(mixedText) !== null) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
const duration = performance.now() - start;
|
||||||
|
|
||||||
|
expect(duration).toBeLessThan(100);
|
||||||
|
expect(count).toBeGreaterThan(80); // Should find citations from both halves
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle repeated execution during streaming simulation (<1000ms cumulative)', () => {
|
||||||
|
/**
|
||||||
|
* Simulates the markdown plugin running repeatedly during LLM streaming.
|
||||||
|
* Each "token" adds ~10 characters, plugin runs on every update.
|
||||||
|
*/
|
||||||
|
const fullText = generateCitationHeavyText(50, 'literal');
|
||||||
|
const tokens: string[] = [];
|
||||||
|
|
||||||
|
// Simulate streaming: break text into ~100 incremental chunks
|
||||||
|
const chunkSize = Math.ceil(fullText.length / 100);
|
||||||
|
for (let i = 0; i < fullText.length; i += chunkSize) {
|
||||||
|
tokens.push(fullText.slice(0, i + chunkSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
const start = performance.now();
|
||||||
|
let totalMatches = 0;
|
||||||
|
let spanCount = 0;
|
||||||
|
let compositeCount = 0;
|
||||||
|
|
||||||
|
// Simulate plugin running on each streaming update
|
||||||
|
for (const partialText of tokens) {
|
||||||
|
// Run all regex operations (simulating unicodeCitation plugin)
|
||||||
|
SPAN_REGEX.lastIndex = 0;
|
||||||
|
while (SPAN_REGEX.exec(partialText) !== null) {
|
||||||
|
spanCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
COMPOSITE_REGEX.lastIndex = 0;
|
||||||
|
while (COMPOSITE_REGEX.exec(partialText) !== null) {
|
||||||
|
compositeCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while (STANDALONE_PATTERN.exec(partialText) !== null) {
|
||||||
|
totalMatches++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup would also run
|
||||||
|
void partialText.replace(CLEANUP_REGEX, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
const duration = performance.now() - start;
|
||||||
|
|
||||||
|
// 100 streaming updates processing up to 50 citations each
|
||||||
|
// Should complete in under 1 second cumulative
|
||||||
|
expect(duration).toBeLessThan(1000);
|
||||||
|
expect(totalMatches).toBeGreaterThan(1000); // Many matches across all iterations
|
||||||
|
expect(spanCount).toBeGreaterThan(0);
|
||||||
|
expect(compositeCount).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle rapid repeated execution (300 renders with 20 citations)', () => {
|
||||||
|
/**
|
||||||
|
* Realistic streaming scenario: 300 token updates, final text has ~20 citations
|
||||||
|
*/
|
||||||
|
const fullText = generateCitationHeavyText(20, 'literal');
|
||||||
|
const renderCount = 300;
|
||||||
|
|
||||||
|
const start = performance.now();
|
||||||
|
let totalOps = 0;
|
||||||
|
|
||||||
|
// Simulate 300 renders, each processing progressively more text
|
||||||
|
for (let i = 0; i < renderCount; i++) {
|
||||||
|
const progress = Math.min(1, (i + 1) / renderCount);
|
||||||
|
const partialText = fullText.slice(0, Math.floor(fullText.length * progress));
|
||||||
|
|
||||||
|
SPAN_REGEX.lastIndex = 0;
|
||||||
|
while (SPAN_REGEX.exec(partialText) !== null) {
|
||||||
|
totalOps++;
|
||||||
|
}
|
||||||
|
|
||||||
|
COMPOSITE_REGEX.lastIndex = 0;
|
||||||
|
while (COMPOSITE_REGEX.exec(partialText) !== null) {
|
||||||
|
totalOps++;
|
||||||
|
}
|
||||||
|
|
||||||
|
STANDALONE_PATTERN.lastIndex = 0;
|
||||||
|
while (STANDALONE_PATTERN.exec(partialText) !== null) {
|
||||||
|
totalOps++;
|
||||||
|
}
|
||||||
|
|
||||||
|
void partialText.replace(CLEANUP_REGEX, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
const duration = performance.now() - start;
|
||||||
|
const avgPerRender = duration / renderCount;
|
||||||
|
|
||||||
|
// Should complete all 300 renders in under 500ms total
|
||||||
|
// Average per render should be under 2ms
|
||||||
|
expect(duration).toBeLessThan(500);
|
||||||
|
expect(avgPerRender).toBeLessThan(2);
|
||||||
|
expect(totalOps).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -1,5 +1,46 @@
|
||||||
export const SPAN_REGEX = /(\\ue203.*?\\ue204)/g;
|
/**
|
||||||
export const COMPOSITE_REGEX = /(\\ue200.*?\\ue201)/g;
|
* Citation Regex Patterns
|
||||||
export const STANDALONE_PATTERN = /\\ue202turn(\d+)(search|image|news|video|ref|file)(\d+)/g;
|
*
|
||||||
export const CLEANUP_REGEX = /\\ue200|\\ue201|\\ue202|\\ue203|\\ue204|\\ue206/g;
|
* These patterns handle two formats that LLMs may output:
|
||||||
export const INVALID_CITATION_REGEX = /\s*\\ue202turn\d+(search|news|image|video|ref|file)\d+/g;
|
* 1. Literal escape sequences: "\ue202turn0search0" (backslash + "ue202" = 6 chars)
|
||||||
|
* 2. Actual Unicode characters: "turn0search0" (U+E202 = 1 char, private use area)
|
||||||
|
*
|
||||||
|
* The system instructs LLMs to output literal escape sequences, but some models
|
||||||
|
* may convert them to actual Unicode characters during text generation. These
|
||||||
|
* dual-format patterns ensure robust citation handling regardless of output format.
|
||||||
|
*
|
||||||
|
* Citation Format:
|
||||||
|
* - \ue202 / U+E202: Standalone citation marker (before each anchor)
|
||||||
|
* - \ue200 / U+E200: Composite group start
|
||||||
|
* - \ue201 / U+E201: Composite group end
|
||||||
|
* - \ue203 / U+E203: Highlight span start
|
||||||
|
* - \ue204 / U+E204: Highlight span end
|
||||||
|
*
|
||||||
|
* Anchor Pattern: turn{N}{type}{index}
|
||||||
|
* - N: Turn number (0-based)
|
||||||
|
* - type: search|image|news|video|ref|file
|
||||||
|
* - index: Result index within that type (0-based)
|
||||||
|
*
|
||||||
|
* Examples:
|
||||||
|
* - Standalone: "Statement.\ue202turn0search0"
|
||||||
|
* - Composite: "\ue200\ue202turn0search0\ue202turn0news1\ue201"
|
||||||
|
* - Highlighted: "\ue203Cited text.\ue204\ue202turn0search0"
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Matches highlighted text spans in both literal and Unicode formats */
|
||||||
|
export const SPAN_REGEX = /((?:\\ue203|\ue203).*?(?:\\ue204|\ue204))/g;
|
||||||
|
|
||||||
|
/** Matches composite citation blocks (multiple citations grouped together) */
|
||||||
|
export const COMPOSITE_REGEX = /((?:\\ue200|\ue200).*?(?:\\ue201|\ue201))/g;
|
||||||
|
|
||||||
|
/** Matches standalone citation anchors with turn, type, and index capture groups */
|
||||||
|
export const STANDALONE_PATTERN =
|
||||||
|
/(?:\\ue202|\ue202)turn(\d+)(search|image|news|video|ref|file)(\d+)/g;
|
||||||
|
|
||||||
|
/** Removes all citation marker characters from text for clean display */
|
||||||
|
export const CLEANUP_REGEX =
|
||||||
|
/\\ue200|\\ue201|\\ue202|\\ue203|\\ue204|\\ue206|\ue200|\ue201|\ue202|\ue203|\ue204|\ue206/g;
|
||||||
|
|
||||||
|
/** Matches invalid/orphaned citations (with leading whitespace) for removal */
|
||||||
|
export const INVALID_CITATION_REGEX =
|
||||||
|
/\s*(?:\\ue202|\ue202)turn\d+(search|news|image|video|ref|file)\d+/g;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue