LibreChat/client/src/components/Web/plugin.ts
Danny Avila 03c9d5f79f
📑 refactor: File Search Citations Dual-Format Unicode Handling (#10888)
* 🔖 refactor: citation handling with support for both literal and Unicode formats

* refactor: file search messages for edge cases in documents

* 🔧 refactor: Enhance citation handling with detailed regex patterns for literal and Unicode formats

* 🔧 refactor: Simplify file search query handling by removing unnecessary parameters and improving result formatting

*  test: Add comprehensive integration tests for citation processing flow with support for literal and Unicode formats

* 🔧 refactor: Improve regex match handling and add performance tests for citation processing
2025-12-10 13:25:56 -05:00

262 lines
8.7 KiB
TypeScript

import { visit } from 'unist-util-visit';
import type { Node } from 'unist';
import type { Citation, CitationNode } from './types';
import { SPAN_REGEX, STANDALONE_PATTERN, CLEANUP_REGEX, COMPOSITE_REGEX } from '~/utils/citations';
/**
* Checks if a standalone marker is truly standalone (not inside a composite block).
* A marker is inside a composite if there's an opening \ue200 without a closing \ue201 after it.
*
* Handles both literal text format ("\ue200") and actual Unicode (U+E200) by checking
* for both and using the rightmost occurrence. This correctly handles:
* - Pure literal format: "\ue200...\ue201"
* - Pure Unicode format: "..."
* - Mixed formats: "\ue200..." (different formats for open/close)
*/
function isStandaloneMarker(text: string, position: number): boolean {
const beforeText = text.substring(0, position);
// Find rightmost composite block start (either format)
const lastUe200Literal = beforeText.lastIndexOf('\\ue200');
const lastUe200Char = beforeText.lastIndexOf('\ue200');
const lastUe200 = Math.max(lastUe200Literal, lastUe200Char);
// Find rightmost composite block end (either format)
const lastUe201Literal = beforeText.lastIndexOf('\\ue201');
const lastUe201Char = beforeText.lastIndexOf('\ue201');
const lastUe201 = Math.max(lastUe201Literal, lastUe201Char);
// Standalone if: no opening marker OR closing marker appears after opening
return lastUe200 === -1 || (lastUe201 !== -1 && lastUe201 > lastUe200);
}
/**
* Find the next pattern match from the current position
*/
function findNextMatch(
text: string,
position: number,
): { type: string; match: RegExpExecArray | null; index: number } | null {
// Reset regex lastIndex to start from current position
SPAN_REGEX.lastIndex = position;
COMPOSITE_REGEX.lastIndex = position;
STANDALONE_PATTERN.lastIndex = position;
// Find next occurrence of each pattern
const spanMatch = SPAN_REGEX.exec(text);
const compositeMatch = COMPOSITE_REGEX.exec(text);
// For standalone, we need to check each match
let standaloneMatch: RegExpExecArray | null = null;
STANDALONE_PATTERN.lastIndex = position;
// Find the first standalone match that's not inside a composite block
let match: RegExpExecArray | null;
while (!standaloneMatch && (match = STANDALONE_PATTERN.exec(text)) !== null) {
if (isStandaloneMarker(text, match.index)) {
standaloneMatch = match;
}
}
// Find closest match
let nextMatch: RegExpExecArray | null = null;
let matchType = '';
let matchIndex = -1;
let typeIndex = -1;
if (spanMatch && (!nextMatch || spanMatch.index < matchIndex || matchIndex === -1)) {
nextMatch = spanMatch;
matchType = 'span';
matchIndex = spanMatch.index;
// We can use a counter for typeIndex if needed
typeIndex = 0;
}
if (compositeMatch && (!nextMatch || compositeMatch.index < matchIndex || matchIndex === -1)) {
nextMatch = compositeMatch;
matchType = 'composite';
matchIndex = compositeMatch.index;
typeIndex = 0;
}
if (standaloneMatch && (!nextMatch || standaloneMatch.index < matchIndex || matchIndex === -1)) {
nextMatch = standaloneMatch;
matchType = 'standalone';
matchIndex = standaloneMatch.index;
typeIndex = 0;
}
if (!nextMatch) return null;
return { type: matchType, match: nextMatch, index: typeIndex };
}
function processTree(tree: Node) {
visit(tree, 'text', (node, index, parent) => {
const textNode = node as CitationNode;
const parentNode = parent as CitationNode;
if (typeof textNode.value !== 'string') return;
const originalValue = textNode.value;
const segments: Array<CitationNode> = [];
// Single-pass processing through the string
let currentPosition = 0;
// Important change: Create a map to track citation IDs by their position
// This ensures consistent IDs across multiple segments
const citationIds = new Map<number, string>();
const typeCounts = { span: 0, composite: 0, standalone: 0 };
while (currentPosition < originalValue.length) {
const nextMatchInfo = findNextMatch(originalValue, currentPosition);
if (!nextMatchInfo) {
// No more matches, add remaining content with cleanup
const remainingText = originalValue.substring(currentPosition).replace(CLEANUP_REGEX, '');
if (remainingText) {
segments.push({ type: 'text', value: remainingText });
}
break;
}
const { type, match } = nextMatchInfo;
const matchIndex = match!.index;
const matchText = match![0];
// Add cleaned text before this match
if (matchIndex > currentPosition) {
const textBeforeMatch = originalValue
.substring(currentPosition, matchIndex)
.replace(CLEANUP_REGEX, '');
if (textBeforeMatch) {
segments.push({ type: 'text', value: textBeforeMatch });
}
}
// Generate a unique ID for this citation based on its position in the text
const citationId = `${type}-${typeCounts[type as keyof typeof typeCounts]}-${matchIndex}`;
citationIds.set(matchIndex, citationId);
// Process based on match type
switch (type) {
case 'span': {
const spanText = matchText;
const cleanText = spanText.replace(/\\ue203|\\ue204/g, '');
// Look ahead for associated citation
let associatedCitationId: string | null = null;
const endOfSpan = matchIndex + matchText.length;
// Check if there's a citation right after this span
const nextCitation = findNextMatch(originalValue, endOfSpan);
if (
nextCitation &&
(nextCitation.type === 'standalone' || nextCitation.type === 'composite') &&
nextCitation.match!.index - endOfSpan < 5
) {
// Use the ID that will be generated for the next citation
const nextIndex = nextCitation.match!.index;
const nextType = nextCitation.type;
associatedCitationId = `${nextType}-${typeCounts[nextType as keyof typeof typeCounts]}-${nextIndex}`;
}
segments.push({
type: 'highlighted-text',
data: {
hName: 'highlighted-text',
hProperties: { citationId: associatedCitationId },
},
children: [{ type: 'text', value: cleanText }],
});
typeCounts.span++;
break;
}
case 'composite': {
const compositeText = matchText;
// Use a regular expression to extract reference indices
const compositeRefRegex = new RegExp(STANDALONE_PATTERN.source, 'g');
let refMatch: RegExpExecArray | null;
const citations: Array<Citation> = [];
while ((refMatch = compositeRefRegex.exec(compositeText)) !== null) {
const turn = Number(refMatch[1]);
const refType = refMatch[2];
const refIndex = Number(refMatch[3]);
citations.push({
turn,
refType,
index: refIndex,
});
}
if (citations.length > 0) {
segments.push({
type: 'composite-citation',
data: {
hName: 'composite-citation',
hProperties: {
citations,
citationId: citationId,
},
},
});
}
typeCounts.composite++;
break;
}
case 'standalone': {
// Extract reference info
const turn = Number(match![1]);
const refType = match![2];
const refIndex = Number(match![3]);
segments.push({
type: 'citation',
data: {
hName: 'citation',
hProperties: {
citation: {
turn,
refType,
index: refIndex,
},
citationType: 'standalone',
citationId: citationId,
},
},
});
typeCounts.standalone++;
break;
}
}
// Move position forward
currentPosition = matchIndex + matchText.length;
}
// Replace the original node with our segments or clean up the original
if (segments.length > 0 && index !== undefined) {
parentNode.children?.splice(index, 1, ...segments);
return index + segments.length;
} else if (textNode.value !== textNode.value.replace(CLEANUP_REGEX, '')) {
// If we didn't create segments but there are markers to clean up
textNode.value = textNode.value.replace(CLEANUP_REGEX, '');
}
});
}
export function unicodeCitation() {
return (tree: Node) => {
processTree(tree);
};
}