LibreChat/client/src/utils/latex.ts

153 lines
4.7 KiB
TypeScript
Raw Normal View History

// Pre-compile all regular expressions for better performance
const MHCHEM_CE_REGEX = /\$\\ce\{/g;
const MHCHEM_PU_REGEX = /\$\\pu\{/g;
const MHCHEM_CE_ESCAPED_REGEX = /\$\\\\ce\{[^}]*\}\$/g;
const MHCHEM_PU_ESCAPED_REGEX = /\$\\\\pu\{[^}]*\}\$/g;
const CURRENCY_REGEX =
/(?<![\\$])\$(?!\$)(?=\d+(?:,\d{3})*(?:\.\d+)?(?:[KMBkmb])?(?:\s|$|[^a-zA-Z\d]))/g;
🧮 fix: Properly Escape Currency and Prevent Code Block LaTeX Bugs (#9248) * fix(latex): prevent LaTeX conversion when closing $ is preceded by backtick When text contained patterns like "$lookup namespace" followed by "`$lookup`", the regex would match from the first $ to the backtick's $, treating the entire span as a LaTeX expression. This caused programming constructs to be incorrectly converted to double dollars. - Added negative lookbehind (?<!`) to single dollar regex - Prevents matching when closing $ immediately follows a backtick - Fixes issues with inline code blocks containing $ symbols * fix(latex): detect currency amounts with 4+ digits without commas The currency regex pattern \d{1,3} only matched amounts with 1-3 initial digits, causing amounts like $1157.90 to be interpreted as LaTeX instead of currency. This resulted in text like "$1157.90 (text) + $500 (text) = $1657.90" being incorrectly converted to a single LaTeX expression. - Changed pattern from \d{1,3} to \d+ to match any number of initial digits - Now properly escapes $1000, $10000, $123456, etc. without requiring commas - Maintains support for comma-formatted amounts like $1,234.56 * fix(latex): support currency with unlimited decimal places The currency regex limited decimal places to 1-2 digits (\.\d{1,2}), which failed to properly escape amounts with more precision like cryptocurrency values ($0.00001234), gas prices ($3.999), or exchange rates ($1.23456). - Changed decimal pattern from \.\d{1,2} to \.\d+ - Now supports any number of decimal places - Handles edge cases like scientific calculations and high-precision values
2025-08-25 02:44:13 -04:00
const SINGLE_DOLLAR_REGEX = /(?<!\\)\$(?!\$)((?:[^$\n]|\\[$])+?)(?<!\\)(?<!`)\$(?!\$)/g;
/**
* Escapes mhchem package notation in LaTeX by converting single dollar delimiters to double dollars
* and escaping backslashes in mhchem commands.
*
* @param text - The input text containing potential mhchem notation
* @returns The processed text with properly escaped mhchem notation
*/
function escapeMhchem(text: string): string {
// First escape the backslashes in mhchem commands
let result = text.replace(MHCHEM_CE_REGEX, '$\\\\ce{');
result = result.replace(MHCHEM_PU_REGEX, '$\\\\pu{');
// Then convert single dollar mhchem to double dollar
result = result.replace(MHCHEM_CE_ESCAPED_REGEX, (match) => `$${match}$`);
result = result.replace(MHCHEM_PU_ESCAPED_REGEX, (match) => `$${match}$`);
return result;
}
/**
* Efficiently finds all code block regions in the content
* @param content The content to analyze
* @returns Array of code block regions [start, end]
*/
function findCodeBlockRegions(content: string): Array<[number, number]> {
const regions: Array<[number, number]> = [];
let inlineStart = -1;
let multilineStart = -1;
for (let i = 0; i < content.length; i++) {
const char = content[i];
// Check for multiline code blocks
if (
char === '`' &&
i + 2 < content.length &&
content[i + 1] === '`' &&
content[i + 2] === '`'
) {
if (multilineStart === -1) {
multilineStart = i;
i += 2; // Skip the next two backticks
} else {
regions.push([multilineStart, i + 2]);
multilineStart = -1;
i += 2;
}
}
// Check for inline code blocks (only if not in multiline)
else if (char === '`' && multilineStart === -1) {
if (inlineStart === -1) {
inlineStart = i;
} else {
regions.push([inlineStart, i]);
inlineStart = -1;
}
}
}
return regions;
}
/**
* Checks if a position is inside any code block region using binary search
* @param position The position to check
* @param codeRegions Array of code block regions
* @returns True if position is inside a code block
*/
function isInCodeBlock(position: number, codeRegions: Array<[number, number]>): boolean {
let left = 0;
let right = codeRegions.length - 1;
while (left <= right) {
const mid = Math.floor((left + right) / 2);
const [start, end] = codeRegions[mid];
if (position >= start && position <= end) {
return true;
} else if (position < start) {
right = mid - 1;
} else {
left = mid + 1;
}
}
return false;
}
/**
* Preprocesses LaTeX content by escaping currency indicators and converting single dollar math delimiters.
* Optimized for high-frequency execution.
* @param content The input string containing LaTeX expressions.
* @returns The processed string with escaped currency indicators and converted math delimiters.
*/
export function preprocessLaTeX(content: string): string {
// Early return for most common case
if (!content.includes('$')) return content;
// Process mhchem first (usually rare, so check if needed)
let processed = content;
if (content.includes('\\ce{') || content.includes('\\pu{')) {
processed = escapeMhchem(content);
}
// Find all code block regions once
const codeRegions = findCodeBlockRegions(processed);
// First pass: escape currency dollar signs
const parts: string[] = [];
let lastIndex = 0;
// Reset regex for reuse
CURRENCY_REGEX.lastIndex = 0;
let match: RegExpExecArray | null;
while ((match = CURRENCY_REGEX.exec(processed)) !== null) {
if (!isInCodeBlock(match.index, codeRegions)) {
parts.push(processed.substring(lastIndex, match.index));
parts.push('\\$');
lastIndex = match.index + 1;
}
}
parts.push(processed.substring(lastIndex));
processed = parts.join('');
// Second pass: convert single dollar delimiters to double dollars
const result: string[] = [];
lastIndex = 0;
// Reset regex for reuse
SINGLE_DOLLAR_REGEX.lastIndex = 0;
while ((match = SINGLE_DOLLAR_REGEX.exec(processed)) !== null) {
if (!isInCodeBlock(match.index, codeRegions)) {
result.push(processed.substring(lastIndex, match.index));
result.push(`$$${match[1]}$$`);
lastIndex = match.index + match[0].length;
}
}
result.push(processed.substring(lastIndex));
return result.join('');
}