LibreChat/packages/data-schemas/src/methods/file.ts
Danny Avila 9054ca9c15
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
🆔 fix: Atomic File Dedupe, Bedrock Tokens Fix, and Allowed MIME Types (#11675)
* feat: Add support for Apache Parquet MIME types

- Introduced 'application/x-parquet' to the full MIME types list and code interpreter MIME types list.
- Updated application MIME types regex to include 'x-parquet' and 'vnd.apache.parquet'.
- Added mapping for '.parquet' files to 'application/x-parquet' in code type mapping, enhancing file format support.

* feat: Implement atomic file claiming for code execution outputs

- Added a new `claimCodeFile` function to atomically claim a file_id for code execution outputs, preventing duplicates by using a compound key of filename and conversationId.
- Updated `processCodeOutput` to utilize the new claiming mechanism, ensuring that concurrent calls for the same filename converge on a single record.
- Refactored related tests to validate the new atomic claiming behavior and its impact on file usage tracking and versioning.

* fix: Update image file handling to use cache-busting filepath

- Modified the `processCodeOutput` function to generate a cache-busting filepath for updated image files, improving browser caching behavior.
- Adjusted related tests to reflect the change from versioned filenames to cache-busted filepaths, ensuring accurate validation of image updates.

* fix: Update step handler to prevent undefined content for non-tool call types

- Modified the condition in useStepHandler to ensure that undefined content is only assigned for specific content types, enhancing the robustness of content handling.

* fix: Update bedrockOutputParser to handle maxTokens for adaptive models

- Modified the bedrockOutputParser logic to ensure that maxTokens is not set for adaptive models when neither maxTokens nor maxOutputTokens are provided, improving the handling of adaptive thinking configurations.
- Updated related tests to reflect these changes, ensuring accurate validation of the output for adaptive models.

* chore: Update @librechat/agents to version 3.1.38 in package.json and package-lock.json

* fix: Enhance file claiming and error handling in code processing

- Updated the `processCodeOutput` function to use a consistent file ID for claiming files, preventing duplicates and improving concurrency handling.
- Refactored the `createFileMethods` to include error handling for failed file claims, ensuring robust behavior when claiming files for conversations.
- These changes enhance the reliability of file management in the application.

* fix: Update adaptive thinking test for Opus 4.6 model

- Modified the test for configuring adaptive thinking to reflect that no default maxTokens should be set for the Opus 4.6 model.
- Updated assertions to ensure that maxTokens is undefined, aligning with the expected behavior for adaptive models.
2026-02-07 13:26:18 -05:00

388 lines
13 KiB
TypeScript

import logger from '../config/winston';
import { EToolResources, FileContext } from 'librechat-data-provider';
import type { FilterQuery, SortOrder, Model } from 'mongoose';
import type { IMongoFile } from '~/types/file';
/** Factory function that takes mongoose instance and returns the file methods */
export function createFileMethods(mongoose: typeof import('mongoose')) {
/**
* Finds a file by its file_id with additional query options.
* @param file_id - The unique identifier of the file
* @param options - Query options for filtering, projection, etc.
* @returns A promise that resolves to the file document or null
*/
async function findFileById(
file_id: string,
options: Record<string, unknown> = {},
): Promise<IMongoFile | null> {
const File = mongoose.models.File as Model<IMongoFile>;
return File.findOne({ file_id, ...options }).lean();
}
/** Select fields for query projection - 0 to exclude, 1 to include */
type SelectProjection = Record<string, 0 | 1>;
/**
* Retrieves files matching a given filter, sorted by the most recently updated.
* @param filter - The filter criteria to apply
* @param _sortOptions - Optional sort parameters
* @param selectFields - Fields to include/exclude in the query results. Default excludes the 'text' field
* @param options - Additional query options (userId, agentId for ACL)
* @returns A promise that resolves to an array of file documents
*/
async function getFiles(
filter: FilterQuery<IMongoFile>,
_sortOptions?: Record<string, SortOrder> | null,
selectFields?: SelectProjection | string | null,
): Promise<IMongoFile[] | null> {
const File = mongoose.models.File as Model<IMongoFile>;
const sortOptions = { updatedAt: -1 as SortOrder, ..._sortOptions };
const query = File.find(filter);
if (selectFields != null) {
query.select(selectFields);
} else {
query.select({ text: 0 });
}
return await query.sort(sortOptions).lean();
}
/**
* Retrieves tool files (files that are embedded or have a fileIdentifier) from an array of file IDs.
* Note: execute_code files are handled separately by getCodeGeneratedFiles.
* @param fileIds - Array of file_id strings to search for
* @param toolResourceSet - Optional filter for tool resources
* @returns Files that match the criteria
*/
async function getToolFilesByIds(
fileIds: string[],
toolResourceSet?: Set<EToolResources>,
): Promise<IMongoFile[]> {
if (!fileIds || !fileIds.length || !toolResourceSet?.size) {
return [];
}
try {
const orConditions: FilterQuery<IMongoFile>[] = [];
if (toolResourceSet.has(EToolResources.context)) {
orConditions.push({ text: { $exists: true, $ne: null }, context: FileContext.agents });
}
if (toolResourceSet.has(EToolResources.file_search)) {
orConditions.push({ embedded: true });
}
// If no conditions to match, return empty
if (orConditions.length === 0) {
return [];
}
const filter: FilterQuery<IMongoFile> = {
file_id: { $in: fileIds },
context: { $ne: FileContext.execute_code },
$or: orConditions,
};
const selectFields: SelectProjection = { text: 0 };
const sortOptions = { updatedAt: -1 as SortOrder };
const results = await getFiles(filter, sortOptions, selectFields);
return results ?? [];
} catch (error) {
logger.error('[getToolFilesByIds] Error retrieving tool files:', error);
throw new Error('Error retrieving tool files');
}
}
/**
* Retrieves files generated by code execution for a given conversation.
* These files are stored locally with fileIdentifier metadata for code env re-upload.
*
* @param conversationId - The conversation ID to search for
* @param messageIds - Array of messageIds to filter by (for linear thread filtering).
* While technically optional, this function returns empty if not provided.
* This is intentional: code-generated files must be filtered by thread to avoid
* including files from other branches of a conversation.
* @returns Files generated by code execution in the conversation, filtered by messageIds
*/
async function getCodeGeneratedFiles(
conversationId: string,
messageIds?: string[],
): Promise<IMongoFile[]> {
if (!conversationId) {
return [];
}
/**
* Return early if messageIds not provided - this is intentional behavior.
* Code-generated files must be filtered by thread messageIds to ensure we only
* return files relevant to the current conversation branch, not orphaned files
* from other branches or deleted messages.
*/
if (!messageIds || messageIds.length === 0) {
return [];
}
try {
const filter: FilterQuery<IMongoFile> = {
conversationId,
context: FileContext.execute_code,
messageId: { $exists: true, $in: messageIds },
'metadata.fileIdentifier': { $exists: true },
};
const selectFields: SelectProjection = { text: 0 };
const sortOptions = { createdAt: 1 as SortOrder };
const results = await getFiles(filter, sortOptions, selectFields);
return results ?? [];
} catch (error) {
logger.error('[getCodeGeneratedFiles] Error retrieving code generated files:', error);
return [];
}
}
/**
* Retrieves user-uploaded execute_code files (not code-generated) by their file IDs.
* These are files with fileIdentifier metadata but context is NOT execute_code (e.g., agents or message_attachment).
* File IDs should be collected from message.files arrays in the current thread.
* @param fileIds - Array of file IDs to fetch (from message.files in the thread)
* @returns User-uploaded execute_code files
*/
async function getUserCodeFiles(fileIds?: string[]): Promise<IMongoFile[]> {
if (!fileIds || fileIds.length === 0) {
return [];
}
try {
const filter: FilterQuery<IMongoFile> = {
file_id: { $in: fileIds },
context: { $ne: FileContext.execute_code },
'metadata.fileIdentifier': { $exists: true },
};
const selectFields: SelectProjection = { text: 0 };
const sortOptions = { createdAt: 1 as SortOrder };
const results = await getFiles(filter, sortOptions, selectFields);
return results ?? [];
} catch (error) {
logger.error('[getUserCodeFiles] Error retrieving user code files:', error);
return [];
}
}
/**
* Atomically claims a file_id for a code-execution output by compound key.
* Uses $setOnInsert so concurrent calls for the same (filename, conversationId)
* converge on a single record instead of creating duplicates.
*/
async function claimCodeFile(data: {
filename: string;
conversationId: string;
file_id: string;
user: string;
}): Promise<IMongoFile> {
const File = mongoose.models.File as Model<IMongoFile>;
const result = await File.findOneAndUpdate(
{
filename: data.filename,
conversationId: data.conversationId,
context: FileContext.execute_code,
},
{ $setOnInsert: { file_id: data.file_id, user: data.user } },
{ upsert: true, new: true },
).lean();
if (!result) {
throw new Error(
`[claimCodeFile] Failed to claim file "${data.filename}" for conversation ${data.conversationId}`,
);
}
return result as IMongoFile;
}
/**
* Creates a new file with a TTL of 1 hour.
* @param data - The file data to be created, must contain file_id
* @param disableTTL - Whether to disable the TTL
* @returns A promise that resolves to the created file document
*/
async function createFile(
data: Partial<IMongoFile>,
disableTTL?: boolean,
): Promise<IMongoFile | null> {
const File = mongoose.models.File as Model<IMongoFile>;
const fileData: Partial<IMongoFile> = {
...data,
expiresAt: new Date(Date.now() + 3600 * 1000),
};
if (disableTTL) {
delete fileData.expiresAt;
}
return File.findOneAndUpdate({ file_id: data.file_id }, fileData, {
new: true,
upsert: true,
}).lean();
}
/**
* Updates a file identified by file_id with new data and removes the TTL.
* @param data - The data to update, must contain file_id
* @returns A promise that resolves to the updated file document
*/
async function updateFile(
data: Partial<IMongoFile> & { file_id: string },
): Promise<IMongoFile | null> {
const File = mongoose.models.File as Model<IMongoFile>;
const { file_id, ...update } = data;
const updateOperation = {
$set: update,
$unset: { expiresAt: '' },
};
return File.findOneAndUpdate({ file_id }, updateOperation, {
new: true,
}).lean();
}
/**
* Increments the usage of a file identified by file_id.
* @param data - The data to update, must contain file_id and the increment value for usage
* @returns A promise that resolves to the updated file document
*/
async function updateFileUsage(data: {
file_id: string;
inc?: number;
}): Promise<IMongoFile | null> {
const File = mongoose.models.File as Model<IMongoFile>;
const { file_id, inc = 1 } = data;
const updateOperation = {
$inc: { usage: inc },
$unset: { expiresAt: '', temp_file_id: '' },
};
return File.findOneAndUpdate({ file_id }, updateOperation, {
new: true,
}).lean();
}
/**
* Deletes a file identified by file_id.
* @param file_id - The unique identifier of the file to delete
* @returns A promise that resolves to the deleted file document or null
*/
async function deleteFile(file_id: string): Promise<IMongoFile | null> {
const File = mongoose.models.File as Model<IMongoFile>;
return File.findOneAndDelete({ file_id }).lean();
}
/**
* Deletes a file identified by a filter.
* @param filter - The filter criteria to apply
* @returns A promise that resolves to the deleted file document or null
*/
async function deleteFileByFilter(filter: FilterQuery<IMongoFile>): Promise<IMongoFile | null> {
const File = mongoose.models.File as Model<IMongoFile>;
return File.findOneAndDelete(filter).lean();
}
/**
* Deletes multiple files identified by an array of file_ids.
* @param file_ids - The unique identifiers of the files to delete
* @param user - Optional user ID to filter by
* @returns A promise that resolves to the result of the deletion operation
*/
async function deleteFiles(
file_ids: string[],
user?: string,
): Promise<{ deletedCount?: number }> {
const File = mongoose.models.File as Model<IMongoFile>;
let deleteQuery: FilterQuery<IMongoFile> = { file_id: { $in: file_ids } };
if (user) {
deleteQuery = { user: user };
}
return File.deleteMany(deleteQuery);
}
/**
* Batch updates files with new signed URLs in MongoDB
* @param updates - Array of updates in the format { file_id, filepath }
*/
async function batchUpdateFiles(
updates: Array<{ file_id: string; filepath: string }>,
): Promise<void> {
if (!updates || updates.length === 0) {
return;
}
const File = mongoose.models.File as Model<IMongoFile>;
const bulkOperations = updates.map((update) => ({
updateOne: {
filter: { file_id: update.file_id },
update: { $set: { filepath: update.filepath } },
},
}));
const result = await File.bulkWrite(bulkOperations);
logger.info(`Updated ${result.modifiedCount} files with new S3 URLs`);
}
/**
* Updates usage tracking for multiple files.
* Processes files and optional fileIds, updating their usage count in the database.
*
* @param files - Array of file objects to process
* @param fileIds - Optional array of file IDs to process
* @returns Array of updated file documents (with null results filtered out)
*/
async function updateFilesUsage(
files: Array<{ file_id: string }>,
fileIds?: string[],
): Promise<IMongoFile[]> {
const promises: Promise<IMongoFile | null>[] = [];
const seen = new Set<string>();
for (const file of files) {
const { file_id } = file;
if (seen.has(file_id)) {
continue;
}
seen.add(file_id);
promises.push(updateFileUsage({ file_id }));
}
if (!fileIds) {
const results = await Promise.all(promises);
return results.filter((result): result is IMongoFile => result != null);
}
for (const file_id of fileIds) {
if (seen.has(file_id)) {
continue;
}
seen.add(file_id);
promises.push(updateFileUsage({ file_id }));
}
const results = await Promise.all(promises);
return results.filter((result): result is IMongoFile => result != null);
}
return {
findFileById,
getFiles,
getToolFilesByIds,
getCodeGeneratedFiles,
getUserCodeFiles,
claimCodeFile,
createFile,
updateFile,
updateFileUsage,
deleteFile,
deleteFiles,
deleteFileByFilter,
batchUpdateFiles,
updateFilesUsage,
};
}
export type FileMethods = ReturnType<typeof createFileMethods>;