👓 feat: Vision Support for Assistants (#2195)

* refactor(assistants/chat): use promises to speed up initialization, initialize shared variables, include `attachedFileIds` to streamRunManager

* chore: additional typedefs

* fix(OpenAIClient): handle edge case where attachments promise is resolved

* feat: createVisionPrompt

* feat: Vision Support for Assistants
This commit is contained in:
Danny Avila 2024-03-24 23:43:00 -04:00 committed by GitHub
parent 1f0fb497f8
commit 798e8763d0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 376 additions and 100 deletions

View file

@ -92,7 +92,11 @@ class OpenAIClient extends BaseClient {
}
this.defaultVisionModel = this.options.visionModel ?? 'gpt-4-vision-preview';
this.options.attachments?.then((attachments) => this.checkVisionRequest(attachments));
if (typeof this.options.attachments?.then === 'function') {
this.options.attachments.then((attachments) => this.checkVisionRequest(attachments));
} else {
this.checkVisionRequest(this.options.attachments);
}
const { OPENROUTER_API_KEY, OPENAI_FORCE_PROMPT } = process.env ?? {};
if (OPENROUTER_API_KEY && !this.azure) {

View file

@ -0,0 +1,34 @@
/**
* Generates a prompt instructing the user to describe an image in detail, tailored to different types of visual content.
* @param {boolean} pluralized - Whether to pluralize the prompt for multiple images.
* @returns {string} - The generated vision prompt.
*/
const createVisionPrompt = (pluralized = false) => {
return `Please describe the image${
pluralized ? 's' : ''
} in detail, covering relevant aspects such as:
For photographs, illustrations, or artwork:
- The main subject(s) and their appearance, positioning, and actions
- The setting, background, and any notable objects or elements
- Colors, lighting, and overall mood or atmosphere
- Any interesting details, textures, or patterns
- The style, technique, or medium used (if discernible)
For screenshots or images containing text:
- The content and purpose of the text
- The layout, formatting, and organization of the information
- Any notable visual elements, such as logos, icons, or graphics
- The overall context or message conveyed by the screenshot
For graphs, charts, or data visualizations:
- The type of graph or chart (e.g., bar graph, line chart, pie chart)
- The variables being compared or analyzed
- Any trends, patterns, or outliers in the data
- The axis labels, scales, and units of measurement
- The title, legend, and any additional context provided
Be as specific and descriptive as possible while maintaining clarity and concision.`;
};
module.exports = createVisionPrompt;

View file

@ -4,6 +4,7 @@ const handleInputs = require('./handleInputs');
const instructions = require('./instructions');
const titlePrompts = require('./titlePrompts');
const truncateText = require('./truncateText');
const createVisionPrompt = require('./createVisionPrompt');
const createContextHandlers = require('./createContextHandlers');
module.exports = {
@ -13,5 +14,6 @@ module.exports = {
...instructions,
...titlePrompts,
truncateText,
createVisionPrompt,
createContextHandlers,
};