👓 feat: Vision Support for Assistants (#2195)

* refactor(assistants/chat): use promises to speed up initialization, initialize shared variables, include `attachedFileIds` to streamRunManager * chore: additional typedefs * fix(OpenAIClient): handle edge case where attachments promise is resolved * feat: createVisionPrompt * feat: Vision Support for Assistants
2026-02-23 19:04:10 +01:00 · 2024-03-24 23:43:00 -04:00 · 2024-03-24 23:43:00 -04:00 · 798e8763d0
commit 798e8763d0
parent 1f0fb497f8
16 changed files with 376 additions and 100 deletions
--- a/api/app/clients/OpenAIClient.js
+++ b/api/app/clients/OpenAIClient.js
@ -92,7 +92,11 @@ class OpenAIClient extends BaseClient {
    }

    this.defaultVisionModel = this.options.visionModel ?? 'gpt-4-vision-preview';
-    this.options.attachments?.then((attachments) => this.checkVisionRequest(attachments));
+    if (typeof this.options.attachments?.then === 'function') {
+      this.options.attachments.then((attachments) => this.checkVisionRequest(attachments));
+    } else {
+      this.checkVisionRequest(this.options.attachments);
+    }

    const { OPENROUTER_API_KEY, OPENAI_FORCE_PROMPT } = process.env ?? {};
    if (OPENROUTER_API_KEY && !this.azure) {
--- a/api/app/clients/prompts/createVisionPrompt.js
+++ b/api/app/clients/prompts/createVisionPrompt.js
@ -0,0 +1,34 @@
+/**
+ * Generates a prompt instructing the user to describe an image in detail, tailored to different types of visual content.
+ * @param {boolean} pluralized - Whether to pluralize the prompt for multiple images.
+ * @returns {string} - The generated vision prompt.
+ */
+const createVisionPrompt = (pluralized = false) => {
+  return `Please describe the image${
+    pluralized ? 's' : ''
+  } in detail, covering relevant aspects such as:
+
+  For photographs, illustrations, or artwork:
+  - The main subject(s) and their appearance, positioning, and actions
+  - The setting, background, and any notable objects or elements
+  - Colors, lighting, and overall mood or atmosphere
+  - Any interesting details, textures, or patterns
+  - The style, technique, or medium used (if discernible)
+  
+  For screenshots or images containing text:
+  - The content and purpose of the text
+  - The layout, formatting, and organization of the information
+  - Any notable visual elements, such as logos, icons, or graphics
+  - The overall context or message conveyed by the screenshot
+  
+  For graphs, charts, or data visualizations:
+  - The type of graph or chart (e.g., bar graph, line chart, pie chart)
+  - The variables being compared or analyzed
+  - Any trends, patterns, or outliers in the data
+  - The axis labels, scales, and units of measurement
+  - The title, legend, and any additional context provided
+  
+  Be as specific and descriptive as possible while maintaining clarity and concision.`;
+};
+
+module.exports = createVisionPrompt;
--- a/api/app/clients/prompts/index.js
+++ b/api/app/clients/prompts/index.js
@ -4,6 +4,7 @@ const handleInputs = require('./handleInputs');
 const instructions = require('./instructions');
 const titlePrompts = require('./titlePrompts');
 const truncateText = require('./truncateText');
+const createVisionPrompt = require('./createVisionPrompt');
 const createContextHandlers = require('./createContextHandlers');

 module.exports = {
@ -13,5 +14,6 @@ module.exports = {
  ...instructions,
  ...titlePrompts,
  truncateText,
+  createVisionPrompt,
  createContextHandlers,
 };