feat: Update Azure Document Intelligence upload function to return Markdown result

This commit is contained in:
Ruben Talstra 2025-05-24 21:20:47 +02:00
parent 5dfad6b77b
commit 8711ca8b3a
No known key found for this signature in database
GPG key ID: 2A5A7174A60F3BEA
2 changed files with 82 additions and 55 deletions

View file

@ -3,7 +3,7 @@ const fs = require('fs');
const { logger } = require('~/config');
/**
* Uploads a document to Azure Document Intelligence API and processes the result.
* Uploads a document to Azure Document Intelligence API and returns the Markdown result.
*
* @param {Object} params - The parameters for the Azure Document Intelligence request.
* @param {string} params.filePath - The path to the file on disk.
@ -13,15 +13,18 @@ const { logger } = require('~/config');
* @returns {Promise<Object>} - The Document Intelligence result.
*/
async function uploadAzureDocumentIntelligence({ filePath, apiKey, endpoint, modelId }) {
// Read and encode file
const fileBuffer = fs.readFileSync(filePath);
const base64Source = fileBuffer.toString('base64');
// Build URL (ensure no trailing slash on endpoint)
const url = `${endpoint.replace(/\/+$/, '')}/documentModels/${modelId}:analyze?outputContentFormat=markdown`;
try {
// Kick off the analysis
const response = await axios.post(
`${endpoint}/documentModels/${modelId}/analyze?outputContentFormat=markdown`,
{
base64Source,
},
url,
{ base64Source },
{
headers: {
'Ocp-Apim-Subscription-Key': apiKey,
@ -29,32 +32,37 @@ async function uploadAzureDocumentIntelligence({ filePath, apiKey, endpoint, mod
},
},
);
const operationLocation = response.headers['Operation-Location'];
// Polling for the result
let result;
while (true) {
const pollResponse = await axios.get(operationLocation, {
headers: {
'Ocp-Apim-Subscription-Key': apiKey,
},
});
if (pollResponse.data.status === 'succeeded') {
const resultUrl = pollResponse.data.resultUrl; // URL to fetch the analysis result
const resultResponse = await axios.get(resultUrl, {
headers: {
'Ocp-Apim-Subscription-Key': apiKey,
},
});
result = resultResponse.data.analyzeResult.content; // Final analysis result
break;
} else if (pollResponse.data.status === 'failed') {
throw new Error('Azure Document Intelligence processing failed.');
}
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before polling again
// Axios lower-cases header keys, but allow either form
const headers = response.headers || {};
const operationLocation = headers['operation-location'] || headers['Operation-Location'];
if (!operationLocation) {
throw new Error('Missing Operation-Location header in Azure response.');
}
return result;
// Poll until done
let resultContent;
while (true) {
const pollResponse = await axios.get(operationLocation, {
headers: { 'Ocp-Apim-Subscription-Key': apiKey },
});
const { status, resultUrl } = pollResponse.data;
if (status === 'succeeded') {
const final = await axios.get(resultUrl, {
headers: { 'Ocp-Apim-Subscription-Key': apiKey },
});
resultContent = final.data.analyzeResult.content;
break;
}
if (status === 'failed') {
throw new Error('Azure Document Intelligence processing failed.');
}
// Wait 2s before retry
await new Promise((r) => setTimeout(r, 2000));
}
return resultContent;
} catch (error) {
logger.error('Error performing Azure Document Intelligence:', error.message);
throw error;

View file

@ -6,9 +6,7 @@ const mockAxios = {
response: { use: jest.fn(), eject: jest.fn() },
},
create: jest.fn().mockReturnValue({
defaults: {
proxy: null,
},
defaults: { proxy: null },
get: jest.fn().mockResolvedValue({ data: {} }),
post: jest.fn().mockResolvedValue({ data: {} }),
put: jest.fn().mockResolvedValue({ data: {} }),
@ -30,46 +28,52 @@ const mockAxios = {
jest.mock('axios', () => mockAxios);
jest.mock('fs');
jest.mock('~/config', () => ({
logger: {
error: jest.fn(),
},
createAxiosInstance: () => mockAxios,
}));
jest.mock('~/server/services/Tools/credentials', () => ({
loadAuthValues: jest.fn(),
logger: { error: jest.fn() },
}));
const { uploadAzureDocumentIntelligence } = require('./crud');
describe('AzureDocumentIntelligence Service', () => {
it('should upload a document and process the result using Azure Document Intelligence API', async () => {
const mockFileBuffer = Buffer.from('test file content');
const mockBase64Source = mockFileBuffer.toString('base64');
const mockOperationLocation = 'https://azure-ocr-endpoint.com/operation';
const mockResultUrl = 'https://azure-ocr-endpoint.com/result';
const mockFinalResult = { analyzeResult: { content: 'Final analysis result' } };
beforeEach(() => {
mockAxios.reset();
fs.readFileSync.mockReset();
});
it('should upload and poll until it gets the Markdown result', async () => {
const mockFileBuffer = Buffer.from('test file content');
const mockBase64 = mockFileBuffer.toString('base64');
const mockOpLocation = 'https://azure-ocr-endpoint.com/operations/123';
const mockResultUrl = 'https://azure-ocr-endpoint.com/results/123';
const mockFinal = { analyzeResult: { content: 'Final analysis result' } };
// fs.readFileSync returns our buffer
fs.readFileSync.mockReturnValue(mockFileBuffer);
mockAxios.post
.mockResolvedValueOnce({ headers: { 'Operation-Location': mockOperationLocation } }) // Initial upload
.mockResolvedValueOnce({ data: { status: 'succeeded', resultUrl: mockResultUrl } }); // Polling success
// First axios.post => returns Operation-Location header
mockAxios.post.mockResolvedValueOnce({
headers: { 'Operation-Location': mockOpLocation },
});
// First axios.get => poll success, returns status + resultUrl
// Second axios.get => fetch final result
mockAxios.get
.mockResolvedValueOnce({ data: { status: 'succeeded', resultUrl: mockResultUrl } }) // Polling
.mockResolvedValueOnce({ data: mockFinalResult }); // Final result fetch
.mockResolvedValueOnce({ data: { status: 'succeeded', resultUrl: mockResultUrl } })
.mockResolvedValueOnce({ data: mockFinal });
const result = await uploadAzureDocumentIntelligence({
filePath: '/path/to/test.pdf',
apiKey: 'azure-api-key',
endpoint: 'https://azure-ocr-endpoint.com',
endpoint: 'https://azure-ocr-endpoint.com/',
modelId: 'prebuilt-layout',
});
// Validate read
expect(fs.readFileSync).toHaveBeenCalledWith('/path/to/test.pdf');
// Validate initial POST
expect(mockAxios.post).toHaveBeenCalledWith(
'https://azure-ocr-endpoint.com/documentModels/prebuilt-invoice:analyze',
{ base64Source: mockBase64Source },
'https://azure-ocr-endpoint.com/documentModels/prebuilt-layout:analyze?outputContentFormat=markdown',
{ base64Source: mockBase64 },
expect.objectContaining({
headers: expect.objectContaining({
'Ocp-Apim-Subscription-Key': 'azure-api-key',
@ -77,8 +81,23 @@ describe('AzureDocumentIntelligence Service', () => {
}),
}),
);
expect(mockAxios.get).toHaveBeenCalledWith(mockOperationLocation, expect.any(Object));
expect(mockAxios.get).toHaveBeenCalledWith(mockResultUrl, expect.any(Object));
expect(result).toEqual(mockFinalResult.analyzeResult.content);
// Validate polling GET
expect(mockAxios.get).toHaveBeenCalledWith(
mockOpLocation,
expect.objectContaining({
headers: expect.objectContaining({ 'Ocp-Apim-Subscription-Key': 'azure-api-key' }),
}),
);
// Validate final fetch GET
expect(mockAxios.get).toHaveBeenCalledWith(
mockResultUrl,
expect.objectContaining({
headers: expect.objectContaining({ 'Ocp-Apim-Subscription-Key': 'azure-api-key' }),
}),
);
expect(result).toEqual('Final analysis result');
});
});