feat: implement PDF attachment extraction functionality with web worker support
- Added a new worker script to handle extraction of embedded attachments from PDF files. - Created TypeScript definitions for the message structure and response types. - Updated the main extraction logic to utilize the worker for improved performance and responsiveness. - Integrated the extraction feature into the UI, allowing users to extract attachments as a ZIP file. - Enhanced error handling and user feedback during the extraction process.
This commit is contained in:
106
public/workers/extract-attachments.worker.js
Normal file
106
public/workers/extract-attachments.worker.js
Normal file
@@ -0,0 +1,106 @@
|
||||
self.importScripts('/coherentpdf.browser.min.js');
|
||||
|
||||
function extractAttachmentsFromPDFsInWorker(fileBuffers, fileNames) {
|
||||
try {
|
||||
const allAttachments = [];
|
||||
const totalFiles = fileBuffers.length;
|
||||
|
||||
for (let i = 0; i < totalFiles; i++) {
|
||||
const buffer = fileBuffers[i];
|
||||
const fileName = fileNames[i];
|
||||
const uint8Array = new Uint8Array(buffer);
|
||||
|
||||
let pdf;
|
||||
try {
|
||||
pdf = coherentpdf.fromMemory(uint8Array, '');
|
||||
} catch (error) {
|
||||
console.warn(`Failed to load PDF: ${fileName}`, error);
|
||||
continue;
|
||||
}
|
||||
|
||||
coherentpdf.startGetAttachments(pdf);
|
||||
const attachmentCount = coherentpdf.numberGetAttachments();
|
||||
|
||||
if (attachmentCount === 0) {
|
||||
console.warn(`No attachments found in ${fileName}`);
|
||||
coherentpdf.deletePdf(pdf);
|
||||
continue;
|
||||
}
|
||||
|
||||
const baseName = fileName.replace(/\.pdf$/i, '');
|
||||
for (let j = 0; j < attachmentCount; j++) {
|
||||
try {
|
||||
const attachmentName = coherentpdf.getAttachmentName(j);
|
||||
const attachmentPage = coherentpdf.getAttachmentPage(j);
|
||||
const attachmentData = coherentpdf.getAttachmentData(j);
|
||||
|
||||
let uniqueName = attachmentName;
|
||||
let counter = 1;
|
||||
while (allAttachments.some(att => att.name === uniqueName)) {
|
||||
const nameParts = attachmentName.split('.');
|
||||
if (nameParts.length > 1) {
|
||||
const extension = nameParts.pop();
|
||||
uniqueName = `${nameParts.join('.')}_${counter}.${extension}`;
|
||||
} else {
|
||||
uniqueName = `${attachmentName}_${counter}`;
|
||||
}
|
||||
counter++;
|
||||
}
|
||||
|
||||
if (attachmentPage > 0) {
|
||||
uniqueName = `${baseName}_page${attachmentPage}_${uniqueName}`;
|
||||
} else {
|
||||
uniqueName = `${baseName}_${uniqueName}`;
|
||||
}
|
||||
|
||||
allAttachments.push({
|
||||
name: uniqueName,
|
||||
data: attachmentData.buffer.slice(0)
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`Failed to extract attachment ${j} from ${fileName}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
coherentpdf.endGetAttachments();
|
||||
coherentpdf.deletePdf(pdf);
|
||||
}
|
||||
|
||||
if (allAttachments.length === 0) {
|
||||
self.postMessage({
|
||||
status: 'error',
|
||||
message: 'No attachments were found in the selected PDF(s).'
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const response = {
|
||||
status: 'success',
|
||||
attachments: []
|
||||
};
|
||||
|
||||
const transferBuffers = [];
|
||||
for (const attachment of allAttachments) {
|
||||
response.attachments.push({
|
||||
name: attachment.name,
|
||||
data: attachment.data
|
||||
});
|
||||
transferBuffers.push(attachment.data);
|
||||
}
|
||||
|
||||
self.postMessage(response, transferBuffers);
|
||||
} catch (error) {
|
||||
self.postMessage({
|
||||
status: 'error',
|
||||
message: error instanceof Error
|
||||
? error.message
|
||||
: 'Unknown error occurred during attachment extraction.'
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
self.onmessage = (e) => {
|
||||
if (e.data.command === 'extract-attachments') {
|
||||
extractAttachmentsFromPDFsInWorker(e.data.fileBuffers, e.data.fileNames);
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user