feat: implement PDF attachment extraction functionality with web worker support
- Added a new worker script to handle extraction of embedded attachments from PDF files. - Created TypeScript definitions for the message structure and response types. - Updated the main extraction logic to utilize the worker for improved performance and responsiveness. - Integrated the extraction feature into the UI, allowing users to extract attachments as a ZIP file. - Enhanced error handling and user feedback during the extraction process.
This commit is contained in:
19
public/workers/extract-attachments.worker.d.ts
vendored
Normal file
19
public/workers/extract-attachments.worker.d.ts
vendored
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
declare const coherentpdf: typeof import('../../src/types/coherentpdf.global').coherentpdf;
|
||||||
|
|
||||||
|
interface ExtractAttachmentsMessage {
|
||||||
|
command: 'extract-attachments';
|
||||||
|
fileBuffers: ArrayBuffer[];
|
||||||
|
fileNames: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ExtractAttachmentSuccessResponse {
|
||||||
|
status: 'success';
|
||||||
|
attachments: Array<{ name: string; data: ArrayBuffer }>;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ExtractAttachmentErrorResponse {
|
||||||
|
status: 'error';
|
||||||
|
message: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ExtractAttachmentResponse = ExtractAttachmentSuccessResponse | ExtractAttachmentErrorResponse;
|
||||||
106
public/workers/extract-attachments.worker.js
Normal file
106
public/workers/extract-attachments.worker.js
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
self.importScripts('/coherentpdf.browser.min.js');
|
||||||
|
|
||||||
|
function extractAttachmentsFromPDFsInWorker(fileBuffers, fileNames) {
|
||||||
|
try {
|
||||||
|
const allAttachments = [];
|
||||||
|
const totalFiles = fileBuffers.length;
|
||||||
|
|
||||||
|
for (let i = 0; i < totalFiles; i++) {
|
||||||
|
const buffer = fileBuffers[i];
|
||||||
|
const fileName = fileNames[i];
|
||||||
|
const uint8Array = new Uint8Array(buffer);
|
||||||
|
|
||||||
|
let pdf;
|
||||||
|
try {
|
||||||
|
pdf = coherentpdf.fromMemory(uint8Array, '');
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`Failed to load PDF: ${fileName}`, error);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
coherentpdf.startGetAttachments(pdf);
|
||||||
|
const attachmentCount = coherentpdf.numberGetAttachments();
|
||||||
|
|
||||||
|
if (attachmentCount === 0) {
|
||||||
|
console.warn(`No attachments found in ${fileName}`);
|
||||||
|
coherentpdf.deletePdf(pdf);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const baseName = fileName.replace(/\.pdf$/i, '');
|
||||||
|
for (let j = 0; j < attachmentCount; j++) {
|
||||||
|
try {
|
||||||
|
const attachmentName = coherentpdf.getAttachmentName(j);
|
||||||
|
const attachmentPage = coherentpdf.getAttachmentPage(j);
|
||||||
|
const attachmentData = coherentpdf.getAttachmentData(j);
|
||||||
|
|
||||||
|
let uniqueName = attachmentName;
|
||||||
|
let counter = 1;
|
||||||
|
while (allAttachments.some(att => att.name === uniqueName)) {
|
||||||
|
const nameParts = attachmentName.split('.');
|
||||||
|
if (nameParts.length > 1) {
|
||||||
|
const extension = nameParts.pop();
|
||||||
|
uniqueName = `${nameParts.join('.')}_${counter}.${extension}`;
|
||||||
|
} else {
|
||||||
|
uniqueName = `${attachmentName}_${counter}`;
|
||||||
|
}
|
||||||
|
counter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attachmentPage > 0) {
|
||||||
|
uniqueName = `${baseName}_page${attachmentPage}_${uniqueName}`;
|
||||||
|
} else {
|
||||||
|
uniqueName = `${baseName}_${uniqueName}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
allAttachments.push({
|
||||||
|
name: uniqueName,
|
||||||
|
data: attachmentData.buffer.slice(0)
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`Failed to extract attachment ${j} from ${fileName}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
coherentpdf.endGetAttachments();
|
||||||
|
coherentpdf.deletePdf(pdf);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allAttachments.length === 0) {
|
||||||
|
self.postMessage({
|
||||||
|
status: 'error',
|
||||||
|
message: 'No attachments were found in the selected PDF(s).'
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = {
|
||||||
|
status: 'success',
|
||||||
|
attachments: []
|
||||||
|
};
|
||||||
|
|
||||||
|
const transferBuffers = [];
|
||||||
|
for (const attachment of allAttachments) {
|
||||||
|
response.attachments.push({
|
||||||
|
name: attachment.name,
|
||||||
|
data: attachment.data
|
||||||
|
});
|
||||||
|
transferBuffers.push(attachment.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.postMessage(response, transferBuffers);
|
||||||
|
} catch (error) {
|
||||||
|
self.postMessage({
|
||||||
|
status: 'error',
|
||||||
|
message: error instanceof Error
|
||||||
|
? error.message
|
||||||
|
: 'Unknown error occurred during attachment extraction.'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.onmessage = (e) => {
|
||||||
|
if (e.data.command === 'extract-attachments') {
|
||||||
|
extractAttachmentsFromPDFsInWorker(e.data.fileBuffers, e.data.fileNames);
|
||||||
|
}
|
||||||
|
};
|
||||||
@@ -318,13 +318,12 @@ export const categories = [
|
|||||||
icon: 'paperclip',
|
icon: 'paperclip',
|
||||||
subtitle: 'Embed one or more files into your PDF.',
|
subtitle: 'Embed one or more files into your PDF.',
|
||||||
},
|
},
|
||||||
// TODO@ALAM - MAKE THIS LATER, ONCE INTEGERATED WITH CPDF
|
{
|
||||||
// {
|
id: 'extract-attachments',
|
||||||
// id: 'extract-attachments',
|
name: 'Extract Attachments',
|
||||||
// name: 'Extract Attachments',
|
icon: 'download',
|
||||||
// icon: 'download',
|
subtitle: 'Extract all embedded files from PDF(s) as a ZIP.',
|
||||||
// subtitle: 'Extract all embedded files from PDF(s) as a ZIP.',
|
},
|
||||||
// },
|
|
||||||
// {
|
// {
|
||||||
// id: 'edit-attachments',
|
// id: 'edit-attachments',
|
||||||
// name: 'Edit Attachments',
|
// name: 'Edit Attachments',
|
||||||
|
|||||||
@@ -1,88 +1,130 @@
|
|||||||
// TODO@ALAM - USE CPDF HERE
|
import { downloadFile, formatBytes } from '../utils/helpers.js';
|
||||||
|
import { state } from '../state.js';
|
||||||
|
import JSZip from 'jszip';
|
||||||
|
|
||||||
// import { showLoader, hideLoader, showAlert } from '../ui.js';
|
const worker = new Worker('/workers/extract-attachments.worker.js');
|
||||||
// import { downloadFile, readFileAsArrayBuffer } from '../utils/helpers.js';
|
|
||||||
// import { state } from '../state.js';
|
|
||||||
// import { PDFDocument as PDFLibDocument } from 'pdf-lib';
|
|
||||||
// import JSZip from 'jszip';
|
|
||||||
|
|
||||||
// export async function extractAttachments() {
|
interface ExtractAttachmentSuccessResponse {
|
||||||
// if (state.files.length === 0) {
|
status: 'success';
|
||||||
// showAlert('No Files', 'Please select at least one PDF file.');
|
attachments: Array<{ name: string; data: ArrayBuffer }>;
|
||||||
// return;
|
}
|
||||||
// }
|
|
||||||
|
|
||||||
// showLoader('Extracting attachments...');
|
interface ExtractAttachmentErrorResponse {
|
||||||
// try {
|
status: 'error';
|
||||||
// const zip = new JSZip();
|
message: string;
|
||||||
// let totalAttachments = 0;
|
}
|
||||||
|
|
||||||
// for (const file of state.files) {
|
type ExtractAttachmentResponse = ExtractAttachmentSuccessResponse | ExtractAttachmentErrorResponse;
|
||||||
// const pdfBytes = await readFileAsArrayBuffer(file);
|
|
||||||
// const pdfDoc = await PDFLibDocument.load(pdfBytes as ArrayBuffer, {
|
|
||||||
// ignoreEncryption: true,
|
|
||||||
// });
|
|
||||||
|
|
||||||
// const embeddedFiles = pdfDoc.context.enumerateIndirectObjects()
|
export async function extractAttachments() {
|
||||||
// .filter(([ref, obj]: any) => {
|
if (state.files.length === 0) {
|
||||||
// // obj must be a PDFDict
|
showStatus('No Files', 'error');
|
||||||
// if (obj && typeof obj.get === 'function') {
|
return;
|
||||||
// const type = obj.get('Type');
|
}
|
||||||
// return type && type.toString() === '/Filespec';
|
|
||||||
// }
|
|
||||||
// return false;
|
|
||||||
// });
|
|
||||||
|
|
||||||
// if (embeddedFiles.length === 0) {
|
document.getElementById('process-btn')?.classList.add('opacity-50', 'cursor-not-allowed');
|
||||||
// console.warn(`No attachments found in ${file.name}`);
|
document.getElementById('process-btn')?.setAttribute('disabled', 'true');
|
||||||
// continue;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // Extract attachments
|
showStatus('Reading files (Main Thread)...', 'info');
|
||||||
// const baseName = file.name.replace(/\.pdf$/i, '');
|
|
||||||
// for (let i = 0; i < embeddedFiles.length; i++) {
|
|
||||||
// try {
|
|
||||||
// const [ref, fileSpec] = embeddedFiles[i];
|
|
||||||
// const fileSpecDict = fileSpec as any;
|
|
||||||
|
|
||||||
// // Get attachment name
|
try {
|
||||||
// const fileName = fileSpecDict.get('UF')?.decodeText() ||
|
const fileBuffers: ArrayBuffer[] = [];
|
||||||
// fileSpecDict.get('F')?.decodeText() ||
|
const fileNames: string[] = [];
|
||||||
// `attachment-${i + 1}`;
|
|
||||||
|
|
||||||
// // Get embedded file stream
|
for (const file of state.files) {
|
||||||
// const ef = fileSpecDict.get('EF');
|
const buffer = await file.arrayBuffer();
|
||||||
// if (ef) {
|
fileBuffers.push(buffer);
|
||||||
// const fRef = ef.get('F') || ef.get('UF');
|
fileNames.push(file.name);
|
||||||
// if (fRef) {
|
}
|
||||||
// const fileStream = pdfDoc.context.lookup(fRef);
|
|
||||||
// if (fileStream) {
|
|
||||||
// const fileData = (fileStream as any).getContents();
|
|
||||||
// zip.file(`${baseName}_${fileName}`, fileData);
|
|
||||||
// totalAttachments++;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// } catch (e) {
|
|
||||||
// console.warn(`Failed to extract attachment ${i} from ${file.name}:`, e);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if (totalAttachments === 0) {
|
showStatus(`Extracting attachments from ${state.files.length} file(s)...`, 'info');
|
||||||
// showAlert('No Attachments', 'No attachments were found in the selected PDF(s).');
|
|
||||||
// hideLoader();
|
|
||||||
// return;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// const zipBlob = await zip.generateAsync({ type: 'blob' });
|
const message: ExtractAttachmentsMessage = {
|
||||||
// downloadFile(zipBlob, 'extracted-attachments.zip');
|
command: 'extract-attachments',
|
||||||
// showAlert('Success', `Extracted ${totalAttachments} attachment(s) successfully!`);
|
fileBuffers,
|
||||||
// } catch (e) {
|
fileNames,
|
||||||
// console.error(e);
|
};
|
||||||
// showAlert('Error', 'Failed to extract attachments. The PDF may not contain attachments or may be corrupted.');
|
|
||||||
// } finally {
|
|
||||||
// hideLoader();
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
const transferables = fileBuffers.map(buf => buf);
|
||||||
|
worker.postMessage(message, transferables);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error reading files:', error);
|
||||||
|
showStatus(
|
||||||
|
`Error reading files: ${error instanceof Error ? error.message : 'Unknown error occurred'}`,
|
||||||
|
'error'
|
||||||
|
);
|
||||||
|
document.getElementById('process-btn')?.classList.remove('opacity-50', 'cursor-not-allowed');
|
||||||
|
document.getElementById('process-btn')?.removeAttribute('disabled');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
worker.onmessage = (e: MessageEvent<ExtractAttachmentResponse>) => {
|
||||||
|
document.getElementById('process-btn')?.classList.remove('opacity-50', 'cursor-not-allowed');
|
||||||
|
document.getElementById('process-btn')?.removeAttribute('disabled');
|
||||||
|
|
||||||
|
if (e.data.status === 'success') {
|
||||||
|
const attachments = e.data.attachments;
|
||||||
|
|
||||||
|
const zip = new JSZip();
|
||||||
|
let totalSize = 0;
|
||||||
|
|
||||||
|
for (const attachment of attachments) {
|
||||||
|
zip.file(attachment.name, new Uint8Array(attachment.data));
|
||||||
|
totalSize += attachment.data.byteLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
zip.generateAsync({ type: 'blob' }).then((zipBlob) => {
|
||||||
|
downloadFile(zipBlob, 'extracted-attachments.zip');
|
||||||
|
showStatus(
|
||||||
|
`Extraction completed! ${attachments.length} attachment(s) in zip file (${formatBytes(totalSize)}). Download started.`,
|
||||||
|
'success'
|
||||||
|
);
|
||||||
|
|
||||||
|
state.files = [];
|
||||||
|
const fileDisplayArea = document.getElementById('file-display-area');
|
||||||
|
if (fileDisplayArea) {
|
||||||
|
fileDisplayArea.innerHTML = '';
|
||||||
|
fileDisplayArea.classList.add('hidden');
|
||||||
|
}
|
||||||
|
const fileInput = document.getElementById('file-input') as HTMLInputElement;
|
||||||
|
if (fileInput) {
|
||||||
|
fileInput.value = '';
|
||||||
|
}
|
||||||
|
document.getElementById('process-btn')?.classList.add('opacity-50', 'cursor-not-allowed');
|
||||||
|
document.getElementById('process-btn')?.setAttribute('disabled', 'true');
|
||||||
|
});
|
||||||
|
} else if (e.data.status === 'error') {
|
||||||
|
const errorMessage = e.data.message || 'Unknown error occurred in worker.';
|
||||||
|
console.error('Worker Error:', errorMessage);
|
||||||
|
showStatus(`Error: ${errorMessage}`, 'error');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
worker.onerror = (error) => {
|
||||||
|
console.error('Worker error:', error);
|
||||||
|
showStatus('Worker error occurred. Check console for details.', 'error');
|
||||||
|
document.getElementById('process-btn')?.classList.remove('opacity-50', 'cursor-not-allowed');
|
||||||
|
document.getElementById('process-btn')?.removeAttribute('disabled');
|
||||||
|
};
|
||||||
|
|
||||||
|
function showStatus(message: string, type: 'success' | 'error' | 'info' = 'info') {
|
||||||
|
const statusMessage = document.getElementById('status-message') as HTMLElement;
|
||||||
|
if (!statusMessage) return;
|
||||||
|
|
||||||
|
statusMessage.textContent = message;
|
||||||
|
statusMessage.className = `mt-4 p-3 rounded-lg text-sm ${
|
||||||
|
type === 'success'
|
||||||
|
? 'bg-green-900 text-green-200'
|
||||||
|
: type === 'error'
|
||||||
|
? 'bg-red-900 text-red-200'
|
||||||
|
: 'bg-blue-900 text-blue-200'
|
||||||
|
}`;
|
||||||
|
statusMessage.classList.remove('hidden');
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ExtractAttachmentsMessage {
|
||||||
|
command: 'extract-attachments';
|
||||||
|
fileBuffers: ArrayBuffer[];
|
||||||
|
fileNames: string[];
|
||||||
|
}
|
||||||
@@ -63,7 +63,7 @@ import {
|
|||||||
import { alternateMerge, setupAlternateMergeTool } from './alternate-merge.js';
|
import { alternateMerge, setupAlternateMergeTool } from './alternate-merge.js';
|
||||||
import { linearizePdf } from './linearize.js';
|
import { linearizePdf } from './linearize.js';
|
||||||
import { addAttachments, setupAddAttachmentsTool } from './add-attachments.js';
|
import { addAttachments, setupAddAttachmentsTool } from './add-attachments.js';
|
||||||
// import { extractAttachments } from './extract-attachments.js';
|
import { extractAttachments } from './extract-attachments.js';
|
||||||
// import { editAttachments, setupEditAttachmentsTool } from './edit-attachments.js';
|
// import { editAttachments, setupEditAttachmentsTool } from './edit-attachments.js';
|
||||||
import { sanitizePdf } from './sanitize-pdf.js';
|
import { sanitizePdf } from './sanitize-pdf.js';
|
||||||
import { removeRestrictions } from './remove-restrictions.js';
|
import { removeRestrictions } from './remove-restrictions.js';
|
||||||
@@ -140,7 +140,7 @@ export const toolLogic = {
|
|||||||
process: addAttachments,
|
process: addAttachments,
|
||||||
setup: setupAddAttachmentsTool,
|
setup: setupAddAttachmentsTool,
|
||||||
},
|
},
|
||||||
// 'extract-attachments': extractAttachments,
|
'extract-attachments': extractAttachments,
|
||||||
// 'edit-attachments': {
|
// 'edit-attachments': {
|
||||||
// process: editAttachments,
|
// process: editAttachments,
|
||||||
// setup: setupEditAttachmentsTool,
|
// setup: setupEditAttachmentsTool,
|
||||||
|
|||||||
Reference in New Issue
Block a user