Add visual workflow builder, fix critical bugs, and add Arabic i18n support
This commit is contained in:
590
src/js/utils/sanitize.ts
Normal file
590
src/js/utils/sanitize.ts
Normal file
@@ -0,0 +1,590 @@
|
||||
import { PDFDocument, PDFName } from 'pdf-lib';
|
||||
|
||||
export interface SanitizeOptions {
|
||||
flattenForms: boolean;
|
||||
removeMetadata: boolean;
|
||||
removeAnnotations: boolean;
|
||||
removeJavascript: boolean;
|
||||
removeEmbeddedFiles: boolean;
|
||||
removeLayers: boolean;
|
||||
removeLinks: boolean;
|
||||
removeStructureTree: boolean;
|
||||
removeMarkInfo: boolean;
|
||||
removeFonts: boolean;
|
||||
}
|
||||
|
||||
export const defaultSanitizeOptions: SanitizeOptions = {
|
||||
flattenForms: true,
|
||||
removeMetadata: true,
|
||||
removeAnnotations: true,
|
||||
removeJavascript: true,
|
||||
removeEmbeddedFiles: true,
|
||||
removeLayers: true,
|
||||
removeLinks: true,
|
||||
removeStructureTree: true,
|
||||
removeMarkInfo: true,
|
||||
removeFonts: false,
|
||||
};
|
||||
|
||||
function removeMetadataFromDoc(pdfDoc: PDFDocument) {
|
||||
const infoDict = (pdfDoc as any).getInfoDict();
|
||||
const allKeys = infoDict.keys();
|
||||
allKeys.forEach((key: any) => {
|
||||
infoDict.delete(key);
|
||||
});
|
||||
|
||||
pdfDoc.setTitle('');
|
||||
pdfDoc.setAuthor('');
|
||||
pdfDoc.setSubject('');
|
||||
pdfDoc.setKeywords([]);
|
||||
pdfDoc.setCreator('');
|
||||
pdfDoc.setProducer('');
|
||||
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
if (catalogDict.has(PDFName.of('Metadata'))) {
|
||||
catalogDict.delete(PDFName.of('Metadata'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove XMP metadata:', e.message);
|
||||
}
|
||||
|
||||
try {
|
||||
const context = pdfDoc.context;
|
||||
if ((context as any).trailerInfo) {
|
||||
delete (context as any).trailerInfo.ID;
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove document IDs:', e.message);
|
||||
}
|
||||
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
if (catalogDict.has(PDFName.of('PieceInfo'))) {
|
||||
catalogDict.delete(PDFName.of('PieceInfo'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove PieceInfo:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function removeAnnotationsFromDoc(pdfDoc: PDFDocument) {
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
page.node.delete(PDFName.of('Annots'));
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove annotations from page:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function flattenFormsInDoc(pdfDoc: PDFDocument) {
|
||||
const form = pdfDoc.getForm();
|
||||
form.flatten();
|
||||
}
|
||||
|
||||
function removeJavascriptFromDoc(pdfDoc: PDFDocument) {
|
||||
if ((pdfDoc as any).javaScripts && (pdfDoc as any).javaScripts.length > 0) {
|
||||
(pdfDoc as any).javaScripts = [];
|
||||
}
|
||||
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
const namesRef = catalogDict.get(PDFName.of('Names'));
|
||||
if (namesRef) {
|
||||
try {
|
||||
const namesDict = pdfDoc.context.lookup(namesRef) as any;
|
||||
if (namesDict.has(PDFName.of('JavaScript'))) {
|
||||
namesDict.delete(PDFName.of('JavaScript'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Names/JavaScript:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('OpenAction'))) {
|
||||
catalogDict.delete(PDFName.of('OpenAction'));
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('AA'))) {
|
||||
catalogDict.delete(PDFName.of('AA'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const pageDict = page.node;
|
||||
|
||||
if (pageDict.has(PDFName.of('AA'))) {
|
||||
pageDict.delete(PDFName.of('AA'));
|
||||
}
|
||||
|
||||
const annotRefs = pageDict.Annots()?.asArray() || [];
|
||||
for (const annotRef of annotRefs) {
|
||||
try {
|
||||
const annot = pdfDoc.context.lookup(annotRef) as any;
|
||||
|
||||
if (annot.has(PDFName.of('A'))) {
|
||||
const actionRef = annot.get(PDFName.of('A'));
|
||||
try {
|
||||
const actionDict = pdfDoc.context.lookup(actionRef) as any;
|
||||
const actionType = actionDict
|
||||
.get(PDFName.of('S'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
if (actionType === 'JavaScript') {
|
||||
annot.delete(PDFName.of('A'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not read action:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (annot.has(PDFName.of('AA'))) {
|
||||
annot.delete(PDFName.of('AA'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process annotation for JS:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove page actions:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const acroFormRef = catalogDict.get(PDFName.of('AcroForm'));
|
||||
if (acroFormRef) {
|
||||
const acroFormDict = pdfDoc.context.lookup(acroFormRef) as any;
|
||||
const fieldsRef = acroFormDict.get(PDFName.of('Fields'));
|
||||
|
||||
if (fieldsRef) {
|
||||
const fieldsArray = pdfDoc.context.lookup(fieldsRef) as any;
|
||||
const fields = fieldsArray.asArray();
|
||||
|
||||
for (const fieldRef of fields) {
|
||||
try {
|
||||
const field = pdfDoc.context.lookup(fieldRef) as any;
|
||||
|
||||
if (field.has(PDFName.of('A'))) {
|
||||
field.delete(PDFName.of('A'));
|
||||
}
|
||||
|
||||
if (field.has(PDFName.of('AA'))) {
|
||||
field.delete(PDFName.of('AA'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process field for JS:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process form fields for JS:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function removeEmbeddedFilesFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
const namesRef = catalogDict.get(PDFName.of('Names'));
|
||||
if (namesRef) {
|
||||
try {
|
||||
const namesDict = pdfDoc.context.lookup(namesRef) as any;
|
||||
if (namesDict.has(PDFName.of('EmbeddedFiles'))) {
|
||||
namesDict.delete(PDFName.of('EmbeddedFiles'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Names/EmbeddedFiles:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('EmbeddedFiles'))) {
|
||||
catalogDict.delete(PDFName.of('EmbeddedFiles'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const annotRefs = page.node.Annots()?.asArray() || [];
|
||||
const annotsToKeep = [];
|
||||
|
||||
for (const ref of annotRefs) {
|
||||
try {
|
||||
const annot = pdfDoc.context.lookup(ref) as any;
|
||||
const subtype = annot
|
||||
.get(PDFName.of('Subtype'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
if (subtype !== 'FileAttachment') {
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
} catch (e) {
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
}
|
||||
|
||||
if (annotsToKeep.length !== annotRefs.length) {
|
||||
if (annotsToKeep.length > 0) {
|
||||
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
|
||||
page.node.set(PDFName.of('Annots'), newAnnotsArray);
|
||||
} else {
|
||||
page.node.delete(PDFName.of('Annots'));
|
||||
}
|
||||
}
|
||||
} catch (pageError: any) {
|
||||
console.warn(
|
||||
`Could not process page for attachments: ${pageError.message}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
(pdfDoc as any).embeddedFiles &&
|
||||
(pdfDoc as any).embeddedFiles.length > 0
|
||||
) {
|
||||
(pdfDoc as any).embeddedFiles = [];
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('Collection'))) {
|
||||
catalogDict.delete(PDFName.of('Collection'));
|
||||
}
|
||||
}
|
||||
|
||||
function removeLayersFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
if (catalogDict.has(PDFName.of('OCProperties'))) {
|
||||
catalogDict.delete(PDFName.of('OCProperties'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const pageDict = page.node;
|
||||
|
||||
if (pageDict.has(PDFName.of('OCProperties'))) {
|
||||
pageDict.delete(PDFName.of('OCProperties'));
|
||||
}
|
||||
|
||||
const resourcesRef = pageDict.get(PDFName.of('Resources'));
|
||||
if (resourcesRef) {
|
||||
try {
|
||||
const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any;
|
||||
if (resourcesDict.has(PDFName.of('Properties'))) {
|
||||
resourcesDict.delete(PDFName.of('Properties'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Resources:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove page layers:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function removeLinksFromDoc(pdfDoc: PDFDocument) {
|
||||
const pages = pdfDoc.getPages();
|
||||
|
||||
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
||||
try {
|
||||
const page = pages[pageIndex];
|
||||
const pageDict = page.node;
|
||||
|
||||
const annotsRef = pageDict.get(PDFName.of('Annots'));
|
||||
if (!annotsRef) continue;
|
||||
|
||||
const annotsArray = pdfDoc.context.lookup(annotsRef) as any;
|
||||
const annotRefs = annotsArray.asArray();
|
||||
|
||||
if (annotRefs.length === 0) continue;
|
||||
|
||||
const annotsToKeep = [];
|
||||
let linksRemoved = 0;
|
||||
|
||||
for (const ref of annotRefs) {
|
||||
try {
|
||||
const annot = pdfDoc.context.lookup(ref) as any;
|
||||
const subtype = annot
|
||||
.get(PDFName.of('Subtype'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
let isLink = false;
|
||||
|
||||
if (subtype === 'Link') {
|
||||
isLink = true;
|
||||
linksRemoved++;
|
||||
} else {
|
||||
const actionRef = annot.get(PDFName.of('A'));
|
||||
if (actionRef) {
|
||||
try {
|
||||
const actionDict = pdfDoc.context.lookup(actionRef) as any;
|
||||
const actionType = actionDict
|
||||
.get(PDFName.of('S'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
if (
|
||||
actionType === 'URI' ||
|
||||
actionType === 'Launch' ||
|
||||
actionType === 'GoTo' ||
|
||||
actionType === 'GoToR'
|
||||
) {
|
||||
isLink = true;
|
||||
linksRemoved++;
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not read action:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
const dest = annot.get(PDFName.of('Dest'));
|
||||
if (dest && !isLink) {
|
||||
isLink = true;
|
||||
linksRemoved++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isLink) {
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process annotation:', e.message);
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
}
|
||||
|
||||
if (linksRemoved > 0) {
|
||||
if (annotsToKeep.length > 0) {
|
||||
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
|
||||
pageDict.set(PDFName.of('Annots'), newAnnotsArray);
|
||||
} else {
|
||||
pageDict.delete(PDFName.of('Annots'));
|
||||
}
|
||||
}
|
||||
} catch (pageError: any) {
|
||||
console.warn(
|
||||
`Could not process page ${pageIndex + 1} for links: ${pageError.message}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
const namesRef = catalogDict.get(PDFName.of('Names'));
|
||||
if (namesRef) {
|
||||
try {
|
||||
const namesDict = pdfDoc.context.lookup(namesRef) as any;
|
||||
if (namesDict.has(PDFName.of('Dests'))) {
|
||||
namesDict.delete(PDFName.of('Dests'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Names/Dests:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('Dests'))) {
|
||||
catalogDict.delete(PDFName.of('Dests'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove named destinations:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function removeStructureTreeFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
if (catalogDict.has(PDFName.of('StructTreeRoot'))) {
|
||||
catalogDict.delete(PDFName.of('StructTreeRoot'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const pageDict = page.node;
|
||||
if (pageDict.has(PDFName.of('StructParents'))) {
|
||||
pageDict.delete(PDFName.of('StructParents'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove page StructParents:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('ParentTree'))) {
|
||||
catalogDict.delete(PDFName.of('ParentTree'));
|
||||
}
|
||||
}
|
||||
|
||||
function removeMarkInfoFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
if (catalogDict.has(PDFName.of('MarkInfo'))) {
|
||||
catalogDict.delete(PDFName.of('MarkInfo'));
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('Marked'))) {
|
||||
catalogDict.delete(PDFName.of('Marked'));
|
||||
}
|
||||
}
|
||||
|
||||
function removeFontsFromDoc(pdfDoc: PDFDocument) {
|
||||
const pages = pdfDoc.getPages();
|
||||
|
||||
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
||||
try {
|
||||
const page = pages[pageIndex];
|
||||
const pageDict = page.node;
|
||||
const resourcesRef = pageDict.get(PDFName.of('Resources'));
|
||||
|
||||
if (resourcesRef) {
|
||||
try {
|
||||
const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any;
|
||||
|
||||
if (resourcesDict.has(PDFName.of('Font'))) {
|
||||
const fontRef = resourcesDict.get(PDFName.of('Font'));
|
||||
|
||||
try {
|
||||
const fontDict = pdfDoc.context.lookup(fontRef) as any;
|
||||
const fontKeys = fontDict.keys();
|
||||
|
||||
for (const fontKey of fontKeys) {
|
||||
try {
|
||||
const specificFontRef = fontDict.get(fontKey);
|
||||
const specificFont = pdfDoc.context.lookup(
|
||||
specificFontRef
|
||||
) as any;
|
||||
|
||||
if (specificFont.has(PDFName.of('FontDescriptor'))) {
|
||||
const descriptorRef = specificFont.get(
|
||||
PDFName.of('FontDescriptor')
|
||||
);
|
||||
const descriptor = pdfDoc.context.lookup(
|
||||
descriptorRef
|
||||
) as any;
|
||||
|
||||
const fontFileKeys = ['FontFile', 'FontFile2', 'FontFile3'];
|
||||
for (const key of fontFileKeys) {
|
||||
if (descriptor.has(PDFName.of(key))) {
|
||||
descriptor.delete(PDFName.of(key));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not process font ${fontKey}:`, e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access font dictionary:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Resources for fonts:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn(
|
||||
`Could not remove fonts from page ${pageIndex + 1}:`,
|
||||
e.message
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ((pdfDoc as any).fonts && (pdfDoc as any).fonts.length > 0) {
|
||||
(pdfDoc as any).fonts = [];
|
||||
}
|
||||
}
|
||||
|
||||
export async function sanitizePdf(
|
||||
pdfBytes: Uint8Array,
|
||||
options: SanitizeOptions
|
||||
): Promise<{ pdfDoc: PDFDocument; bytes: Uint8Array }> {
|
||||
const pdfDoc = await PDFDocument.load(pdfBytes);
|
||||
|
||||
if (options.flattenForms) {
|
||||
try {
|
||||
flattenFormsInDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not flatten forms: ${e.message}`);
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
if (catalogDict.has(PDFName.of('AcroForm'))) {
|
||||
catalogDict.delete(PDFName.of('AcroForm'));
|
||||
}
|
||||
} catch (removeError: any) {
|
||||
console.warn('Could not remove AcroForm:', removeError.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeMetadata) {
|
||||
removeMetadataFromDoc(pdfDoc);
|
||||
}
|
||||
|
||||
if (options.removeAnnotations) {
|
||||
removeAnnotationsFromDoc(pdfDoc);
|
||||
}
|
||||
|
||||
if (options.removeJavascript) {
|
||||
try {
|
||||
removeJavascriptFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove JavaScript: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeEmbeddedFiles) {
|
||||
try {
|
||||
removeEmbeddedFilesFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove embedded files: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeLayers) {
|
||||
try {
|
||||
removeLayersFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove layers: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeLinks) {
|
||||
try {
|
||||
removeLinksFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove links: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeStructureTree) {
|
||||
try {
|
||||
removeStructureTreeFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove structure tree: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeMarkInfo) {
|
||||
try {
|
||||
removeMarkInfoFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove MarkInfo: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeFonts) {
|
||||
try {
|
||||
removeFontsFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove fonts: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
const savedBytes = await pdfDoc.save();
|
||||
return { pdfDoc, bytes: new Uint8Array(savedBytes) };
|
||||
}
|
||||
Reference in New Issue
Block a user