Add visual workflow builder, fix critical bugs, and add Arabic i18n support

2026-02-08 17:05:40 +05:30
parent 36ebb3b429
commit 5d8b83e105
118 changed files with 14151 additions and 2357 deletions
--- a/src/js/utils/sanitize.ts
+++ b/src/js/utils/sanitize.ts
@@ -0,0 +1,590 @@
+import { PDFDocument, PDFName } from 'pdf-lib';
+
+export interface SanitizeOptions {
+  flattenForms: boolean;
+  removeMetadata: boolean;
+  removeAnnotations: boolean;
+  removeJavascript: boolean;
+  removeEmbeddedFiles: boolean;
+  removeLayers: boolean;
+  removeLinks: boolean;
+  removeStructureTree: boolean;
+  removeMarkInfo: boolean;
+  removeFonts: boolean;
+}
+
+export const defaultSanitizeOptions: SanitizeOptions = {
+  flattenForms: true,
+  removeMetadata: true,
+  removeAnnotations: true,
+  removeJavascript: true,
+  removeEmbeddedFiles: true,
+  removeLayers: true,
+  removeLinks: true,
+  removeStructureTree: true,
+  removeMarkInfo: true,
+  removeFonts: false,
+};
+
+function removeMetadataFromDoc(pdfDoc: PDFDocument) {
+  const infoDict = (pdfDoc as any).getInfoDict();
+  const allKeys = infoDict.keys();
+  allKeys.forEach((key: any) => {
+    infoDict.delete(key);
+  });
+
+  pdfDoc.setTitle('');
+  pdfDoc.setAuthor('');
+  pdfDoc.setSubject('');
+  pdfDoc.setKeywords([]);
+  pdfDoc.setCreator('');
+  pdfDoc.setProducer('');
+
+  try {
+    const catalogDict = (pdfDoc.catalog as any).dict;
+    if (catalogDict.has(PDFName.of('Metadata'))) {
+      catalogDict.delete(PDFName.of('Metadata'));
+    }
+  } catch (e: any) {
+    console.warn('Could not remove XMP metadata:', e.message);
+  }
+
+  try {
+    const context = pdfDoc.context;
+    if ((context as any).trailerInfo) {
+      delete (context as any).trailerInfo.ID;
+    }
+  } catch (e: any) {
+    console.warn('Could not remove document IDs:', e.message);
+  }
+
+  try {
+    const catalogDict = (pdfDoc.catalog as any).dict;
+    if (catalogDict.has(PDFName.of('PieceInfo'))) {
+      catalogDict.delete(PDFName.of('PieceInfo'));
+    }
+  } catch (e: any) {
+    console.warn('Could not remove PieceInfo:', e.message);
+  }
+}
+
+function removeAnnotationsFromDoc(pdfDoc: PDFDocument) {
+  const pages = pdfDoc.getPages();
+  for (const page of pages) {
+    try {
+      page.node.delete(PDFName.of('Annots'));
+    } catch (e: any) {
+      console.warn('Could not remove annotations from page:', e.message);
+    }
+  }
+}
+
+function flattenFormsInDoc(pdfDoc: PDFDocument) {
+  const form = pdfDoc.getForm();
+  form.flatten();
+}
+
+function removeJavascriptFromDoc(pdfDoc: PDFDocument) {
+  if ((pdfDoc as any).javaScripts && (pdfDoc as any).javaScripts.length > 0) {
+    (pdfDoc as any).javaScripts = [];
+  }
+
+  const catalogDict = (pdfDoc.catalog as any).dict;
+
+  const namesRef = catalogDict.get(PDFName.of('Names'));
+  if (namesRef) {
+    try {
+      const namesDict = pdfDoc.context.lookup(namesRef) as any;
+      if (namesDict.has(PDFName.of('JavaScript'))) {
+        namesDict.delete(PDFName.of('JavaScript'));
+      }
+    } catch (e: any) {
+      console.warn('Could not access Names/JavaScript:', e.message);
+    }
+  }
+
+  if (catalogDict.has(PDFName.of('OpenAction'))) {
+    catalogDict.delete(PDFName.of('OpenAction'));
+  }
+
+  if (catalogDict.has(PDFName.of('AA'))) {
+    catalogDict.delete(PDFName.of('AA'));
+  }
+
+  const pages = pdfDoc.getPages();
+  for (const page of pages) {
+    try {
+      const pageDict = page.node;
+
+      if (pageDict.has(PDFName.of('AA'))) {
+        pageDict.delete(PDFName.of('AA'));
+      }
+
+      const annotRefs = pageDict.Annots()?.asArray() || [];
+      for (const annotRef of annotRefs) {
+        try {
+          const annot = pdfDoc.context.lookup(annotRef) as any;
+
+          if (annot.has(PDFName.of('A'))) {
+            const actionRef = annot.get(PDFName.of('A'));
+            try {
+              const actionDict = pdfDoc.context.lookup(actionRef) as any;
+              const actionType = actionDict
+                .get(PDFName.of('S'))
+                ?.toString()
+                .substring(1);
+
+              if (actionType === 'JavaScript') {
+                annot.delete(PDFName.of('A'));
+              }
+            } catch (e: any) {
+              console.warn('Could not read action:', e.message);
+            }
+          }
+
+          if (annot.has(PDFName.of('AA'))) {
+            annot.delete(PDFName.of('AA'));
+          }
+        } catch (e: any) {
+          console.warn('Could not process annotation for JS:', e.message);
+        }
+      }
+    } catch (e: any) {
+      console.warn('Could not remove page actions:', e.message);
+    }
+  }
+
+  try {
+    const acroFormRef = catalogDict.get(PDFName.of('AcroForm'));
+    if (acroFormRef) {
+      const acroFormDict = pdfDoc.context.lookup(acroFormRef) as any;
+      const fieldsRef = acroFormDict.get(PDFName.of('Fields'));
+
+      if (fieldsRef) {
+        const fieldsArray = pdfDoc.context.lookup(fieldsRef) as any;
+        const fields = fieldsArray.asArray();
+
+        for (const fieldRef of fields) {
+          try {
+            const field = pdfDoc.context.lookup(fieldRef) as any;
+
+            if (field.has(PDFName.of('A'))) {
+              field.delete(PDFName.of('A'));
+            }
+
+            if (field.has(PDFName.of('AA'))) {
+              field.delete(PDFName.of('AA'));
+            }
+          } catch (e: any) {
+            console.warn('Could not process field for JS:', e.message);
+          }
+        }
+      }
+    }
+  } catch (e: any) {
+    console.warn('Could not process form fields for JS:', e.message);
+  }
+}
+
+function removeEmbeddedFilesFromDoc(pdfDoc: PDFDocument) {
+  const catalogDict = (pdfDoc.catalog as any).dict;
+
+  const namesRef = catalogDict.get(PDFName.of('Names'));
+  if (namesRef) {
+    try {
+      const namesDict = pdfDoc.context.lookup(namesRef) as any;
+      if (namesDict.has(PDFName.of('EmbeddedFiles'))) {
+        namesDict.delete(PDFName.of('EmbeddedFiles'));
+      }
+    } catch (e: any) {
+      console.warn('Could not access Names/EmbeddedFiles:', e.message);
+    }
+  }
+
+  if (catalogDict.has(PDFName.of('EmbeddedFiles'))) {
+    catalogDict.delete(PDFName.of('EmbeddedFiles'));
+  }
+
+  const pages = pdfDoc.getPages();
+  for (const page of pages) {
+    try {
+      const annotRefs = page.node.Annots()?.asArray() || [];
+      const annotsToKeep = [];
+
+      for (const ref of annotRefs) {
+        try {
+          const annot = pdfDoc.context.lookup(ref) as any;
+          const subtype = annot
+            .get(PDFName.of('Subtype'))
+            ?.toString()
+            .substring(1);
+
+          if (subtype !== 'FileAttachment') {
+            annotsToKeep.push(ref);
+          }
+        } catch (e) {
+          annotsToKeep.push(ref);
+        }
+      }
+
+      if (annotsToKeep.length !== annotRefs.length) {
+        if (annotsToKeep.length > 0) {
+          const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
+          page.node.set(PDFName.of('Annots'), newAnnotsArray);
+        } else {
+          page.node.delete(PDFName.of('Annots'));
+        }
+      }
+    } catch (pageError: any) {
+      console.warn(
+        `Could not process page for attachments: ${pageError.message}`
+      );
+    }
+  }
+
+  if (
+    (pdfDoc as any).embeddedFiles &&
+    (pdfDoc as any).embeddedFiles.length > 0
+  ) {
+    (pdfDoc as any).embeddedFiles = [];
+  }
+
+  if (catalogDict.has(PDFName.of('Collection'))) {
+    catalogDict.delete(PDFName.of('Collection'));
+  }
+}
+
+function removeLayersFromDoc(pdfDoc: PDFDocument) {
+  const catalogDict = (pdfDoc.catalog as any).dict;
+
+  if (catalogDict.has(PDFName.of('OCProperties'))) {
+    catalogDict.delete(PDFName.of('OCProperties'));
+  }
+
+  const pages = pdfDoc.getPages();
+  for (const page of pages) {
+    try {
+      const pageDict = page.node;
+
+      if (pageDict.has(PDFName.of('OCProperties'))) {
+        pageDict.delete(PDFName.of('OCProperties'));
+      }
+
+      const resourcesRef = pageDict.get(PDFName.of('Resources'));
+      if (resourcesRef) {
+        try {
+          const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any;
+          if (resourcesDict.has(PDFName.of('Properties'))) {
+            resourcesDict.delete(PDFName.of('Properties'));
+          }
+        } catch (e: any) {
+          console.warn('Could not access Resources:', e.message);
+        }
+      }
+    } catch (e: any) {
+      console.warn('Could not remove page layers:', e.message);
+    }
+  }
+}
+
+function removeLinksFromDoc(pdfDoc: PDFDocument) {
+  const pages = pdfDoc.getPages();
+
+  for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
+    try {
+      const page = pages[pageIndex];
+      const pageDict = page.node;
+
+      const annotsRef = pageDict.get(PDFName.of('Annots'));
+      if (!annotsRef) continue;
+
+      const annotsArray = pdfDoc.context.lookup(annotsRef) as any;
+      const annotRefs = annotsArray.asArray();
+
+      if (annotRefs.length === 0) continue;
+
+      const annotsToKeep = [];
+      let linksRemoved = 0;
+
+      for (const ref of annotRefs) {
+        try {
+          const annot = pdfDoc.context.lookup(ref) as any;
+          const subtype = annot
+            .get(PDFName.of('Subtype'))
+            ?.toString()
+            .substring(1);
+
+          let isLink = false;
+
+          if (subtype === 'Link') {
+            isLink = true;
+            linksRemoved++;
+          } else {
+            const actionRef = annot.get(PDFName.of('A'));
+            if (actionRef) {
+              try {
+                const actionDict = pdfDoc.context.lookup(actionRef) as any;
+                const actionType = actionDict
+                  .get(PDFName.of('S'))
+                  ?.toString()
+                  .substring(1);
+
+                if (
+                  actionType === 'URI' ||
+                  actionType === 'Launch' ||
+                  actionType === 'GoTo' ||
+                  actionType === 'GoToR'
+                ) {
+                  isLink = true;
+                  linksRemoved++;
+                }
+              } catch (e: any) {
+                console.warn('Could not read action:', e.message);
+              }
+            }
+
+            const dest = annot.get(PDFName.of('Dest'));
+            if (dest && !isLink) {
+              isLink = true;
+              linksRemoved++;
+            }
+          }
+
+          if (!isLink) {
+            annotsToKeep.push(ref);
+          }
+        } catch (e: any) {
+          console.warn('Could not process annotation:', e.message);
+          annotsToKeep.push(ref);
+        }
+      }
+
+      if (linksRemoved > 0) {
+        if (annotsToKeep.length > 0) {
+          const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
+          pageDict.set(PDFName.of('Annots'), newAnnotsArray);
+        } else {
+          pageDict.delete(PDFName.of('Annots'));
+        }
+      }
+    } catch (pageError: any) {
+      console.warn(
+        `Could not process page ${pageIndex + 1} for links: ${pageError.message}`
+      );
+    }
+  }
+
+  try {
+    const catalogDict = (pdfDoc.catalog as any).dict;
+    const namesRef = catalogDict.get(PDFName.of('Names'));
+    if (namesRef) {
+      try {
+        const namesDict = pdfDoc.context.lookup(namesRef) as any;
+        if (namesDict.has(PDFName.of('Dests'))) {
+          namesDict.delete(PDFName.of('Dests'));
+        }
+      } catch (e: any) {
+        console.warn('Could not access Names/Dests:', e.message);
+      }
+    }
+
+    if (catalogDict.has(PDFName.of('Dests'))) {
+      catalogDict.delete(PDFName.of('Dests'));
+    }
+  } catch (e: any) {
+    console.warn('Could not remove named destinations:', e.message);
+  }
+}
+
+function removeStructureTreeFromDoc(pdfDoc: PDFDocument) {
+  const catalogDict = (pdfDoc.catalog as any).dict;
+
+  if (catalogDict.has(PDFName.of('StructTreeRoot'))) {
+    catalogDict.delete(PDFName.of('StructTreeRoot'));
+  }
+
+  const pages = pdfDoc.getPages();
+  for (const page of pages) {
+    try {
+      const pageDict = page.node;
+      if (pageDict.has(PDFName.of('StructParents'))) {
+        pageDict.delete(PDFName.of('StructParents'));
+      }
+    } catch (e: any) {
+      console.warn('Could not remove page StructParents:', e.message);
+    }
+  }
+
+  if (catalogDict.has(PDFName.of('ParentTree'))) {
+    catalogDict.delete(PDFName.of('ParentTree'));
+  }
+}
+
+function removeMarkInfoFromDoc(pdfDoc: PDFDocument) {
+  const catalogDict = (pdfDoc.catalog as any).dict;
+
+  if (catalogDict.has(PDFName.of('MarkInfo'))) {
+    catalogDict.delete(PDFName.of('MarkInfo'));
+  }
+
+  if (catalogDict.has(PDFName.of('Marked'))) {
+    catalogDict.delete(PDFName.of('Marked'));
+  }
+}
+
+function removeFontsFromDoc(pdfDoc: PDFDocument) {
+  const pages = pdfDoc.getPages();
+
+  for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
+    try {
+      const page = pages[pageIndex];
+      const pageDict = page.node;
+      const resourcesRef = pageDict.get(PDFName.of('Resources'));
+
+      if (resourcesRef) {
+        try {
+          const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any;
+
+          if (resourcesDict.has(PDFName.of('Font'))) {
+            const fontRef = resourcesDict.get(PDFName.of('Font'));
+
+            try {
+              const fontDict = pdfDoc.context.lookup(fontRef) as any;
+              const fontKeys = fontDict.keys();
+
+              for (const fontKey of fontKeys) {
+                try {
+                  const specificFontRef = fontDict.get(fontKey);
+                  const specificFont = pdfDoc.context.lookup(
+                    specificFontRef
+                  ) as any;
+
+                  if (specificFont.has(PDFName.of('FontDescriptor'))) {
+                    const descriptorRef = specificFont.get(
+                      PDFName.of('FontDescriptor')
+                    );
+                    const descriptor = pdfDoc.context.lookup(
+                      descriptorRef
+                    ) as any;
+
+                    const fontFileKeys = ['FontFile', 'FontFile2', 'FontFile3'];
+                    for (const key of fontFileKeys) {
+                      if (descriptor.has(PDFName.of(key))) {
+                        descriptor.delete(PDFName.of(key));
+                      }
+                    }
+                  }
+                } catch (e: any) {
+                  console.warn(`Could not process font ${fontKey}:`, e.message);
+                }
+              }
+            } catch (e: any) {
+              console.warn('Could not access font dictionary:', e.message);
+            }
+          }
+        } catch (e: any) {
+          console.warn('Could not access Resources for fonts:', e.message);
+        }
+      }
+    } catch (e: any) {
+      console.warn(
+        `Could not remove fonts from page ${pageIndex + 1}:`,
+        e.message
+      );
+    }
+  }
+
+  if ((pdfDoc as any).fonts && (pdfDoc as any).fonts.length > 0) {
+    (pdfDoc as any).fonts = [];
+  }
+}
+
+export async function sanitizePdf(
+  pdfBytes: Uint8Array,
+  options: SanitizeOptions
+): Promise<{ pdfDoc: PDFDocument; bytes: Uint8Array }> {
+  const pdfDoc = await PDFDocument.load(pdfBytes);
+
+  if (options.flattenForms) {
+    try {
+      flattenFormsInDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not flatten forms: ${e.message}`);
+      try {
+        const catalogDict = (pdfDoc.catalog as any).dict;
+        if (catalogDict.has(PDFName.of('AcroForm'))) {
+          catalogDict.delete(PDFName.of('AcroForm'));
+        }
+      } catch (removeError: any) {
+        console.warn('Could not remove AcroForm:', removeError.message);
+      }
+    }
+  }
+
+  if (options.removeMetadata) {
+    removeMetadataFromDoc(pdfDoc);
+  }
+
+  if (options.removeAnnotations) {
+    removeAnnotationsFromDoc(pdfDoc);
+  }
+
+  if (options.removeJavascript) {
+    try {
+      removeJavascriptFromDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not remove JavaScript: ${e.message}`);
+    }
+  }
+
+  if (options.removeEmbeddedFiles) {
+    try {
+      removeEmbeddedFilesFromDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not remove embedded files: ${e.message}`);
+    }
+  }
+
+  if (options.removeLayers) {
+    try {
+      removeLayersFromDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not remove layers: ${e.message}`);
+    }
+  }
+
+  if (options.removeLinks) {
+    try {
+      removeLinksFromDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not remove links: ${e.message}`);
+    }
+  }
+
+  if (options.removeStructureTree) {
+    try {
+      removeStructureTreeFromDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not remove structure tree: ${e.message}`);
+    }
+  }
+
+  if (options.removeMarkInfo) {
+    try {
+      removeMarkInfoFromDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not remove MarkInfo: ${e.message}`);
+    }
+  }
+
+  if (options.removeFonts) {
+    try {
+      removeFontsFromDoc(pdfDoc);
+    } catch (e: any) {
+      console.warn(`Could not remove fonts: ${e.message}`);
+    }
+  }
+
+  const savedBytes = await pdfDoc.save();
+  return { pdfDoc, bytes: new Uint8Array(savedBytes) };
+}