fix(sanitize-pdf): improve link removal and add named destinations cleanup

- Refactor link annotation removal logic to handle more action types (URI, Launch, GoTo, GoToR)
- Add cleanup of named destinations in catalog and names dictionary
- Improve error handling and logging throughout the sanitization process
This commit is contained in:
abdullahalam123
2025-10-22 15:03:24 +05:30
parent b7de4bf1ed
commit de3c2fecc2

View File

@@ -113,7 +113,7 @@ export async function sanitizePdf() {
for (const page of pages) { for (const page of pages) {
try { try {
const pageDict = page.node; const pageDict = page.node;
if (pageDict.has(PDFName.of('AA'))) { if (pageDict.has(PDFName.of('AA'))) {
pageDict.delete(PDFName.of('AA')); pageDict.delete(PDFName.of('AA'));
changesMade = true; changesMade = true;
@@ -123,13 +123,16 @@ export async function sanitizePdf() {
for (const annotRef of annotRefs) { for (const annotRef of annotRefs) {
try { try {
const annot = pdfDoc.context.lookup(annotRef); const annot = pdfDoc.context.lookup(annotRef);
if (annot.has(PDFName.of('A'))) { if (annot.has(PDFName.of('A'))) {
const actionRef = annot.get(PDFName.of('A')); const actionRef = annot.get(PDFName.of('A'));
try { try {
const actionDict = pdfDoc.context.lookup(actionRef); const actionDict = pdfDoc.context.lookup(actionRef);
const actionType = actionDict.get(PDFName.of('S'))?.toString().substring(1); const actionType = actionDict
.get(PDFName.of('S'))
?.toString()
.substring(1);
if (actionType === 'JavaScript') { if (actionType === 'JavaScript') {
annot.delete(PDFName.of('A')); annot.delete(PDFName.of('A'));
changesMade = true; changesMade = true;
@@ -157,20 +160,20 @@ export async function sanitizePdf() {
if (acroFormRef) { if (acroFormRef) {
const acroFormDict = pdfDoc.context.lookup(acroFormRef); const acroFormDict = pdfDoc.context.lookup(acroFormRef);
const fieldsRef = acroFormDict.get(PDFName.of('Fields')); const fieldsRef = acroFormDict.get(PDFName.of('Fields'));
if (fieldsRef) { if (fieldsRef) {
const fieldsArray = pdfDoc.context.lookup(fieldsRef); const fieldsArray = pdfDoc.context.lookup(fieldsRef);
const fields = fieldsArray.asArray(); const fields = fieldsArray.asArray();
for (const fieldRef of fields) { for (const fieldRef of fields) {
try { try {
const field = pdfDoc.context.lookup(fieldRef); const field = pdfDoc.context.lookup(fieldRef);
if (field.has(PDFName.of('A'))) { if (field.has(PDFName.of('A'))) {
field.delete(PDFName.of('A')); field.delete(PDFName.of('A'));
changesMade = true; changesMade = true;
} }
if (field.has(PDFName.of('AA'))) { if (field.has(PDFName.of('AA'))) {
field.delete(PDFName.of('AA')); field.delete(PDFName.of('AA'));
changesMade = true; changesMade = true;
@@ -304,6 +307,9 @@ export async function sanitizePdf() {
} }
} }
// TODO:@ALAM
// Currently if the links are embedded in a stream they can't be removed
// Find a way to remove them from the stream
if (shouldRemoveLinks) { if (shouldRemoveLinks) {
try { try {
const pages = pdfDoc.getPages(); const pages = pdfDoc.getPages();
@@ -311,8 +317,14 @@ export async function sanitizePdf() {
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) { for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
try { try {
const page = pages[pageIndex]; const page = pages[pageIndex];
const annotRefs = page.node.Annots()?.asArray() || []; const pageDict = page.node;
const annotsRef = pageDict.get(PDFName.of('Annots'));
if (!annotsRef) continue;
const annotsArray = pdfDoc.context.lookup(annotsRef);
const annotRefs = annotsArray.asArray();
if (annotRefs.length === 0) continue; if (annotRefs.length === 0) continue;
const annotsToKeep = []; const annotsToKeep = [];
@@ -321,33 +333,48 @@ export async function sanitizePdf() {
for (const ref of annotRefs) { for (const ref of annotRefs) {
try { try {
const annot = pdfDoc.context.lookup(ref); const annot = pdfDoc.context.lookup(ref);
const subtype = annot.get(PDFName.of('Subtype'))?.toString().substring(1); const subtype = annot
.get(PDFName.of('Subtype'))
?.toString()
.substring(1);
let shouldRemove = false; let isLink = false;
if (subtype === 'Link') { if (subtype === 'Link') {
isLink = true;
linksRemoved++;
} else {
const actionRef = annot.get(PDFName.of('A')); const actionRef = annot.get(PDFName.of('A'));
if (actionRef) { if (actionRef) {
try { try {
const actionDict = pdfDoc.context.lookup(actionRef); const actionDict = pdfDoc.context.lookup(actionRef);
const actionType = actionDict.get(PDFName.of('S'))?.toString().substring(1); const actionType = actionDict
.get(PDFName.of('S'))
?.toString()
.substring(1);
if (actionType === 'URI' || actionType === 'Launch' || actionType === 'GoTo') { if (
shouldRemove = true; actionType === 'URI' ||
actionType === 'Launch' ||
actionType === 'GoTo' ||
actionType === 'GoToR'
) {
isLink = true;
linksRemoved++; linksRemoved++;
} }
} catch (e) { } catch (e) {
console.warn('Could not read link action:', e.message); console.warn('Could not read action:', e.message);
} }
} }
const dest = annot.get(PDFName.of('Dest')); const dest = annot.get(PDFName.of('Dest'));
if (dest && !shouldRemove) { if (dest && !isLink) {
// TODO:@ALAM - Check if this is an internal link isLink = true;
linksRemoved++;
} }
} }
if (!shouldRemove) { if (!isLink) {
annotsToKeep.push(ref); annotsToKeep.push(ref);
} }
} catch (e) { } catch (e) {
@@ -359,17 +386,44 @@ export async function sanitizePdf() {
if (linksRemoved > 0) { if (linksRemoved > 0) {
if (annotsToKeep.length > 0) { if (annotsToKeep.length > 0) {
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep); const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
page.node.set(PDFName.of('Annots'), newAnnotsArray); pageDict.set(PDFName.of('Annots'), newAnnotsArray);
} else { } else {
page.node.delete(PDFName.of('Annots')); pageDict.delete(PDFName.of('Annots'));
} }
changesMade = true; changesMade = true;
console.log(`Page ${pageIndex + 1}: Removed ${linksRemoved} link(s)`); console.log(
`Page ${pageIndex + 1}: Removed ${linksRemoved} link(s)`
);
} }
} catch (pageError) { } catch (pageError) {
console.warn(`Could not process page ${pageIndex + 1} for links: ${pageError.message}`); console.warn(
`Could not process page ${pageIndex + 1} for links: ${pageError.message}`
);
} }
} }
try {
const catalogDict = pdfDoc.catalog.dict;
const namesRef = catalogDict.get(PDFName.of('Names'));
if (namesRef) {
try {
const namesDict = pdfDoc.context.lookup(namesRef);
if (namesDict.has(PDFName.of('Dests'))) {
namesDict.delete(PDFName.of('Dests'));
changesMade = true;
}
} catch (e) {
console.warn('Could not access Names/Dests:', e.message);
}
}
if (catalogDict.has(PDFName.of('Dests'))) {
catalogDict.delete(PDFName.of('Dests'));
changesMade = true;
}
} catch (e) {
console.warn('Could not remove named destinations:', e.message);
}
} catch (e) { } catch (e) {
console.warn(`Could not remove links: ${e.message}`); console.warn(`Could not remove links: ${e.message}`);
} }
@@ -440,21 +494,29 @@ export async function sanitizePdf() {
if (resourcesDict.has(PDFName.of('Font'))) { if (resourcesDict.has(PDFName.of('Font'))) {
const fontRef = resourcesDict.get(PDFName.of('Font')); const fontRef = resourcesDict.get(PDFName.of('Font'));
try { try {
const fontDict = pdfDoc.context.lookup(fontRef); const fontDict = pdfDoc.context.lookup(fontRef);
const fontKeys = fontDict.keys(); const fontKeys = fontDict.keys();
for (const fontKey of fontKeys) { for (const fontKey of fontKeys) {
try { try {
const specificFontRef = fontDict.get(fontKey); const specificFontRef = fontDict.get(fontKey);
const specificFont = pdfDoc.context.lookup(specificFontRef); const specificFont =
pdfDoc.context.lookup(specificFontRef);
if (specificFont.has(PDFName.of('FontDescriptor'))) { if (specificFont.has(PDFName.of('FontDescriptor'))) {
const descriptorRef = specificFont.get(PDFName.of('FontDescriptor')); const descriptorRef = specificFont.get(
const descriptor = pdfDoc.context.lookup(descriptorRef); PDFName.of('FontDescriptor')
);
const fontFileKeys = ['FontFile', 'FontFile2', 'FontFile3']; const descriptor =
pdfDoc.context.lookup(descriptorRef);
const fontFileKeys = [
'FontFile',
'FontFile2',
'FontFile3',
];
for (const key of fontFileKeys) { for (const key of fontFileKeys) {
if (descriptor.has(PDFName.of(key))) { if (descriptor.has(PDFName.of(key))) {
descriptor.delete(PDFName.of(key)); descriptor.delete(PDFName.of(key));
@@ -462,24 +524,36 @@ export async function sanitizePdf() {
} }
} }
} }
// Users/Developers: Uncomment this if you can delete the entire font entry // Users/Developers: Uncomment this if you can delete the entire font entry -- might break the rendering though
// fontDict.delete(fontKey); // fontDict.delete(fontKey);
// changesMade = true; // changesMade = true;
} catch (e) { } catch (e) {
console.warn(`Could not process font ${fontKey}:`, e.message); console.warn(
`Could not process font ${fontKey}:`,
e.message
);
} }
} }
} catch (e) { } catch (e) {
console.warn('Could not access font dictionary:', e.message); console.warn(
'Could not access font dictionary:',
e.message
);
} }
} }
} catch (e) { } catch (e) {
console.warn('Could not access Resources for fonts:', e.message); console.warn(
'Could not access Resources for fonts:',
e.message
);
} }
} }
} catch (e) { } catch (e) {
console.warn(`Could not remove fonts from page ${pageIndex + 1}:`, e.message); console.warn(
`Could not remove fonts from page ${pageIndex + 1}:`,
e.message
);
} }
} }
@@ -513,4 +587,4 @@ export async function sanitizePdf() {
} finally { } finally {
hideLoader(); hideLoader();
} }
} }