fix(sanitize-pdf): improve link removal and add named destinations cleanup

- Refactor link annotation removal logic to handle more action types (URI, Launch, GoTo, GoToR)
- Add cleanup of named destinations in catalog and names dictionary
- Improve error handling and logging throughout the sanitization process
This commit is contained in:
abdullahalam123
2025-10-22 15:03:24 +05:30
parent b7de4bf1ed
commit de3c2fecc2

View File

@@ -128,7 +128,10 @@ export async function sanitizePdf() {
const actionRef = annot.get(PDFName.of('A')); const actionRef = annot.get(PDFName.of('A'));
try { try {
const actionDict = pdfDoc.context.lookup(actionRef); const actionDict = pdfDoc.context.lookup(actionRef);
const actionType = actionDict.get(PDFName.of('S'))?.toString().substring(1); const actionType = actionDict
.get(PDFName.of('S'))
?.toString()
.substring(1);
if (actionType === 'JavaScript') { if (actionType === 'JavaScript') {
annot.delete(PDFName.of('A')); annot.delete(PDFName.of('A'));
@@ -304,6 +307,9 @@ export async function sanitizePdf() {
} }
} }
// TODO:@ALAM
// Currently if the links are embedded in a stream they can't be removed
// Find a way to remove them from the stream
if (shouldRemoveLinks) { if (shouldRemoveLinks) {
try { try {
const pages = pdfDoc.getPages(); const pages = pdfDoc.getPages();
@@ -311,7 +317,13 @@ export async function sanitizePdf() {
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) { for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
try { try {
const page = pages[pageIndex]; const page = pages[pageIndex];
const annotRefs = page.node.Annots()?.asArray() || []; const pageDict = page.node;
const annotsRef = pageDict.get(PDFName.of('Annots'));
if (!annotsRef) continue;
const annotsArray = pdfDoc.context.lookup(annotsRef);
const annotRefs = annotsArray.asArray();
if (annotRefs.length === 0) continue; if (annotRefs.length === 0) continue;
@@ -321,33 +333,48 @@ export async function sanitizePdf() {
for (const ref of annotRefs) { for (const ref of annotRefs) {
try { try {
const annot = pdfDoc.context.lookup(ref); const annot = pdfDoc.context.lookup(ref);
const subtype = annot.get(PDFName.of('Subtype'))?.toString().substring(1); const subtype = annot
.get(PDFName.of('Subtype'))
?.toString()
.substring(1);
let shouldRemove = false; let isLink = false;
if (subtype === 'Link') { if (subtype === 'Link') {
isLink = true;
linksRemoved++;
} else {
const actionRef = annot.get(PDFName.of('A')); const actionRef = annot.get(PDFName.of('A'));
if (actionRef) { if (actionRef) {
try { try {
const actionDict = pdfDoc.context.lookup(actionRef); const actionDict = pdfDoc.context.lookup(actionRef);
const actionType = actionDict.get(PDFName.of('S'))?.toString().substring(1); const actionType = actionDict
.get(PDFName.of('S'))
?.toString()
.substring(1);
if (actionType === 'URI' || actionType === 'Launch' || actionType === 'GoTo') { if (
shouldRemove = true; actionType === 'URI' ||
actionType === 'Launch' ||
actionType === 'GoTo' ||
actionType === 'GoToR'
) {
isLink = true;
linksRemoved++; linksRemoved++;
} }
} catch (e) { } catch (e) {
console.warn('Could not read link action:', e.message); console.warn('Could not read action:', e.message);
} }
} }
const dest = annot.get(PDFName.of('Dest')); const dest = annot.get(PDFName.of('Dest'));
if (dest && !shouldRemove) { if (dest && !isLink) {
// TODO:@ALAM - Check if this is an internal link isLink = true;
linksRemoved++;
} }
} }
if (!shouldRemove) { if (!isLink) {
annotsToKeep.push(ref); annotsToKeep.push(ref);
} }
} catch (e) { } catch (e) {
@@ -359,17 +386,44 @@ export async function sanitizePdf() {
if (linksRemoved > 0) { if (linksRemoved > 0) {
if (annotsToKeep.length > 0) { if (annotsToKeep.length > 0) {
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep); const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
page.node.set(PDFName.of('Annots'), newAnnotsArray); pageDict.set(PDFName.of('Annots'), newAnnotsArray);
} else { } else {
page.node.delete(PDFName.of('Annots')); pageDict.delete(PDFName.of('Annots'));
} }
changesMade = true; changesMade = true;
console.log(`Page ${pageIndex + 1}: Removed ${linksRemoved} link(s)`); console.log(
`Page ${pageIndex + 1}: Removed ${linksRemoved} link(s)`
);
} }
} catch (pageError) { } catch (pageError) {
console.warn(`Could not process page ${pageIndex + 1} for links: ${pageError.message}`); console.warn(
`Could not process page ${pageIndex + 1} for links: ${pageError.message}`
);
} }
} }
try {
const catalogDict = pdfDoc.catalog.dict;
const namesRef = catalogDict.get(PDFName.of('Names'));
if (namesRef) {
try {
const namesDict = pdfDoc.context.lookup(namesRef);
if (namesDict.has(PDFName.of('Dests'))) {
namesDict.delete(PDFName.of('Dests'));
changesMade = true;
}
} catch (e) {
console.warn('Could not access Names/Dests:', e.message);
}
}
if (catalogDict.has(PDFName.of('Dests'))) {
catalogDict.delete(PDFName.of('Dests'));
changesMade = true;
}
} catch (e) {
console.warn('Could not remove named destinations:', e.message);
}
} catch (e) { } catch (e) {
console.warn(`Could not remove links: ${e.message}`); console.warn(`Could not remove links: ${e.message}`);
} }
@@ -448,13 +502,21 @@ export async function sanitizePdf() {
for (const fontKey of fontKeys) { for (const fontKey of fontKeys) {
try { try {
const specificFontRef = fontDict.get(fontKey); const specificFontRef = fontDict.get(fontKey);
const specificFont = pdfDoc.context.lookup(specificFontRef); const specificFont =
pdfDoc.context.lookup(specificFontRef);
if (specificFont.has(PDFName.of('FontDescriptor'))) { if (specificFont.has(PDFName.of('FontDescriptor'))) {
const descriptorRef = specificFont.get(PDFName.of('FontDescriptor')); const descriptorRef = specificFont.get(
const descriptor = pdfDoc.context.lookup(descriptorRef); PDFName.of('FontDescriptor')
);
const descriptor =
pdfDoc.context.lookup(descriptorRef);
const fontFileKeys = ['FontFile', 'FontFile2', 'FontFile3']; const fontFileKeys = [
'FontFile',
'FontFile2',
'FontFile3',
];
for (const key of fontFileKeys) { for (const key of fontFileKeys) {
if (descriptor.has(PDFName.of(key))) { if (descriptor.has(PDFName.of(key))) {
descriptor.delete(PDFName.of(key)); descriptor.delete(PDFName.of(key));
@@ -463,23 +525,35 @@ export async function sanitizePdf() {
} }
} }
// Users/Developers: Uncomment this if you can delete the entire font entry // Users/Developers: Uncomment this if you can delete the entire font entry -- might break the rendering though
// fontDict.delete(fontKey); // fontDict.delete(fontKey);
// changesMade = true; // changesMade = true;
} catch (e) { } catch (e) {
console.warn(`Could not process font ${fontKey}:`, e.message); console.warn(
`Could not process font ${fontKey}:`,
e.message
);
} }
} }
} catch (e) { } catch (e) {
console.warn('Could not access font dictionary:', e.message); console.warn(
'Could not access font dictionary:',
e.message
);
} }
} }
} catch (e) { } catch (e) {
console.warn('Could not access Resources for fonts:', e.message); console.warn(
'Could not access Resources for fonts:',
e.message
);
} }
} }
} catch (e) { } catch (e) {
console.warn(`Could not remove fonts from page ${pageIndex + 1}:`, e.message); console.warn(
`Could not remove fonts from page ${pageIndex + 1}:`,
e.message
);
} }
} }