fix(sanitize-pdf): improve link removal and add named destinations cleanup
- Refactor link annotation removal logic to handle more action types (URI, Launch, GoTo, GoToR) - Add cleanup of named destinations in catalog and names dictionary - Improve error handling and logging throughout the sanitization process
This commit is contained in:
@@ -128,7 +128,10 @@ export async function sanitizePdf() {
|
|||||||
const actionRef = annot.get(PDFName.of('A'));
|
const actionRef = annot.get(PDFName.of('A'));
|
||||||
try {
|
try {
|
||||||
const actionDict = pdfDoc.context.lookup(actionRef);
|
const actionDict = pdfDoc.context.lookup(actionRef);
|
||||||
const actionType = actionDict.get(PDFName.of('S'))?.toString().substring(1);
|
const actionType = actionDict
|
||||||
|
.get(PDFName.of('S'))
|
||||||
|
?.toString()
|
||||||
|
.substring(1);
|
||||||
|
|
||||||
if (actionType === 'JavaScript') {
|
if (actionType === 'JavaScript') {
|
||||||
annot.delete(PDFName.of('A'));
|
annot.delete(PDFName.of('A'));
|
||||||
@@ -304,6 +307,9 @@ export async function sanitizePdf() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO:@ALAM
|
||||||
|
// Currently if the links are embedded in a stream they can't be removed
|
||||||
|
// Find a way to remove them from the stream
|
||||||
if (shouldRemoveLinks) {
|
if (shouldRemoveLinks) {
|
||||||
try {
|
try {
|
||||||
const pages = pdfDoc.getPages();
|
const pages = pdfDoc.getPages();
|
||||||
@@ -311,7 +317,13 @@ export async function sanitizePdf() {
|
|||||||
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
||||||
try {
|
try {
|
||||||
const page = pages[pageIndex];
|
const page = pages[pageIndex];
|
||||||
const annotRefs = page.node.Annots()?.asArray() || [];
|
const pageDict = page.node;
|
||||||
|
|
||||||
|
const annotsRef = pageDict.get(PDFName.of('Annots'));
|
||||||
|
if (!annotsRef) continue;
|
||||||
|
|
||||||
|
const annotsArray = pdfDoc.context.lookup(annotsRef);
|
||||||
|
const annotRefs = annotsArray.asArray();
|
||||||
|
|
||||||
if (annotRefs.length === 0) continue;
|
if (annotRefs.length === 0) continue;
|
||||||
|
|
||||||
@@ -321,33 +333,48 @@ export async function sanitizePdf() {
|
|||||||
for (const ref of annotRefs) {
|
for (const ref of annotRefs) {
|
||||||
try {
|
try {
|
||||||
const annot = pdfDoc.context.lookup(ref);
|
const annot = pdfDoc.context.lookup(ref);
|
||||||
const subtype = annot.get(PDFName.of('Subtype'))?.toString().substring(1);
|
const subtype = annot
|
||||||
|
.get(PDFName.of('Subtype'))
|
||||||
|
?.toString()
|
||||||
|
.substring(1);
|
||||||
|
|
||||||
let shouldRemove = false;
|
let isLink = false;
|
||||||
|
|
||||||
if (subtype === 'Link') {
|
if (subtype === 'Link') {
|
||||||
|
isLink = true;
|
||||||
|
linksRemoved++;
|
||||||
|
} else {
|
||||||
const actionRef = annot.get(PDFName.of('A'));
|
const actionRef = annot.get(PDFName.of('A'));
|
||||||
if (actionRef) {
|
if (actionRef) {
|
||||||
try {
|
try {
|
||||||
const actionDict = pdfDoc.context.lookup(actionRef);
|
const actionDict = pdfDoc.context.lookup(actionRef);
|
||||||
const actionType = actionDict.get(PDFName.of('S'))?.toString().substring(1);
|
const actionType = actionDict
|
||||||
|
.get(PDFName.of('S'))
|
||||||
|
?.toString()
|
||||||
|
.substring(1);
|
||||||
|
|
||||||
if (actionType === 'URI' || actionType === 'Launch' || actionType === 'GoTo') {
|
if (
|
||||||
shouldRemove = true;
|
actionType === 'URI' ||
|
||||||
|
actionType === 'Launch' ||
|
||||||
|
actionType === 'GoTo' ||
|
||||||
|
actionType === 'GoToR'
|
||||||
|
) {
|
||||||
|
isLink = true;
|
||||||
linksRemoved++;
|
linksRemoved++;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Could not read link action:', e.message);
|
console.warn('Could not read action:', e.message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const dest = annot.get(PDFName.of('Dest'));
|
const dest = annot.get(PDFName.of('Dest'));
|
||||||
if (dest && !shouldRemove) {
|
if (dest && !isLink) {
|
||||||
// TODO:@ALAM - Check if this is an internal link
|
isLink = true;
|
||||||
|
linksRemoved++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!shouldRemove) {
|
if (!isLink) {
|
||||||
annotsToKeep.push(ref);
|
annotsToKeep.push(ref);
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -359,17 +386,44 @@ export async function sanitizePdf() {
|
|||||||
if (linksRemoved > 0) {
|
if (linksRemoved > 0) {
|
||||||
if (annotsToKeep.length > 0) {
|
if (annotsToKeep.length > 0) {
|
||||||
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
|
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
|
||||||
page.node.set(PDFName.of('Annots'), newAnnotsArray);
|
pageDict.set(PDFName.of('Annots'), newAnnotsArray);
|
||||||
} else {
|
} else {
|
||||||
page.node.delete(PDFName.of('Annots'));
|
pageDict.delete(PDFName.of('Annots'));
|
||||||
}
|
}
|
||||||
changesMade = true;
|
changesMade = true;
|
||||||
console.log(`Page ${pageIndex + 1}: Removed ${linksRemoved} link(s)`);
|
console.log(
|
||||||
|
`Page ${pageIndex + 1}: Removed ${linksRemoved} link(s)`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
} catch (pageError) {
|
} catch (pageError) {
|
||||||
console.warn(`Could not process page ${pageIndex + 1} for links: ${pageError.message}`);
|
console.warn(
|
||||||
|
`Could not process page ${pageIndex + 1} for links: ${pageError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const catalogDict = pdfDoc.catalog.dict;
|
||||||
|
const namesRef = catalogDict.get(PDFName.of('Names'));
|
||||||
|
if (namesRef) {
|
||||||
|
try {
|
||||||
|
const namesDict = pdfDoc.context.lookup(namesRef);
|
||||||
|
if (namesDict.has(PDFName.of('Dests'))) {
|
||||||
|
namesDict.delete(PDFName.of('Dests'));
|
||||||
|
changesMade = true;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('Could not access Names/Dests:', e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (catalogDict.has(PDFName.of('Dests'))) {
|
||||||
|
catalogDict.delete(PDFName.of('Dests'));
|
||||||
|
changesMade = true;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('Could not remove named destinations:', e.message);
|
||||||
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(`Could not remove links: ${e.message}`);
|
console.warn(`Could not remove links: ${e.message}`);
|
||||||
}
|
}
|
||||||
@@ -448,13 +502,21 @@ export async function sanitizePdf() {
|
|||||||
for (const fontKey of fontKeys) {
|
for (const fontKey of fontKeys) {
|
||||||
try {
|
try {
|
||||||
const specificFontRef = fontDict.get(fontKey);
|
const specificFontRef = fontDict.get(fontKey);
|
||||||
const specificFont = pdfDoc.context.lookup(specificFontRef);
|
const specificFont =
|
||||||
|
pdfDoc.context.lookup(specificFontRef);
|
||||||
|
|
||||||
if (specificFont.has(PDFName.of('FontDescriptor'))) {
|
if (specificFont.has(PDFName.of('FontDescriptor'))) {
|
||||||
const descriptorRef = specificFont.get(PDFName.of('FontDescriptor'));
|
const descriptorRef = specificFont.get(
|
||||||
const descriptor = pdfDoc.context.lookup(descriptorRef);
|
PDFName.of('FontDescriptor')
|
||||||
|
);
|
||||||
|
const descriptor =
|
||||||
|
pdfDoc.context.lookup(descriptorRef);
|
||||||
|
|
||||||
const fontFileKeys = ['FontFile', 'FontFile2', 'FontFile3'];
|
const fontFileKeys = [
|
||||||
|
'FontFile',
|
||||||
|
'FontFile2',
|
||||||
|
'FontFile3',
|
||||||
|
];
|
||||||
for (const key of fontFileKeys) {
|
for (const key of fontFileKeys) {
|
||||||
if (descriptor.has(PDFName.of(key))) {
|
if (descriptor.has(PDFName.of(key))) {
|
||||||
descriptor.delete(PDFName.of(key));
|
descriptor.delete(PDFName.of(key));
|
||||||
@@ -463,23 +525,35 @@ export async function sanitizePdf() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Users/Developers: Uncomment this if you can delete the entire font entry
|
// Users/Developers: Uncomment this if you can delete the entire font entry -- might break the rendering though
|
||||||
// fontDict.delete(fontKey);
|
// fontDict.delete(fontKey);
|
||||||
// changesMade = true;
|
// changesMade = true;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(`Could not process font ${fontKey}:`, e.message);
|
console.warn(
|
||||||
|
`Could not process font ${fontKey}:`,
|
||||||
|
e.message
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Could not access font dictionary:', e.message);
|
console.warn(
|
||||||
|
'Could not access font dictionary:',
|
||||||
|
e.message
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Could not access Resources for fonts:', e.message);
|
console.warn(
|
||||||
|
'Could not access Resources for fonts:',
|
||||||
|
e.message
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(`Could not remove fonts from page ${pageIndex + 1}:`, e.message);
|
console.warn(
|
||||||
|
`Could not remove fonts from page ${pageIndex + 1}:`,
|
||||||
|
e.message
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user