Add visual workflow builder, fix critical bugs, and add Arabic i18n support
This commit is contained in:
156
src/js/utils/compress.ts
Normal file
156
src/js/utils/compress.ts
Normal file
@@ -0,0 +1,156 @@
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import { getPDFDocument } from './helpers.js';
|
||||
import { loadPyMuPDF } from './pymupdf-loader.js';
|
||||
|
||||
export const CONDENSE_PRESETS = {
|
||||
light: {
|
||||
images: { quality: 90, dpiTarget: 150, dpiThreshold: 200 },
|
||||
scrub: { metadata: false, thumbnails: true },
|
||||
subsetFonts: true,
|
||||
},
|
||||
balanced: {
|
||||
images: { quality: 75, dpiTarget: 96, dpiThreshold: 150 },
|
||||
scrub: { metadata: true, thumbnails: true },
|
||||
subsetFonts: true,
|
||||
},
|
||||
aggressive: {
|
||||
images: { quality: 50, dpiTarget: 72, dpiThreshold: 100 },
|
||||
scrub: { metadata: true, thumbnails: true, xmlMetadata: true },
|
||||
subsetFonts: true,
|
||||
},
|
||||
extreme: {
|
||||
images: { quality: 30, dpiTarget: 60, dpiThreshold: 96 },
|
||||
scrub: { metadata: true, thumbnails: true, xmlMetadata: true },
|
||||
subsetFonts: true,
|
||||
},
|
||||
};
|
||||
|
||||
export const PHOTON_PRESETS = {
|
||||
light: { scale: 2.0, quality: 0.85 },
|
||||
balanced: { scale: 1.5, quality: 0.65 },
|
||||
aggressive: { scale: 1.2, quality: 0.45 },
|
||||
extreme: { scale: 1.0, quality: 0.25 },
|
||||
};
|
||||
|
||||
export interface CondenseCustomSettings {
|
||||
imageQuality?: number;
|
||||
dpiTarget?: number;
|
||||
dpiThreshold?: number;
|
||||
removeMetadata?: boolean;
|
||||
subsetFonts?: boolean;
|
||||
convertToGrayscale?: boolean;
|
||||
removeThumbnails?: boolean;
|
||||
}
|
||||
|
||||
export async function performCondenseCompression(
|
||||
fileBlob: Blob,
|
||||
level: string,
|
||||
customSettings?: CondenseCustomSettings
|
||||
) {
|
||||
const pymupdf = await loadPyMuPDF();
|
||||
|
||||
const preset =
|
||||
CONDENSE_PRESETS[level as keyof typeof CONDENSE_PRESETS] ||
|
||||
CONDENSE_PRESETS.balanced;
|
||||
|
||||
const dpiTarget = customSettings?.dpiTarget ?? preset.images.dpiTarget;
|
||||
const userThreshold =
|
||||
customSettings?.dpiThreshold ?? preset.images.dpiThreshold;
|
||||
const dpiThreshold = Math.max(userThreshold, dpiTarget + 10);
|
||||
|
||||
const options = {
|
||||
images: {
|
||||
enabled: true,
|
||||
quality: customSettings?.imageQuality ?? preset.images.quality,
|
||||
dpiTarget,
|
||||
dpiThreshold,
|
||||
convertToGray: customSettings?.convertToGrayscale ?? false,
|
||||
},
|
||||
scrub: {
|
||||
metadata: customSettings?.removeMetadata ?? preset.scrub.metadata,
|
||||
thumbnails: customSettings?.removeThumbnails ?? preset.scrub.thumbnails,
|
||||
xmlMetadata: (preset.scrub as any).xmlMetadata ?? false,
|
||||
},
|
||||
subsetFonts: customSettings?.subsetFonts ?? preset.subsetFonts,
|
||||
save: {
|
||||
garbage: 4 as const,
|
||||
deflate: true,
|
||||
clean: true,
|
||||
useObjstms: true,
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await pymupdf.compressPdf(fileBlob, options);
|
||||
return result;
|
||||
} catch (error: any) {
|
||||
const errorMessage = error?.message || String(error);
|
||||
if (
|
||||
errorMessage.includes('PatternType') ||
|
||||
errorMessage.includes('pattern')
|
||||
) {
|
||||
const fallbackOptions = {
|
||||
...options,
|
||||
images: {
|
||||
...options.images,
|
||||
enabled: false,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await pymupdf.compressPdf(fileBlob, fallbackOptions);
|
||||
return { ...result, usedFallback: true };
|
||||
}
|
||||
|
||||
throw new Error(`PDF compression failed: ${errorMessage}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function performPhotonCompression(
|
||||
arrayBuffer: ArrayBuffer,
|
||||
level: string
|
||||
): Promise<Uint8Array> {
|
||||
const pdfJsDoc = await getPDFDocument({ data: arrayBuffer }).promise;
|
||||
const newPdfDoc = await PDFDocument.create();
|
||||
const settings =
|
||||
PHOTON_PRESETS[level as keyof typeof PHOTON_PRESETS] ||
|
||||
PHOTON_PRESETS.balanced;
|
||||
|
||||
for (let i = 1; i <= pdfJsDoc.numPages; i++) {
|
||||
const page = await pdfJsDoc.getPage(i);
|
||||
const viewport = page.getViewport({ scale: settings.scale });
|
||||
const canvas = document.createElement('canvas');
|
||||
const context = canvas.getContext('2d');
|
||||
if (!context) throw new Error('Failed to create canvas context');
|
||||
canvas.height = viewport.height;
|
||||
canvas.width = viewport.width;
|
||||
|
||||
await page.render({ canvasContext: context, viewport, canvas: canvas })
|
||||
.promise;
|
||||
|
||||
const jpegBlob = await new Promise<Blob>((resolve, reject) =>
|
||||
canvas.toBlob(
|
||||
(blob) => {
|
||||
if (blob) resolve(blob);
|
||||
else reject(new Error('Failed to create JPEG blob'));
|
||||
},
|
||||
'image/jpeg',
|
||||
settings.quality
|
||||
)
|
||||
);
|
||||
|
||||
// Release canvas memory
|
||||
canvas.width = 0;
|
||||
canvas.height = 0;
|
||||
|
||||
const jpegBytes = await jpegBlob.arrayBuffer();
|
||||
const jpegImage = await newPdfDoc.embedJpg(jpegBytes);
|
||||
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
|
||||
newPage.drawImage(jpegImage, {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: viewport.width,
|
||||
height: viewport.height,
|
||||
});
|
||||
}
|
||||
return await newPdfDoc.save();
|
||||
}
|
||||
312
src/js/utils/image-effects.ts
Normal file
312
src/js/utils/image-effects.ts
Normal file
@@ -0,0 +1,312 @@
|
||||
import type { ScanSettings } from '../types/scanner-effect-type.js';
|
||||
import type { AdjustColorsSettings } from '../types/adjust-colors-type.js';
|
||||
|
||||
export function applyGreyscale(imageData: ImageData): void {
|
||||
const data = imageData.data;
|
||||
for (let j = 0; j < data.length; j += 4) {
|
||||
const grey = Math.round(
|
||||
0.299 * data[j] + 0.587 * data[j + 1] + 0.114 * data[j + 2]
|
||||
);
|
||||
data[j] = grey;
|
||||
data[j + 1] = grey;
|
||||
data[j + 2] = grey;
|
||||
}
|
||||
}
|
||||
|
||||
export function applyInvertColors(imageData: ImageData): void {
|
||||
const data = imageData.data;
|
||||
for (let j = 0; j < data.length; j += 4) {
|
||||
data[j] = 255 - data[j];
|
||||
data[j + 1] = 255 - data[j + 1];
|
||||
data[j + 2] = 255 - data[j + 2];
|
||||
}
|
||||
}
|
||||
|
||||
export function rgbToHsl(
|
||||
r: number,
|
||||
g: number,
|
||||
b: number
|
||||
): [number, number, number] {
|
||||
r /= 255;
|
||||
g /= 255;
|
||||
b /= 255;
|
||||
const max = Math.max(r, g, b);
|
||||
const min = Math.min(r, g, b);
|
||||
const l = (max + min) / 2;
|
||||
let h = 0;
|
||||
let s = 0;
|
||||
|
||||
if (max !== min) {
|
||||
const d = max - min;
|
||||
s = l > 0.5 ? d / (2 - max - min) : d / (max + min);
|
||||
if (max === r) h = ((g - b) / d + (g < b ? 6 : 0)) / 6;
|
||||
else if (max === g) h = ((b - r) / d + 2) / 6;
|
||||
else h = ((r - g) / d + 4) / 6;
|
||||
}
|
||||
|
||||
return [h, s, l];
|
||||
}
|
||||
|
||||
export function hslToRgb(
|
||||
h: number,
|
||||
s: number,
|
||||
l: number
|
||||
): [number, number, number] {
|
||||
if (s === 0) {
|
||||
const v = Math.round(l * 255);
|
||||
return [v, v, v];
|
||||
}
|
||||
|
||||
const hue2rgb = (p: number, q: number, t: number): number => {
|
||||
if (t < 0) t += 1;
|
||||
if (t > 1) t -= 1;
|
||||
if (t < 1 / 6) return p + (q - p) * 6 * t;
|
||||
if (t < 1 / 2) return q;
|
||||
if (t < 2 / 3) return p + (q - p) * (2 / 3 - t) * 6;
|
||||
return p;
|
||||
};
|
||||
|
||||
const q = l < 0.5 ? l * (1 + s) : l + s - l * s;
|
||||
const p = 2 * l - q;
|
||||
return [
|
||||
Math.round(hue2rgb(p, q, h + 1 / 3) * 255),
|
||||
Math.round(hue2rgb(p, q, h) * 255),
|
||||
Math.round(hue2rgb(p, q, h - 1 / 3) * 255),
|
||||
];
|
||||
}
|
||||
|
||||
export function applyScannerEffect(
|
||||
sourceData: ImageData,
|
||||
canvas: HTMLCanvasElement,
|
||||
settings: ScanSettings,
|
||||
rotationAngle: number,
|
||||
scale: number = 1
|
||||
): void {
|
||||
const ctx = canvas.getContext('2d')!;
|
||||
const w = sourceData.width;
|
||||
const h = sourceData.height;
|
||||
|
||||
const scaledBlur = settings.blur * scale;
|
||||
const scaledNoise = settings.noise * scale;
|
||||
|
||||
const workCanvas = document.createElement('canvas');
|
||||
workCanvas.width = w;
|
||||
workCanvas.height = h;
|
||||
const workCtx = workCanvas.getContext('2d')!;
|
||||
|
||||
if (scaledBlur > 0) {
|
||||
workCtx.filter = `blur(${scaledBlur}px)`;
|
||||
}
|
||||
|
||||
workCtx.putImageData(sourceData, 0, 0);
|
||||
if (scaledBlur > 0) {
|
||||
const tempCanvas = document.createElement('canvas');
|
||||
tempCanvas.width = w;
|
||||
tempCanvas.height = h;
|
||||
const tempCtx = tempCanvas.getContext('2d')!;
|
||||
tempCtx.filter = `blur(${scaledBlur}px)`;
|
||||
tempCtx.drawImage(workCanvas, 0, 0);
|
||||
workCtx.filter = 'none';
|
||||
workCtx.clearRect(0, 0, w, h);
|
||||
workCtx.drawImage(tempCanvas, 0, 0);
|
||||
}
|
||||
|
||||
const imageData = workCtx.getImageData(0, 0, w, h);
|
||||
const data = imageData.data;
|
||||
|
||||
const contrastFactor =
|
||||
settings.contrast !== 0
|
||||
? (259 * (settings.contrast + 255)) / (255 * (259 - settings.contrast))
|
||||
: 1;
|
||||
|
||||
for (let i = 0; i < data.length; i += 4) {
|
||||
let r = data[i];
|
||||
let g = data[i + 1];
|
||||
let b = data[i + 2];
|
||||
|
||||
if (settings.grayscale) {
|
||||
const grey = Math.round(0.299 * r + 0.587 * g + 0.114 * b);
|
||||
r = grey;
|
||||
g = grey;
|
||||
b = grey;
|
||||
}
|
||||
|
||||
if (settings.brightness !== 0) {
|
||||
r += settings.brightness;
|
||||
g += settings.brightness;
|
||||
b += settings.brightness;
|
||||
}
|
||||
|
||||
if (settings.contrast !== 0) {
|
||||
r = contrastFactor * (r - 128) + 128;
|
||||
g = contrastFactor * (g - 128) + 128;
|
||||
b = contrastFactor * (b - 128) + 128;
|
||||
}
|
||||
|
||||
if (settings.yellowish > 0) {
|
||||
const intensity = settings.yellowish / 50;
|
||||
r += 20 * intensity;
|
||||
g += 12 * intensity;
|
||||
b -= 15 * intensity;
|
||||
}
|
||||
|
||||
if (scaledNoise > 0) {
|
||||
const n = (Math.random() - 0.5) * scaledNoise;
|
||||
r += n;
|
||||
g += n;
|
||||
b += n;
|
||||
}
|
||||
|
||||
data[i] = Math.max(0, Math.min(255, r));
|
||||
data[i + 1] = Math.max(0, Math.min(255, g));
|
||||
data[i + 2] = Math.max(0, Math.min(255, b));
|
||||
}
|
||||
|
||||
workCtx.putImageData(imageData, 0, 0);
|
||||
|
||||
if (settings.border) {
|
||||
const borderSize = Math.max(w, h) * 0.02;
|
||||
const gradient1 = workCtx.createLinearGradient(0, 0, borderSize, 0);
|
||||
gradient1.addColorStop(0, 'rgba(0,0,0,0.3)');
|
||||
gradient1.addColorStop(1, 'rgba(0,0,0,0)');
|
||||
workCtx.fillStyle = gradient1;
|
||||
workCtx.fillRect(0, 0, borderSize, h);
|
||||
|
||||
const gradient2 = workCtx.createLinearGradient(w, 0, w - borderSize, 0);
|
||||
gradient2.addColorStop(0, 'rgba(0,0,0,0.3)');
|
||||
gradient2.addColorStop(1, 'rgba(0,0,0,0)');
|
||||
workCtx.fillStyle = gradient2;
|
||||
workCtx.fillRect(w - borderSize, 0, borderSize, h);
|
||||
|
||||
const gradient3 = workCtx.createLinearGradient(0, 0, 0, borderSize);
|
||||
gradient3.addColorStop(0, 'rgba(0,0,0,0.3)');
|
||||
gradient3.addColorStop(1, 'rgba(0,0,0,0)');
|
||||
workCtx.fillStyle = gradient3;
|
||||
workCtx.fillRect(0, 0, w, borderSize);
|
||||
|
||||
const gradient4 = workCtx.createLinearGradient(0, h, 0, h - borderSize);
|
||||
gradient4.addColorStop(0, 'rgba(0,0,0,0.3)');
|
||||
gradient4.addColorStop(1, 'rgba(0,0,0,0)');
|
||||
workCtx.fillStyle = gradient4;
|
||||
workCtx.fillRect(0, h - borderSize, w, borderSize);
|
||||
}
|
||||
|
||||
if (rotationAngle !== 0) {
|
||||
const rad = (rotationAngle * Math.PI) / 180;
|
||||
const cos = Math.abs(Math.cos(rad));
|
||||
const sin = Math.abs(Math.sin(rad));
|
||||
const newW = Math.ceil(w * cos + h * sin);
|
||||
const newH = Math.ceil(w * sin + h * cos);
|
||||
|
||||
canvas.width = newW;
|
||||
canvas.height = newH;
|
||||
ctx.fillStyle = '#ffffff';
|
||||
ctx.fillRect(0, 0, newW, newH);
|
||||
ctx.translate(newW / 2, newH / 2);
|
||||
ctx.rotate(rad);
|
||||
ctx.drawImage(workCanvas, -w / 2, -h / 2);
|
||||
ctx.setTransform(1, 0, 0, 1, 0, 0);
|
||||
} else {
|
||||
canvas.width = w;
|
||||
canvas.height = h;
|
||||
ctx.drawImage(workCanvas, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
export function applyColorAdjustments(
|
||||
sourceData: ImageData,
|
||||
canvas: HTMLCanvasElement,
|
||||
settings: AdjustColorsSettings
|
||||
): void {
|
||||
const ctx = canvas.getContext('2d')!;
|
||||
const w = sourceData.width;
|
||||
const h = sourceData.height;
|
||||
|
||||
canvas.width = w;
|
||||
canvas.height = h;
|
||||
|
||||
const imageData = new ImageData(new Uint8ClampedArray(sourceData.data), w, h);
|
||||
const data = imageData.data;
|
||||
|
||||
const contrastFactor =
|
||||
settings.contrast !== 0
|
||||
? (259 * (settings.contrast + 255)) / (255 * (259 - settings.contrast))
|
||||
: 1;
|
||||
|
||||
const gammaCorrection = settings.gamma !== 1.0 ? 1 / settings.gamma : 1;
|
||||
const sepiaAmount = settings.sepia / 100;
|
||||
|
||||
for (let i = 0; i < data.length; i += 4) {
|
||||
let r = data[i];
|
||||
let g = data[i + 1];
|
||||
let b = data[i + 2];
|
||||
|
||||
if (settings.brightness !== 0) {
|
||||
const adj = settings.brightness * 2.55;
|
||||
r += adj;
|
||||
g += adj;
|
||||
b += adj;
|
||||
}
|
||||
|
||||
if (settings.contrast !== 0) {
|
||||
r = contrastFactor * (r - 128) + 128;
|
||||
g = contrastFactor * (g - 128) + 128;
|
||||
b = contrastFactor * (b - 128) + 128;
|
||||
}
|
||||
|
||||
if (settings.saturation !== 0 || settings.hueShift !== 0) {
|
||||
const [hue, sat, lig] = rgbToHsl(
|
||||
Math.max(0, Math.min(255, r)),
|
||||
Math.max(0, Math.min(255, g)),
|
||||
Math.max(0, Math.min(255, b))
|
||||
);
|
||||
|
||||
let newHue = hue;
|
||||
if (settings.hueShift !== 0) {
|
||||
newHue = (hue + settings.hueShift / 360) % 1;
|
||||
if (newHue < 0) newHue += 1;
|
||||
}
|
||||
|
||||
let newSat = sat;
|
||||
if (settings.saturation !== 0) {
|
||||
const satAdj = settings.saturation / 100;
|
||||
newSat = satAdj > 0 ? sat + (1 - sat) * satAdj : sat * (1 + satAdj);
|
||||
newSat = Math.max(0, Math.min(1, newSat));
|
||||
}
|
||||
|
||||
[r, g, b] = hslToRgb(newHue, newSat, lig);
|
||||
}
|
||||
|
||||
if (settings.temperature !== 0) {
|
||||
const t = settings.temperature / 50;
|
||||
r += 30 * t;
|
||||
b -= 30 * t;
|
||||
}
|
||||
|
||||
if (settings.tint !== 0) {
|
||||
const t = settings.tint / 50;
|
||||
g += 30 * t;
|
||||
}
|
||||
|
||||
if (settings.gamma !== 1.0) {
|
||||
r = Math.pow(Math.max(0, Math.min(255, r)) / 255, gammaCorrection) * 255;
|
||||
g = Math.pow(Math.max(0, Math.min(255, g)) / 255, gammaCorrection) * 255;
|
||||
b = Math.pow(Math.max(0, Math.min(255, b)) / 255, gammaCorrection) * 255;
|
||||
}
|
||||
|
||||
if (settings.sepia > 0) {
|
||||
const sr = 0.393 * r + 0.769 * g + 0.189 * b;
|
||||
const sg = 0.349 * r + 0.686 * g + 0.168 * b;
|
||||
const sb = 0.272 * r + 0.534 * g + 0.131 * b;
|
||||
r = r + (sr - r) * sepiaAmount;
|
||||
g = g + (sg - g) * sepiaAmount;
|
||||
b = b + (sb - b) * sepiaAmount;
|
||||
}
|
||||
|
||||
data[i] = Math.max(0, Math.min(255, r));
|
||||
data[i + 1] = Math.max(0, Math.min(255, g));
|
||||
data[i + 2] = Math.max(0, Math.min(255, b));
|
||||
}
|
||||
|
||||
ctx.putImageData(imageData, 0, 0);
|
||||
}
|
||||
304
src/js/utils/ocr.ts
Normal file
304
src/js/utils/ocr.ts
Normal file
@@ -0,0 +1,304 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
|
||||
import fontkit from '@pdf-lib/fontkit';
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import { getFontForLanguage } from './font-loader.js';
|
||||
import { OcrPage, OcrLine } from '@/types';
|
||||
import {
|
||||
parseHocrDocument,
|
||||
calculateWordTransform,
|
||||
calculateSpaceTransform,
|
||||
} from './hocr-transform.js';
|
||||
import { getPDFDocument } from './helpers.js';
|
||||
|
||||
export interface OcrOptions {
|
||||
language: string;
|
||||
resolution: number;
|
||||
binarize: boolean;
|
||||
whitelist: string;
|
||||
onProgress?: (status: string, progress: number) => void;
|
||||
}
|
||||
|
||||
export interface OcrResult {
|
||||
pdfBytes: Uint8Array;
|
||||
pdfDoc: PDFDocument;
|
||||
fullText: string;
|
||||
}
|
||||
|
||||
function binarizeCanvas(ctx: CanvasRenderingContext2D) {
|
||||
const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height);
|
||||
const data = imageData.data;
|
||||
for (let i = 0; i < data.length; i += 4) {
|
||||
const brightness =
|
||||
0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2];
|
||||
const color = brightness > 128 ? 255 : 0;
|
||||
data[i] = data[i + 1] = data[i + 2] = color;
|
||||
}
|
||||
ctx.putImageData(imageData, 0, 0);
|
||||
}
|
||||
|
||||
function drawOcrTextLayer(
|
||||
page: ReturnType<typeof PDFDocument.prototype.addPage>,
|
||||
ocrPage: OcrPage,
|
||||
pageHeight: number,
|
||||
primaryFont: PDFFont,
|
||||
latinFont: PDFFont
|
||||
): void {
|
||||
ocrPage.lines.forEach(function (line: OcrLine) {
|
||||
const words = line.words;
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
const word = words[i];
|
||||
const text = word.text.replace(
|
||||
/[\u0000-\u001F\u007F-\u009F\u200E\u200F\u202A-\u202E\uFEFF]/g,
|
||||
''
|
||||
);
|
||||
|
||||
if (!text.trim()) continue;
|
||||
|
||||
const hasNonLatin = /[^\u0000-\u007F]/.test(text);
|
||||
const font = hasNonLatin ? primaryFont : latinFont;
|
||||
|
||||
if (!font) {
|
||||
console.warn('Font not available for text: "' + text + '"');
|
||||
continue;
|
||||
}
|
||||
|
||||
const transform = calculateWordTransform(
|
||||
word,
|
||||
line,
|
||||
pageHeight,
|
||||
(txt: string, size: number) => {
|
||||
try {
|
||||
return font.widthOfTextAtSize(txt, size);
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (transform.fontSize <= 0) continue;
|
||||
|
||||
try {
|
||||
page.drawText(text, {
|
||||
x: transform.x,
|
||||
y: transform.y,
|
||||
font,
|
||||
size: transform.fontSize,
|
||||
color: rgb(0, 0, 0),
|
||||
opacity: 0,
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`Could not draw text "${text}":`, error);
|
||||
}
|
||||
|
||||
if (line.injectWordBreaks && i < words.length - 1) {
|
||||
const nextWord = words[i + 1];
|
||||
const spaceTransform = calculateSpaceTransform(
|
||||
word,
|
||||
nextWord,
|
||||
line,
|
||||
pageHeight,
|
||||
(size: number) => {
|
||||
try {
|
||||
return font.widthOfTextAtSize(' ', size);
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (spaceTransform && spaceTransform.horizontalScale > 0.1) {
|
||||
try {
|
||||
page.drawText(' ', {
|
||||
x: spaceTransform.x,
|
||||
y: spaceTransform.y,
|
||||
font,
|
||||
size: spaceTransform.fontSize,
|
||||
color: rgb(0, 0, 0),
|
||||
opacity: 0,
|
||||
});
|
||||
} catch {
|
||||
console.warn(`Could not draw space between words`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export async function performOcr(
|
||||
pdfBytes: Uint8Array | ArrayBuffer,
|
||||
options: OcrOptions
|
||||
): Promise<OcrResult> {
|
||||
const { language, resolution, binarize, whitelist, onProgress } = options;
|
||||
const progress = onProgress || (() => {});
|
||||
|
||||
const worker = await Tesseract.createWorker(language, 1, {
|
||||
logger: function (m: { status: string; progress: number }) {
|
||||
progress(m.status, m.progress || 0);
|
||||
},
|
||||
});
|
||||
|
||||
await worker.setParameters({
|
||||
tessjs_create_hocr: '1',
|
||||
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
|
||||
});
|
||||
|
||||
if (whitelist) {
|
||||
await worker.setParameters({
|
||||
tessedit_char_whitelist: whitelist,
|
||||
});
|
||||
}
|
||||
|
||||
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
|
||||
const newPdfDoc = await PDFDocument.create();
|
||||
|
||||
newPdfDoc.registerFontkit(fontkit);
|
||||
|
||||
progress('Loading fonts...', 0);
|
||||
|
||||
const selectedLangs = language.split('+');
|
||||
const cjkLangs = ['jpn', 'chi_sim', 'chi_tra', 'kor'];
|
||||
const indicLangs = [
|
||||
'hin',
|
||||
'ben',
|
||||
'guj',
|
||||
'kan',
|
||||
'mal',
|
||||
'ori',
|
||||
'pan',
|
||||
'tam',
|
||||
'tel',
|
||||
'sin',
|
||||
];
|
||||
const priorityLangs = [...cjkLangs, ...indicLangs, 'ara', 'rus', 'ukr'];
|
||||
|
||||
const primaryLang =
|
||||
selectedLangs.find((l) => priorityLangs.includes(l)) ||
|
||||
selectedLangs[0] ||
|
||||
'eng';
|
||||
|
||||
const hasCJK = selectedLangs.some((l) => cjkLangs.includes(l));
|
||||
const hasIndic = selectedLangs.some((l) => indicLangs.includes(l));
|
||||
const hasLatin =
|
||||
selectedLangs.some((l) => !priorityLangs.includes(l)) ||
|
||||
selectedLangs.includes('eng');
|
||||
const isIndicPlusLatin = hasIndic && hasLatin && !hasCJK;
|
||||
|
||||
let primaryFont: PDFFont;
|
||||
let latinFont: PDFFont;
|
||||
|
||||
try {
|
||||
if (isIndicPlusLatin) {
|
||||
const [scriptFontBytes, latinFontBytes] = await Promise.all([
|
||||
getFontForLanguage(primaryLang),
|
||||
getFontForLanguage('eng'),
|
||||
]);
|
||||
primaryFont = await newPdfDoc.embedFont(scriptFontBytes, {
|
||||
subset: false,
|
||||
});
|
||||
latinFont = await newPdfDoc.embedFont(latinFontBytes, {
|
||||
subset: false,
|
||||
});
|
||||
} else {
|
||||
const fontBytes = await getFontForLanguage(primaryLang);
|
||||
primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false });
|
||||
latinFont = primaryFont;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Font loading failed, falling back to Helvetica', e);
|
||||
primaryFont = await newPdfDoc.embedFont(StandardFonts.Helvetica);
|
||||
latinFont = primaryFont;
|
||||
}
|
||||
|
||||
let fullText = '';
|
||||
|
||||
try {
|
||||
for (let i = 1; i <= pdf.numPages; i++) {
|
||||
progress(
|
||||
`Processing page ${i} of ${pdf.numPages}`,
|
||||
(i - 1) / pdf.numPages
|
||||
);
|
||||
|
||||
const page = await pdf.getPage(i);
|
||||
const viewport = page.getViewport({ scale: resolution });
|
||||
|
||||
const canvas = document.createElement('canvas');
|
||||
canvas.width = viewport.width;
|
||||
canvas.height = viewport.height;
|
||||
const context = canvas.getContext('2d');
|
||||
if (!context) throw new Error('Failed to create canvas context');
|
||||
|
||||
await page.render({ canvasContext: context, viewport, canvas }).promise;
|
||||
|
||||
if (binarize) {
|
||||
binarizeCanvas(context);
|
||||
}
|
||||
|
||||
const result = await worker.recognize(
|
||||
canvas,
|
||||
{},
|
||||
{ text: true, hocr: true }
|
||||
);
|
||||
const data = result.data;
|
||||
|
||||
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
|
||||
|
||||
const pngImageBytes = await new Promise<Uint8Array>(function (
|
||||
resolve,
|
||||
reject
|
||||
) {
|
||||
canvas.toBlob(function (blob) {
|
||||
if (!blob) {
|
||||
reject(new Error('Failed to create image blob'));
|
||||
return;
|
||||
}
|
||||
const reader = new FileReader();
|
||||
reader.onload = function () {
|
||||
resolve(new Uint8Array(reader.result as ArrayBuffer));
|
||||
};
|
||||
reader.onerror = function () {
|
||||
reject(new Error('Failed to read image data'));
|
||||
};
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}, 'image/png');
|
||||
});
|
||||
|
||||
// Release canvas memory
|
||||
canvas.width = 0;
|
||||
canvas.height = 0;
|
||||
|
||||
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
|
||||
newPage.drawImage(pngImage, {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: viewport.width,
|
||||
height: viewport.height,
|
||||
});
|
||||
|
||||
if (data.hocr) {
|
||||
const ocrPage = parseHocrDocument(data.hocr);
|
||||
drawOcrTextLayer(
|
||||
newPage,
|
||||
ocrPage,
|
||||
viewport.height,
|
||||
primaryFont,
|
||||
latinFont
|
||||
);
|
||||
}
|
||||
|
||||
fullText += data.text + '\n\n';
|
||||
}
|
||||
} finally {
|
||||
await worker.terminate();
|
||||
}
|
||||
|
||||
const savedBytes = await newPdfDoc.save();
|
||||
|
||||
return {
|
||||
pdfBytes: new Uint8Array(savedBytes),
|
||||
pdfDoc: newPdfDoc,
|
||||
fullText,
|
||||
};
|
||||
}
|
||||
382
src/js/utils/pdf-operations.ts
Normal file
382
src/js/utils/pdf-operations.ts
Normal file
@@ -0,0 +1,382 @@
|
||||
import { PDFDocument, degrees, rgb, StandardFonts } from 'pdf-lib';
|
||||
|
||||
export async function mergePdfs(
|
||||
pdfBytesList: Uint8Array[]
|
||||
): Promise<Uint8Array> {
|
||||
const mergedDoc = await PDFDocument.create();
|
||||
for (const bytes of pdfBytesList) {
|
||||
const srcDoc = await PDFDocument.load(bytes);
|
||||
const copiedPages = await mergedDoc.copyPages(
|
||||
srcDoc,
|
||||
srcDoc.getPageIndices()
|
||||
);
|
||||
copiedPages.forEach((page) => mergedDoc.addPage(page));
|
||||
}
|
||||
return new Uint8Array(await mergedDoc.save());
|
||||
}
|
||||
|
||||
export async function splitPdf(
|
||||
pdfBytes: Uint8Array,
|
||||
pageIndices: number[]
|
||||
): Promise<Uint8Array> {
|
||||
const srcDoc = await PDFDocument.load(pdfBytes);
|
||||
const newPdf = await PDFDocument.create();
|
||||
const copiedPages = await newPdf.copyPages(srcDoc, pageIndices);
|
||||
copiedPages.forEach((page) => newPdf.addPage(page));
|
||||
return new Uint8Array(await newPdf.save());
|
||||
}
|
||||
|
||||
export async function rotatePdfUniform(
|
||||
pdfBytes: Uint8Array,
|
||||
angle: number
|
||||
): Promise<Uint8Array> {
|
||||
const srcDoc = await PDFDocument.load(pdfBytes);
|
||||
const newPdfDoc = await PDFDocument.create();
|
||||
const pageCount = srcDoc.getPageCount();
|
||||
|
||||
for (let i = 0; i < pageCount; i++) {
|
||||
const originalPage = srcDoc.getPage(i);
|
||||
const currentRotation = originalPage.getRotation().angle;
|
||||
const totalRotation = currentRotation + angle;
|
||||
|
||||
if (totalRotation % 90 === 0) {
|
||||
const [copiedPage] = await newPdfDoc.copyPages(srcDoc, [i]);
|
||||
copiedPage.setRotation(degrees(totalRotation));
|
||||
newPdfDoc.addPage(copiedPage);
|
||||
} else {
|
||||
const embeddedPage = await newPdfDoc.embedPage(originalPage);
|
||||
const { width, height } = embeddedPage.scale(1);
|
||||
const angleRad = (totalRotation * Math.PI) / 180;
|
||||
const absCos = Math.abs(Math.cos(angleRad));
|
||||
const absSin = Math.abs(Math.sin(angleRad));
|
||||
const newWidth = width * absCos + height * absSin;
|
||||
const newHeight = width * absSin + height * absCos;
|
||||
const newPage = newPdfDoc.addPage([newWidth, newHeight]);
|
||||
const x =
|
||||
newWidth / 2 -
|
||||
((width / 2) * Math.cos(angleRad) - (height / 2) * Math.sin(angleRad));
|
||||
const y =
|
||||
newHeight / 2 -
|
||||
((width / 2) * Math.sin(angleRad) + (height / 2) * Math.cos(angleRad));
|
||||
newPage.drawPage(embeddedPage, {
|
||||
x,
|
||||
y,
|
||||
width,
|
||||
height,
|
||||
rotate: degrees(totalRotation),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return new Uint8Array(await newPdfDoc.save());
|
||||
}
|
||||
|
||||
export async function rotatePdfPages(
|
||||
pdfBytes: Uint8Array,
|
||||
rotations: number[]
|
||||
): Promise<Uint8Array> {
|
||||
const srcDoc = await PDFDocument.load(pdfBytes);
|
||||
const newPdfDoc = await PDFDocument.create();
|
||||
const pageCount = srcDoc.getPageCount();
|
||||
|
||||
for (let i = 0; i < pageCount; i++) {
|
||||
const rotation = rotations[i] || 0;
|
||||
const originalPage = srcDoc.getPage(i);
|
||||
const currentRotation = originalPage.getRotation().angle;
|
||||
const totalRotation = currentRotation + rotation;
|
||||
|
||||
if (totalRotation % 90 === 0) {
|
||||
const [copiedPage] = await newPdfDoc.copyPages(srcDoc, [i]);
|
||||
copiedPage.setRotation(degrees(totalRotation));
|
||||
newPdfDoc.addPage(copiedPage);
|
||||
} else {
|
||||
const embeddedPage = await newPdfDoc.embedPage(originalPage);
|
||||
const { width, height } = embeddedPage.scale(1);
|
||||
const angleRad = (totalRotation * Math.PI) / 180;
|
||||
const absCos = Math.abs(Math.cos(angleRad));
|
||||
const absSin = Math.abs(Math.sin(angleRad));
|
||||
const newWidth = width * absCos + height * absSin;
|
||||
const newHeight = width * absSin + height * absCos;
|
||||
const newPage = newPdfDoc.addPage([newWidth, newHeight]);
|
||||
const x =
|
||||
newWidth / 2 -
|
||||
((width / 2) * Math.cos(angleRad) - (height / 2) * Math.sin(angleRad));
|
||||
const y =
|
||||
newHeight / 2 -
|
||||
((width / 2) * Math.sin(angleRad) + (height / 2) * Math.cos(angleRad));
|
||||
newPage.drawPage(embeddedPage, {
|
||||
x,
|
||||
y,
|
||||
width,
|
||||
height,
|
||||
rotate: degrees(totalRotation),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return new Uint8Array(await newPdfDoc.save());
|
||||
}
|
||||
|
||||
export async function deletePdfPages(
|
||||
pdfBytes: Uint8Array,
|
||||
pagesToDelete: Set<number>
|
||||
): Promise<Uint8Array> {
|
||||
const srcDoc = await PDFDocument.load(pdfBytes);
|
||||
const totalPages = srcDoc.getPageCount();
|
||||
|
||||
const pagesToKeep: number[] = [];
|
||||
for (let i = 0; i < totalPages; i++) {
|
||||
if (!pagesToDelete.has(i + 1)) {
|
||||
pagesToKeep.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (pagesToKeep.length === 0) throw new Error('Cannot delete all pages');
|
||||
|
||||
const newPdf = await PDFDocument.create();
|
||||
const copiedPages = await newPdf.copyPages(srcDoc, pagesToKeep);
|
||||
copiedPages.forEach((page) => newPdf.addPage(page));
|
||||
return new Uint8Array(await newPdf.save());
|
||||
}
|
||||
|
||||
export function parsePageRange(rangeStr: string, totalPages: number): number[] {
|
||||
const indices: Set<number> = new Set();
|
||||
const parts = rangeStr.split(',').map((s) => s.trim());
|
||||
|
||||
for (const part of parts) {
|
||||
if (part.includes('-')) {
|
||||
const [startStr, endStr] = part.split('-');
|
||||
const start = Math.max(1, parseInt(startStr, 10) || 1);
|
||||
const end = Math.min(totalPages, parseInt(endStr, 10) || totalPages);
|
||||
for (let i = start; i <= end; i++) {
|
||||
indices.add(i - 1);
|
||||
}
|
||||
} else {
|
||||
const page = parseInt(part, 10);
|
||||
if (page >= 1 && page <= totalPages) {
|
||||
indices.add(page - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(indices).sort((a, b) => a - b);
|
||||
}
|
||||
|
||||
export function parseDeletePages(str: string, totalPages: number): Set<number> {
|
||||
const pages = new Set<number>();
|
||||
const parts = str.split(',').map((s) => s.trim());
|
||||
|
||||
for (const part of parts) {
|
||||
if (part.includes('-')) {
|
||||
const [startStr, endStr] = part.split('-');
|
||||
const start = Math.max(1, parseInt(startStr, 10) || 1);
|
||||
const end = Math.min(totalPages, parseInt(endStr, 10) || totalPages);
|
||||
for (let i = start; i <= end; i++) pages.add(i);
|
||||
} else {
|
||||
const page = parseInt(part, 10);
|
||||
if (page >= 1 && page <= totalPages) pages.add(page);
|
||||
}
|
||||
}
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
export interface TextWatermarkOptions {
|
||||
text: string;
|
||||
fontSize: number;
|
||||
color: { r: number; g: number; b: number };
|
||||
opacity: number;
|
||||
angle: number;
|
||||
}
|
||||
|
||||
export async function addTextWatermark(
|
||||
pdfBytes: Uint8Array,
|
||||
options: TextWatermarkOptions
|
||||
): Promise<Uint8Array> {
|
||||
const pdfDoc = await PDFDocument.load(pdfBytes);
|
||||
const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
|
||||
const pages = pdfDoc.getPages();
|
||||
|
||||
for (const page of pages) {
|
||||
const { width, height } = page.getSize();
|
||||
const textWidth = font.widthOfTextAtSize(options.text, options.fontSize);
|
||||
|
||||
page.drawText(options.text, {
|
||||
x: (width - textWidth) / 2,
|
||||
y: height / 2,
|
||||
font,
|
||||
size: options.fontSize,
|
||||
color: rgb(options.color.r, options.color.g, options.color.b),
|
||||
opacity: options.opacity,
|
||||
rotate: degrees(options.angle),
|
||||
});
|
||||
}
|
||||
|
||||
return new Uint8Array(await pdfDoc.save());
|
||||
}
|
||||
|
||||
export interface ImageWatermarkOptions {
|
||||
imageBytes: Uint8Array;
|
||||
imageType: 'png' | 'jpg';
|
||||
opacity: number;
|
||||
angle: number;
|
||||
scale: number;
|
||||
}
|
||||
|
||||
export async function addImageWatermark(
|
||||
pdfBytes: Uint8Array,
|
||||
options: ImageWatermarkOptions
|
||||
): Promise<Uint8Array> {
|
||||
const pdfDoc = await PDFDocument.load(pdfBytes);
|
||||
const image =
|
||||
options.imageType === 'png'
|
||||
? await pdfDoc.embedPng(options.imageBytes)
|
||||
: await pdfDoc.embedJpg(options.imageBytes);
|
||||
const pages = pdfDoc.getPages();
|
||||
|
||||
for (const page of pages) {
|
||||
const { width, height } = page.getSize();
|
||||
const imgWidth = image.width * options.scale;
|
||||
const imgHeight = image.height * options.scale;
|
||||
|
||||
page.drawImage(image, {
|
||||
x: (width - imgWidth) / 2,
|
||||
y: (height - imgHeight) / 2,
|
||||
width: imgWidth,
|
||||
height: imgHeight,
|
||||
opacity: options.opacity,
|
||||
rotate: degrees(options.angle),
|
||||
});
|
||||
}
|
||||
|
||||
return new Uint8Array(await pdfDoc.save());
|
||||
}
|
||||
|
||||
export type PageNumberPosition =
|
||||
| 'bottom-center'
|
||||
| 'bottom-left'
|
||||
| 'bottom-right'
|
||||
| 'top-center'
|
||||
| 'top-left'
|
||||
| 'top-right';
|
||||
export type PageNumberFormat = 'simple' | 'page_x_of_y';
|
||||
|
||||
export interface PageNumberOptions {
|
||||
position: PageNumberPosition;
|
||||
fontSize: number;
|
||||
format: PageNumberFormat;
|
||||
color: { r: number; g: number; b: number };
|
||||
}
|
||||
|
||||
export async function addPageNumbers(
|
||||
pdfBytes: Uint8Array,
|
||||
options: PageNumberOptions
|
||||
): Promise<Uint8Array> {
|
||||
const pdfDoc = await PDFDocument.load(pdfBytes);
|
||||
const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica);
|
||||
const pages = pdfDoc.getPages();
|
||||
const totalPages = pages.length;
|
||||
|
||||
for (let i = 0; i < totalPages; i++) {
|
||||
const page = pages[i];
|
||||
const mediaBox = page.getMediaBox();
|
||||
const cropBox = page.getCropBox();
|
||||
const bounds = cropBox || mediaBox;
|
||||
const width = bounds.width;
|
||||
const height = bounds.height;
|
||||
const xOffset = bounds.x || 0;
|
||||
const yOffset = bounds.y || 0;
|
||||
|
||||
const pageNumText =
|
||||
options.format === 'page_x_of_y'
|
||||
? `${i + 1} / ${totalPages}`
|
||||
: `${i + 1}`;
|
||||
|
||||
const textWidth = helveticaFont.widthOfTextAtSize(
|
||||
pageNumText,
|
||||
options.fontSize
|
||||
);
|
||||
const textHeight = options.fontSize;
|
||||
|
||||
const minMargin = 8;
|
||||
const maxMargin = 40;
|
||||
const marginPercentage = 0.04;
|
||||
|
||||
const horizontalMargin = Math.max(
|
||||
minMargin,
|
||||
Math.min(maxMargin, width * marginPercentage)
|
||||
);
|
||||
const verticalMargin = Math.max(
|
||||
minMargin,
|
||||
Math.min(maxMargin, height * marginPercentage)
|
||||
);
|
||||
|
||||
const safeHorizontalMargin = Math.max(horizontalMargin, textWidth / 2 + 3);
|
||||
const safeVerticalMargin = Math.max(verticalMargin, textHeight + 3);
|
||||
|
||||
let x = 0,
|
||||
y = 0;
|
||||
|
||||
switch (options.position) {
|
||||
case 'bottom-center':
|
||||
x =
|
||||
Math.max(
|
||||
safeHorizontalMargin,
|
||||
Math.min(
|
||||
width - safeHorizontalMargin - textWidth,
|
||||
(width - textWidth) / 2
|
||||
)
|
||||
) + xOffset;
|
||||
y = safeVerticalMargin + yOffset;
|
||||
break;
|
||||
case 'bottom-left':
|
||||
x = safeHorizontalMargin + xOffset;
|
||||
y = safeVerticalMargin + yOffset;
|
||||
break;
|
||||
case 'bottom-right':
|
||||
x =
|
||||
Math.max(
|
||||
safeHorizontalMargin,
|
||||
width - safeHorizontalMargin - textWidth
|
||||
) + xOffset;
|
||||
y = safeVerticalMargin + yOffset;
|
||||
break;
|
||||
case 'top-center':
|
||||
x =
|
||||
Math.max(
|
||||
safeHorizontalMargin,
|
||||
Math.min(
|
||||
width - safeHorizontalMargin - textWidth,
|
||||
(width - textWidth) / 2
|
||||
)
|
||||
) + xOffset;
|
||||
y = height - safeVerticalMargin - textHeight + yOffset;
|
||||
break;
|
||||
case 'top-left':
|
||||
x = safeHorizontalMargin + xOffset;
|
||||
y = height - safeVerticalMargin - textHeight + yOffset;
|
||||
break;
|
||||
case 'top-right':
|
||||
x =
|
||||
Math.max(
|
||||
safeHorizontalMargin,
|
||||
width - safeHorizontalMargin - textWidth
|
||||
) + xOffset;
|
||||
y = height - safeVerticalMargin - textHeight + yOffset;
|
||||
break;
|
||||
}
|
||||
|
||||
x = Math.max(xOffset + 3, Math.min(xOffset + width - textWidth - 3, x));
|
||||
y = Math.max(yOffset + 3, Math.min(yOffset + height - textHeight - 3, y));
|
||||
|
||||
page.drawText(pageNumText, {
|
||||
x,
|
||||
y,
|
||||
font: helveticaFont,
|
||||
size: options.fontSize,
|
||||
color: rgb(options.color.r, options.color.g, options.color.b),
|
||||
});
|
||||
}
|
||||
|
||||
return new Uint8Array(await pdfDoc.save());
|
||||
}
|
||||
590
src/js/utils/sanitize.ts
Normal file
590
src/js/utils/sanitize.ts
Normal file
@@ -0,0 +1,590 @@
|
||||
import { PDFDocument, PDFName } from 'pdf-lib';
|
||||
|
||||
export interface SanitizeOptions {
|
||||
flattenForms: boolean;
|
||||
removeMetadata: boolean;
|
||||
removeAnnotations: boolean;
|
||||
removeJavascript: boolean;
|
||||
removeEmbeddedFiles: boolean;
|
||||
removeLayers: boolean;
|
||||
removeLinks: boolean;
|
||||
removeStructureTree: boolean;
|
||||
removeMarkInfo: boolean;
|
||||
removeFonts: boolean;
|
||||
}
|
||||
|
||||
export const defaultSanitizeOptions: SanitizeOptions = {
|
||||
flattenForms: true,
|
||||
removeMetadata: true,
|
||||
removeAnnotations: true,
|
||||
removeJavascript: true,
|
||||
removeEmbeddedFiles: true,
|
||||
removeLayers: true,
|
||||
removeLinks: true,
|
||||
removeStructureTree: true,
|
||||
removeMarkInfo: true,
|
||||
removeFonts: false,
|
||||
};
|
||||
|
||||
function removeMetadataFromDoc(pdfDoc: PDFDocument) {
|
||||
const infoDict = (pdfDoc as any).getInfoDict();
|
||||
const allKeys = infoDict.keys();
|
||||
allKeys.forEach((key: any) => {
|
||||
infoDict.delete(key);
|
||||
});
|
||||
|
||||
pdfDoc.setTitle('');
|
||||
pdfDoc.setAuthor('');
|
||||
pdfDoc.setSubject('');
|
||||
pdfDoc.setKeywords([]);
|
||||
pdfDoc.setCreator('');
|
||||
pdfDoc.setProducer('');
|
||||
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
if (catalogDict.has(PDFName.of('Metadata'))) {
|
||||
catalogDict.delete(PDFName.of('Metadata'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove XMP metadata:', e.message);
|
||||
}
|
||||
|
||||
try {
|
||||
const context = pdfDoc.context;
|
||||
if ((context as any).trailerInfo) {
|
||||
delete (context as any).trailerInfo.ID;
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove document IDs:', e.message);
|
||||
}
|
||||
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
if (catalogDict.has(PDFName.of('PieceInfo'))) {
|
||||
catalogDict.delete(PDFName.of('PieceInfo'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove PieceInfo:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function removeAnnotationsFromDoc(pdfDoc: PDFDocument) {
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
page.node.delete(PDFName.of('Annots'));
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove annotations from page:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function flattenFormsInDoc(pdfDoc: PDFDocument) {
|
||||
const form = pdfDoc.getForm();
|
||||
form.flatten();
|
||||
}
|
||||
|
||||
function removeJavascriptFromDoc(pdfDoc: PDFDocument) {
|
||||
if ((pdfDoc as any).javaScripts && (pdfDoc as any).javaScripts.length > 0) {
|
||||
(pdfDoc as any).javaScripts = [];
|
||||
}
|
||||
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
const namesRef = catalogDict.get(PDFName.of('Names'));
|
||||
if (namesRef) {
|
||||
try {
|
||||
const namesDict = pdfDoc.context.lookup(namesRef) as any;
|
||||
if (namesDict.has(PDFName.of('JavaScript'))) {
|
||||
namesDict.delete(PDFName.of('JavaScript'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Names/JavaScript:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('OpenAction'))) {
|
||||
catalogDict.delete(PDFName.of('OpenAction'));
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('AA'))) {
|
||||
catalogDict.delete(PDFName.of('AA'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const pageDict = page.node;
|
||||
|
||||
if (pageDict.has(PDFName.of('AA'))) {
|
||||
pageDict.delete(PDFName.of('AA'));
|
||||
}
|
||||
|
||||
const annotRefs = pageDict.Annots()?.asArray() || [];
|
||||
for (const annotRef of annotRefs) {
|
||||
try {
|
||||
const annot = pdfDoc.context.lookup(annotRef) as any;
|
||||
|
||||
if (annot.has(PDFName.of('A'))) {
|
||||
const actionRef = annot.get(PDFName.of('A'));
|
||||
try {
|
||||
const actionDict = pdfDoc.context.lookup(actionRef) as any;
|
||||
const actionType = actionDict
|
||||
.get(PDFName.of('S'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
if (actionType === 'JavaScript') {
|
||||
annot.delete(PDFName.of('A'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not read action:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (annot.has(PDFName.of('AA'))) {
|
||||
annot.delete(PDFName.of('AA'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process annotation for JS:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove page actions:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const acroFormRef = catalogDict.get(PDFName.of('AcroForm'));
|
||||
if (acroFormRef) {
|
||||
const acroFormDict = pdfDoc.context.lookup(acroFormRef) as any;
|
||||
const fieldsRef = acroFormDict.get(PDFName.of('Fields'));
|
||||
|
||||
if (fieldsRef) {
|
||||
const fieldsArray = pdfDoc.context.lookup(fieldsRef) as any;
|
||||
const fields = fieldsArray.asArray();
|
||||
|
||||
for (const fieldRef of fields) {
|
||||
try {
|
||||
const field = pdfDoc.context.lookup(fieldRef) as any;
|
||||
|
||||
if (field.has(PDFName.of('A'))) {
|
||||
field.delete(PDFName.of('A'));
|
||||
}
|
||||
|
||||
if (field.has(PDFName.of('AA'))) {
|
||||
field.delete(PDFName.of('AA'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process field for JS:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process form fields for JS:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function removeEmbeddedFilesFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
const namesRef = catalogDict.get(PDFName.of('Names'));
|
||||
if (namesRef) {
|
||||
try {
|
||||
const namesDict = pdfDoc.context.lookup(namesRef) as any;
|
||||
if (namesDict.has(PDFName.of('EmbeddedFiles'))) {
|
||||
namesDict.delete(PDFName.of('EmbeddedFiles'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Names/EmbeddedFiles:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('EmbeddedFiles'))) {
|
||||
catalogDict.delete(PDFName.of('EmbeddedFiles'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const annotRefs = page.node.Annots()?.asArray() || [];
|
||||
const annotsToKeep = [];
|
||||
|
||||
for (const ref of annotRefs) {
|
||||
try {
|
||||
const annot = pdfDoc.context.lookup(ref) as any;
|
||||
const subtype = annot
|
||||
.get(PDFName.of('Subtype'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
if (subtype !== 'FileAttachment') {
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
} catch (e) {
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
}
|
||||
|
||||
if (annotsToKeep.length !== annotRefs.length) {
|
||||
if (annotsToKeep.length > 0) {
|
||||
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
|
||||
page.node.set(PDFName.of('Annots'), newAnnotsArray);
|
||||
} else {
|
||||
page.node.delete(PDFName.of('Annots'));
|
||||
}
|
||||
}
|
||||
} catch (pageError: any) {
|
||||
console.warn(
|
||||
`Could not process page for attachments: ${pageError.message}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
(pdfDoc as any).embeddedFiles &&
|
||||
(pdfDoc as any).embeddedFiles.length > 0
|
||||
) {
|
||||
(pdfDoc as any).embeddedFiles = [];
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('Collection'))) {
|
||||
catalogDict.delete(PDFName.of('Collection'));
|
||||
}
|
||||
}
|
||||
|
||||
function removeLayersFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
if (catalogDict.has(PDFName.of('OCProperties'))) {
|
||||
catalogDict.delete(PDFName.of('OCProperties'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const pageDict = page.node;
|
||||
|
||||
if (pageDict.has(PDFName.of('OCProperties'))) {
|
||||
pageDict.delete(PDFName.of('OCProperties'));
|
||||
}
|
||||
|
||||
const resourcesRef = pageDict.get(PDFName.of('Resources'));
|
||||
if (resourcesRef) {
|
||||
try {
|
||||
const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any;
|
||||
if (resourcesDict.has(PDFName.of('Properties'))) {
|
||||
resourcesDict.delete(PDFName.of('Properties'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Resources:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove page layers:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function removeLinksFromDoc(pdfDoc: PDFDocument) {
|
||||
const pages = pdfDoc.getPages();
|
||||
|
||||
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
||||
try {
|
||||
const page = pages[pageIndex];
|
||||
const pageDict = page.node;
|
||||
|
||||
const annotsRef = pageDict.get(PDFName.of('Annots'));
|
||||
if (!annotsRef) continue;
|
||||
|
||||
const annotsArray = pdfDoc.context.lookup(annotsRef) as any;
|
||||
const annotRefs = annotsArray.asArray();
|
||||
|
||||
if (annotRefs.length === 0) continue;
|
||||
|
||||
const annotsToKeep = [];
|
||||
let linksRemoved = 0;
|
||||
|
||||
for (const ref of annotRefs) {
|
||||
try {
|
||||
const annot = pdfDoc.context.lookup(ref) as any;
|
||||
const subtype = annot
|
||||
.get(PDFName.of('Subtype'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
let isLink = false;
|
||||
|
||||
if (subtype === 'Link') {
|
||||
isLink = true;
|
||||
linksRemoved++;
|
||||
} else {
|
||||
const actionRef = annot.get(PDFName.of('A'));
|
||||
if (actionRef) {
|
||||
try {
|
||||
const actionDict = pdfDoc.context.lookup(actionRef) as any;
|
||||
const actionType = actionDict
|
||||
.get(PDFName.of('S'))
|
||||
?.toString()
|
||||
.substring(1);
|
||||
|
||||
if (
|
||||
actionType === 'URI' ||
|
||||
actionType === 'Launch' ||
|
||||
actionType === 'GoTo' ||
|
||||
actionType === 'GoToR'
|
||||
) {
|
||||
isLink = true;
|
||||
linksRemoved++;
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not read action:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
const dest = annot.get(PDFName.of('Dest'));
|
||||
if (dest && !isLink) {
|
||||
isLink = true;
|
||||
linksRemoved++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isLink) {
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not process annotation:', e.message);
|
||||
annotsToKeep.push(ref);
|
||||
}
|
||||
}
|
||||
|
||||
if (linksRemoved > 0) {
|
||||
if (annotsToKeep.length > 0) {
|
||||
const newAnnotsArray = pdfDoc.context.obj(annotsToKeep);
|
||||
pageDict.set(PDFName.of('Annots'), newAnnotsArray);
|
||||
} else {
|
||||
pageDict.delete(PDFName.of('Annots'));
|
||||
}
|
||||
}
|
||||
} catch (pageError: any) {
|
||||
console.warn(
|
||||
`Could not process page ${pageIndex + 1} for links: ${pageError.message}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
const namesRef = catalogDict.get(PDFName.of('Names'));
|
||||
if (namesRef) {
|
||||
try {
|
||||
const namesDict = pdfDoc.context.lookup(namesRef) as any;
|
||||
if (namesDict.has(PDFName.of('Dests'))) {
|
||||
namesDict.delete(PDFName.of('Dests'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Names/Dests:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('Dests'))) {
|
||||
catalogDict.delete(PDFName.of('Dests'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove named destinations:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function removeStructureTreeFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
if (catalogDict.has(PDFName.of('StructTreeRoot'))) {
|
||||
catalogDict.delete(PDFName.of('StructTreeRoot'));
|
||||
}
|
||||
|
||||
const pages = pdfDoc.getPages();
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const pageDict = page.node;
|
||||
if (pageDict.has(PDFName.of('StructParents'))) {
|
||||
pageDict.delete(PDFName.of('StructParents'));
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not remove page StructParents:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('ParentTree'))) {
|
||||
catalogDict.delete(PDFName.of('ParentTree'));
|
||||
}
|
||||
}
|
||||
|
||||
function removeMarkInfoFromDoc(pdfDoc: PDFDocument) {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
|
||||
if (catalogDict.has(PDFName.of('MarkInfo'))) {
|
||||
catalogDict.delete(PDFName.of('MarkInfo'));
|
||||
}
|
||||
|
||||
if (catalogDict.has(PDFName.of('Marked'))) {
|
||||
catalogDict.delete(PDFName.of('Marked'));
|
||||
}
|
||||
}
|
||||
|
||||
function removeFontsFromDoc(pdfDoc: PDFDocument) {
|
||||
const pages = pdfDoc.getPages();
|
||||
|
||||
for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) {
|
||||
try {
|
||||
const page = pages[pageIndex];
|
||||
const pageDict = page.node;
|
||||
const resourcesRef = pageDict.get(PDFName.of('Resources'));
|
||||
|
||||
if (resourcesRef) {
|
||||
try {
|
||||
const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any;
|
||||
|
||||
if (resourcesDict.has(PDFName.of('Font'))) {
|
||||
const fontRef = resourcesDict.get(PDFName.of('Font'));
|
||||
|
||||
try {
|
||||
const fontDict = pdfDoc.context.lookup(fontRef) as any;
|
||||
const fontKeys = fontDict.keys();
|
||||
|
||||
for (const fontKey of fontKeys) {
|
||||
try {
|
||||
const specificFontRef = fontDict.get(fontKey);
|
||||
const specificFont = pdfDoc.context.lookup(
|
||||
specificFontRef
|
||||
) as any;
|
||||
|
||||
if (specificFont.has(PDFName.of('FontDescriptor'))) {
|
||||
const descriptorRef = specificFont.get(
|
||||
PDFName.of('FontDescriptor')
|
||||
);
|
||||
const descriptor = pdfDoc.context.lookup(
|
||||
descriptorRef
|
||||
) as any;
|
||||
|
||||
const fontFileKeys = ['FontFile', 'FontFile2', 'FontFile3'];
|
||||
for (const key of fontFileKeys) {
|
||||
if (descriptor.has(PDFName.of(key))) {
|
||||
descriptor.delete(PDFName.of(key));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not process font ${fontKey}:`, e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access font dictionary:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn('Could not access Resources for fonts:', e.message);
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
console.warn(
|
||||
`Could not remove fonts from page ${pageIndex + 1}:`,
|
||||
e.message
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if ((pdfDoc as any).fonts && (pdfDoc as any).fonts.length > 0) {
|
||||
(pdfDoc as any).fonts = [];
|
||||
}
|
||||
}
|
||||
|
||||
export async function sanitizePdf(
|
||||
pdfBytes: Uint8Array,
|
||||
options: SanitizeOptions
|
||||
): Promise<{ pdfDoc: PDFDocument; bytes: Uint8Array }> {
|
||||
const pdfDoc = await PDFDocument.load(pdfBytes);
|
||||
|
||||
if (options.flattenForms) {
|
||||
try {
|
||||
flattenFormsInDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not flatten forms: ${e.message}`);
|
||||
try {
|
||||
const catalogDict = (pdfDoc.catalog as any).dict;
|
||||
if (catalogDict.has(PDFName.of('AcroForm'))) {
|
||||
catalogDict.delete(PDFName.of('AcroForm'));
|
||||
}
|
||||
} catch (removeError: any) {
|
||||
console.warn('Could not remove AcroForm:', removeError.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeMetadata) {
|
||||
removeMetadataFromDoc(pdfDoc);
|
||||
}
|
||||
|
||||
if (options.removeAnnotations) {
|
||||
removeAnnotationsFromDoc(pdfDoc);
|
||||
}
|
||||
|
||||
if (options.removeJavascript) {
|
||||
try {
|
||||
removeJavascriptFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove JavaScript: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeEmbeddedFiles) {
|
||||
try {
|
||||
removeEmbeddedFilesFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove embedded files: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeLayers) {
|
||||
try {
|
||||
removeLayersFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove layers: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeLinks) {
|
||||
try {
|
||||
removeLinksFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove links: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeStructureTree) {
|
||||
try {
|
||||
removeStructureTreeFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove structure tree: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeMarkInfo) {
|
||||
try {
|
||||
removeMarkInfoFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove MarkInfo: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.removeFonts) {
|
||||
try {
|
||||
removeFontsFromDoc(pdfDoc);
|
||||
} catch (e: any) {
|
||||
console.warn(`Could not remove fonts: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
const savedBytes = await pdfDoc.save();
|
||||
return { pdfDoc, bytes: new Uint8Array(savedBytes) };
|
||||
}
|
||||
Reference in New Issue
Block a user