refactor: copy original pages in OCR instead of embedding rasterized PNGs

This commit is contained in:
alam00000
2026-04-04 12:51:55 +05:30
parent ffddca2a7d
commit 12677f1647
2 changed files with 23 additions and 31 deletions

View File

@@ -9,6 +9,7 @@ import {
calculateSpaceTransform,
} from './hocr-transform.js';
import { getPDFDocument } from './helpers.js';
import { loadPdfDocument } from './load-pdf-document.js';
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
export interface OcrOptions {
@@ -161,6 +162,7 @@ export async function performOcr(
});
}
const sourcePdfDoc = await loadPdfDocument(pdfBytes);
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
const newPdfDoc = await PDFDocument.create();
@@ -255,44 +257,16 @@ export async function performOcr(
);
const data = result.data;
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
const pngImageBytes = await new Promise<Uint8Array>(function (
resolve,
reject
) {
canvas.toBlob(function (blob) {
if (!blob) {
reject(new Error('Failed to create image blob'));
return;
}
const reader = new FileReader();
reader.onload = function () {
resolve(new Uint8Array(reader.result as ArrayBuffer));
};
reader.onerror = function () {
reject(new Error('Failed to read image data'));
};
reader.readAsArrayBuffer(blob);
}, 'image/png');
});
// Release canvas memory
canvas.width = 0;
canvas.height = 0;
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
newPage.drawImage(pngImage, {
x: 0,
y: 0,
width: viewport.width,
height: viewport.height,
});
const [copiedPage] = await newPdfDoc.copyPages(sourcePdfDoc, [i - 1]);
newPdfDoc.addPage(copiedPage);
if (data.hocr) {
const ocrPage = parseHocrDocument(data.hocr);
drawOcrTextLayer(
newPage,
copiedPage,
ocrPage,
viewport.height,
primaryFont,

View File

@@ -3,11 +3,13 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
const {
createConfiguredTesseractWorker,
getPDFDocument,
loadPdfDocument,
getFontForLanguage,
parseHocrDocument,
} = vi.hoisted(() => ({
createConfiguredTesseractWorker: vi.fn(),
getPDFDocument: vi.fn(),
loadPdfDocument: vi.fn(),
getFontForLanguage: vi.fn(),
parseHocrDocument: vi.fn(),
}));
@@ -25,13 +27,17 @@ const mockPdfPage = {
const mockPdfOutputPage = {
drawImage: vi.fn(),
drawPage: vi.fn(),
drawText: vi.fn(),
setRotation: vi.fn(),
};
const mockPdfDoc = {
registerFontkit: vi.fn(),
embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })),
addPage: vi.fn(() => mockPdfOutputPage),
copyPages: vi.fn(async () => [mockPdfOutputPage]),
embedPage: vi.fn(async () => ({ width: 200, height: 100 })),
embedPng: vi.fn(async () => ({ id: 'png' })),
save: vi.fn(async () => new Uint8Array([1, 2, 3])),
};
@@ -44,6 +50,10 @@ vi.mock('../js/utils/helpers.js', () => ({
getPDFDocument,
}));
vi.mock('../js/utils/load-pdf-document.js', () => ({
loadPdfDocument,
}));
vi.mock('../js/utils/font-loader.js', () => ({
getFontForLanguage,
}));
@@ -77,6 +87,7 @@ describe('performOcr', () => {
beforeEach(() => {
createConfiguredTesseractWorker.mockReset();
getPDFDocument.mockReset();
loadPdfDocument.mockReset();
getFontForLanguage.mockReset();
parseHocrDocument.mockReset();
@@ -90,10 +101,17 @@ describe('performOcr', () => {
mockPdfDoc.registerFontkit.mockClear();
mockPdfDoc.embedFont.mockClear();
mockPdfDoc.addPage.mockClear();
mockPdfDoc.embedPage.mockClear();
mockPdfDoc.embedPng.mockClear();
mockPdfDoc.save.mockClear();
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
loadPdfDocument.mockResolvedValue({
getPage: vi.fn(() => ({
width: 200,
height: 100,
})),
});
getPDFDocument.mockReturnValue({
promise: Promise.resolve({
numPages: 1,