diff --git a/src/js/utils/ocr.ts b/src/js/utils/ocr.ts index f0f4bd3..40b071c 100644 --- a/src/js/utils/ocr.ts +++ b/src/js/utils/ocr.ts @@ -9,6 +9,7 @@ import { calculateSpaceTransform, } from './hocr-transform.js'; import { getPDFDocument } from './helpers.js'; +import { loadPdfDocument } from './load-pdf-document.js'; import { createConfiguredTesseractWorker } from './tesseract-runtime.js'; export interface OcrOptions { @@ -161,6 +162,7 @@ export async function performOcr( }); } + const sourcePdfDoc = await loadPdfDocument(pdfBytes); const pdf = await getPDFDocument({ data: pdfBytes }).promise; const newPdfDoc = await PDFDocument.create(); @@ -255,44 +257,16 @@ export async function performOcr( ); const data = result.data; - const newPage = newPdfDoc.addPage([viewport.width, viewport.height]); - - const pngImageBytes = await new Promise(function ( - resolve, - reject - ) { - canvas.toBlob(function (blob) { - if (!blob) { - reject(new Error('Failed to create image blob')); - return; - } - const reader = new FileReader(); - reader.onload = function () { - resolve(new Uint8Array(reader.result as ArrayBuffer)); - }; - reader.onerror = function () { - reject(new Error('Failed to read image data')); - }; - reader.readAsArrayBuffer(blob); - }, 'image/png'); - }); - - // Release canvas memory canvas.width = 0; canvas.height = 0; - const pngImage = await newPdfDoc.embedPng(pngImageBytes); - newPage.drawImage(pngImage, { - x: 0, - y: 0, - width: viewport.width, - height: viewport.height, - }); + const [copiedPage] = await newPdfDoc.copyPages(sourcePdfDoc, [i - 1]); + newPdfDoc.addPage(copiedPage); if (data.hocr) { const ocrPage = parseHocrDocument(data.hocr); drawOcrTextLayer( - newPage, + copiedPage, ocrPage, viewport.height, primaryFont, diff --git a/src/tests/ocr.test.ts b/src/tests/ocr.test.ts index 97e175b..11b98b5 100644 --- a/src/tests/ocr.test.ts +++ b/src/tests/ocr.test.ts @@ -3,11 +3,13 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; const { createConfiguredTesseractWorker, getPDFDocument, + loadPdfDocument, getFontForLanguage, parseHocrDocument, } = vi.hoisted(() => ({ createConfiguredTesseractWorker: vi.fn(), getPDFDocument: vi.fn(), + loadPdfDocument: vi.fn(), getFontForLanguage: vi.fn(), parseHocrDocument: vi.fn(), })); @@ -25,13 +27,17 @@ const mockPdfPage = { const mockPdfOutputPage = { drawImage: vi.fn(), + drawPage: vi.fn(), drawText: vi.fn(), + setRotation: vi.fn(), }; const mockPdfDoc = { registerFontkit: vi.fn(), embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })), addPage: vi.fn(() => mockPdfOutputPage), + copyPages: vi.fn(async () => [mockPdfOutputPage]), + embedPage: vi.fn(async () => ({ width: 200, height: 100 })), embedPng: vi.fn(async () => ({ id: 'png' })), save: vi.fn(async () => new Uint8Array([1, 2, 3])), }; @@ -44,6 +50,10 @@ vi.mock('../js/utils/helpers.js', () => ({ getPDFDocument, })); +vi.mock('../js/utils/load-pdf-document.js', () => ({ + loadPdfDocument, +})); + vi.mock('../js/utils/font-loader.js', () => ({ getFontForLanguage, })); @@ -77,6 +87,7 @@ describe('performOcr', () => { beforeEach(() => { createConfiguredTesseractWorker.mockReset(); getPDFDocument.mockReset(); + loadPdfDocument.mockReset(); getFontForLanguage.mockReset(); parseHocrDocument.mockReset(); @@ -90,10 +101,17 @@ describe('performOcr', () => { mockPdfDoc.registerFontkit.mockClear(); mockPdfDoc.embedFont.mockClear(); mockPdfDoc.addPage.mockClear(); + mockPdfDoc.embedPage.mockClear(); mockPdfDoc.embedPng.mockClear(); mockPdfDoc.save.mockClear(); createConfiguredTesseractWorker.mockResolvedValue(mockWorker); + loadPdfDocument.mockResolvedValue({ + getPage: vi.fn(() => ({ + width: 200, + height: 100, + })), + }); getPDFDocument.mockReturnValue({ promise: Promise.resolve({ numPages: 1,