refactor: copy original pages in OCR instead of embedding rasterized PNGs
This commit is contained in:
@@ -9,6 +9,7 @@ import {
|
||||
calculateSpaceTransform,
|
||||
} from './hocr-transform.js';
|
||||
import { getPDFDocument } from './helpers.js';
|
||||
import { loadPdfDocument } from './load-pdf-document.js';
|
||||
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
|
||||
|
||||
export interface OcrOptions {
|
||||
@@ -161,6 +162,7 @@ export async function performOcr(
|
||||
});
|
||||
}
|
||||
|
||||
const sourcePdfDoc = await loadPdfDocument(pdfBytes);
|
||||
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
|
||||
const newPdfDoc = await PDFDocument.create();
|
||||
|
||||
@@ -255,44 +257,16 @@ export async function performOcr(
|
||||
);
|
||||
const data = result.data;
|
||||
|
||||
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
|
||||
|
||||
const pngImageBytes = await new Promise<Uint8Array>(function (
|
||||
resolve,
|
||||
reject
|
||||
) {
|
||||
canvas.toBlob(function (blob) {
|
||||
if (!blob) {
|
||||
reject(new Error('Failed to create image blob'));
|
||||
return;
|
||||
}
|
||||
const reader = new FileReader();
|
||||
reader.onload = function () {
|
||||
resolve(new Uint8Array(reader.result as ArrayBuffer));
|
||||
};
|
||||
reader.onerror = function () {
|
||||
reject(new Error('Failed to read image data'));
|
||||
};
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}, 'image/png');
|
||||
});
|
||||
|
||||
// Release canvas memory
|
||||
canvas.width = 0;
|
||||
canvas.height = 0;
|
||||
|
||||
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
|
||||
newPage.drawImage(pngImage, {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: viewport.width,
|
||||
height: viewport.height,
|
||||
});
|
||||
const [copiedPage] = await newPdfDoc.copyPages(sourcePdfDoc, [i - 1]);
|
||||
newPdfDoc.addPage(copiedPage);
|
||||
|
||||
if (data.hocr) {
|
||||
const ocrPage = parseHocrDocument(data.hocr);
|
||||
drawOcrTextLayer(
|
||||
newPage,
|
||||
copiedPage,
|
||||
ocrPage,
|
||||
viewport.height,
|
||||
primaryFont,
|
||||
|
||||
@@ -3,11 +3,13 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
const {
|
||||
createConfiguredTesseractWorker,
|
||||
getPDFDocument,
|
||||
loadPdfDocument,
|
||||
getFontForLanguage,
|
||||
parseHocrDocument,
|
||||
} = vi.hoisted(() => ({
|
||||
createConfiguredTesseractWorker: vi.fn(),
|
||||
getPDFDocument: vi.fn(),
|
||||
loadPdfDocument: vi.fn(),
|
||||
getFontForLanguage: vi.fn(),
|
||||
parseHocrDocument: vi.fn(),
|
||||
}));
|
||||
@@ -25,13 +27,17 @@ const mockPdfPage = {
|
||||
|
||||
const mockPdfOutputPage = {
|
||||
drawImage: vi.fn(),
|
||||
drawPage: vi.fn(),
|
||||
drawText: vi.fn(),
|
||||
setRotation: vi.fn(),
|
||||
};
|
||||
|
||||
const mockPdfDoc = {
|
||||
registerFontkit: vi.fn(),
|
||||
embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })),
|
||||
addPage: vi.fn(() => mockPdfOutputPage),
|
||||
copyPages: vi.fn(async () => [mockPdfOutputPage]),
|
||||
embedPage: vi.fn(async () => ({ width: 200, height: 100 })),
|
||||
embedPng: vi.fn(async () => ({ id: 'png' })),
|
||||
save: vi.fn(async () => new Uint8Array([1, 2, 3])),
|
||||
};
|
||||
@@ -44,6 +50,10 @@ vi.mock('../js/utils/helpers.js', () => ({
|
||||
getPDFDocument,
|
||||
}));
|
||||
|
||||
vi.mock('../js/utils/load-pdf-document.js', () => ({
|
||||
loadPdfDocument,
|
||||
}));
|
||||
|
||||
vi.mock('../js/utils/font-loader.js', () => ({
|
||||
getFontForLanguage,
|
||||
}));
|
||||
@@ -77,6 +87,7 @@ describe('performOcr', () => {
|
||||
beforeEach(() => {
|
||||
createConfiguredTesseractWorker.mockReset();
|
||||
getPDFDocument.mockReset();
|
||||
loadPdfDocument.mockReset();
|
||||
getFontForLanguage.mockReset();
|
||||
parseHocrDocument.mockReset();
|
||||
|
||||
@@ -90,10 +101,17 @@ describe('performOcr', () => {
|
||||
mockPdfDoc.registerFontkit.mockClear();
|
||||
mockPdfDoc.embedFont.mockClear();
|
||||
mockPdfDoc.addPage.mockClear();
|
||||
mockPdfDoc.embedPage.mockClear();
|
||||
mockPdfDoc.embedPng.mockClear();
|
||||
mockPdfDoc.save.mockClear();
|
||||
|
||||
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
|
||||
loadPdfDocument.mockResolvedValue({
|
||||
getPage: vi.fn(() => ({
|
||||
width: 200,
|
||||
height: 100,
|
||||
})),
|
||||
});
|
||||
getPDFDocument.mockReturnValue({
|
||||
promise: Promise.resolve({
|
||||
numPages: 1,
|
||||
|
||||
Reference in New Issue
Block a user