refactor: copy original pages in OCR instead of embedding rasterized PNGs
This commit is contained in:
@@ -9,6 +9,7 @@ import {
|
|||||||
calculateSpaceTransform,
|
calculateSpaceTransform,
|
||||||
} from './hocr-transform.js';
|
} from './hocr-transform.js';
|
||||||
import { getPDFDocument } from './helpers.js';
|
import { getPDFDocument } from './helpers.js';
|
||||||
|
import { loadPdfDocument } from './load-pdf-document.js';
|
||||||
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
|
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
|
||||||
|
|
||||||
export interface OcrOptions {
|
export interface OcrOptions {
|
||||||
@@ -161,6 +162,7 @@ export async function performOcr(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const sourcePdfDoc = await loadPdfDocument(pdfBytes);
|
||||||
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
|
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
|
||||||
const newPdfDoc = await PDFDocument.create();
|
const newPdfDoc = await PDFDocument.create();
|
||||||
|
|
||||||
@@ -255,44 +257,16 @@ export async function performOcr(
|
|||||||
);
|
);
|
||||||
const data = result.data;
|
const data = result.data;
|
||||||
|
|
||||||
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
|
|
||||||
|
|
||||||
const pngImageBytes = await new Promise<Uint8Array>(function (
|
|
||||||
resolve,
|
|
||||||
reject
|
|
||||||
) {
|
|
||||||
canvas.toBlob(function (blob) {
|
|
||||||
if (!blob) {
|
|
||||||
reject(new Error('Failed to create image blob'));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const reader = new FileReader();
|
|
||||||
reader.onload = function () {
|
|
||||||
resolve(new Uint8Array(reader.result as ArrayBuffer));
|
|
||||||
};
|
|
||||||
reader.onerror = function () {
|
|
||||||
reject(new Error('Failed to read image data'));
|
|
||||||
};
|
|
||||||
reader.readAsArrayBuffer(blob);
|
|
||||||
}, 'image/png');
|
|
||||||
});
|
|
||||||
|
|
||||||
// Release canvas memory
|
|
||||||
canvas.width = 0;
|
canvas.width = 0;
|
||||||
canvas.height = 0;
|
canvas.height = 0;
|
||||||
|
|
||||||
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
|
const [copiedPage] = await newPdfDoc.copyPages(sourcePdfDoc, [i - 1]);
|
||||||
newPage.drawImage(pngImage, {
|
newPdfDoc.addPage(copiedPage);
|
||||||
x: 0,
|
|
||||||
y: 0,
|
|
||||||
width: viewport.width,
|
|
||||||
height: viewport.height,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (data.hocr) {
|
if (data.hocr) {
|
||||||
const ocrPage = parseHocrDocument(data.hocr);
|
const ocrPage = parseHocrDocument(data.hocr);
|
||||||
drawOcrTextLayer(
|
drawOcrTextLayer(
|
||||||
newPage,
|
copiedPage,
|
||||||
ocrPage,
|
ocrPage,
|
||||||
viewport.height,
|
viewport.height,
|
||||||
primaryFont,
|
primaryFont,
|
||||||
|
|||||||
@@ -3,11 +3,13 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
|
|||||||
const {
|
const {
|
||||||
createConfiguredTesseractWorker,
|
createConfiguredTesseractWorker,
|
||||||
getPDFDocument,
|
getPDFDocument,
|
||||||
|
loadPdfDocument,
|
||||||
getFontForLanguage,
|
getFontForLanguage,
|
||||||
parseHocrDocument,
|
parseHocrDocument,
|
||||||
} = vi.hoisted(() => ({
|
} = vi.hoisted(() => ({
|
||||||
createConfiguredTesseractWorker: vi.fn(),
|
createConfiguredTesseractWorker: vi.fn(),
|
||||||
getPDFDocument: vi.fn(),
|
getPDFDocument: vi.fn(),
|
||||||
|
loadPdfDocument: vi.fn(),
|
||||||
getFontForLanguage: vi.fn(),
|
getFontForLanguage: vi.fn(),
|
||||||
parseHocrDocument: vi.fn(),
|
parseHocrDocument: vi.fn(),
|
||||||
}));
|
}));
|
||||||
@@ -25,13 +27,17 @@ const mockPdfPage = {
|
|||||||
|
|
||||||
const mockPdfOutputPage = {
|
const mockPdfOutputPage = {
|
||||||
drawImage: vi.fn(),
|
drawImage: vi.fn(),
|
||||||
|
drawPage: vi.fn(),
|
||||||
drawText: vi.fn(),
|
drawText: vi.fn(),
|
||||||
|
setRotation: vi.fn(),
|
||||||
};
|
};
|
||||||
|
|
||||||
const mockPdfDoc = {
|
const mockPdfDoc = {
|
||||||
registerFontkit: vi.fn(),
|
registerFontkit: vi.fn(),
|
||||||
embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })),
|
embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })),
|
||||||
addPage: vi.fn(() => mockPdfOutputPage),
|
addPage: vi.fn(() => mockPdfOutputPage),
|
||||||
|
copyPages: vi.fn(async () => [mockPdfOutputPage]),
|
||||||
|
embedPage: vi.fn(async () => ({ width: 200, height: 100 })),
|
||||||
embedPng: vi.fn(async () => ({ id: 'png' })),
|
embedPng: vi.fn(async () => ({ id: 'png' })),
|
||||||
save: vi.fn(async () => new Uint8Array([1, 2, 3])),
|
save: vi.fn(async () => new Uint8Array([1, 2, 3])),
|
||||||
};
|
};
|
||||||
@@ -44,6 +50,10 @@ vi.mock('../js/utils/helpers.js', () => ({
|
|||||||
getPDFDocument,
|
getPDFDocument,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock('../js/utils/load-pdf-document.js', () => ({
|
||||||
|
loadPdfDocument,
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock('../js/utils/font-loader.js', () => ({
|
vi.mock('../js/utils/font-loader.js', () => ({
|
||||||
getFontForLanguage,
|
getFontForLanguage,
|
||||||
}));
|
}));
|
||||||
@@ -77,6 +87,7 @@ describe('performOcr', () => {
|
|||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
createConfiguredTesseractWorker.mockReset();
|
createConfiguredTesseractWorker.mockReset();
|
||||||
getPDFDocument.mockReset();
|
getPDFDocument.mockReset();
|
||||||
|
loadPdfDocument.mockReset();
|
||||||
getFontForLanguage.mockReset();
|
getFontForLanguage.mockReset();
|
||||||
parseHocrDocument.mockReset();
|
parseHocrDocument.mockReset();
|
||||||
|
|
||||||
@@ -90,10 +101,17 @@ describe('performOcr', () => {
|
|||||||
mockPdfDoc.registerFontkit.mockClear();
|
mockPdfDoc.registerFontkit.mockClear();
|
||||||
mockPdfDoc.embedFont.mockClear();
|
mockPdfDoc.embedFont.mockClear();
|
||||||
mockPdfDoc.addPage.mockClear();
|
mockPdfDoc.addPage.mockClear();
|
||||||
|
mockPdfDoc.embedPage.mockClear();
|
||||||
mockPdfDoc.embedPng.mockClear();
|
mockPdfDoc.embedPng.mockClear();
|
||||||
mockPdfDoc.save.mockClear();
|
mockPdfDoc.save.mockClear();
|
||||||
|
|
||||||
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
|
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
|
||||||
|
loadPdfDocument.mockResolvedValue({
|
||||||
|
getPage: vi.fn(() => ({
|
||||||
|
width: 200,
|
||||||
|
height: 100,
|
||||||
|
})),
|
||||||
|
});
|
||||||
getPDFDocument.mockReturnValue({
|
getPDFDocument.mockReturnValue({
|
||||||
promise: Promise.resolve({
|
promise: Promise.resolve({
|
||||||
numPages: 1,
|
numPages: 1,
|
||||||
|
|||||||
Reference in New Issue
Block a user