Files
bentopdf/src/js/compare/engine/ocr-page.ts
alam00000 1d68691331 refactor: update PDF comparison types and enhance UI for better usability
- Refactored CompareState to import from a centralized type definition.
- Enhanced the compare-pdfs.html layout with improved styles for overlay and side-by-side modes.
- Added new CSS styles for various UI components including panels, buttons, and highlights.
- Implemented a new sidebar for displaying change summaries and filters.
- Created unit tests for text comparison logic, including diffing text runs and page pairing.
- Added tests for text normalization functions to ensure proper handling of punctuation and character normalization.
2026-03-08 23:55:33 +05:30

77 lines
2.0 KiB
TypeScript

import Tesseract from 'tesseract.js';
import type { ComparePageModel, CompareTextItem } from '../types.ts';
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
import {
joinCompareTextItems,
normalizeCompareText,
} from './text-normalization.ts';
type OcrWord = {
text: string;
bbox: {
x0: number;
y0: number;
x1: number;
y1: number;
};
};
export async function recognizePageCanvas(
canvas: HTMLCanvasElement,
language: string,
onProgress?: (status: string, progress: number) => void
): Promise<ComparePageModel> {
const result = await Tesseract.recognize(canvas, language, {
logger(message) {
onProgress?.(message.status, message.progress || 0);
},
});
const ocrData = result.data as unknown as { words?: OcrWord[] };
const words = ((ocrData.words || []) as OcrWord[])
.map((word, index) => {
const normalizedText = normalizeCompareText(word.text || '');
if (!normalizedText) return null;
const item: CompareTextItem = {
id: `ocr-${index}-${normalizedText}`,
text: word.text,
normalizedText,
rect: {
x: word.bbox.x0,
y: word.bbox.y0,
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
},
wordTokens: [
{
word: normalizedText,
compareWord: normalizedText.toLowerCase(),
rect: {
x: word.bbox.x0,
y: word.bbox.y0,
width: Math.max(word.bbox.x1 - word.bbox.x0, 1),
height: Math.max(word.bbox.y1 - word.bbox.y0, 1),
},
},
],
};
return item;
})
.filter((word): word is CompareTextItem => Boolean(word));
const mergedItems = mergeIntoLines(sortCompareTextItems(words));
return {
pageNumber: 0,
width: canvas.width,
height: canvas.height,
textItems: mergedItems,
plainText: joinCompareTextItems(mergedItems),
hasText: mergedItems.length > 0,
source: 'ocr',
};
}