fix(ocr): improve text layer alignment with width-based font sizing

- Create new hocr-transform.ts utility for parsing hOCR output
- Add line-aware text processing with baseline and rotation support
- Implement width-based font size calculation to match word bounding boxes
- Fix text selection not covering full characters issue
- Add proper type definitions for OcrLine, OcrPage, WordTransform
- Support RTL languages and CJK word break handling
This commit is contained in:
abdullahalam123
2026-01-10 13:09:52 +05:30
parent 1f7238d0b5
commit c5799954dc
3 changed files with 887 additions and 461 deletions

View File

@@ -1,10 +1,46 @@
export interface OcrWord {
text: string;
bbox: { x0: number; y0: number; x1: number; y1: number };
confidence: number;
text: string;
bbox: { x0: number; y0: number; x1: number; y1: number };
confidence: number;
}
export interface OcrState {
file: File | null;
searchablePdfBytes: Uint8Array | null;
file: File | null;
searchablePdfBytes: Uint8Array | null;
}
export interface BBox {
x0: number; // left
y0: number; // top (in hOCR coordinate system, origin at top-left)
x1: number; // right
y1: number; // bottom
}
export interface Baseline {
slope: number;
intercept: number;
}
export interface OcrLine {
bbox: BBox;
baseline: Baseline;
textangle: number;
words: OcrWord[];
direction: 'ltr' | 'rtl';
injectWordBreaks: boolean;
}
export interface OcrPage {
width: number;
height: number;
dpi: number;
lines: OcrLine[];
}
export interface WordTransform {
x: number;
y: number;
fontSize: number;
horizontalScale: number;
rotation: number;
}