- Create new hocr-transform.ts utility for parsing hOCR output - Add line-aware text processing with baseline and rotation support - Implement width-based font size calculation to match word bounding boxes - Fix text selection not covering full characters issue - Add proper type definitions for OcrLine, OcrPage, WordTransform - Support RTL languages and CJK word break handling
267 lines
6.9 KiB
TypeScript
267 lines
6.9 KiB
TypeScript
import {
|
|
BBox,
|
|
OcrLine,
|
|
OcrPage,
|
|
OcrWord,
|
|
WordTransform,
|
|
Baseline,
|
|
} from '@/types';
|
|
|
|
const BBOX_PATTERN = /bbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/;
|
|
const BASELINE_PATTERN = /baseline\s+([-+]?\d*\.?\d*)\s+([-+]?\d+)/;
|
|
const TEXTANGLE_PATTERN = /textangle\s+([-+]?\d*\.?\d*)/;
|
|
|
|
export function parseBBox(title: string): BBox | null {
|
|
const match = title.match(BBOX_PATTERN);
|
|
if (!match) return null;
|
|
|
|
return {
|
|
x0: parseInt(match[1], 10),
|
|
y0: parseInt(match[2], 10),
|
|
x1: parseInt(match[3], 10),
|
|
y1: parseInt(match[4], 10),
|
|
};
|
|
}
|
|
|
|
export function parseBaseline(title: string): Baseline {
|
|
const match = title.match(BASELINE_PATTERN);
|
|
if (!match) {
|
|
return { slope: 0, intercept: 0 };
|
|
}
|
|
|
|
return {
|
|
slope: parseFloat(match[1]) || 0,
|
|
intercept: parseInt(match[2], 10) || 0,
|
|
};
|
|
}
|
|
|
|
export function parseTextangle(title: string): number {
|
|
const match = title.match(TEXTANGLE_PATTERN);
|
|
if (!match) return 0;
|
|
return parseFloat(match[1]) || 0;
|
|
}
|
|
|
|
export function getTextDirection(element: Element): 'ltr' | 'rtl' {
|
|
const dir = element.getAttribute('dir');
|
|
return dir === 'rtl' ? 'rtl' : 'ltr';
|
|
}
|
|
|
|
export function shouldInjectWordBreaks(element: Element): boolean {
|
|
const lang = element.getAttribute('lang') || '';
|
|
const cjkLangs = ['chi_sim', 'chi_tra', 'jpn', 'kor', 'zh', 'ja', 'ko'];
|
|
return !cjkLangs.includes(lang);
|
|
}
|
|
|
|
export function normalizeText(text: string): string {
|
|
return text.normalize('NFKC');
|
|
}
|
|
|
|
export function parseHocrDocument(hocrText: string): OcrPage {
|
|
const parser = new DOMParser();
|
|
const doc = parser.parseFromString(hocrText, 'text/html');
|
|
|
|
let width = 0;
|
|
let height = 0;
|
|
const pageDiv = doc.querySelector('.ocr_page');
|
|
if (pageDiv) {
|
|
const title = pageDiv.getAttribute('title') || '';
|
|
const bbox = parseBBox(title);
|
|
if (bbox) {
|
|
width = bbox.x1 - bbox.x0;
|
|
height = bbox.y1 - bbox.y0;
|
|
}
|
|
}
|
|
|
|
const lines: OcrLine[] = [];
|
|
|
|
const lineClasses = [
|
|
'ocr_line',
|
|
'ocr_textfloat',
|
|
'ocr_header',
|
|
'ocr_caption',
|
|
];
|
|
const lineSelectors = lineClasses.map((c) => `.${c}`).join(', ');
|
|
const lineElements = doc.querySelectorAll(lineSelectors);
|
|
|
|
if (lineElements.length > 0) {
|
|
lineElements.forEach((lineEl) => {
|
|
const line = parseHocrLine(lineEl);
|
|
if (line && line.words.length > 0) {
|
|
lines.push(line);
|
|
}
|
|
});
|
|
} else {
|
|
const wordElements = doc.querySelectorAll('.ocrx_word');
|
|
if (wordElements.length > 0) {
|
|
const words = parseWordsFromElements(wordElements);
|
|
if (words.length > 0) {
|
|
const allBBox = calculateBoundingBox(words.map((w) => w.bbox));
|
|
lines.push({
|
|
bbox: allBBox,
|
|
baseline: { slope: 0, intercept: 0 },
|
|
textangle: 0,
|
|
words,
|
|
direction: 'ltr',
|
|
injectWordBreaks: true,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return { width, height, dpi: 72, lines };
|
|
}
|
|
|
|
function parseHocrLine(lineElement: Element): OcrLine | null {
|
|
const title = lineElement.getAttribute('title') || '';
|
|
const bbox = parseBBox(title);
|
|
|
|
if (!bbox) return null;
|
|
|
|
const baseline = parseBaseline(title);
|
|
const textangle = parseTextangle(title);
|
|
|
|
const parent = lineElement.closest('.ocr_par') || lineElement.parentElement;
|
|
const direction = parent ? getTextDirection(parent) : 'ltr';
|
|
const injectWordBreaks = parent ? shouldInjectWordBreaks(parent) : true;
|
|
const wordElements = lineElement.querySelectorAll('.ocrx_word');
|
|
const words = parseWordsFromElements(wordElements);
|
|
|
|
return {
|
|
bbox,
|
|
baseline,
|
|
textangle,
|
|
words,
|
|
direction,
|
|
injectWordBreaks,
|
|
};
|
|
}
|
|
|
|
function parseWordsFromElements(wordElements: NodeListOf<Element>): OcrWord[] {
|
|
const words: OcrWord[] = [];
|
|
|
|
wordElements.forEach((wordEl) => {
|
|
const title = wordEl.getAttribute('title') || '';
|
|
const text = normalizeText((wordEl.textContent || '').trim());
|
|
|
|
if (!text) return;
|
|
|
|
const bbox = parseBBox(title);
|
|
if (!bbox) return;
|
|
|
|
const confMatch = title.match(/x_wconf\s+(\d+)/);
|
|
const confidence = confMatch ? parseInt(confMatch[1], 10) : 0;
|
|
|
|
words.push({
|
|
text,
|
|
bbox,
|
|
confidence,
|
|
});
|
|
});
|
|
|
|
return words;
|
|
}
|
|
|
|
function calculateBoundingBox(bboxes: BBox[]): BBox {
|
|
if (bboxes.length === 0) {
|
|
return { x0: 0, y0: 0, x1: 0, y1: 0 };
|
|
}
|
|
|
|
return {
|
|
x0: Math.min(...bboxes.map((b) => b.x0)),
|
|
y0: Math.min(...bboxes.map((b) => b.y0)),
|
|
x1: Math.max(...bboxes.map((b) => b.x1)),
|
|
y1: Math.max(...bboxes.map((b) => b.y1)),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Calculate the transformation parameters for drawing a word
|
|
*
|
|
* pdf-lib doesn't support horizontal text scaling (Tz operator),
|
|
* we calculate a font size that makes the text width exactly match the word bbox width.
|
|
*
|
|
* @param word - The word to position
|
|
* @param line - The line containing this word
|
|
* @param pageHeight - Height of the page in pixels (for coordinate flip)
|
|
* @param fontWidthFn - Function to calculate text width at a given font size
|
|
* @returns Transform parameters for pdf-lib
|
|
*/
|
|
export function calculateWordTransform(
|
|
word: OcrWord,
|
|
line: OcrLine,
|
|
pageHeight: number,
|
|
fontWidthFn: (text: string, fontSize: number) => number
|
|
): WordTransform {
|
|
const wordBBox = word.bbox;
|
|
const wordWidth = wordBBox.x1 - wordBBox.x0;
|
|
const wordHeight = wordBBox.y1 - wordBBox.y0;
|
|
|
|
let fontSize = wordHeight;
|
|
const maxIterations = 10;
|
|
|
|
for (let i = 0; i < maxIterations; i++) {
|
|
const currentWidth = fontWidthFn(word.text, fontSize);
|
|
if (currentWidth <= 0) break;
|
|
|
|
const ratio = wordWidth / currentWidth;
|
|
const newFontSize = fontSize * ratio;
|
|
|
|
if (Math.abs(newFontSize - fontSize) / fontSize < 0.01) {
|
|
fontSize = newFontSize;
|
|
break;
|
|
}
|
|
fontSize = newFontSize;
|
|
}
|
|
|
|
fontSize = Math.max(1, Math.min(fontSize, wordHeight * 2));
|
|
|
|
const fontWidth = fontWidthFn(word.text, fontSize);
|
|
const horizontalScale = fontWidth > 0 ? wordWidth / fontWidth : 1;
|
|
|
|
const slopeAngle = Math.atan(line.baseline.slope) * (180 / Math.PI);
|
|
const rotation = -line.textangle + slopeAngle;
|
|
|
|
const x = wordBBox.x0;
|
|
|
|
// pdf-lib draws text from baseline, so we position at word bottom
|
|
const y = pageHeight - wordBBox.y1;
|
|
|
|
return {
|
|
x,
|
|
y,
|
|
fontSize,
|
|
horizontalScale,
|
|
rotation,
|
|
};
|
|
}
|
|
|
|
export function calculateSpaceTransform(
|
|
prevWord: OcrWord,
|
|
nextWord: OcrWord,
|
|
line: OcrLine,
|
|
pageHeight: number,
|
|
spaceWidthFn: (fontSize: number) => number
|
|
): { x: number; y: number; horizontalScale: number; fontSize: number } | null {
|
|
const lineHeight = line.bbox.y1 - line.bbox.y0;
|
|
const fontSize = Math.max(lineHeight + line.baseline.intercept, 1);
|
|
|
|
const gapStart = prevWord.bbox.x1;
|
|
const gapEnd = nextWord.bbox.x0;
|
|
const gapWidth = gapEnd - gapStart;
|
|
|
|
if (gapWidth <= 0) return null;
|
|
|
|
const spaceWidth = spaceWidthFn(fontSize);
|
|
if (spaceWidth <= 0) return null;
|
|
|
|
const horizontalScale = gapWidth / spaceWidth;
|
|
const baselineY = pageHeight - line.bbox.y1 - line.baseline.intercept;
|
|
|
|
return {
|
|
x: gapStart,
|
|
y: baselineY,
|
|
horizontalScale,
|
|
fontSize,
|
|
};
|
|
}
|