feat: integrate Tesseract.js with improved language availability and font handling

- Refactored OCR page recognition to utilize a configured Tesseract worker.
- Added functions to manage font URLs and asset filenames based on language.
- Implemented language availability checks and error handling for unsupported languages.
- Enhanced PDF workflow to display available OCR languages and handle user selections.
- Introduced utility functions for resolving Tesseract asset configurations.
- Added tests for OCR functionality, font loading, and Tesseract runtime behavior.
- Updated global types to include environment variables for Tesseract and font configurations.
This commit is contained in:
alam00000
2026-03-14 15:50:30 +05:30
parent 58c78b09d2
commit 77da6d7a7d
23 changed files with 1906 additions and 564 deletions

View File

@@ -1,7 +1,6 @@
import Tesseract from 'tesseract.js';
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
import fontkit from '@pdf-lib/fontkit';
import * as pdfjsLib from 'pdfjs-dist';
import { getFontForLanguage } from './font-loader.js';
import { OcrPage, OcrLine } from '@/types';
import {
@@ -10,6 +9,7 @@ import {
calculateSpaceTransform,
} from './hocr-transform.js';
import { getPDFDocument } from './helpers.js';
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
export interface OcrOptions {
language: string;
@@ -134,11 +134,13 @@ export async function performOcr(
const { language, resolution, binarize, whitelist, onProgress } = options;
const progress = onProgress || (() => {});
const worker = await Tesseract.createWorker(language, 1, {
logger: function (m: { status: string; progress: number }) {
const worker = await createConfiguredTesseractWorker(
language,
1,
function (m: { status: string; progress: number }) {
progress(m.status, m.progress || 0);
},
});
}
);
await worker.setParameters({
tessjs_create_hocr: '1',