feat: integrate Tesseract.js with improved language availability and font handling

- Refactored OCR page recognition to utilize a configured Tesseract worker.
- Added functions to manage font URLs and asset filenames based on language.
- Implemented language availability checks and error handling for unsupported languages.
- Enhanced PDF workflow to display available OCR languages and handle user selections.
- Introduced utility functions for resolving Tesseract asset configurations.
- Added tests for OCR functionality, font loading, and Tesseract runtime behavior.
- Updated global types to include environment variables for Tesseract and font configurations.
This commit is contained in:
alam00000
2026-03-14 15:50:30 +05:30
parent 58c78b09d2
commit 77da6d7a7d
23 changed files with 1906 additions and 564 deletions

View File

@@ -4,6 +4,11 @@ import { downloadFile, formatBytes } from '../utils/helpers.js';
import { icons, createIcons } from 'lucide';
import { OcrState } from '@/types';
import { performOcr } from '../utils/ocr.js';
import {
getAvailableTesseractLanguageEntries,
resolveConfiguredTesseractAvailableLanguages,
UnsupportedOcrLanguageError,
} from '../utils/tesseract-language-availability.js';
const pageState: OcrState = {
file: null,
@@ -80,6 +85,30 @@ function resetState() {
if (processBtn) processBtn.disabled = true;
}
function updateLanguageAvailabilityNotice() {
const notice = document.getElementById('lang-availability-note');
if (!notice) return;
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages();
if (!configuredLanguages) {
notice.classList.add('hidden');
notice.textContent = '';
return;
}
const availableEntries = getAvailableTesseractLanguageEntries();
if (availableEntries.length === 0) {
notice.classList.remove('hidden');
notice.textContent =
'This deployment does not expose any valid OCR languages. Rebuild it with VITE_TESSERACT_AVAILABLE_LANGUAGES set to valid Tesseract codes.';
return;
}
const availableNames = availableEntries.map(([, name]) => name).join(', ');
notice.classList.remove('hidden');
notice.textContent = `This deployment bundles OCR for: ${availableNames}.`;
}
async function runOCR() {
const selectedLangs = Array.from(
document.querySelectorAll('.lang-checkbox:checked')
@@ -142,10 +171,14 @@ async function runOCR() {
if (textOutput) textOutput.value = result.fullText.trim();
} catch (e) {
console.error(e);
showAlert(
'OCR Error',
'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
);
if (e instanceof UnsupportedOcrLanguageError) {
showAlert('OCR Language Not Available', e.message);
} else {
showAlert(
'OCR Error',
'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
);
}
if (toolOptions) toolOptions.classList.remove('hidden');
if (ocrProgress) ocrProgress.classList.add('hidden');
}
@@ -213,10 +246,21 @@ function populateLanguageList() {
langList.innerHTML = '';
Object.entries(tesseractLanguages).forEach(function ([code, name]) {
const availableEntries = getAvailableTesseractLanguageEntries();
if (availableEntries.length === 0) {
const emptyState = document.createElement('p');
emptyState.className = 'text-sm text-yellow-300 p-2';
emptyState.textContent =
'No OCR languages are available in this deployment.';
langList.appendChild(emptyState);
return;
}
availableEntries.forEach(function ([code, name]) {
const label = document.createElement('label');
label.className =
'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer';
label.dataset.search = `${name} ${code}`.toLowerCase();
const checkbox = document.createElement('input');
checkbox.type = 'checkbox';
@@ -253,6 +297,7 @@ document.addEventListener('DOMContentLoaded', function () {
const downloadPdfBtn = document.getElementById('download-searchable-pdf');
populateLanguageList();
updateLanguageAvailabilityNotice();
if (backBtn) {
backBtn.addEventListener('click', function () {
@@ -304,9 +349,9 @@ document.addEventListener('DOMContentLoaded', function () {
langSearch.addEventListener('input', function () {
const searchTerm = langSearch.value.toLowerCase();
langList.querySelectorAll('label').forEach(function (label) {
(label as HTMLElement).style.display = label.textContent
?.toLowerCase()
.includes(searchTerm)
(label as HTMLElement).style.display = (
label as HTMLElement
).dataset.search?.includes(searchTerm)
? ''
: 'none';
});

View File

@@ -1,7 +1,7 @@
import { showAlert } from '../ui.js';
import { tesseractLanguages } from '../config/tesseract-languages.js';
import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor';
import { executeWorkflow } from '../workflow/engine';
import { getAvailableTesseractLanguageEntries } from '../utils/tesseract-language-availability.js';
import {
nodeRegistry,
getNodesByCategory,
@@ -1194,7 +1194,7 @@ function showNodeSettings(node: BaseWorkflowNode) {
{ label: 'High (288 DPI)', value: '3.0' },
{ label: 'Ultra (384 DPI)', value: '4.0' },
],
language: Object.entries(tesseractLanguages).map(([code, name]) => ({
language: getAvailableTesseractLanguageEntries().map(([code, name]) => ({
label: name,
value: code,
})),