Add visual workflow builder, fix critical bugs, and add Arabic i18n support

This commit is contained in:
alam00000
2026-02-08 17:05:40 +05:30
parent 36ebb3b429
commit 5d8b83e105
118 changed files with 14151 additions and 2357 deletions

View File

@@ -1,28 +1,9 @@
import { tesseractLanguages } from '../config/tesseract-languages.js';
import { showAlert } from '../ui.js';
import { downloadFile, formatBytes, getPDFDocument } from '../utils/helpers.js';
import Tesseract from 'tesseract.js';
import {
PDFDocument as PDFLibDocument,
StandardFonts,
rgb,
PDFFont,
} from 'pdf-lib';
import fontkit from '@pdf-lib/fontkit';
import { downloadFile, formatBytes } from '../utils/helpers.js';
import { icons, createIcons } from 'lucide';
import * as pdfjsLib from 'pdfjs-dist';
import { getFontForLanguage } from '../utils/font-loader.js';
import { OcrState, OcrLine, OcrPage } from '@/types';
import {
parseHocrDocument,
calculateWordTransform,
calculateSpaceTransform,
} from '../utils/hocr-transform.js';
pdfjsLib.GlobalWorkerOptions.workerSrc = new URL(
'pdfjs-dist/build/pdf.worker.min.mjs',
import.meta.url
).toString();
import { OcrState } from '@/types';
import { performOcr } from '../utils/ocr.js';
const pageState: OcrState = {
file: null,
@@ -40,108 +21,6 @@ const whitelistPresets: Record<string, string> = {
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,()-_/@#:',
};
function drawOcrTextLayer(
page: ReturnType<typeof PDFLibDocument.prototype.addPage>,
ocrPage: OcrPage,
pageHeight: number,
primaryFont: PDFFont,
latinFont: PDFFont
): void {
ocrPage.lines.forEach(function (line: OcrLine) {
const words = line.words;
for (let i = 0; i < words.length; i++) {
const word = words[i];
const text = word.text.replace(
/[\u0000-\u001F\u007F-\u009F\u200E\u200F\u202A-\u202E\uFEFF]/g,
''
);
if (!text.trim()) continue;
const hasNonLatin = /[^\u0000-\u007F]/.test(text);
const font = hasNonLatin ? primaryFont : latinFont;
if (!font) {
console.warn('Font not available for text: "' + text + '"');
continue;
}
const transform = calculateWordTransform(
word,
line,
pageHeight,
(txt: string, size: number) => {
try {
return font.widthOfTextAtSize(txt, size);
} catch {
return 0;
}
}
);
if (transform.fontSize <= 0) continue;
try {
page.drawText(text, {
x: transform.x,
y: transform.y,
font,
size: transform.fontSize,
color: rgb(0, 0, 0),
opacity: 0,
});
} catch (error) {
console.warn(`Could not draw text "${text}":`, error);
}
if (line.injectWordBreaks && i < words.length - 1) {
const nextWord = words[i + 1];
const spaceTransform = calculateSpaceTransform(
word,
nextWord,
line,
pageHeight,
(size: number) => {
try {
return font.widthOfTextAtSize(' ', size);
} catch {
return 0;
}
}
);
if (spaceTransform && spaceTransform.horizontalScale > 0.1) {
try {
page.drawText(' ', {
x: spaceTransform.x,
y: spaceTransform.y,
font,
size: spaceTransform.fontSize,
color: rgb(0, 0, 0),
opacity: 0,
});
} catch {
console.warn(`Could not draw space between words`);
}
}
}
}
});
}
function binarizeCanvas(ctx: CanvasRenderingContext2D) {
const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height);
const data = imageData.data;
for (let i = 0; i < data.length; i += 4) {
const brightness =
0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2];
const color = brightness > 128 ? 255 : 0;
data[i] = data[i + 1] = data[i + 2] = color;
}
ctx.putImageData(imageData, 0, 0);
}
function updateProgress(status: string, progress: number) {
const progressBar = document.getElementById('progress-bar');
const progressStatus = document.getElementById('progress-status');
@@ -239,161 +118,17 @@ async function runOCR() {
if (ocrProgress) ocrProgress.classList.remove('hidden');
try {
const worker = await Tesseract.createWorker(langString, 1, {
logger: function (m: { status: string; progress: number }) {
updateProgress(m.status, m.progress || 0);
},
});
await worker.setParameters({
tessjs_create_hocr: '1',
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
});
if (whitelist) {
await worker.setParameters({
tessedit_char_whitelist: whitelist,
});
}
const arrayBuffer = await pageState.file.arrayBuffer();
const pdf = await getPDFDocument({ data: arrayBuffer }).promise;
const newPdfDoc = await PDFLibDocument.create();
newPdfDoc.registerFontkit(fontkit);
updateProgress('Loading fonts...', 0);
const cjkLangs = ['jpn', 'chi_sim', 'chi_tra', 'kor'];
const indicLangs = [
'hin',
'ben',
'guj',
'kan',
'mal',
'ori',
'pan',
'tam',
'tel',
'sin',
];
const priorityLangs = [...cjkLangs, ...indicLangs, 'ara', 'rus', 'ukr'];
const primaryLang =
selectedLangs.find(function (l) {
return priorityLangs.includes(l);
}) ||
selectedLangs[0] ||
'eng';
const hasCJK = selectedLangs.some(function (l) {
return cjkLangs.includes(l);
const result = await performOcr(new Uint8Array(arrayBuffer), {
language: langString,
resolution: scale,
binarize,
whitelist,
onProgress: updateProgress,
});
const hasIndic = selectedLangs.some(function (l) {
return indicLangs.includes(l);
});
const hasLatin =
selectedLangs.some(function (l) {
return !priorityLangs.includes(l);
}) || selectedLangs.includes('eng');
const isIndicPlusLatin = hasIndic && hasLatin && !hasCJK;
let primaryFont;
let latinFont;
try {
if (isIndicPlusLatin) {
const [scriptFontBytes, latinFontBytes] = await Promise.all([
getFontForLanguage(primaryLang),
getFontForLanguage('eng'),
]);
primaryFont = await newPdfDoc.embedFont(scriptFontBytes, {
subset: false,
});
latinFont = await newPdfDoc.embedFont(latinFontBytes, {
subset: false,
});
} else {
const fontBytes = await getFontForLanguage(primaryLang);
primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false });
latinFont = primaryFont;
}
} catch (e) {
console.error('Font loading failed, falling back to Helvetica', e);
primaryFont = await newPdfDoc.embedFont(StandardFonts.Helvetica);
latinFont = primaryFont;
showAlert(
'Font Warning',
'Could not load the specific font for this language. Some characters may not appear correctly.'
);
}
let fullText = '';
for (let i = 1; i <= pdf.numPages; i++) {
updateProgress(
`Processing page ${i} of ${pdf.numPages}`,
(i - 1) / pdf.numPages
);
const page = await pdf.getPage(i);
const viewport = page.getViewport({ scale });
const canvas = document.createElement('canvas');
canvas.width = viewport.width;
canvas.height = viewport.height;
const context = canvas.getContext('2d')!;
await page.render({ canvasContext: context, viewport, canvas }).promise;
if (binarize) {
binarizeCanvas(context);
}
const result = await worker.recognize(
canvas,
{},
{ text: true, hocr: true }
);
const data = result.data;
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
const pngImageBytes = await new Promise<Uint8Array>(function (resolve) {
canvas.toBlob(function (blob) {
const reader = new FileReader();
reader.onload = function () {
resolve(new Uint8Array(reader.result as ArrayBuffer));
};
reader.readAsArrayBuffer(blob!);
}, 'image/png');
});
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
newPage.drawImage(pngImage, {
x: 0,
y: 0,
width: viewport.width,
height: viewport.height,
});
if (data.hocr) {
const ocrPage = parseHocrDocument(data.hocr);
drawOcrTextLayer(
newPage,
ocrPage,
viewport.height,
primaryFont,
latinFont
);
}
fullText += data.text + '\n\n';
}
await worker.terminate();
pageState.searchablePdfBytes = await newPdfDoc.save();
pageState.searchablePdfBytes = result.pdfBytes;
const ocrResults = document.getElementById('ocr-results');
if (ocrProgress) ocrProgress.classList.add('hidden');
@@ -404,7 +139,7 @@ async function runOCR() {
const textOutput = document.getElementById(
'ocr-text-output'
) as HTMLTextAreaElement;
if (textOutput) textOutput.value = fullText.trim();
if (textOutput) textOutput.value = result.fullText.trim();
} catch (e) {
console.error(e);
showAlert(