Add visual workflow builder, fix critical bugs, and add Arabic i18n support
This commit is contained in:
@@ -1,28 +1,9 @@
|
||||
import { tesseractLanguages } from '../config/tesseract-languages.js';
|
||||
import { showAlert } from '../ui.js';
|
||||
import { downloadFile, formatBytes, getPDFDocument } from '../utils/helpers.js';
|
||||
import Tesseract from 'tesseract.js';
|
||||
import {
|
||||
PDFDocument as PDFLibDocument,
|
||||
StandardFonts,
|
||||
rgb,
|
||||
PDFFont,
|
||||
} from 'pdf-lib';
|
||||
import fontkit from '@pdf-lib/fontkit';
|
||||
import { downloadFile, formatBytes } from '../utils/helpers.js';
|
||||
import { icons, createIcons } from 'lucide';
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import { getFontForLanguage } from '../utils/font-loader.js';
|
||||
import { OcrState, OcrLine, OcrPage } from '@/types';
|
||||
import {
|
||||
parseHocrDocument,
|
||||
calculateWordTransform,
|
||||
calculateSpaceTransform,
|
||||
} from '../utils/hocr-transform.js';
|
||||
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = new URL(
|
||||
'pdfjs-dist/build/pdf.worker.min.mjs',
|
||||
import.meta.url
|
||||
).toString();
|
||||
import { OcrState } from '@/types';
|
||||
import { performOcr } from '../utils/ocr.js';
|
||||
|
||||
const pageState: OcrState = {
|
||||
file: null,
|
||||
@@ -40,108 +21,6 @@ const whitelistPresets: Record<string, string> = {
|
||||
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,()-_/@#:',
|
||||
};
|
||||
|
||||
function drawOcrTextLayer(
|
||||
page: ReturnType<typeof PDFLibDocument.prototype.addPage>,
|
||||
ocrPage: OcrPage,
|
||||
pageHeight: number,
|
||||
primaryFont: PDFFont,
|
||||
latinFont: PDFFont
|
||||
): void {
|
||||
ocrPage.lines.forEach(function (line: OcrLine) {
|
||||
const words = line.words;
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
const word = words[i];
|
||||
const text = word.text.replace(
|
||||
/[\u0000-\u001F\u007F-\u009F\u200E\u200F\u202A-\u202E\uFEFF]/g,
|
||||
''
|
||||
);
|
||||
|
||||
if (!text.trim()) continue;
|
||||
|
||||
const hasNonLatin = /[^\u0000-\u007F]/.test(text);
|
||||
const font = hasNonLatin ? primaryFont : latinFont;
|
||||
|
||||
if (!font) {
|
||||
console.warn('Font not available for text: "' + text + '"');
|
||||
continue;
|
||||
}
|
||||
|
||||
const transform = calculateWordTransform(
|
||||
word,
|
||||
line,
|
||||
pageHeight,
|
||||
(txt: string, size: number) => {
|
||||
try {
|
||||
return font.widthOfTextAtSize(txt, size);
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (transform.fontSize <= 0) continue;
|
||||
|
||||
try {
|
||||
page.drawText(text, {
|
||||
x: transform.x,
|
||||
y: transform.y,
|
||||
font,
|
||||
size: transform.fontSize,
|
||||
color: rgb(0, 0, 0),
|
||||
opacity: 0,
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`Could not draw text "${text}":`, error);
|
||||
}
|
||||
|
||||
if (line.injectWordBreaks && i < words.length - 1) {
|
||||
const nextWord = words[i + 1];
|
||||
const spaceTransform = calculateSpaceTransform(
|
||||
word,
|
||||
nextWord,
|
||||
line,
|
||||
pageHeight,
|
||||
(size: number) => {
|
||||
try {
|
||||
return font.widthOfTextAtSize(' ', size);
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (spaceTransform && spaceTransform.horizontalScale > 0.1) {
|
||||
try {
|
||||
page.drawText(' ', {
|
||||
x: spaceTransform.x,
|
||||
y: spaceTransform.y,
|
||||
font,
|
||||
size: spaceTransform.fontSize,
|
||||
color: rgb(0, 0, 0),
|
||||
opacity: 0,
|
||||
});
|
||||
} catch {
|
||||
console.warn(`Could not draw space between words`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function binarizeCanvas(ctx: CanvasRenderingContext2D) {
|
||||
const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height);
|
||||
const data = imageData.data;
|
||||
for (let i = 0; i < data.length; i += 4) {
|
||||
const brightness =
|
||||
0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2];
|
||||
const color = brightness > 128 ? 255 : 0;
|
||||
data[i] = data[i + 1] = data[i + 2] = color;
|
||||
}
|
||||
ctx.putImageData(imageData, 0, 0);
|
||||
}
|
||||
|
||||
function updateProgress(status: string, progress: number) {
|
||||
const progressBar = document.getElementById('progress-bar');
|
||||
const progressStatus = document.getElementById('progress-status');
|
||||
@@ -239,161 +118,17 @@ async function runOCR() {
|
||||
if (ocrProgress) ocrProgress.classList.remove('hidden');
|
||||
|
||||
try {
|
||||
const worker = await Tesseract.createWorker(langString, 1, {
|
||||
logger: function (m: { status: string; progress: number }) {
|
||||
updateProgress(m.status, m.progress || 0);
|
||||
},
|
||||
});
|
||||
|
||||
await worker.setParameters({
|
||||
tessjs_create_hocr: '1',
|
||||
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
|
||||
});
|
||||
|
||||
if (whitelist) {
|
||||
await worker.setParameters({
|
||||
tessedit_char_whitelist: whitelist,
|
||||
});
|
||||
}
|
||||
|
||||
const arrayBuffer = await pageState.file.arrayBuffer();
|
||||
const pdf = await getPDFDocument({ data: arrayBuffer }).promise;
|
||||
const newPdfDoc = await PDFLibDocument.create();
|
||||
|
||||
newPdfDoc.registerFontkit(fontkit);
|
||||
|
||||
updateProgress('Loading fonts...', 0);
|
||||
|
||||
const cjkLangs = ['jpn', 'chi_sim', 'chi_tra', 'kor'];
|
||||
const indicLangs = [
|
||||
'hin',
|
||||
'ben',
|
||||
'guj',
|
||||
'kan',
|
||||
'mal',
|
||||
'ori',
|
||||
'pan',
|
||||
'tam',
|
||||
'tel',
|
||||
'sin',
|
||||
];
|
||||
const priorityLangs = [...cjkLangs, ...indicLangs, 'ara', 'rus', 'ukr'];
|
||||
|
||||
const primaryLang =
|
||||
selectedLangs.find(function (l) {
|
||||
return priorityLangs.includes(l);
|
||||
}) ||
|
||||
selectedLangs[0] ||
|
||||
'eng';
|
||||
|
||||
const hasCJK = selectedLangs.some(function (l) {
|
||||
return cjkLangs.includes(l);
|
||||
const result = await performOcr(new Uint8Array(arrayBuffer), {
|
||||
language: langString,
|
||||
resolution: scale,
|
||||
binarize,
|
||||
whitelist,
|
||||
onProgress: updateProgress,
|
||||
});
|
||||
const hasIndic = selectedLangs.some(function (l) {
|
||||
return indicLangs.includes(l);
|
||||
});
|
||||
const hasLatin =
|
||||
selectedLangs.some(function (l) {
|
||||
return !priorityLangs.includes(l);
|
||||
}) || selectedLangs.includes('eng');
|
||||
const isIndicPlusLatin = hasIndic && hasLatin && !hasCJK;
|
||||
|
||||
let primaryFont;
|
||||
let latinFont;
|
||||
|
||||
try {
|
||||
if (isIndicPlusLatin) {
|
||||
const [scriptFontBytes, latinFontBytes] = await Promise.all([
|
||||
getFontForLanguage(primaryLang),
|
||||
getFontForLanguage('eng'),
|
||||
]);
|
||||
primaryFont = await newPdfDoc.embedFont(scriptFontBytes, {
|
||||
subset: false,
|
||||
});
|
||||
latinFont = await newPdfDoc.embedFont(latinFontBytes, {
|
||||
subset: false,
|
||||
});
|
||||
} else {
|
||||
const fontBytes = await getFontForLanguage(primaryLang);
|
||||
primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false });
|
||||
latinFont = primaryFont;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Font loading failed, falling back to Helvetica', e);
|
||||
primaryFont = await newPdfDoc.embedFont(StandardFonts.Helvetica);
|
||||
latinFont = primaryFont;
|
||||
showAlert(
|
||||
'Font Warning',
|
||||
'Could not load the specific font for this language. Some characters may not appear correctly.'
|
||||
);
|
||||
}
|
||||
|
||||
let fullText = '';
|
||||
|
||||
for (let i = 1; i <= pdf.numPages; i++) {
|
||||
updateProgress(
|
||||
`Processing page ${i} of ${pdf.numPages}`,
|
||||
(i - 1) / pdf.numPages
|
||||
);
|
||||
|
||||
const page = await pdf.getPage(i);
|
||||
const viewport = page.getViewport({ scale });
|
||||
|
||||
const canvas = document.createElement('canvas');
|
||||
canvas.width = viewport.width;
|
||||
canvas.height = viewport.height;
|
||||
const context = canvas.getContext('2d')!;
|
||||
|
||||
await page.render({ canvasContext: context, viewport, canvas }).promise;
|
||||
|
||||
if (binarize) {
|
||||
binarizeCanvas(context);
|
||||
}
|
||||
|
||||
const result = await worker.recognize(
|
||||
canvas,
|
||||
{},
|
||||
{ text: true, hocr: true }
|
||||
);
|
||||
const data = result.data;
|
||||
|
||||
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
|
||||
|
||||
const pngImageBytes = await new Promise<Uint8Array>(function (resolve) {
|
||||
canvas.toBlob(function (blob) {
|
||||
const reader = new FileReader();
|
||||
reader.onload = function () {
|
||||
resolve(new Uint8Array(reader.result as ArrayBuffer));
|
||||
};
|
||||
reader.readAsArrayBuffer(blob!);
|
||||
}, 'image/png');
|
||||
});
|
||||
|
||||
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
|
||||
newPage.drawImage(pngImage, {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: viewport.width,
|
||||
height: viewport.height,
|
||||
});
|
||||
|
||||
if (data.hocr) {
|
||||
const ocrPage = parseHocrDocument(data.hocr);
|
||||
drawOcrTextLayer(
|
||||
newPage,
|
||||
ocrPage,
|
||||
viewport.height,
|
||||
primaryFont,
|
||||
latinFont
|
||||
);
|
||||
}
|
||||
|
||||
fullText += data.text + '\n\n';
|
||||
}
|
||||
|
||||
await worker.terminate();
|
||||
|
||||
pageState.searchablePdfBytes = await newPdfDoc.save();
|
||||
pageState.searchablePdfBytes = result.pdfBytes;
|
||||
|
||||
const ocrResults = document.getElementById('ocr-results');
|
||||
if (ocrProgress) ocrProgress.classList.add('hidden');
|
||||
@@ -404,7 +139,7 @@ async function runOCR() {
|
||||
const textOutput = document.getElementById(
|
||||
'ocr-text-output'
|
||||
) as HTMLTextAreaElement;
|
||||
if (textOutput) textOutput.value = fullText.trim();
|
||||
if (textOutput) textOutput.value = result.fullText.trim();
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
showAlert(
|
||||
|
||||
Reference in New Issue
Block a user