Add visual workflow builder, fix critical bugs, and add Arabic i18n support
This commit is contained in:
304
src/js/utils/ocr.ts
Normal file
304
src/js/utils/ocr.ts
Normal file
@@ -0,0 +1,304 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
|
||||
import fontkit from '@pdf-lib/fontkit';
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import { getFontForLanguage } from './font-loader.js';
|
||||
import { OcrPage, OcrLine } from '@/types';
|
||||
import {
|
||||
parseHocrDocument,
|
||||
calculateWordTransform,
|
||||
calculateSpaceTransform,
|
||||
} from './hocr-transform.js';
|
||||
import { getPDFDocument } from './helpers.js';
|
||||
|
||||
export interface OcrOptions {
|
||||
language: string;
|
||||
resolution: number;
|
||||
binarize: boolean;
|
||||
whitelist: string;
|
||||
onProgress?: (status: string, progress: number) => void;
|
||||
}
|
||||
|
||||
export interface OcrResult {
|
||||
pdfBytes: Uint8Array;
|
||||
pdfDoc: PDFDocument;
|
||||
fullText: string;
|
||||
}
|
||||
|
||||
function binarizeCanvas(ctx: CanvasRenderingContext2D) {
|
||||
const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height);
|
||||
const data = imageData.data;
|
||||
for (let i = 0; i < data.length; i += 4) {
|
||||
const brightness =
|
||||
0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2];
|
||||
const color = brightness > 128 ? 255 : 0;
|
||||
data[i] = data[i + 1] = data[i + 2] = color;
|
||||
}
|
||||
ctx.putImageData(imageData, 0, 0);
|
||||
}
|
||||
|
||||
function drawOcrTextLayer(
|
||||
page: ReturnType<typeof PDFDocument.prototype.addPage>,
|
||||
ocrPage: OcrPage,
|
||||
pageHeight: number,
|
||||
primaryFont: PDFFont,
|
||||
latinFont: PDFFont
|
||||
): void {
|
||||
ocrPage.lines.forEach(function (line: OcrLine) {
|
||||
const words = line.words;
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
const word = words[i];
|
||||
const text = word.text.replace(
|
||||
/[\u0000-\u001F\u007F-\u009F\u200E\u200F\u202A-\u202E\uFEFF]/g,
|
||||
''
|
||||
);
|
||||
|
||||
if (!text.trim()) continue;
|
||||
|
||||
const hasNonLatin = /[^\u0000-\u007F]/.test(text);
|
||||
const font = hasNonLatin ? primaryFont : latinFont;
|
||||
|
||||
if (!font) {
|
||||
console.warn('Font not available for text: "' + text + '"');
|
||||
continue;
|
||||
}
|
||||
|
||||
const transform = calculateWordTransform(
|
||||
word,
|
||||
line,
|
||||
pageHeight,
|
||||
(txt: string, size: number) => {
|
||||
try {
|
||||
return font.widthOfTextAtSize(txt, size);
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (transform.fontSize <= 0) continue;
|
||||
|
||||
try {
|
||||
page.drawText(text, {
|
||||
x: transform.x,
|
||||
y: transform.y,
|
||||
font,
|
||||
size: transform.fontSize,
|
||||
color: rgb(0, 0, 0),
|
||||
opacity: 0,
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`Could not draw text "${text}":`, error);
|
||||
}
|
||||
|
||||
if (line.injectWordBreaks && i < words.length - 1) {
|
||||
const nextWord = words[i + 1];
|
||||
const spaceTransform = calculateSpaceTransform(
|
||||
word,
|
||||
nextWord,
|
||||
line,
|
||||
pageHeight,
|
||||
(size: number) => {
|
||||
try {
|
||||
return font.widthOfTextAtSize(' ', size);
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (spaceTransform && spaceTransform.horizontalScale > 0.1) {
|
||||
try {
|
||||
page.drawText(' ', {
|
||||
x: spaceTransform.x,
|
||||
y: spaceTransform.y,
|
||||
font,
|
||||
size: spaceTransform.fontSize,
|
||||
color: rgb(0, 0, 0),
|
||||
opacity: 0,
|
||||
});
|
||||
} catch {
|
||||
console.warn(`Could not draw space between words`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export async function performOcr(
|
||||
pdfBytes: Uint8Array | ArrayBuffer,
|
||||
options: OcrOptions
|
||||
): Promise<OcrResult> {
|
||||
const { language, resolution, binarize, whitelist, onProgress } = options;
|
||||
const progress = onProgress || (() => {});
|
||||
|
||||
const worker = await Tesseract.createWorker(language, 1, {
|
||||
logger: function (m: { status: string; progress: number }) {
|
||||
progress(m.status, m.progress || 0);
|
||||
},
|
||||
});
|
||||
|
||||
await worker.setParameters({
|
||||
tessjs_create_hocr: '1',
|
||||
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
|
||||
});
|
||||
|
||||
if (whitelist) {
|
||||
await worker.setParameters({
|
||||
tessedit_char_whitelist: whitelist,
|
||||
});
|
||||
}
|
||||
|
||||
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
|
||||
const newPdfDoc = await PDFDocument.create();
|
||||
|
||||
newPdfDoc.registerFontkit(fontkit);
|
||||
|
||||
progress('Loading fonts...', 0);
|
||||
|
||||
const selectedLangs = language.split('+');
|
||||
const cjkLangs = ['jpn', 'chi_sim', 'chi_tra', 'kor'];
|
||||
const indicLangs = [
|
||||
'hin',
|
||||
'ben',
|
||||
'guj',
|
||||
'kan',
|
||||
'mal',
|
||||
'ori',
|
||||
'pan',
|
||||
'tam',
|
||||
'tel',
|
||||
'sin',
|
||||
];
|
||||
const priorityLangs = [...cjkLangs, ...indicLangs, 'ara', 'rus', 'ukr'];
|
||||
|
||||
const primaryLang =
|
||||
selectedLangs.find((l) => priorityLangs.includes(l)) ||
|
||||
selectedLangs[0] ||
|
||||
'eng';
|
||||
|
||||
const hasCJK = selectedLangs.some((l) => cjkLangs.includes(l));
|
||||
const hasIndic = selectedLangs.some((l) => indicLangs.includes(l));
|
||||
const hasLatin =
|
||||
selectedLangs.some((l) => !priorityLangs.includes(l)) ||
|
||||
selectedLangs.includes('eng');
|
||||
const isIndicPlusLatin = hasIndic && hasLatin && !hasCJK;
|
||||
|
||||
let primaryFont: PDFFont;
|
||||
let latinFont: PDFFont;
|
||||
|
||||
try {
|
||||
if (isIndicPlusLatin) {
|
||||
const [scriptFontBytes, latinFontBytes] = await Promise.all([
|
||||
getFontForLanguage(primaryLang),
|
||||
getFontForLanguage('eng'),
|
||||
]);
|
||||
primaryFont = await newPdfDoc.embedFont(scriptFontBytes, {
|
||||
subset: false,
|
||||
});
|
||||
latinFont = await newPdfDoc.embedFont(latinFontBytes, {
|
||||
subset: false,
|
||||
});
|
||||
} else {
|
||||
const fontBytes = await getFontForLanguage(primaryLang);
|
||||
primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false });
|
||||
latinFont = primaryFont;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Font loading failed, falling back to Helvetica', e);
|
||||
primaryFont = await newPdfDoc.embedFont(StandardFonts.Helvetica);
|
||||
latinFont = primaryFont;
|
||||
}
|
||||
|
||||
let fullText = '';
|
||||
|
||||
try {
|
||||
for (let i = 1; i <= pdf.numPages; i++) {
|
||||
progress(
|
||||
`Processing page ${i} of ${pdf.numPages}`,
|
||||
(i - 1) / pdf.numPages
|
||||
);
|
||||
|
||||
const page = await pdf.getPage(i);
|
||||
const viewport = page.getViewport({ scale: resolution });
|
||||
|
||||
const canvas = document.createElement('canvas');
|
||||
canvas.width = viewport.width;
|
||||
canvas.height = viewport.height;
|
||||
const context = canvas.getContext('2d');
|
||||
if (!context) throw new Error('Failed to create canvas context');
|
||||
|
||||
await page.render({ canvasContext: context, viewport, canvas }).promise;
|
||||
|
||||
if (binarize) {
|
||||
binarizeCanvas(context);
|
||||
}
|
||||
|
||||
const result = await worker.recognize(
|
||||
canvas,
|
||||
{},
|
||||
{ text: true, hocr: true }
|
||||
);
|
||||
const data = result.data;
|
||||
|
||||
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
|
||||
|
||||
const pngImageBytes = await new Promise<Uint8Array>(function (
|
||||
resolve,
|
||||
reject
|
||||
) {
|
||||
canvas.toBlob(function (blob) {
|
||||
if (!blob) {
|
||||
reject(new Error('Failed to create image blob'));
|
||||
return;
|
||||
}
|
||||
const reader = new FileReader();
|
||||
reader.onload = function () {
|
||||
resolve(new Uint8Array(reader.result as ArrayBuffer));
|
||||
};
|
||||
reader.onerror = function () {
|
||||
reject(new Error('Failed to read image data'));
|
||||
};
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}, 'image/png');
|
||||
});
|
||||
|
||||
// Release canvas memory
|
||||
canvas.width = 0;
|
||||
canvas.height = 0;
|
||||
|
||||
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
|
||||
newPage.drawImage(pngImage, {
|
||||
x: 0,
|
||||
y: 0,
|
||||
width: viewport.width,
|
||||
height: viewport.height,
|
||||
});
|
||||
|
||||
if (data.hocr) {
|
||||
const ocrPage = parseHocrDocument(data.hocr);
|
||||
drawOcrTextLayer(
|
||||
newPage,
|
||||
ocrPage,
|
||||
viewport.height,
|
||||
primaryFont,
|
||||
latinFont
|
||||
);
|
||||
}
|
||||
|
||||
fullText += data.text + '\n\n';
|
||||
}
|
||||
} finally {
|
||||
await worker.terminate();
|
||||
}
|
||||
|
||||
const savedBytes = await newPdfDoc.save();
|
||||
|
||||
return {
|
||||
pdfBytes: new Uint8Array(savedBytes),
|
||||
pdfDoc: newPdfDoc,
|
||||
fullText,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user