305 lines
7.9 KiB
TypeScript
305 lines
7.9 KiB
TypeScript
import Tesseract from 'tesseract.js';
|
|
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
|
|
import fontkit from '@pdf-lib/fontkit';
|
|
import * as pdfjsLib from 'pdfjs-dist';
|
|
import { getFontForLanguage } from './font-loader.js';
|
|
import { OcrPage, OcrLine } from '@/types';
|
|
import {
|
|
parseHocrDocument,
|
|
calculateWordTransform,
|
|
calculateSpaceTransform,
|
|
} from './hocr-transform.js';
|
|
import { getPDFDocument } from './helpers.js';
|
|
|
|
export interface OcrOptions {
|
|
language: string;
|
|
resolution: number;
|
|
binarize: boolean;
|
|
whitelist: string;
|
|
onProgress?: (status: string, progress: number) => void;
|
|
}
|
|
|
|
export interface OcrResult {
|
|
pdfBytes: Uint8Array;
|
|
pdfDoc: PDFDocument;
|
|
fullText: string;
|
|
}
|
|
|
|
function binarizeCanvas(ctx: CanvasRenderingContext2D) {
|
|
const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height);
|
|
const data = imageData.data;
|
|
for (let i = 0; i < data.length; i += 4) {
|
|
const brightness =
|
|
0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2];
|
|
const color = brightness > 128 ? 255 : 0;
|
|
data[i] = data[i + 1] = data[i + 2] = color;
|
|
}
|
|
ctx.putImageData(imageData, 0, 0);
|
|
}
|
|
|
|
function drawOcrTextLayer(
|
|
page: ReturnType<typeof PDFDocument.prototype.addPage>,
|
|
ocrPage: OcrPage,
|
|
pageHeight: number,
|
|
primaryFont: PDFFont,
|
|
latinFont: PDFFont
|
|
): void {
|
|
ocrPage.lines.forEach(function (line: OcrLine) {
|
|
const words = line.words;
|
|
|
|
for (let i = 0; i < words.length; i++) {
|
|
const word = words[i];
|
|
const text = word.text.replace(
|
|
/[\u0000-\u001F\u007F-\u009F\u200E\u200F\u202A-\u202E\uFEFF]/g,
|
|
''
|
|
);
|
|
|
|
if (!text.trim()) continue;
|
|
|
|
const hasNonLatin = /[^\u0000-\u007F]/.test(text);
|
|
const font = hasNonLatin ? primaryFont : latinFont;
|
|
|
|
if (!font) {
|
|
console.warn('Font not available for text: "' + text + '"');
|
|
continue;
|
|
}
|
|
|
|
const transform = calculateWordTransform(
|
|
word,
|
|
line,
|
|
pageHeight,
|
|
(txt: string, size: number) => {
|
|
try {
|
|
return font.widthOfTextAtSize(txt, size);
|
|
} catch {
|
|
return 0;
|
|
}
|
|
}
|
|
);
|
|
|
|
if (transform.fontSize <= 0) continue;
|
|
|
|
try {
|
|
page.drawText(text, {
|
|
x: transform.x,
|
|
y: transform.y,
|
|
font,
|
|
size: transform.fontSize,
|
|
color: rgb(0, 0, 0),
|
|
opacity: 0,
|
|
});
|
|
} catch (error) {
|
|
console.warn(`Could not draw text "${text}":`, error);
|
|
}
|
|
|
|
if (line.injectWordBreaks && i < words.length - 1) {
|
|
const nextWord = words[i + 1];
|
|
const spaceTransform = calculateSpaceTransform(
|
|
word,
|
|
nextWord,
|
|
line,
|
|
pageHeight,
|
|
(size: number) => {
|
|
try {
|
|
return font.widthOfTextAtSize(' ', size);
|
|
} catch {
|
|
return 0;
|
|
}
|
|
}
|
|
);
|
|
|
|
if (spaceTransform && spaceTransform.horizontalScale > 0.1) {
|
|
try {
|
|
page.drawText(' ', {
|
|
x: spaceTransform.x,
|
|
y: spaceTransform.y,
|
|
font,
|
|
size: spaceTransform.fontSize,
|
|
color: rgb(0, 0, 0),
|
|
opacity: 0,
|
|
});
|
|
} catch {
|
|
console.warn(`Could not draw space between words`);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
export async function performOcr(
|
|
pdfBytes: Uint8Array | ArrayBuffer,
|
|
options: OcrOptions
|
|
): Promise<OcrResult> {
|
|
const { language, resolution, binarize, whitelist, onProgress } = options;
|
|
const progress = onProgress || (() => {});
|
|
|
|
const worker = await Tesseract.createWorker(language, 1, {
|
|
logger: function (m: { status: string; progress: number }) {
|
|
progress(m.status, m.progress || 0);
|
|
},
|
|
});
|
|
|
|
await worker.setParameters({
|
|
tessjs_create_hocr: '1',
|
|
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
|
|
});
|
|
|
|
if (whitelist) {
|
|
await worker.setParameters({
|
|
tessedit_char_whitelist: whitelist,
|
|
});
|
|
}
|
|
|
|
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
|
|
const newPdfDoc = await PDFDocument.create();
|
|
|
|
newPdfDoc.registerFontkit(fontkit);
|
|
|
|
progress('Loading fonts...', 0);
|
|
|
|
const selectedLangs = language.split('+');
|
|
const cjkLangs = ['jpn', 'chi_sim', 'chi_tra', 'kor'];
|
|
const indicLangs = [
|
|
'hin',
|
|
'ben',
|
|
'guj',
|
|
'kan',
|
|
'mal',
|
|
'ori',
|
|
'pan',
|
|
'tam',
|
|
'tel',
|
|
'sin',
|
|
];
|
|
const priorityLangs = [...cjkLangs, ...indicLangs, 'ara', 'rus', 'ukr'];
|
|
|
|
const primaryLang =
|
|
selectedLangs.find((l) => priorityLangs.includes(l)) ||
|
|
selectedLangs[0] ||
|
|
'eng';
|
|
|
|
const hasCJK = selectedLangs.some((l) => cjkLangs.includes(l));
|
|
const hasIndic = selectedLangs.some((l) => indicLangs.includes(l));
|
|
const hasLatin =
|
|
selectedLangs.some((l) => !priorityLangs.includes(l)) ||
|
|
selectedLangs.includes('eng');
|
|
const isIndicPlusLatin = hasIndic && hasLatin && !hasCJK;
|
|
|
|
let primaryFont: PDFFont;
|
|
let latinFont: PDFFont;
|
|
|
|
try {
|
|
if (isIndicPlusLatin) {
|
|
const [scriptFontBytes, latinFontBytes] = await Promise.all([
|
|
getFontForLanguage(primaryLang),
|
|
getFontForLanguage('eng'),
|
|
]);
|
|
primaryFont = await newPdfDoc.embedFont(scriptFontBytes, {
|
|
subset: false,
|
|
});
|
|
latinFont = await newPdfDoc.embedFont(latinFontBytes, {
|
|
subset: false,
|
|
});
|
|
} else {
|
|
const fontBytes = await getFontForLanguage(primaryLang);
|
|
primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false });
|
|
latinFont = primaryFont;
|
|
}
|
|
} catch (e) {
|
|
console.error('Font loading failed, falling back to Helvetica', e);
|
|
primaryFont = await newPdfDoc.embedFont(StandardFonts.Helvetica);
|
|
latinFont = primaryFont;
|
|
}
|
|
|
|
let fullText = '';
|
|
|
|
try {
|
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
progress(
|
|
`Processing page ${i} of ${pdf.numPages}`,
|
|
(i - 1) / pdf.numPages
|
|
);
|
|
|
|
const page = await pdf.getPage(i);
|
|
const viewport = page.getViewport({ scale: resolution });
|
|
|
|
const canvas = document.createElement('canvas');
|
|
canvas.width = viewport.width;
|
|
canvas.height = viewport.height;
|
|
const context = canvas.getContext('2d');
|
|
if (!context) throw new Error('Failed to create canvas context');
|
|
|
|
await page.render({ canvasContext: context, viewport, canvas }).promise;
|
|
|
|
if (binarize) {
|
|
binarizeCanvas(context);
|
|
}
|
|
|
|
const result = await worker.recognize(
|
|
canvas,
|
|
{},
|
|
{ text: true, hocr: true }
|
|
);
|
|
const data = result.data;
|
|
|
|
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
|
|
|
|
const pngImageBytes = await new Promise<Uint8Array>(function (
|
|
resolve,
|
|
reject
|
|
) {
|
|
canvas.toBlob(function (blob) {
|
|
if (!blob) {
|
|
reject(new Error('Failed to create image blob'));
|
|
return;
|
|
}
|
|
const reader = new FileReader();
|
|
reader.onload = function () {
|
|
resolve(new Uint8Array(reader.result as ArrayBuffer));
|
|
};
|
|
reader.onerror = function () {
|
|
reject(new Error('Failed to read image data'));
|
|
};
|
|
reader.readAsArrayBuffer(blob);
|
|
}, 'image/png');
|
|
});
|
|
|
|
// Release canvas memory
|
|
canvas.width = 0;
|
|
canvas.height = 0;
|
|
|
|
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
|
|
newPage.drawImage(pngImage, {
|
|
x: 0,
|
|
y: 0,
|
|
width: viewport.width,
|
|
height: viewport.height,
|
|
});
|
|
|
|
if (data.hocr) {
|
|
const ocrPage = parseHocrDocument(data.hocr);
|
|
drawOcrTextLayer(
|
|
newPage,
|
|
ocrPage,
|
|
viewport.height,
|
|
primaryFont,
|
|
latinFont
|
|
);
|
|
}
|
|
|
|
fullText += data.text + '\n\n';
|
|
}
|
|
} finally {
|
|
await worker.terminate();
|
|
}
|
|
|
|
const savedBytes = await newPdfDoc.save();
|
|
|
|
return {
|
|
pdfBytes: new Uint8Array(savedBytes),
|
|
pdfDoc: newPdfDoc,
|
|
fullText,
|
|
};
|
|
}
|