From 37b5956bd55396cb922139fbac182d95a007b062 Mon Sep 17 00:00:00 2001 From: alam00000 Date: Sat, 21 Mar 2026 16:39:51 +0530 Subject: [PATCH] feat: add option to opt out of embedding full fonts in OCR settings --- src/js/logic/ocr-pdf-page.ts | 4 ++++ src/js/utils/ocr.ts | 18 ++++++++++++++---- src/pages/ocr-pdf.html | 10 ++++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/js/logic/ocr-pdf-page.ts b/src/js/logic/ocr-pdf-page.ts index 1f8318c..85af932 100644 --- a/src/js/logic/ocr-pdf-page.ts +++ b/src/js/logic/ocr-pdf-page.ts @@ -121,6 +121,9 @@ async function runOCR() { ); const binarize = (document.getElementById('ocr-binarize') as HTMLInputElement) .checked; + const embedFullFonts = ( + document.getElementById('ocr-embed-full-fonts') as HTMLInputElement + ).checked; const whitelist = ( document.getElementById('ocr-whitelist') as HTMLInputElement ).value; @@ -154,6 +157,7 @@ async function runOCR() { resolution: scale, binarize, whitelist, + embedFullFonts, onProgress: updateProgress, }); diff --git a/src/js/utils/ocr.ts b/src/js/utils/ocr.ts index 931d3c1..f0f4bd3 100644 --- a/src/js/utils/ocr.ts +++ b/src/js/utils/ocr.ts @@ -16,6 +16,7 @@ export interface OcrOptions { resolution: number; binarize: boolean; whitelist: string; + embedFullFonts?: boolean; onProgress?: (status: string, progress: number) => void; } @@ -131,7 +132,14 @@ export async function performOcr( pdfBytes: Uint8Array | ArrayBuffer, options: OcrOptions ): Promise { - const { language, resolution, binarize, whitelist, onProgress } = options; + const { + language, + resolution, + binarize, + whitelist, + embedFullFonts, + onProgress, + } = options; const progress = onProgress || (() => {}); const worker = await createConfiguredTesseractWorker( @@ -198,14 +206,16 @@ export async function performOcr( getFontForLanguage('eng'), ]); primaryFont = await newPdfDoc.embedFont(scriptFontBytes, { - subset: false, + subset: !embedFullFonts, }); latinFont = await newPdfDoc.embedFont(latinFontBytes, { - subset: false, + subset: !embedFullFonts, }); } else { const fontBytes = await getFontForLanguage(primaryLang); - primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false }); + primaryFont = await newPdfDoc.embedFont(fontBytes, { + subset: !embedFullFonts, + }); latinFont = primaryFont; } } catch (e) { diff --git a/src/pages/ocr-pdf.html b/src/pages/ocr-pdf.html index 0baa3a1..db2e0eb 100644 --- a/src/pages/ocr-pdf.html +++ b/src/pages/ocr-pdf.html @@ -259,6 +259,16 @@ /> Binarize Image (Enhance Contrast for Clean Scans) +