feat: add option to opt out of embedding full fonts in OCR settings

This commit is contained in:
alam00000
2026-03-21 16:39:51 +05:30
parent b4a2c98497
commit 37b5956bd5
3 changed files with 28 additions and 4 deletions

View File

@@ -121,6 +121,9 @@ async function runOCR() {
); );
const binarize = (document.getElementById('ocr-binarize') as HTMLInputElement) const binarize = (document.getElementById('ocr-binarize') as HTMLInputElement)
.checked; .checked;
const embedFullFonts = (
document.getElementById('ocr-embed-full-fonts') as HTMLInputElement
).checked;
const whitelist = ( const whitelist = (
document.getElementById('ocr-whitelist') as HTMLInputElement document.getElementById('ocr-whitelist') as HTMLInputElement
).value; ).value;
@@ -154,6 +157,7 @@ async function runOCR() {
resolution: scale, resolution: scale,
binarize, binarize,
whitelist, whitelist,
embedFullFonts,
onProgress: updateProgress, onProgress: updateProgress,
}); });

View File

@@ -16,6 +16,7 @@ export interface OcrOptions {
resolution: number; resolution: number;
binarize: boolean; binarize: boolean;
whitelist: string; whitelist: string;
embedFullFonts?: boolean;
onProgress?: (status: string, progress: number) => void; onProgress?: (status: string, progress: number) => void;
} }
@@ -131,7 +132,14 @@ export async function performOcr(
pdfBytes: Uint8Array | ArrayBuffer, pdfBytes: Uint8Array | ArrayBuffer,
options: OcrOptions options: OcrOptions
): Promise<OcrResult> { ): Promise<OcrResult> {
const { language, resolution, binarize, whitelist, onProgress } = options; const {
language,
resolution,
binarize,
whitelist,
embedFullFonts,
onProgress,
} = options;
const progress = onProgress || (() => {}); const progress = onProgress || (() => {});
const worker = await createConfiguredTesseractWorker( const worker = await createConfiguredTesseractWorker(
@@ -198,14 +206,16 @@ export async function performOcr(
getFontForLanguage('eng'), getFontForLanguage('eng'),
]); ]);
primaryFont = await newPdfDoc.embedFont(scriptFontBytes, { primaryFont = await newPdfDoc.embedFont(scriptFontBytes, {
subset: false, subset: !embedFullFonts,
}); });
latinFont = await newPdfDoc.embedFont(latinFontBytes, { latinFont = await newPdfDoc.embedFont(latinFontBytes, {
subset: false, subset: !embedFullFonts,
}); });
} else { } else {
const fontBytes = await getFontForLanguage(primaryLang); const fontBytes = await getFontForLanguage(primaryLang);
primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false }); primaryFont = await newPdfDoc.embedFont(fontBytes, {
subset: !embedFullFonts,
});
latinFont = primaryFont; latinFont = primaryFont;
} }
} catch (e) { } catch (e) {

View File

@@ -259,6 +259,16 @@
/> />
Binarize Image (Enhance Contrast for Clean Scans) Binarize Image (Enhance Contrast for Clean Scans)
</label> </label>
<label
class="flex items-center gap-2 text-sm text-gray-300 cursor-pointer"
>
<input
type="checkbox"
id="ocr-embed-full-fonts"
class="w-4 h-4 rounded text-indigo-600 bg-gray-700 border-gray-600"
/>
Embed Full Fonts (Larger file, better compatibility)
</label>
<!-- Whitelist Presets --> <!-- Whitelist Presets -->
<div> <div>
<label <label