feat: integrate Tesseract.js with improved language availability and font handling
- Refactored OCR page recognition to utilize a configured Tesseract worker. - Added functions to manage font URLs and asset filenames based on language. - Implemented language availability checks and error handling for unsupported languages. - Enhanced PDF workflow to display available OCR languages and handle user selections. - Introduced utility functions for resolving Tesseract asset configurations. - Added tests for OCR functionality, font loading, and Tesseract runtime behavior. - Updated global types to include environment variables for Tesseract and font configurations.
This commit is contained in:
@@ -1,37 +1,39 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
|
||||
import type Tesseract from 'tesseract.js';
|
||||
import type { ComparePageModel, CompareTextItem } from '../types.ts';
|
||||
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
|
||||
import {
|
||||
joinCompareTextItems,
|
||||
normalizeCompareText,
|
||||
} from './text-normalization.ts';
|
||||
import { createConfiguredTesseractWorker } from '../../utils/tesseract-runtime.js';
|
||||
|
||||
type OcrWord = {
|
||||
text: string;
|
||||
bbox: {
|
||||
x0: number;
|
||||
y0: number;
|
||||
x1: number;
|
||||
y1: number;
|
||||
};
|
||||
};
|
||||
type OcrWord = Tesseract.Word;
|
||||
type OcrRecognizeResult = Tesseract.RecognizeResult;
|
||||
type OcrPageWithWords = Tesseract.Page & { words: OcrWord[] };
|
||||
|
||||
export async function recognizePageCanvas(
|
||||
canvas: HTMLCanvasElement,
|
||||
language: string,
|
||||
onProgress?: (status: string, progress: number) => void
|
||||
): Promise<ComparePageModel> {
|
||||
const result = await Tesseract.recognize(canvas, language, {
|
||||
logger(message) {
|
||||
const worker = await createConfiguredTesseractWorker(
|
||||
language,
|
||||
1,
|
||||
(message) => {
|
||||
onProgress?.(message.status, message.progress || 0);
|
||||
},
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
const ocrData = result.data as unknown as { words?: OcrWord[] };
|
||||
const words = ((ocrData.words || []) as OcrWord[])
|
||||
let result: OcrRecognizeResult;
|
||||
try {
|
||||
result = await worker.recognize(canvas);
|
||||
} finally {
|
||||
await worker.terminate();
|
||||
}
|
||||
|
||||
const words = (result.data as OcrPageWithWords).words
|
||||
.map((word, index) => {
|
||||
const normalizedText = normalizeCompareText(word.text || '');
|
||||
const normalizedText = normalizeCompareText(word.text);
|
||||
if (!normalizedText) return null;
|
||||
|
||||
const item: CompareTextItem = {
|
||||
|
||||
@@ -1,189 +1,233 @@
|
||||
/**
|
||||
* Font mappings for OCR text layer rendering
|
||||
* Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs
|
||||
*/
|
||||
|
||||
export const languageToFontFamily: Record<string, string> = {
|
||||
// CJK Languages
|
||||
jpn: 'Noto Sans JP',
|
||||
chi_sim: 'Noto Sans SC',
|
||||
chi_tra: 'Noto Sans TC',
|
||||
kor: 'Noto Sans KR',
|
||||
|
||||
// Arabic Script
|
||||
ara: 'Noto Sans Arabic',
|
||||
fas: 'Noto Sans Arabic',
|
||||
urd: 'Noto Sans Arabic',
|
||||
pus: 'Noto Sans Arabic',
|
||||
kur: 'Noto Sans Arabic',
|
||||
|
||||
// Devanagari Script
|
||||
hin: 'Noto Sans Devanagari',
|
||||
mar: 'Noto Sans Devanagari',
|
||||
san: 'Noto Sans Devanagari',
|
||||
nep: 'Noto Sans Devanagari',
|
||||
|
||||
// Bengali Script
|
||||
ben: 'Noto Sans Bengali',
|
||||
asm: 'Noto Sans Bengali',
|
||||
|
||||
// Tamil Script
|
||||
tam: 'Noto Sans Tamil',
|
||||
|
||||
// Telugu Script
|
||||
tel: 'Noto Sans Telugu',
|
||||
|
||||
// Kannada Script
|
||||
kan: 'Noto Sans Kannada',
|
||||
|
||||
// Malayalam Script
|
||||
mal: 'Noto Sans Malayalam',
|
||||
|
||||
// Gujarati Script
|
||||
guj: 'Noto Sans Gujarati',
|
||||
|
||||
// Gurmukhi Script (Punjabi)
|
||||
pan: 'Noto Sans Gurmukhi',
|
||||
|
||||
// Oriya Script
|
||||
ori: 'Noto Sans Oriya',
|
||||
|
||||
// Sinhala Script
|
||||
sin: 'Noto Sans Sinhala',
|
||||
|
||||
// Thai Script
|
||||
tha: 'Noto Sans Thai',
|
||||
|
||||
// Lao Script
|
||||
lao: 'Noto Sans Lao',
|
||||
|
||||
// Khmer Script
|
||||
khm: 'Noto Sans Khmer',
|
||||
|
||||
// Myanmar Script
|
||||
mya: 'Noto Sans Myanmar',
|
||||
|
||||
// Tibetan Script
|
||||
bod: 'Noto Serif Tibetan',
|
||||
|
||||
// Georgian Script
|
||||
kat: 'Noto Sans Georgian',
|
||||
kat_old: 'Noto Sans Georgian',
|
||||
|
||||
// Armenian Script
|
||||
hye: 'Noto Sans Armenian',
|
||||
|
||||
// Hebrew Script
|
||||
heb: 'Noto Sans Hebrew',
|
||||
yid: 'Noto Sans Hebrew',
|
||||
|
||||
// Ethiopic Script
|
||||
amh: 'Noto Sans Ethiopic',
|
||||
tir: 'Noto Sans Ethiopic',
|
||||
|
||||
// Cherokee Script
|
||||
chr: 'Noto Sans Cherokee',
|
||||
|
||||
// Syriac Script
|
||||
syr: 'Noto Sans Syriac',
|
||||
|
||||
// Cyrillic Script (Noto Sans includes Cyrillic)
|
||||
bel: 'Noto Sans',
|
||||
bul: 'Noto Sans',
|
||||
mkd: 'Noto Sans',
|
||||
rus: 'Noto Sans',
|
||||
srp: 'Noto Sans',
|
||||
srp_latn: 'Noto Sans',
|
||||
ukr: 'Noto Sans',
|
||||
kaz: 'Noto Sans',
|
||||
kir: 'Noto Sans',
|
||||
tgk: 'Noto Sans',
|
||||
uzb: 'Noto Sans',
|
||||
uzb_cyrl: 'Noto Sans',
|
||||
aze_cyrl: 'Noto Sans',
|
||||
|
||||
// Latin Script (covered by base Noto Sans)
|
||||
afr: 'Noto Sans',
|
||||
aze: 'Noto Sans',
|
||||
bos: 'Noto Sans',
|
||||
cat: 'Noto Sans',
|
||||
ceb: 'Noto Sans',
|
||||
ces: 'Noto Sans',
|
||||
cym: 'Noto Sans',
|
||||
dan: 'Noto Sans',
|
||||
deu: 'Noto Sans',
|
||||
ell: 'Noto Sans',
|
||||
eng: 'Noto Sans',
|
||||
enm: 'Noto Sans',
|
||||
epo: 'Noto Sans',
|
||||
est: 'Noto Sans',
|
||||
eus: 'Noto Sans',
|
||||
fin: 'Noto Sans',
|
||||
fra: 'Noto Sans',
|
||||
frk: 'Noto Sans',
|
||||
frm: 'Noto Sans',
|
||||
gle: 'Noto Sans',
|
||||
glg: 'Noto Sans',
|
||||
grc: 'Noto Sans',
|
||||
hat: 'Noto Sans',
|
||||
hrv: 'Noto Sans',
|
||||
hun: 'Noto Sans',
|
||||
iku: 'Noto Sans',
|
||||
ind: 'Noto Sans',
|
||||
isl: 'Noto Sans',
|
||||
ita: 'Noto Sans',
|
||||
ita_old: 'Noto Sans',
|
||||
jav: 'Noto Sans',
|
||||
lat: 'Noto Sans',
|
||||
lav: 'Noto Sans',
|
||||
lit: 'Noto Sans',
|
||||
mlt: 'Noto Sans',
|
||||
msa: 'Noto Sans',
|
||||
nld: 'Noto Sans',
|
||||
nor: 'Noto Sans',
|
||||
pol: 'Noto Sans',
|
||||
por: 'Noto Sans',
|
||||
ron: 'Noto Sans',
|
||||
slk: 'Noto Sans',
|
||||
slv: 'Noto Sans',
|
||||
spa: 'Noto Sans',
|
||||
spa_old: 'Noto Sans',
|
||||
sqi: 'Noto Sans',
|
||||
swa: 'Noto Sans',
|
||||
swe: 'Noto Sans',
|
||||
tgl: 'Noto Sans',
|
||||
tur: 'Noto Sans',
|
||||
vie: 'Noto Sans',
|
||||
dzo: 'Noto Sans',
|
||||
uig: 'Noto Sans',
|
||||
};
|
||||
|
||||
export const fontFamilyToUrl: Record<string, string> = {
|
||||
'Noto Sans JP': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf',
|
||||
'Noto Sans SC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf',
|
||||
'Noto Sans TC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf',
|
||||
'Noto Sans KR': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf',
|
||||
'Noto Sans Arabic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf',
|
||||
'Noto Sans Devanagari': 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf',
|
||||
'Noto Sans Bengali': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf',
|
||||
'Noto Sans Gujarati': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf',
|
||||
'Noto Sans Kannada': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf',
|
||||
'Noto Sans Malayalam': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf',
|
||||
'Noto Sans Oriya': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf',
|
||||
'Noto Sans Gurmukhi': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf',
|
||||
'Noto Sans Tamil': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf',
|
||||
'Noto Sans Telugu': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf',
|
||||
'Noto Sans Sinhala': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf',
|
||||
'Noto Sans Thai': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf',
|
||||
'Noto Sans Khmer': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf',
|
||||
'Noto Sans Lao': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf',
|
||||
'Noto Sans Myanmar': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf',
|
||||
'Noto Sans Hebrew': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf',
|
||||
'Noto Sans Georgian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf',
|
||||
'Noto Sans Ethiopic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf',
|
||||
'Noto Serif Tibetan': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf',
|
||||
'Noto Sans Cherokee': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf',
|
||||
'Noto Sans Armenian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf',
|
||||
'Noto Sans Syriac': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf',
|
||||
'Noto Sans': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf',
|
||||
};
|
||||
/**
|
||||
* Font mappings for OCR text layer rendering
|
||||
* Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs
|
||||
*/
|
||||
|
||||
export const languageToFontFamily: Record<string, string> = {
|
||||
// CJK Languages
|
||||
jpn: 'Noto Sans JP',
|
||||
chi_sim: 'Noto Sans SC',
|
||||
chi_tra: 'Noto Sans TC',
|
||||
kor: 'Noto Sans KR',
|
||||
|
||||
// Arabic Script
|
||||
ara: 'Noto Sans Arabic',
|
||||
fas: 'Noto Sans Arabic',
|
||||
urd: 'Noto Sans Arabic',
|
||||
pus: 'Noto Sans Arabic',
|
||||
kur: 'Noto Sans Arabic',
|
||||
|
||||
// Devanagari Script
|
||||
hin: 'Noto Sans Devanagari',
|
||||
mar: 'Noto Sans Devanagari',
|
||||
san: 'Noto Sans Devanagari',
|
||||
nep: 'Noto Sans Devanagari',
|
||||
|
||||
// Bengali Script
|
||||
ben: 'Noto Sans Bengali',
|
||||
asm: 'Noto Sans Bengali',
|
||||
|
||||
// Tamil Script
|
||||
tam: 'Noto Sans Tamil',
|
||||
|
||||
// Telugu Script
|
||||
tel: 'Noto Sans Telugu',
|
||||
|
||||
// Kannada Script
|
||||
kan: 'Noto Sans Kannada',
|
||||
|
||||
// Malayalam Script
|
||||
mal: 'Noto Sans Malayalam',
|
||||
|
||||
// Gujarati Script
|
||||
guj: 'Noto Sans Gujarati',
|
||||
|
||||
// Gurmukhi Script (Punjabi)
|
||||
pan: 'Noto Sans Gurmukhi',
|
||||
|
||||
// Oriya Script
|
||||
ori: 'Noto Sans Oriya',
|
||||
|
||||
// Sinhala Script
|
||||
sin: 'Noto Sans Sinhala',
|
||||
|
||||
// Thai Script
|
||||
tha: 'Noto Sans Thai',
|
||||
|
||||
// Lao Script
|
||||
lao: 'Noto Sans Lao',
|
||||
|
||||
// Khmer Script
|
||||
khm: 'Noto Sans Khmer',
|
||||
|
||||
// Myanmar Script
|
||||
mya: 'Noto Sans Myanmar',
|
||||
|
||||
// Tibetan Script
|
||||
bod: 'Noto Serif Tibetan',
|
||||
|
||||
// Georgian Script
|
||||
kat: 'Noto Sans Georgian',
|
||||
kat_old: 'Noto Sans Georgian',
|
||||
|
||||
// Armenian Script
|
||||
hye: 'Noto Sans Armenian',
|
||||
|
||||
// Hebrew Script
|
||||
heb: 'Noto Sans Hebrew',
|
||||
yid: 'Noto Sans Hebrew',
|
||||
|
||||
// Ethiopic Script
|
||||
amh: 'Noto Sans Ethiopic',
|
||||
tir: 'Noto Sans Ethiopic',
|
||||
|
||||
// Cherokee Script
|
||||
chr: 'Noto Sans Cherokee',
|
||||
|
||||
// Syriac Script
|
||||
syr: 'Noto Sans Syriac',
|
||||
|
||||
// Cyrillic Script (Noto Sans includes Cyrillic)
|
||||
bel: 'Noto Sans',
|
||||
bul: 'Noto Sans',
|
||||
mkd: 'Noto Sans',
|
||||
rus: 'Noto Sans',
|
||||
srp: 'Noto Sans',
|
||||
srp_latn: 'Noto Sans',
|
||||
ukr: 'Noto Sans',
|
||||
kaz: 'Noto Sans',
|
||||
kir: 'Noto Sans',
|
||||
tgk: 'Noto Sans',
|
||||
uzb: 'Noto Sans',
|
||||
uzb_cyrl: 'Noto Sans',
|
||||
aze_cyrl: 'Noto Sans',
|
||||
|
||||
// Latin Script (covered by base Noto Sans)
|
||||
afr: 'Noto Sans',
|
||||
aze: 'Noto Sans',
|
||||
bos: 'Noto Sans',
|
||||
cat: 'Noto Sans',
|
||||
ceb: 'Noto Sans',
|
||||
ces: 'Noto Sans',
|
||||
cym: 'Noto Sans',
|
||||
dan: 'Noto Sans',
|
||||
deu: 'Noto Sans',
|
||||
ell: 'Noto Sans',
|
||||
eng: 'Noto Sans',
|
||||
enm: 'Noto Sans',
|
||||
epo: 'Noto Sans',
|
||||
est: 'Noto Sans',
|
||||
eus: 'Noto Sans',
|
||||
fin: 'Noto Sans',
|
||||
fra: 'Noto Sans',
|
||||
frk: 'Noto Sans',
|
||||
frm: 'Noto Sans',
|
||||
gle: 'Noto Sans',
|
||||
glg: 'Noto Sans',
|
||||
grc: 'Noto Sans',
|
||||
hat: 'Noto Sans',
|
||||
hrv: 'Noto Sans',
|
||||
hun: 'Noto Sans',
|
||||
iku: 'Noto Sans',
|
||||
ind: 'Noto Sans',
|
||||
isl: 'Noto Sans',
|
||||
ita: 'Noto Sans',
|
||||
ita_old: 'Noto Sans',
|
||||
jav: 'Noto Sans',
|
||||
lat: 'Noto Sans',
|
||||
lav: 'Noto Sans',
|
||||
lit: 'Noto Sans',
|
||||
mlt: 'Noto Sans',
|
||||
msa: 'Noto Sans',
|
||||
nld: 'Noto Sans',
|
||||
nor: 'Noto Sans',
|
||||
pol: 'Noto Sans',
|
||||
por: 'Noto Sans',
|
||||
ron: 'Noto Sans',
|
||||
slk: 'Noto Sans',
|
||||
slv: 'Noto Sans',
|
||||
spa: 'Noto Sans',
|
||||
spa_old: 'Noto Sans',
|
||||
sqi: 'Noto Sans',
|
||||
swa: 'Noto Sans',
|
||||
swe: 'Noto Sans',
|
||||
tgl: 'Noto Sans',
|
||||
tur: 'Noto Sans',
|
||||
vie: 'Noto Sans',
|
||||
dzo: 'Noto Sans',
|
||||
uig: 'Noto Sans',
|
||||
};
|
||||
|
||||
export const fontFamilyToUrl: Record<string, string> = {
|
||||
'Noto Sans JP':
|
||||
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf',
|
||||
'Noto Sans SC':
|
||||
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf',
|
||||
'Noto Sans TC':
|
||||
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf',
|
||||
'Noto Sans KR':
|
||||
'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf',
|
||||
'Noto Sans Arabic':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf',
|
||||
'Noto Sans Devanagari':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf',
|
||||
'Noto Sans Bengali':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf',
|
||||
'Noto Sans Gujarati':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf',
|
||||
'Noto Sans Kannada':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf',
|
||||
'Noto Sans Malayalam':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf',
|
||||
'Noto Sans Oriya':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf',
|
||||
'Noto Sans Gurmukhi':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf',
|
||||
'Noto Sans Tamil':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf',
|
||||
'Noto Sans Telugu':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf',
|
||||
'Noto Sans Sinhala':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf',
|
||||
'Noto Sans Thai':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf',
|
||||
'Noto Sans Khmer':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf',
|
||||
'Noto Sans Lao':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf',
|
||||
'Noto Sans Myanmar':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf',
|
||||
'Noto Sans Hebrew':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf',
|
||||
'Noto Sans Georgian':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf',
|
||||
'Noto Sans Ethiopic':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf',
|
||||
'Noto Serif Tibetan':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf',
|
||||
'Noto Sans Cherokee':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf',
|
||||
'Noto Sans Armenian':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf',
|
||||
'Noto Sans Syriac':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf',
|
||||
'Noto Sans':
|
||||
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf',
|
||||
};
|
||||
|
||||
export function getFontUrlForFamily(fontFamily: string): string {
|
||||
return fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans'];
|
||||
}
|
||||
|
||||
export function getFontAssetFileName(fontFamily: string): string {
|
||||
const defaultUrl = getFontUrlForFamily(fontFamily);
|
||||
const fileName = defaultUrl.split('/').pop();
|
||||
|
||||
if (!fileName) {
|
||||
throw new Error(
|
||||
`Could not resolve a font asset filename for ${fontFamily}`
|
||||
);
|
||||
}
|
||||
|
||||
return fileName;
|
||||
}
|
||||
|
||||
@@ -4,6 +4,11 @@ import { downloadFile, formatBytes } from '../utils/helpers.js';
|
||||
import { icons, createIcons } from 'lucide';
|
||||
import { OcrState } from '@/types';
|
||||
import { performOcr } from '../utils/ocr.js';
|
||||
import {
|
||||
getAvailableTesseractLanguageEntries,
|
||||
resolveConfiguredTesseractAvailableLanguages,
|
||||
UnsupportedOcrLanguageError,
|
||||
} from '../utils/tesseract-language-availability.js';
|
||||
|
||||
const pageState: OcrState = {
|
||||
file: null,
|
||||
@@ -80,6 +85,30 @@ function resetState() {
|
||||
if (processBtn) processBtn.disabled = true;
|
||||
}
|
||||
|
||||
function updateLanguageAvailabilityNotice() {
|
||||
const notice = document.getElementById('lang-availability-note');
|
||||
if (!notice) return;
|
||||
|
||||
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages();
|
||||
if (!configuredLanguages) {
|
||||
notice.classList.add('hidden');
|
||||
notice.textContent = '';
|
||||
return;
|
||||
}
|
||||
|
||||
const availableEntries = getAvailableTesseractLanguageEntries();
|
||||
if (availableEntries.length === 0) {
|
||||
notice.classList.remove('hidden');
|
||||
notice.textContent =
|
||||
'This deployment does not expose any valid OCR languages. Rebuild it with VITE_TESSERACT_AVAILABLE_LANGUAGES set to valid Tesseract codes.';
|
||||
return;
|
||||
}
|
||||
|
||||
const availableNames = availableEntries.map(([, name]) => name).join(', ');
|
||||
notice.classList.remove('hidden');
|
||||
notice.textContent = `This deployment bundles OCR for: ${availableNames}.`;
|
||||
}
|
||||
|
||||
async function runOCR() {
|
||||
const selectedLangs = Array.from(
|
||||
document.querySelectorAll('.lang-checkbox:checked')
|
||||
@@ -142,10 +171,14 @@ async function runOCR() {
|
||||
if (textOutput) textOutput.value = result.fullText.trim();
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
showAlert(
|
||||
'OCR Error',
|
||||
'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
|
||||
);
|
||||
if (e instanceof UnsupportedOcrLanguageError) {
|
||||
showAlert('OCR Language Not Available', e.message);
|
||||
} else {
|
||||
showAlert(
|
||||
'OCR Error',
|
||||
'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
|
||||
);
|
||||
}
|
||||
if (toolOptions) toolOptions.classList.remove('hidden');
|
||||
if (ocrProgress) ocrProgress.classList.add('hidden');
|
||||
}
|
||||
@@ -213,10 +246,21 @@ function populateLanguageList() {
|
||||
|
||||
langList.innerHTML = '';
|
||||
|
||||
Object.entries(tesseractLanguages).forEach(function ([code, name]) {
|
||||
const availableEntries = getAvailableTesseractLanguageEntries();
|
||||
if (availableEntries.length === 0) {
|
||||
const emptyState = document.createElement('p');
|
||||
emptyState.className = 'text-sm text-yellow-300 p-2';
|
||||
emptyState.textContent =
|
||||
'No OCR languages are available in this deployment.';
|
||||
langList.appendChild(emptyState);
|
||||
return;
|
||||
}
|
||||
|
||||
availableEntries.forEach(function ([code, name]) {
|
||||
const label = document.createElement('label');
|
||||
label.className =
|
||||
'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer';
|
||||
label.dataset.search = `${name} ${code}`.toLowerCase();
|
||||
|
||||
const checkbox = document.createElement('input');
|
||||
checkbox.type = 'checkbox';
|
||||
@@ -253,6 +297,7 @@ document.addEventListener('DOMContentLoaded', function () {
|
||||
const downloadPdfBtn = document.getElementById('download-searchable-pdf');
|
||||
|
||||
populateLanguageList();
|
||||
updateLanguageAvailabilityNotice();
|
||||
|
||||
if (backBtn) {
|
||||
backBtn.addEventListener('click', function () {
|
||||
@@ -304,9 +349,9 @@ document.addEventListener('DOMContentLoaded', function () {
|
||||
langSearch.addEventListener('input', function () {
|
||||
const searchTerm = langSearch.value.toLowerCase();
|
||||
langList.querySelectorAll('label').forEach(function (label) {
|
||||
(label as HTMLElement).style.display = label.textContent
|
||||
?.toLowerCase()
|
||||
.includes(searchTerm)
|
||||
(label as HTMLElement).style.display = (
|
||||
label as HTMLElement
|
||||
).dataset.search?.includes(searchTerm)
|
||||
? ''
|
||||
: 'none';
|
||||
});
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import { showAlert } from '../ui.js';
|
||||
import { tesseractLanguages } from '../config/tesseract-languages.js';
|
||||
import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor';
|
||||
import { executeWorkflow } from '../workflow/engine';
|
||||
import { getAvailableTesseractLanguageEntries } from '../utils/tesseract-language-availability.js';
|
||||
import {
|
||||
nodeRegistry,
|
||||
getNodesByCategory,
|
||||
@@ -1194,7 +1194,7 @@ function showNodeSettings(node: BaseWorkflowNode) {
|
||||
{ label: 'High (288 DPI)', value: '3.0' },
|
||||
{ label: 'Ultra (384 DPI)', value: '4.0' },
|
||||
],
|
||||
language: Object.entries(tesseractLanguages).map(([code, name]) => ({
|
||||
language: getAvailableTesseractLanguageEntries().map(([code, name]) => ({
|
||||
label: name,
|
||||
value: code,
|
||||
})),
|
||||
|
||||
@@ -1,281 +1,330 @@
|
||||
import { languageToFontFamily, fontFamilyToUrl } from '../config/font-mappings.js';
|
||||
|
||||
const fontCache: Map<string, ArrayBuffer> = new Map();
|
||||
|
||||
const DB_NAME = 'bentopdf-fonts';
|
||||
const DB_VERSION = 1;
|
||||
const STORE_NAME = 'fonts';
|
||||
|
||||
async function openFontDB(): Promise<IDBDatabase> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
||||
|
||||
request.onerror = () => reject(request.error);
|
||||
request.onsuccess = () => resolve(request.result);
|
||||
|
||||
request.onupgradeneeded = (event) => {
|
||||
const db = (event.target as IDBOpenDBRequest).result;
|
||||
if (!db.objectStoreNames.contains(STORE_NAME)) {
|
||||
db.createObjectStore(STORE_NAME);
|
||||
}
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getCachedFontFromDB(fontFamily: string): Promise<ArrayBuffer | null> {
|
||||
try {
|
||||
const db = await openFontDB();
|
||||
return new Promise((resolve, reject) => {
|
||||
const transaction = db.transaction(STORE_NAME, 'readonly');
|
||||
const store = transaction.objectStore(STORE_NAME);
|
||||
const request = store.get(fontFamily);
|
||||
|
||||
request.onsuccess = () => resolve(request.result || null);
|
||||
request.onerror = () => reject(request.error);
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn('IndexedDB read failed:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function saveFontToDB(fontFamily: string, fontBuffer: ArrayBuffer): Promise<void> {
|
||||
try {
|
||||
const db = await openFontDB();
|
||||
return new Promise((resolve, reject) => {
|
||||
const transaction = db.transaction(STORE_NAME, 'readwrite');
|
||||
const store = transaction.objectStore(STORE_NAME);
|
||||
const request = store.put(fontBuffer, fontFamily);
|
||||
|
||||
request.onsuccess = () => resolve();
|
||||
request.onerror = () => reject(request.error);
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn('IndexedDB write failed:', error);
|
||||
}
|
||||
}
|
||||
|
||||
export async function getFontForLanguage(lang: string): Promise<ArrayBuffer> {
|
||||
const fontFamily = languageToFontFamily[lang] || 'Noto Sans';
|
||||
|
||||
if (fontCache.has(fontFamily)) {
|
||||
return fontCache.get(fontFamily)!;
|
||||
}
|
||||
const cachedFont = await getCachedFontFromDB(fontFamily);
|
||||
if (cachedFont) {
|
||||
fontCache.set(fontFamily, cachedFont);
|
||||
return cachedFont;
|
||||
}
|
||||
|
||||
try {
|
||||
const fontUrl = fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans'];
|
||||
|
||||
const fontResponse = await fetch(fontUrl);
|
||||
|
||||
if (!fontResponse.ok) {
|
||||
throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
|
||||
}
|
||||
|
||||
const fontBuffer = await fontResponse.arrayBuffer();
|
||||
|
||||
fontCache.set(fontFamily, fontBuffer);
|
||||
await saveFontToDB(fontFamily, fontBuffer);
|
||||
|
||||
return fontBuffer;
|
||||
} catch (error) {
|
||||
console.warn(`Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`, error);
|
||||
|
||||
if (fontFamily !== 'Noto Sans') {
|
||||
return await getFontForLanguage('eng');
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export function detectScripts(text: string): string[] {
|
||||
const scripts = new Set<string>();
|
||||
|
||||
// Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF)
|
||||
if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
|
||||
scripts.add('jpn');
|
||||
}
|
||||
|
||||
// Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF)
|
||||
if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) {
|
||||
scripts.add('kor');
|
||||
}
|
||||
|
||||
// Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF)
|
||||
if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
|
||||
scripts.add('chi_sim');
|
||||
}
|
||||
|
||||
// Check for Arabic
|
||||
if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) {
|
||||
scripts.add('ara');
|
||||
}
|
||||
|
||||
// Check for Devanagari (Hindi, Marathi, etc.)
|
||||
if (/[\u0900-\u097F]/.test(text)) scripts.add('hin');
|
||||
|
||||
// Check for Bengali
|
||||
if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben');
|
||||
|
||||
// Check for Tamil
|
||||
if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam');
|
||||
|
||||
// Check for Telugu
|
||||
if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel');
|
||||
|
||||
// Check for Kannada
|
||||
if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
|
||||
|
||||
// Check for Malayalam
|
||||
if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal');
|
||||
|
||||
// Check for Gujarati
|
||||
if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj');
|
||||
|
||||
// Check for Punjabi (Gurmukhi)
|
||||
if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan');
|
||||
|
||||
// Check for Oriya
|
||||
if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
|
||||
|
||||
// Check for Sinhala
|
||||
if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin');
|
||||
|
||||
// Check for Thai
|
||||
if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha');
|
||||
|
||||
// Check for Lao
|
||||
if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao');
|
||||
|
||||
// Check for Khmer
|
||||
if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm');
|
||||
|
||||
// Check for Myanmar
|
||||
if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
|
||||
|
||||
// Check for Tibetan
|
||||
if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
|
||||
|
||||
// Check for Georgian
|
||||
if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
|
||||
|
||||
// Check for Armenian
|
||||
if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
|
||||
|
||||
// Check for Hebrew
|
||||
if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
|
||||
|
||||
// Check for Ethiopic
|
||||
if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
|
||||
|
||||
// Check for Cherokee
|
||||
if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
|
||||
|
||||
// Check for Syriac
|
||||
if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
|
||||
|
||||
if (scripts.size === 0 || /[a-zA-Z]/.test(text)) {
|
||||
scripts.add('eng');
|
||||
}
|
||||
|
||||
return Array.from(scripts);
|
||||
}
|
||||
|
||||
export function getLanguageForChar(char: string): string {
|
||||
const code = char.charCodeAt(0);
|
||||
|
||||
// Latin (Basic + Supplement + Extended)
|
||||
if (code <= 0x024F) return 'eng';
|
||||
|
||||
// Japanese: Hiragana & Katakana
|
||||
if (
|
||||
(code >= 0x3040 && code <= 0x309F) || // Hiragana
|
||||
(code >= 0x30A0 && code <= 0x30FF) // Katakana
|
||||
) return 'jpn';
|
||||
|
||||
// Korean: Hangul Syllables & Jamo
|
||||
if (
|
||||
(code >= 0xAC00 && code <= 0xD7A3) || // Hangul Syllables
|
||||
(code >= 0x1100 && code <= 0x11FF) // Hangul Jamo
|
||||
) return 'kor';
|
||||
|
||||
// Chinese: CJK Unified Ideographs (Han)
|
||||
if (
|
||||
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified
|
||||
(code >= 0x3400 && code <= 0x4DBF) // CJK Ext A
|
||||
) return 'chi_sim';
|
||||
|
||||
// Arabic
|
||||
if ((code >= 0x0600 && code <= 0x06FF) || (code >= 0x0750 && code <= 0x077F) || (code >= 0x08A0 && code <= 0x08FF)) return 'ara';
|
||||
|
||||
// Devanagari
|
||||
if (code >= 0x0900 && code <= 0x097F) return 'hin';
|
||||
|
||||
// Bengali
|
||||
if (code >= 0x0980 && code <= 0x09FF) return 'ben';
|
||||
|
||||
// Tamil
|
||||
if (code >= 0x0B80 && code <= 0x0BFF) return 'tam';
|
||||
|
||||
// Telugu
|
||||
if (code >= 0x0C00 && code <= 0x0C7F) return 'tel';
|
||||
|
||||
// Kannada
|
||||
if (code >= 0x0C80 && code <= 0x0CFF) return 'kan';
|
||||
|
||||
// Malayalam
|
||||
if (code >= 0x0D00 && code <= 0x0D7F) return 'mal';
|
||||
|
||||
// Gujarati
|
||||
if (code >= 0x0A80 && code <= 0x0AFF) return 'guj';
|
||||
|
||||
// Punjabi (Gurmukhi)
|
||||
if (code >= 0x0A00 && code <= 0x0A7F) return 'pan';
|
||||
|
||||
// Oriya
|
||||
if (code >= 0x0B00 && code <= 0x0B7F) return 'ori';
|
||||
|
||||
// Sinhala
|
||||
if (code >= 0x0D80 && code <= 0x0DFF) return 'sin';
|
||||
|
||||
// Thai
|
||||
if (code >= 0x0E00 && code <= 0x0E7F) return 'tha';
|
||||
|
||||
// Lao
|
||||
if (code >= 0x0E80 && code <= 0x0EFF) return 'lao';
|
||||
|
||||
// Khmer
|
||||
if (code >= 0x1780 && code <= 0x17FF) return 'khm';
|
||||
|
||||
// Myanmar
|
||||
if (code >= 0x1000 && code <= 0x109F) return 'mya';
|
||||
|
||||
// Tibetan
|
||||
if (code >= 0x0F00 && code <= 0x0FFF) return 'bod';
|
||||
|
||||
// Georgian
|
||||
if (code >= 0x10A0 && code <= 0x10FF) return 'kat';
|
||||
|
||||
// Armenian
|
||||
if (code >= 0x0530 && code <= 0x058F) return 'hye';
|
||||
|
||||
// Hebrew
|
||||
if (code >= 0x0590 && code <= 0x05FF) return 'heb';
|
||||
|
||||
// Ethiopic
|
||||
if (code >= 0x1200 && code <= 0x137F) return 'amh';
|
||||
|
||||
// Cherokee
|
||||
if (code >= 0x13A0 && code <= 0x13FF) return 'chr';
|
||||
|
||||
// Syriac
|
||||
if (code >= 0x0700 && code <= 0x074F) return 'syr';
|
||||
|
||||
// Default to English (Latin)
|
||||
return 'eng';
|
||||
}
|
||||
import {
|
||||
getFontAssetFileName,
|
||||
getFontUrlForFamily,
|
||||
languageToFontFamily,
|
||||
} from '../config/font-mappings.js';
|
||||
|
||||
const fontCache: Map<string, ArrayBuffer> = new Map();
|
||||
|
||||
const DB_NAME = 'bentopdf-fonts';
|
||||
const DB_VERSION = 1;
|
||||
const STORE_NAME = 'fonts';
|
||||
|
||||
type OcrFontEnv = Partial<Pick<ImportMetaEnv, 'VITE_OCR_FONT_BASE_URL'>>;
|
||||
|
||||
function getDefaultFontEnv(): OcrFontEnv {
|
||||
return import.meta.env;
|
||||
}
|
||||
|
||||
function normalizeFontBaseUrl(url?: string): string | undefined {
|
||||
const trimmed = url?.trim();
|
||||
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return trimmed.replace(/\/+$/, '');
|
||||
}
|
||||
|
||||
export function resolveFontUrl(
|
||||
fontFamily: string,
|
||||
env: OcrFontEnv = getDefaultFontEnv()
|
||||
): string {
|
||||
const fontBaseUrl = normalizeFontBaseUrl(env.VITE_OCR_FONT_BASE_URL);
|
||||
|
||||
if (fontBaseUrl) {
|
||||
return `${fontBaseUrl}/${getFontAssetFileName(fontFamily)}`;
|
||||
}
|
||||
|
||||
return getFontUrlForFamily(fontFamily);
|
||||
}
|
||||
|
||||
async function openFontDB(): Promise<IDBDatabase> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
||||
|
||||
request.onerror = () => reject(request.error);
|
||||
request.onsuccess = () => resolve(request.result);
|
||||
|
||||
request.onupgradeneeded = (event) => {
|
||||
const db = (event.target as IDBOpenDBRequest).result;
|
||||
if (!db.objectStoreNames.contains(STORE_NAME)) {
|
||||
db.createObjectStore(STORE_NAME);
|
||||
}
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getCachedFontFromDB(
|
||||
fontFamily: string
|
||||
): Promise<ArrayBuffer | null> {
|
||||
try {
|
||||
const db = await openFontDB();
|
||||
return new Promise((resolve, reject) => {
|
||||
const transaction = db.transaction(STORE_NAME, 'readonly');
|
||||
const store = transaction.objectStore(STORE_NAME);
|
||||
const request = store.get(fontFamily);
|
||||
|
||||
request.onsuccess = () => resolve(request.result || null);
|
||||
request.onerror = () => reject(request.error);
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn('IndexedDB read failed:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function saveFontToDB(
|
||||
fontFamily: string,
|
||||
fontBuffer: ArrayBuffer
|
||||
): Promise<void> {
|
||||
try {
|
||||
const db = await openFontDB();
|
||||
return new Promise((resolve, reject) => {
|
||||
const transaction = db.transaction(STORE_NAME, 'readwrite');
|
||||
const store = transaction.objectStore(STORE_NAME);
|
||||
const request = store.put(fontBuffer, fontFamily);
|
||||
|
||||
request.onsuccess = () => resolve();
|
||||
request.onerror = () => reject(request.error);
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn('IndexedDB write failed:', error);
|
||||
}
|
||||
}
|
||||
|
||||
export async function getFontForLanguage(lang: string): Promise<ArrayBuffer> {
|
||||
const fontFamily = languageToFontFamily[lang] || 'Noto Sans';
|
||||
|
||||
if (fontCache.has(fontFamily)) {
|
||||
return fontCache.get(fontFamily)!;
|
||||
}
|
||||
const cachedFont = await getCachedFontFromDB(fontFamily);
|
||||
if (cachedFont) {
|
||||
fontCache.set(fontFamily, cachedFont);
|
||||
return cachedFont;
|
||||
}
|
||||
|
||||
try {
|
||||
const fontUrl = resolveFontUrl(fontFamily);
|
||||
|
||||
const fontResponse = await fetch(fontUrl);
|
||||
|
||||
if (!fontResponse.ok) {
|
||||
throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
|
||||
}
|
||||
|
||||
const fontBuffer = await fontResponse.arrayBuffer();
|
||||
|
||||
fontCache.set(fontFamily, fontBuffer);
|
||||
await saveFontToDB(fontFamily, fontBuffer);
|
||||
|
||||
return fontBuffer;
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
`Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`,
|
||||
error
|
||||
);
|
||||
|
||||
if (fontFamily !== 'Noto Sans') {
|
||||
return await getFontForLanguage('eng');
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export function detectScripts(text: string): string[] {
|
||||
const scripts = new Set<string>();
|
||||
|
||||
// Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF)
|
||||
if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
|
||||
scripts.add('jpn');
|
||||
}
|
||||
|
||||
// Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF)
|
||||
if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) {
|
||||
scripts.add('kor');
|
||||
}
|
||||
|
||||
// Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF)
|
||||
if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
|
||||
scripts.add('chi_sim');
|
||||
}
|
||||
|
||||
// Check for Arabic
|
||||
if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) {
|
||||
scripts.add('ara');
|
||||
}
|
||||
|
||||
// Check for Devanagari (Hindi, Marathi, etc.)
|
||||
if (/[\u0900-\u097F]/.test(text)) scripts.add('hin');
|
||||
|
||||
// Check for Bengali
|
||||
if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben');
|
||||
|
||||
// Check for Tamil
|
||||
if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam');
|
||||
|
||||
// Check for Telugu
|
||||
if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel');
|
||||
|
||||
// Check for Kannada
|
||||
if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
|
||||
|
||||
// Check for Malayalam
|
||||
if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal');
|
||||
|
||||
// Check for Gujarati
|
||||
if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj');
|
||||
|
||||
// Check for Punjabi (Gurmukhi)
|
||||
if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan');
|
||||
|
||||
// Check for Oriya
|
||||
if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
|
||||
|
||||
// Check for Sinhala
|
||||
if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin');
|
||||
|
||||
// Check for Thai
|
||||
if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha');
|
||||
|
||||
// Check for Lao
|
||||
if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao');
|
||||
|
||||
// Check for Khmer
|
||||
if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm');
|
||||
|
||||
// Check for Myanmar
|
||||
if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
|
||||
|
||||
// Check for Tibetan
|
||||
if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
|
||||
|
||||
// Check for Georgian
|
||||
if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
|
||||
|
||||
// Check for Armenian
|
||||
if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
|
||||
|
||||
// Check for Hebrew
|
||||
if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
|
||||
|
||||
// Check for Ethiopic
|
||||
if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
|
||||
|
||||
// Check for Cherokee
|
||||
if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
|
||||
|
||||
// Check for Syriac
|
||||
if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
|
||||
|
||||
if (scripts.size === 0 || /[a-zA-Z]/.test(text)) {
|
||||
scripts.add('eng');
|
||||
}
|
||||
|
||||
return Array.from(scripts);
|
||||
}
|
||||
|
||||
export function getLanguageForChar(char: string): string {
|
||||
const code = char.charCodeAt(0);
|
||||
|
||||
// Latin (Basic + Supplement + Extended)
|
||||
if (code <= 0x024f) return 'eng';
|
||||
|
||||
// Japanese: Hiragana & Katakana
|
||||
if (
|
||||
(code >= 0x3040 && code <= 0x309f) || // Hiragana
|
||||
(code >= 0x30a0 && code <= 0x30ff) // Katakana
|
||||
)
|
||||
return 'jpn';
|
||||
|
||||
// Korean: Hangul Syllables & Jamo
|
||||
if (
|
||||
(code >= 0xac00 && code <= 0xd7a3) || // Hangul Syllables
|
||||
(code >= 0x1100 && code <= 0x11ff) // Hangul Jamo
|
||||
)
|
||||
return 'kor';
|
||||
|
||||
// Chinese: CJK Unified Ideographs (Han)
|
||||
if (
|
||||
(code >= 0x4e00 && code <= 0x9fff) || // CJK Unified
|
||||
(code >= 0x3400 && code <= 0x4dbf) // CJK Ext A
|
||||
)
|
||||
return 'chi_sim';
|
||||
|
||||
// Arabic
|
||||
if (
|
||||
(code >= 0x0600 && code <= 0x06ff) ||
|
||||
(code >= 0x0750 && code <= 0x077f) ||
|
||||
(code >= 0x08a0 && code <= 0x08ff)
|
||||
)
|
||||
return 'ara';
|
||||
|
||||
// Devanagari
|
||||
if (code >= 0x0900 && code <= 0x097f) return 'hin';
|
||||
|
||||
// Bengali
|
||||
if (code >= 0x0980 && code <= 0x09ff) return 'ben';
|
||||
|
||||
// Tamil
|
||||
if (code >= 0x0b80 && code <= 0x0bff) return 'tam';
|
||||
|
||||
// Telugu
|
||||
if (code >= 0x0c00 && code <= 0x0c7f) return 'tel';
|
||||
|
||||
// Kannada
|
||||
if (code >= 0x0c80 && code <= 0x0cff) return 'kan';
|
||||
|
||||
// Malayalam
|
||||
if (code >= 0x0d00 && code <= 0x0d7f) return 'mal';
|
||||
|
||||
// Gujarati
|
||||
if (code >= 0x0a80 && code <= 0x0aff) return 'guj';
|
||||
|
||||
// Punjabi (Gurmukhi)
|
||||
if (code >= 0x0a00 && code <= 0x0a7f) return 'pan';
|
||||
|
||||
// Oriya
|
||||
if (code >= 0x0b00 && code <= 0x0b7f) return 'ori';
|
||||
|
||||
// Sinhala
|
||||
if (code >= 0x0d80 && code <= 0x0dff) return 'sin';
|
||||
|
||||
// Thai
|
||||
if (code >= 0x0e00 && code <= 0x0e7f) return 'tha';
|
||||
|
||||
// Lao
|
||||
if (code >= 0x0e80 && code <= 0x0eff) return 'lao';
|
||||
|
||||
// Khmer
|
||||
if (code >= 0x1780 && code <= 0x17ff) return 'khm';
|
||||
|
||||
// Myanmar
|
||||
if (code >= 0x1000 && code <= 0x109f) return 'mya';
|
||||
|
||||
// Tibetan
|
||||
if (code >= 0x0f00 && code <= 0x0fff) return 'bod';
|
||||
|
||||
// Georgian
|
||||
if (code >= 0x10a0 && code <= 0x10ff) return 'kat';
|
||||
|
||||
// Armenian
|
||||
if (code >= 0x0530 && code <= 0x058f) return 'hye';
|
||||
|
||||
// Hebrew
|
||||
if (code >= 0x0590 && code <= 0x05ff) return 'heb';
|
||||
|
||||
// Ethiopic
|
||||
if (code >= 0x1200 && code <= 0x137f) return 'amh';
|
||||
|
||||
// Cherokee
|
||||
if (code >= 0x13a0 && code <= 0x13ff) return 'chr';
|
||||
|
||||
// Syriac
|
||||
if (code >= 0x0700 && code <= 0x074f) return 'syr';
|
||||
|
||||
// Default to English (Latin)
|
||||
return 'eng';
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
|
||||
import fontkit from '@pdf-lib/fontkit';
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import { getFontForLanguage } from './font-loader.js';
|
||||
import { OcrPage, OcrLine } from '@/types';
|
||||
import {
|
||||
@@ -10,6 +9,7 @@ import {
|
||||
calculateSpaceTransform,
|
||||
} from './hocr-transform.js';
|
||||
import { getPDFDocument } from './helpers.js';
|
||||
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
|
||||
|
||||
export interface OcrOptions {
|
||||
language: string;
|
||||
@@ -134,11 +134,13 @@ export async function performOcr(
|
||||
const { language, resolution, binarize, whitelist, onProgress } = options;
|
||||
const progress = onProgress || (() => {});
|
||||
|
||||
const worker = await Tesseract.createWorker(language, 1, {
|
||||
logger: function (m: { status: string; progress: number }) {
|
||||
const worker = await createConfiguredTesseractWorker(
|
||||
language,
|
||||
1,
|
||||
function (m: { status: string; progress: number }) {
|
||||
progress(m.status, m.progress || 0);
|
||||
},
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
await worker.setParameters({
|
||||
tessjs_create_hocr: '1',
|
||||
|
||||
132
src/js/utils/tesseract-language-availability.ts
Normal file
132
src/js/utils/tesseract-language-availability.ts
Normal file
@@ -0,0 +1,132 @@
|
||||
import { tesseractLanguages } from '../config/tesseract-languages.js';
|
||||
|
||||
export const TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY =
|
||||
'VITE_TESSERACT_AVAILABLE_LANGUAGES' as const;
|
||||
|
||||
type TesseractAvailabilityEnv = Partial<
|
||||
Pick<ImportMetaEnv, typeof TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY>
|
||||
>;
|
||||
|
||||
export type TesseractLanguageCode = keyof typeof tesseractLanguages;
|
||||
|
||||
function getDefaultEnv(): TesseractAvailabilityEnv {
|
||||
return import.meta.env;
|
||||
}
|
||||
|
||||
function normalizeLanguageCodes(value: string | string[]): string[] {
|
||||
const rawCodes = Array.isArray(value) ? value : value.split(/[+,]/);
|
||||
const seen = new Set<string>();
|
||||
const normalizedCodes: string[] = [];
|
||||
|
||||
for (const rawCode of rawCodes) {
|
||||
const code = rawCode.trim();
|
||||
if (!code || seen.has(code)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(code);
|
||||
normalizedCodes.push(code);
|
||||
}
|
||||
|
||||
return normalizedCodes;
|
||||
}
|
||||
|
||||
function formatLanguageLabel(code: string): string {
|
||||
const label = tesseractLanguages[code as TesseractLanguageCode];
|
||||
return label ? `${label} (${code})` : code;
|
||||
}
|
||||
|
||||
export function resolveConfiguredTesseractAvailableLanguages(
|
||||
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||
): string[] | null {
|
||||
const configuredLanguages = env.VITE_TESSERACT_AVAILABLE_LANGUAGES?.trim();
|
||||
if (!configuredLanguages) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return normalizeLanguageCodes(configuredLanguages);
|
||||
}
|
||||
|
||||
export function getAvailableTesseractLanguageEntries(
|
||||
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||
): Array<[TesseractLanguageCode, string]> {
|
||||
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
|
||||
const allEntries = Object.entries(tesseractLanguages) as Array<
|
||||
[TesseractLanguageCode, string]
|
||||
>;
|
||||
|
||||
if (!configuredLanguages) {
|
||||
return allEntries;
|
||||
}
|
||||
|
||||
const configuredSet = new Set(configuredLanguages);
|
||||
return allEntries.filter(([code]) => configuredSet.has(code));
|
||||
}
|
||||
|
||||
export function getUnavailableTesseractLanguages(
|
||||
requestedLanguages: string | string[],
|
||||
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||
): string[] {
|
||||
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
|
||||
if (!configuredLanguages) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const configuredSet = new Set(configuredLanguages);
|
||||
return normalizeLanguageCodes(requestedLanguages).filter(
|
||||
(code) => !configuredSet.has(code)
|
||||
);
|
||||
}
|
||||
|
||||
export function formatTesseractLanguageList(codes: string[]): string {
|
||||
return codes.map(formatLanguageLabel).join(', ');
|
||||
}
|
||||
|
||||
function buildUnsupportedLanguageMessage(
|
||||
unavailableLanguages: string[],
|
||||
availableLanguages: string[]
|
||||
): string {
|
||||
const unavailableText = formatTesseractLanguageList(unavailableLanguages);
|
||||
const availableText = formatTesseractLanguageList(availableLanguages);
|
||||
|
||||
return [
|
||||
`This BentoPDF build only bundles OCR data for ${availableText}.`,
|
||||
`The requested OCR language is not available: ${unavailableText}.`,
|
||||
'Choose one of the bundled languages or rebuild the air-gapped bundle with the missing language added to --ocr-languages.',
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
export class UnsupportedOcrLanguageError extends Error {
|
||||
readonly unavailableLanguages: string[];
|
||||
readonly availableLanguages: string[];
|
||||
|
||||
constructor(unavailableLanguages: string[], availableLanguages: string[]) {
|
||||
super(
|
||||
buildUnsupportedLanguageMessage(unavailableLanguages, availableLanguages)
|
||||
);
|
||||
this.name = 'UnsupportedOcrLanguageError';
|
||||
this.unavailableLanguages = unavailableLanguages;
|
||||
this.availableLanguages = availableLanguages;
|
||||
}
|
||||
}
|
||||
|
||||
export function assertTesseractLanguagesAvailable(
|
||||
requestedLanguages: string | string[],
|
||||
env: TesseractAvailabilityEnv = getDefaultEnv()
|
||||
): void {
|
||||
const availableLanguages = resolveConfiguredTesseractAvailableLanguages(env);
|
||||
if (!availableLanguages) {
|
||||
return;
|
||||
}
|
||||
|
||||
const unavailableLanguages = getUnavailableTesseractLanguages(
|
||||
requestedLanguages,
|
||||
env
|
||||
);
|
||||
|
||||
if (unavailableLanguages.length > 0) {
|
||||
throw new UnsupportedOcrLanguageError(
|
||||
unavailableLanguages,
|
||||
availableLanguages
|
||||
);
|
||||
}
|
||||
}
|
||||
130
src/js/utils/tesseract-runtime.ts
Normal file
130
src/js/utils/tesseract-runtime.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
import Tesseract from 'tesseract.js';
|
||||
import {
|
||||
assertTesseractLanguagesAvailable,
|
||||
TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
|
||||
} from './tesseract-language-availability.js';
|
||||
|
||||
const TESSERACT_ENV_KEYS = [
|
||||
'VITE_TESSERACT_WORKER_URL',
|
||||
'VITE_TESSERACT_CORE_URL',
|
||||
'VITE_TESSERACT_LANG_URL',
|
||||
] as const;
|
||||
|
||||
const TESSERACT_RUNTIME_ENV_KEYS = [
|
||||
...TESSERACT_ENV_KEYS,
|
||||
TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
|
||||
] as const;
|
||||
|
||||
type TesseractRuntimeEnvKey = (typeof TESSERACT_RUNTIME_ENV_KEYS)[number];
|
||||
|
||||
export type TesseractAssetEnv = Partial<
|
||||
Pick<ImportMetaEnv, TesseractRuntimeEnvKey>
|
||||
>;
|
||||
|
||||
export interface TesseractAssetConfig {
|
||||
workerPath?: string;
|
||||
corePath?: string;
|
||||
langPath?: string;
|
||||
}
|
||||
|
||||
export type TesseractLoggerMessage = Tesseract.LoggerMessage;
|
||||
export type TesseractWorkerOptions = Partial<Tesseract.WorkerOptions>;
|
||||
export type TesseractWorker = Tesseract.Worker;
|
||||
|
||||
function getDefaultTesseractAssetEnv(): TesseractAssetEnv {
|
||||
return import.meta.env;
|
||||
}
|
||||
|
||||
function normalizeDirectoryUrl(url?: string): string | undefined {
|
||||
const trimmed = url?.trim();
|
||||
if (!trimmed) return undefined;
|
||||
return trimmed.replace(/\/+$/, '');
|
||||
}
|
||||
|
||||
function normalizeFileUrl(url?: string): string | undefined {
|
||||
const trimmed = url?.trim();
|
||||
if (!trimmed) return undefined;
|
||||
return trimmed.replace(/\/+$/, '');
|
||||
}
|
||||
|
||||
export function resolveTesseractAssetConfig(
|
||||
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
|
||||
): TesseractAssetConfig {
|
||||
return {
|
||||
workerPath: normalizeFileUrl(env.VITE_TESSERACT_WORKER_URL),
|
||||
corePath: normalizeDirectoryUrl(env.VITE_TESSERACT_CORE_URL),
|
||||
langPath: normalizeDirectoryUrl(env.VITE_TESSERACT_LANG_URL),
|
||||
};
|
||||
}
|
||||
|
||||
export function hasConfiguredTesseractOverrides(
|
||||
config: TesseractAssetConfig = resolveTesseractAssetConfig()
|
||||
): boolean {
|
||||
return Boolean(config.workerPath || config.corePath || config.langPath);
|
||||
}
|
||||
|
||||
export function hasCompleteTesseractOverrides(
|
||||
config: TesseractAssetConfig = resolveTesseractAssetConfig()
|
||||
): boolean {
|
||||
return Boolean(config.workerPath && config.corePath && config.langPath);
|
||||
}
|
||||
|
||||
export function getIncompleteTesseractOverrideKeys(
|
||||
config: TesseractAssetConfig = resolveTesseractAssetConfig()
|
||||
): Array<(typeof TESSERACT_ENV_KEYS)[number]> {
|
||||
if (!hasConfiguredTesseractOverrides(config)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return TESSERACT_ENV_KEYS.filter((key) => {
|
||||
switch (key) {
|
||||
case 'VITE_TESSERACT_WORKER_URL':
|
||||
return !config.workerPath;
|
||||
case 'VITE_TESSERACT_CORE_URL':
|
||||
return !config.corePath;
|
||||
case 'VITE_TESSERACT_LANG_URL':
|
||||
return !config.langPath;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export function buildTesseractWorkerOptions(
|
||||
logger?: TesseractWorkerOptions['logger'],
|
||||
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
|
||||
): TesseractWorkerOptions {
|
||||
const config = resolveTesseractAssetConfig(env);
|
||||
|
||||
if (!hasConfiguredTesseractOverrides(config)) {
|
||||
return logger ? { logger } : {};
|
||||
}
|
||||
|
||||
if (!hasCompleteTesseractOverrides(config)) {
|
||||
const missing = getIncompleteTesseractOverrideKeys(config).join(', ');
|
||||
throw new Error(
|
||||
`Self-hosted OCR assets are partially configured. Set ${missing} together with the other Tesseract asset URLs.`
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
...(logger ? { logger } : {}),
|
||||
workerPath: config.workerPath,
|
||||
corePath: config.corePath,
|
||||
langPath: config.langPath,
|
||||
gzip: true,
|
||||
};
|
||||
}
|
||||
|
||||
export async function createConfiguredTesseractWorker(
|
||||
language: string,
|
||||
oem: Tesseract.OEM,
|
||||
logger?: TesseractWorkerOptions['logger'],
|
||||
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
|
||||
): Promise<TesseractWorker> {
|
||||
assertTesseractLanguagesAvailable(language, env);
|
||||
|
||||
return Tesseract.createWorker(
|
||||
language,
|
||||
oem,
|
||||
buildTesseractWorkerOptions(logger, env)
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user