From 77da6d7a7d4272be265f5d2983010c88a80a4f56 Mon Sep 17 00:00:00 2001
From: alam00000
Date: Sat, 14 Mar 2026 15:50:30 +0530
Subject: [PATCH] feat: integrate Tesseract.js with improved language
availability and font handling
- Refactored OCR page recognition to utilize a configured Tesseract worker.
- Added functions to manage font URLs and asset filenames based on language.
- Implemented language availability checks and error handling for unsupported languages.
- Enhanced PDF workflow to display available OCR languages and handle user selections.
- Introduced utility functions for resolving Tesseract asset configurations.
- Added tests for OCR functionality, font loading, and Tesseract runtime behavior.
- Updated global types to include environment variables for Tesseract and font configurations.
---
.env.example | 9 +
Dockerfile | 12 +
Dockerfile.nonroot | 11 +
README.md | 86 ++-
docs/getting-started.md | 3 +
docs/index.md | 8 +-
docs/self-hosting/docker.md | 63 +-
docs/self-hosting/index.md | 86 ++-
scripts/prepare-airgap.sh | 342 +++++++++-
src/js/compare/engine/ocr-page.ts | 38 +-
src/js/config/font-mappings.ts | 422 ++++++------
src/js/logic/ocr-pdf-page.ts | 61 +-
src/js/logic/pdf-workflow-page.ts | 4 +-
src/js/utils/font-loader.ts | 611 ++++++++++--------
src/js/utils/ocr.ts | 12 +-
.../utils/tesseract-language-availability.ts | 132 ++++
src/js/utils/tesseract-runtime.ts | 130 ++++
src/pages/ocr-pdf.html | 4 +
src/tests/compare/ocr-page.test.ts | 81 +++
src/tests/font-loader.test.ts | 28 +
src/tests/ocr.test.ts | 185 ++++++
src/tests/tesseract-runtime.test.ts | 128 ++++
src/types/globals.d.ts | 14 +
23 files changed, 1906 insertions(+), 564 deletions(-)
create mode 100644 src/js/utils/tesseract-language-availability.ts
create mode 100644 src/js/utils/tesseract-runtime.ts
create mode 100644 src/tests/compare/ocr-page.test.ts
create mode 100644 src/tests/font-loader.test.ts
create mode 100644 src/tests/ocr.test.ts
create mode 100644 src/tests/tesseract-runtime.test.ts
diff --git a/.env.example b/.env.example
index 3b1655d..fee0b32 100644
--- a/.env.example
+++ b/.env.example
@@ -12,6 +12,15 @@ VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.1
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
+# OCR assets (optional)
+# Set all three together for self-hosted or air-gapped OCR.
+# Leave empty to use Tesseract.js runtime defaults.
+VITE_TESSERACT_WORKER_URL=
+VITE_TESSERACT_CORE_URL=
+VITE_TESSERACT_LANG_URL=
+VITE_TESSERACT_AVAILABLE_LANGUAGES=
+VITE_OCR_FONT_BASE_URL=
+
# Default UI language (build-time)
# Supported: en, ar, be, fr, de, es, zh, zh-TW, vi, tr, id, it, pt, nl, da
VITE_DEFAULT_LANGUAGE=
diff --git a/Dockerfile b/Dockerfile
index 12520e5..1e962ad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,6 +35,18 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
+# OCR asset URLs (optional, used for self-hosted or air-gapped OCR)
+ARG VITE_TESSERACT_WORKER_URL
+ARG VITE_TESSERACT_CORE_URL
+ARG VITE_TESSERACT_LANG_URL
+ARG VITE_TESSERACT_AVAILABLE_LANGUAGES
+ARG VITE_OCR_FONT_BASE_URL
+ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL
+ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL
+ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL
+ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES
+ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL
+
# Default UI language (e.g. en, fr, de, es, zh, ar)
ARG VITE_DEFAULT_LANGUAGE
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE
diff --git a/Dockerfile.nonroot b/Dockerfile.nonroot
index dc1e1a8..0599daf 100644
--- a/Dockerfile.nonroot
+++ b/Dockerfile.nonroot
@@ -32,6 +32,17 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
+ARG VITE_TESSERACT_WORKER_URL
+ARG VITE_TESSERACT_CORE_URL
+ARG VITE_TESSERACT_LANG_URL
+ARG VITE_TESSERACT_AVAILABLE_LANGUAGES
+ARG VITE_OCR_FONT_BASE_URL
+ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL
+ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL
+ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL
+ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES
+ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL
+
# Default UI language (e.g. en, fr, de, es, zh, ar)
ARG VITE_DEFAULT_LANGUAGE
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE
diff --git a/README.md b/README.md
index 0928f93..d7711b4 100644
--- a/README.md
+++ b/README.md
@@ -465,6 +465,11 @@ The default URLs are set in `.env.production`:
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
+VITE_TESSERACT_WORKER_URL=
+VITE_TESSERACT_CORE_URL=
+VITE_TESSERACT_LANG_URL=
+VITE_TESSERACT_AVAILABLE_LANGUAGES=
+VITE_OCR_FONT_BASE_URL=
```
To override via Docker build args:
@@ -474,11 +479,18 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
+ --build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \
+ --build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \
+ --build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \
+ --build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
+ --build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \
-t bentopdf .
```
To disable a module (require manual user config via Advanced Settings), set its variable to an empty string.
+For OCR, either leave all `VITE_TESSERACT_*` variables empty and use the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If your self-hosted bundle only includes a subset such as `eng,deu`, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu` so the UI only shows bundled languages and OCR fails with a descriptive message for unsupported ones. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR text-layer fonts.
+
Users can also override these defaults per-browser via **Advanced Settings** in the UI — user overrides take priority over the environment defaults.
> [!IMPORTANT]
@@ -496,6 +508,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa
git clone https://github.com/alam00000/bentopdf.git
cd bentopdf
+# Show supported OCR language codes (for --ocr-languages)
+bash scripts/prepare-airgap.sh --list-ocr-languages
+
+# Search OCR language codes by name or abbreviation
+bash scripts/prepare-airgap.sh --search-ocr-language german
+
# Interactive mode — prompts for all options
bash scripts/prepare-airgap.sh
@@ -508,7 +526,9 @@ This produces a bundle directory containing:
```
bentopdf-airgap-bundle/
bentopdf.tar # Docker image
- *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF)
+ *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract)
+ tesseract-langdata/ # OCR traineddata files
+ ocr-fonts/ # OCR text-layer font files
setup.sh # Setup script for the air-gapped side
README.md # Instructions
```
@@ -525,23 +545,28 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
Script options
-| Flag | Description | Default |
-| ----------------------- | ------------------------------------------------ | --------------------------------- |
-| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
-| `--image-name ` | Docker image tag | `bentopdf` |
-| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` |
-| `--simple-mode` | Enable Simple Mode | off |
-| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
-| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
-| `--brand-name ` | Custom brand name | _(none)_ |
-| `--brand-logo ` | Logo path relative to `public/` | _(none)_ |
-| `--footer-text ` | Custom footer text | _(none)_ |
-| `--dockerfile ` | Dockerfile to use | `Dockerfile` |
-| `--skip-docker` | Skip Docker build and export | off |
-| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
+| Flag | Description | Default |
+| ------------------------------ | ------------------------------------------------ | --------------------------------- |
+| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
+| `--image-name ` | Docker image tag | `bentopdf` |
+| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` |
+| `--simple-mode` | Enable Simple Mode | off |
+| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
+| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
+| `--brand-name ` | Custom brand name | _(none)_ |
+| `--brand-logo ` | Logo path relative to `public/` | _(none)_ |
+| `--footer-text ` | Custom footer text | _(none)_ |
+| `--ocr-languages ` | Comma-separated OCR languages to bundle | `eng` |
+| `--list-ocr-languages` | Print supported OCR codes and names, then exit | off |
+| `--search-ocr-language ` | Search OCR codes by name or abbreviation | off |
+| `--dockerfile ` | Dockerfile to use | `Dockerfile` |
+| `--skip-docker` | Skip Docker build and export | off |
+| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
+The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search ` to find matches such as `search german` or `search chi`.
+
> [!IMPORTANT]
> WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
@@ -550,12 +575,18 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
If you prefer to do it manually without the script
-**Step 1: Download the WASM packages** (on a machine with internet)
+**Step 1: Download the WASM and OCR packages** (on a machine with internet)
```bash
npm pack @bentopdf/pymupdf-wasm@0.11.16
npm pack @bentopdf/gs-wasm
npm pack coherentpdf
+npm pack tesseract.js@7.0.0
+npm pack tesseract.js-core@7.0.0
+mkdir -p tesseract-langdata
+curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
+mkdir -p ocr-fonts
+curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
```
**Step 2: Build the Docker image with internal URLs**
@@ -568,6 +599,10 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
+ --build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
+ --build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
+ --build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
+ --build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
-t bentopdf .
```
@@ -585,6 +620,10 @@ Copy these files via USB drive, internal artifact repository, or approved transf
- `bentopdf-pymupdf-wasm-0.11.14.tgz` — PyMuPDF WASM package
- `bentopdf-gs-wasm-*.tgz` — Ghostscript WASM package
- `coherentpdf-*.tgz` — CoherentPDF WASM package
+- `tesseract.js-7.0.0.tgz` — Tesseract worker package
+- `tesseract.js-core-7.0.0.tgz` — Tesseract core runtime package
+- `tesseract-langdata/` — OCR traineddata files
+- `ocr-fonts/` — OCR text-layer font files
**Step 5: Set up inside the air-gapped network**
@@ -593,16 +632,23 @@ Copy these files via USB drive, internal artifact repository, or approved transf
docker load -i bentopdf.tar
# Extract the WASM packages
-mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf
+mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
+TEMP_TESS=$(mktemp -d)
+tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
+cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js
+rm -rf "$TEMP_TESS"
+tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1
+cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
+cp ./ocr-fonts/* ./wasm/ocr/fonts/
# Run BentoPDF
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
```
-Make sure the WASM files are accessible at the URLs you configured in Step 2.
+Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
@@ -613,6 +659,10 @@ Make sure the WASM files are accessible at the URLs you configured in Step 2.
> VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
> VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
> VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
+> VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js
+> VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core
+> VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data
+> VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts
> ```
**Subdirectory Hosting:**
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 1ac33fc..7d3a257 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -34,6 +34,9 @@ docker compose up -d
Then open `http://localhost:3000` in your browser.
+> [!NOTE]
+> If you are preparing an air-gapped OCR deployment, you must host the OCR text-layer fonts internally in addition to the Tesseract worker, core runtime, and traineddata files. The full setup is documented in [Self-Hosting](/self-hosting/), including `VITE_OCR_FONT_BASE_URL` and the bundled `ocr-fonts/` directory.
+
### Option 3: Build from Source
```bash
diff --git a/docs/index.md b/docs/index.md
index 5ba49b4..c3630e1 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -32,5 +32,11 @@ features:
details: Convert, edit, merge, split, compress, sign, OCR, and more. Everything you need in one place.
- icon: 🌐
title: Self-Hostable
- details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or any static hosting.
+ details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or fully air-gapped environments with self-hosted OCR workers, language data, and text-layer fonts.
+
+## Offline OCR
+
+If you self-host BentoPDF in an air-gapped or offline environment, OCR needs more than the Tesseract worker and traineddata files. Searchable PDF output also needs the OCR text-layer fonts to be served internally.
+
+See [Self-Hosting](/self-hosting/) for the full setup, including `VITE_OCR_FONT_BASE_URL`, the bundled `ocr-fonts/` directory, and the updated air-gap workflow.
---
diff --git a/docs/self-hosting/docker.md b/docs/self-hosting/docker.md
index 97123d7..e1c58d4 100644
--- a/docs/self-hosting/docker.md
+++ b/docs/self-hosting/docker.md
@@ -90,20 +90,27 @@ docker run -d -p 3000:8080 bentopdf:custom
## Environment Variables
-| Variable | Description | Default |
-| ----------------------- | ------------------------------- | -------------------------------------------------------------- |
-| `SIMPLE_MODE` | Build without LibreOffice tools | `false` |
-| `BASE_URL` | Deploy to subdirectory | `/` |
-| `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` |
-| `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` |
-| `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` |
-| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` |
-| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` |
-| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` |
-| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` |
+| Variable | Description | Default |
+| ------------------------------------ | ------------------------------------------- | -------------------------------------------------------------- |
+| `SIMPLE_MODE` | Build without LibreOffice tools | `false` |
+| `BASE_URL` | Deploy to subdirectory | `/` |
+| `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` |
+| `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` |
+| `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` |
+| `VITE_TESSERACT_WORKER_URL` | OCR worker script URL | _(empty; use Tesseract.js default CDN)_ |
+| `VITE_TESSERACT_CORE_URL` | OCR core runtime directory | _(empty; use Tesseract.js default CDN)_ |
+| `VITE_TESSERACT_LANG_URL` | OCR traineddata directory | _(empty; use Tesseract.js default CDN)_ |
+| `VITE_TESSERACT_AVAILABLE_LANGUAGES` | Comma-separated OCR languages exposed in UI | _(empty; show full catalog)_ |
+| `VITE_OCR_FONT_BASE_URL` | OCR text-layer font directory | _(empty; use remote Noto font URLs)_ |
+| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` |
+| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` |
+| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` |
+| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` |
WASM module URLs are pre-configured with CDN defaults — all advanced features work out of the box. Override these for air-gapped or self-hosted deployments.
+For OCR, leave the `VITE_TESSERACT_*` variables empty to use the default online assets, or set all three together for self-hosted/offline OCR. Partial OCR overrides are rejected because the worker, core runtime, and traineddata directory must match. For fully offline searchable PDF output, also set `VITE_OCR_FONT_BASE_URL` so the OCR text-layer fonts are loaded from your internal server instead of the public Noto font URLs.
+
`VITE_DEFAULT_LANGUAGE` sets the UI language for first-time visitors. Supported values: `en`, `ar`, `be`, `fr`, `de`, `es`, `zh`, `zh-TW`, `vi`, `tr`, `id`, `it`, `pt`, `nl`, `da`. Users can still switch languages — this only changes the default.
Example:
@@ -137,35 +144,59 @@ Branding works in both full mode and Simple Mode, and can be combined with all o
```bash
# 1. On a machine WITH internet — download WASM packages
+bash scripts/prepare-airgap.sh --list-ocr-languages
+bash scripts/prepare-airgap.sh --search-ocr-language german
+
+# 2. Download WASM/OCR packages
npm pack @bentopdf/pymupdf-wasm@0.11.14
npm pack @bentopdf/gs-wasm
npm pack coherentpdf
+npm pack tesseract.js@7.0.0
+npm pack tesseract.js-core@7.0.0
+mkdir -p tesseract-langdata
+curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
+mkdir -p ocr-fonts
+curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
-# 2. Build the image with your internal server URLs
+# 3. Build the image with your internal server URLs
docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
+ --build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
+ --build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
+ --build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
+ --build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
+ --build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
-t bentopdf .
-# 3. Export the image
+# 4. Export the image
docker save bentopdf -o bentopdf.tar
-# 4. Transfer bentopdf.tar + the .tgz WASM packages into the air-gapped network
+# 5. Transfer bentopdf.tar + the .tgz packages + tesseract-langdata/ + ocr-fonts/ into the air-gapped network
-# 5. Inside the air-gapped network — load and run
+# 6. Inside the air-gapped network — load and run
docker load -i bentopdf.tar
# Extract WASM packages to your internal web server
-mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf
+mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf /var/www/wasm/ocr/core /var/www/wasm/ocr/lang-data /var/www/wasm/ocr/fonts
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C /var/www/wasm/pymupdf --strip-components=1
tar xzf bentopdf-gs-wasm-*.tgz -C /var/www/wasm/gs --strip-components=1
tar xzf coherentpdf-*.tgz -C /var/www/wasm/cpdf --strip-components=1
+TEMP_TESS=$(mktemp -d)
+tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
+cp "$TEMP_TESS/package/dist/worker.min.js" /var/www/wasm/ocr/worker.min.js
+rm -rf "$TEMP_TESS"
+tar xzf tesseract.js-core-7.0.0.tgz -C /var/www/wasm/ocr/core --strip-components=1
+cp ./tesseract-langdata/*.traineddata.gz /var/www/wasm/ocr/lang-data/
+cp ./ocr-fonts/* /var/www/wasm/ocr/fonts/
# Run BentoPDF
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
```
+Use the codes printed by `bash scripts/prepare-airgap.sh --list-ocr-languages`, or search by name with `bash scripts/prepare-airgap.sh --search-ocr-language `, for `--ocr-languages`. When you build with a restricted OCR subset, pass the same codes to `VITE_TESSERACT_AVAILABLE_LANGUAGES` so the app only shows bundled languages. For full offline OCR output, also host the bundled `ocr-fonts/` directory and point `VITE_OCR_FONT_BASE_URL` at it.
+
Set a variable to empty string to disable that module (users must configure manually via Advanced Settings).
## Custom User ID (PUID/PGID)
diff --git a/docs/self-hosting/index.md b/docs/self-hosting/index.md
index 4149905..3a79e1f 100644
--- a/docs/self-hosting/index.md
+++ b/docs/self-hosting/index.md
@@ -175,6 +175,11 @@ These are set in `.env.production` and baked into the build:
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
+VITE_TESSERACT_WORKER_URL=
+VITE_TESSERACT_CORE_URL=
+VITE_TESSERACT_LANG_URL=
+VITE_TESSERACT_AVAILABLE_LANGUAGES=
+VITE_OCR_FONT_BASE_URL=
```
### Overriding WASM URLs
@@ -187,6 +192,11 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
+ --build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \
+ --build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \
+ --build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \
+ --build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
+ --build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \
-t bentopdf .
# Or via .env.production before building from source
@@ -195,6 +205,8 @@ VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ npm run build
To disable a module entirely (require manual user config via Advanced Settings), set its variable to an empty string.
+For OCR, either leave all `VITE_TESSERACT_*` variables empty and keep the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If you bundle only specific OCR languages, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES` to the same comma-separated codes so the UI only offers installed languages and unsupported selections fail with a descriptive error. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR fonts.
+
Users can also override these defaults at any time via **Advanced Settings** in the UI — user overrides stored in the browser take priority over environment defaults.
### Air-Gapped / Offline Deployment
@@ -209,6 +221,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa
git clone https://github.com/alam00000/bentopdf.git
cd bentopdf
+# Show supported OCR language codes (for --ocr-languages)
+bash scripts/prepare-airgap.sh --list-ocr-languages
+
+# Search OCR language codes by name or abbreviation
+bash scripts/prepare-airgap.sh --search-ocr-language german
+
# Interactive mode — prompts for all options
bash scripts/prepare-airgap.sh
@@ -221,7 +239,9 @@ This produces a bundle directory:
```
bentopdf-airgap-bundle/
bentopdf.tar # Docker image
- *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF)
+ *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract)
+ tesseract-langdata/ # OCR traineddata files
+ ocr-fonts/ # OCR text-layer font files
setup.sh # Setup script for the air-gapped side
README.md # Instructions
```
@@ -237,20 +257,25 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
**Script options:**
-| Flag | Description | Default |
-| ----------------------- | ------------------------------------------------ | --------------------------------- |
-| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
-| `--image-name ` | Docker image tag | `bentopdf` |
-| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` |
-| `--simple-mode` | Enable Simple Mode | off |
-| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
-| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
-| `--brand-name ` | Custom brand name | _(none)_ |
-| `--brand-logo ` | Logo path relative to `public/` | _(none)_ |
-| `--footer-text ` | Custom footer text | _(none)_ |
-| `--dockerfile ` | Dockerfile to use | `Dockerfile` |
-| `--skip-docker` | Skip Docker build and export | off |
-| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
+| Flag | Description | Default |
+| ------------------------------ | ------------------------------------------------ | --------------------------------- |
+| `--wasm-base-url ` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
+| `--image-name ` | Docker image tag | `bentopdf` |
+| `--output-dir ` | Output bundle directory | `./bentopdf-airgap-bundle` |
+| `--simple-mode` | Enable Simple Mode | off |
+| `--base-url ` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
+| `--language ` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
+| `--brand-name ` | Custom brand name | _(none)_ |
+| `--brand-logo ` | Logo path relative to `public/` | _(none)_ |
+| `--footer-text ` | Custom footer text | _(none)_ |
+| `--ocr-languages ` | Comma-separated OCR languages to bundle | `eng` |
+| `--list-ocr-languages` | Print supported OCR codes and names, then exit | off |
+| `--search-ocr-language ` | Search OCR codes by name or abbreviation | off |
+| `--dockerfile ` | Dockerfile to use | `Dockerfile` |
+| `--skip-docker` | Skip Docker build and export | off |
+| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
+
+The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search ` to find matches such as `search german` or `search chi`.
::: warning Same-Origin Requirement
WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
@@ -261,12 +286,18 @@ WASM files must be served from the **same origin** as the BentoPDF app. Web Work
If you prefer to do it manually without the script
-**Step 1: Download the WASM packages** (on a machine with internet)
+**Step 1: Download the WASM and OCR packages** (on a machine with internet)
```bash
npm pack @bentopdf/pymupdf-wasm@0.11.14
npm pack @bentopdf/gs-wasm
npm pack coherentpdf
+npm pack tesseract.js@7.0.0
+npm pack tesseract.js-core@7.0.0
+mkdir -p tesseract-langdata
+curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
+mkdir -p ocr-fonts
+curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
```
**Step 2: Build the Docker image with internal URLs**
@@ -279,6 +310,10 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
+ --build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
+ --build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
+ --build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
+ --build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
-t bentopdf .
```
@@ -293,7 +328,9 @@ docker save bentopdf -o bentopdf.tar
Copy via USB, internal artifact repo, or approved transfer method:
- `bentopdf.tar` — the Docker image
-- The three `.tgz` WASM packages from Step 1
+- The five `.tgz` WASM/OCR packages from Step 1
+- The `tesseract-langdata/` directory from Step 1
+- The `ocr-fonts/` directory from Step 1
**Step 5: Set up inside the air-gapped network**
@@ -302,16 +339,23 @@ Copy via USB, internal artifact repo, or approved transfer method:
docker load -i bentopdf.tar
# Extract WASM packages
-mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf
+mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
+TEMP_TESS=$(mktemp -d)
+tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
+cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js
+rm -rf "$TEMP_TESS"
+tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1
+cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
+cp ./ocr-fonts/* ./wasm/ocr/fonts/
# Run BentoPDF
docker run -d -p 3000:8080 --restart unless-stopped bentopdf
```
-Make sure the WASM files are accessible at the URLs you configured in Step 2.
+Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
@@ -322,6 +366,10 @@ Set the variables in `.env.production` before running `npm run build`:
VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
+VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js
+VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core
+VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data
+VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts
```
:::
diff --git a/scripts/prepare-airgap.sh b/scripts/prepare-airgap.sh
index b87d54c..2bae28e 100755
--- a/scripts/prepare-airgap.sh
+++ b/scripts/prepare-airgap.sh
@@ -13,6 +13,8 @@ set -euo pipefail
# Usage:
# bash scripts/prepare-airgap.sh --wasm-base-url https://internal.example.com/wasm
# bash scripts/prepare-airgap.sh # interactive mode
+# bash scripts/prepare-airgap.sh --ocr-languages eng,deu,fra
+# bash scripts/prepare-airgap.sh --search-ocr-language german
#
# See --help for all options.
# ============================================================
@@ -54,6 +56,110 @@ DOCKERFILE="Dockerfile"
SKIP_DOCKER=false
SKIP_WASM=false
INTERACTIVE=false
+OCR_LANGUAGES="eng"
+TESSDATA_VERSION="4.0.0_best_int"
+LIST_OCR_LANGUAGES=false
+SEARCH_OCR_LANGUAGE_TERM=""
+
+TESSERACT_LANGUAGE_CONFIG="src/js/config/tesseract-languages.ts"
+FONT_MAPPING_CONFIG="src/js/config/font-mappings.ts"
+
+SUPPORTED_OCR_LANGUAGES_RAW=""
+OCR_FONT_MANIFEST_RAW=""
+
+load_supported_ocr_languages() {
+ if [ -n "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then
+ return
+ fi
+
+ if [ ! -f "$TESSERACT_LANGUAGE_CONFIG" ]; then
+ error "Missing OCR language config: ${TESSERACT_LANGUAGE_CONFIG}"
+ exit 1
+ fi
+
+ SUPPORTED_OCR_LANGUAGES_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const languages = []; const pattern = /^\\s*([a-z0-9_]+):\\s*'([^']+)'/gm; let match; while ((match = pattern.exec(source)) !== null) { languages.push(match[1] + '\\t' + match[2]); } process.stdout.write(languages.join('\\n'));" "$TESSERACT_LANGUAGE_CONFIG")
+
+ if [ -z "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then
+ error "Failed to load supported OCR languages from ${TESSERACT_LANGUAGE_CONFIG}"
+ exit 1
+ fi
+}
+
+is_supported_ocr_language() {
+ local code="$1"
+ load_supported_ocr_languages
+ printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v code="$code" '$1 == code { found = 1 } END { exit found ? 0 : 1 }'
+}
+
+show_supported_ocr_languages() {
+ load_supported_ocr_languages
+
+ echo ""
+ echo -e "${BOLD}Supported OCR languages:${NC}"
+ echo " Use the code in the left column for --ocr-languages."
+ echo ""
+ printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }'
+ echo ""
+ echo " Example: --ocr-languages eng,deu,fra,spa"
+ echo ""
+}
+
+show_matching_ocr_languages() {
+ local query="$1"
+ load_supported_ocr_languages
+
+ if [ -z "$query" ]; then
+ error "OCR language search requires a non-empty query."
+ exit 1
+ fi
+
+ local matches
+ matches=$(printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v query="$query" '
+ BEGIN {
+ normalized = tolower(query)
+ }
+ {
+ code = tolower($1)
+ name = tolower($2)
+ if (index(code, normalized) || index(name, normalized)) {
+ printf "%s\t%s\n", $1, $2
+ }
+ }
+ ')
+
+ echo ""
+ echo -e "${BOLD}OCR language search:${NC} ${query}"
+
+ if [ -z "$matches" ]; then
+ echo " No supported OCR languages matched that query."
+ echo " Tip: run --list-ocr-languages to browse the full list."
+ echo ""
+ return 1
+ fi
+
+ echo " Matching codes for --ocr-languages:"
+ echo ""
+ printf '%s\n' "$matches" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }'
+ echo ""
+}
+
+load_required_ocr_fonts() {
+ if [ -n "$OCR_FONT_MANIFEST_RAW" ]; then
+ return
+ fi
+
+ if [ ! -f "$FONT_MAPPING_CONFIG" ]; then
+ error "Missing OCR font mapping config: ${FONT_MAPPING_CONFIG}"
+ exit 1
+ fi
+
+ OCR_FONT_MANIFEST_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const selected = (process.argv[2] || '').split(',').map((value) => value.trim()).filter(Boolean); const sections = source.split('export const fontFamilyToUrl'); const languageSection = sections[0] || ''; const fontSection = sections[1] || ''; const languageToFamily = {}; const fontFamilyToUrl = {}; let match; const languagePattern = /^\s*([a-z_]+):\s*'([^']+)',/gm; while ((match = languagePattern.exec(languageSection)) !== null) { languageToFamily[match[1]] = match[2]; } const fontPattern = /^\s*'([^']+)':\s*'([^']+)',/gm; while ((match = fontPattern.exec(fontSection)) !== null) { fontFamilyToUrl[match[1]] = match[2]; } const families = new Set(['Noto Sans']); for (const lang of selected) { families.add(languageToFamily[lang] || 'Noto Sans'); } const lines = Array.from(families).sort().map((family) => { const url = fontFamilyToUrl[family] || fontFamilyToUrl['Noto Sans']; const fileName = url.split('/').pop(); return [family, url, fileName].join('\t'); }); process.stdout.write(lines.join('\n'));" "$FONT_MAPPING_CONFIG" "$OCR_LANGUAGES")
+
+ if [ -z "$OCR_FONT_MANIFEST_RAW" ]; then
+ error "Failed to resolve OCR font assets from ${FONT_MAPPING_CONFIG}"
+ exit 1
+ fi
+}
# --- Usage ---
usage() {
@@ -80,6 +186,10 @@ OPTIONS:
--brand-name Custom brand name
--brand-logo Logo path relative to public/
--footer-text Custom footer text
+ --ocr-languages Comma-separated OCR languages to bundle
+ (default: eng)
+ --list-ocr-languages Print supported OCR language codes and exit
+ --search-ocr-language Search supported OCR languages by code or name
--skip-docker Skip Docker build and export
--skip-wasm Skip WASM download (reuse existing .tgz files)
--help Show this help message
@@ -91,6 +201,7 @@ EXAMPLES:
# Full automation
bash scripts/prepare-airgap.sh \
--wasm-base-url https://internal.example.com/wasm \
+ --ocr-languages eng,deu,fra \
--brand-name "AcmePDF" \
--language fr
@@ -98,6 +209,12 @@ EXAMPLES:
bash scripts/prepare-airgap.sh \
--wasm-base-url https://internal.example.com/wasm \
--skip-docker
+
+ # Show all supported OCR language codes
+ bash scripts/prepare-airgap.sh --list-ocr-languages
+
+ # Search OCR languages by code or human-readable name
+ bash scripts/prepare-airgap.sh --search-ocr-language german
EOF
exit 0
}
@@ -115,6 +232,9 @@ while [[ $# -gt 0 ]]; do
--brand-name) BRAND_NAME="$2"; shift 2 ;;
--brand-logo) BRAND_LOGO="$2"; shift 2 ;;
--footer-text) FOOTER_TEXT="$2"; shift 2 ;;
+ --ocr-languages) OCR_LANGUAGES="$2"; shift 2 ;;
+ --list-ocr-languages) LIST_OCR_LANGUAGES=true; shift ;;
+ --search-ocr-language) SEARCH_OCR_LANGUAGE_TERM="$2"; shift 2 ;;
--dockerfile) DOCKERFILE="$2"; shift 2 ;;
--skip-docker) SKIP_DOCKER=true; shift ;;
--skip-wasm) SKIP_WASM=true; shift ;;
@@ -132,6 +252,18 @@ if [ ! -f "package.json" ] || [ ! -f "src/js/const/cdn-version.ts" ]; then
exit 1
fi
+if [ "$LIST_OCR_LANGUAGES" = true ]; then
+ show_supported_ocr_languages
+ exit 0
+fi
+
+if [ -n "$SEARCH_OCR_LANGUAGE_TERM" ]; then
+ if show_matching_ocr_languages "$SEARCH_OCR_LANGUAGE_TERM"; then
+ exit 0
+ fi
+ exit 1
+fi
+
# --- Check prerequisites ---
check_prerequisites() {
local missing=false
@@ -141,6 +273,11 @@ check_prerequisites() {
missing=true
fi
+ if [ "$SKIP_WASM" = false ] && ! command -v curl &>/dev/null; then
+ error "curl is required to download OCR language data."
+ missing=true
+ fi
+
if [ "$SKIP_DOCKER" = false ] && ! command -v docker &>/dev/null; then
error "docker is required but not found (use --skip-docker to skip)."
missing=true
@@ -156,9 +293,11 @@ read_versions() {
PYMUPDF_VERSION=$(grep "pymupdf:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
GS_VERSION=$(grep "ghostscript:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
APP_VERSION=$(node -p "require('./package.json').version")
+ TESSERACT_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js'].version")
+ TESSERACT_CORE_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js-core'].version")
- if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ]; then
- error "Failed to read WASM versions from src/js/const/cdn-version.ts"
+ if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ] || [ -z "$TESSERACT_VERSION" ] || [ -z "$TESSERACT_CORE_VERSION" ]; then
+ error "Failed to read external asset versions from the repository metadata"
exit 1
fi
}
@@ -175,6 +314,8 @@ interactive_mode() {
echo " PyMuPDF: ${PYMUPDF_VERSION}"
echo " Ghostscript: ${GS_VERSION}"
echo " CoherentPDF: latest"
+ echo " Tesseract.js: ${TESSERACT_VERSION}"
+ echo " OCR Data: ${TESSDATA_VERSION}"
echo ""
# [1] WASM base URL (REQUIRED)
@@ -256,8 +397,35 @@ interactive_mode() {
DOCKERFILE="${input:-$DOCKERFILE}"
echo ""
- # [8] Output directory (optional)
- echo -e "${BOLD}[8/8] Output Directory ${GREEN}(optional)${NC}"
+ # [8] OCR languages (optional)
+ echo -e "${BOLD}[8/9] OCR Languages ${GREEN}(optional)${NC}"
+ echo " Comma-separated traineddata files to bundle for offline OCR."
+ echo " Enter Tesseract language codes such as: eng,deu,fra,spa"
+ echo " Type 'list' to print the full supported language list."
+ echo " Type 'search ' to find codes by name or abbreviation."
+ while true; do
+ read -r -p " OCR languages [${OCR_LANGUAGES}]: " input
+ if [ -z "${input:-}" ]; then
+ break
+ fi
+ if [ "$input" = "list" ]; then
+ show_supported_ocr_languages
+ continue
+ fi
+ if [[ "$input" == search\ * ]]; then
+ search_query="${input#search }"
+ if ! show_matching_ocr_languages "$search_query"; then
+ warn "No OCR language matched '${search_query}'."
+ fi
+ continue
+ fi
+ OCR_LANGUAGES="$input"
+ break
+ done
+ echo ""
+
+ # [9] Output directory (optional)
+ echo -e "${BOLD}[9/9] Output Directory ${GREEN}(optional)${NC}"
read -r -p " Path [${OUTPUT_DIR}]: " input
OUTPUT_DIR="${input:-$OUTPUT_DIR}"
@@ -274,6 +442,7 @@ interactive_mode() {
[ -n "$BRAND_NAME" ] && echo " Brand Logo: ${BRAND_LOGO:-images/favicon-no-bg.svg (default)}"
[ -n "$BRAND_NAME" ] && echo " Footer Text: ${FOOTER_TEXT:-(default)}"
echo " Base URL: ${BASE_URL:-/ (root)}"
+ echo " OCR Languages: ${OCR_LANGUAGES}"
echo " Output: ${OUTPUT_DIR}"
echo ""
read -r -p " Proceed? (Y/n): " input
@@ -321,6 +490,7 @@ filesize() {
check_prerequisites
read_versions
+load_supported_ocr_languages
# If no WASM base URL provided, go interactive
if [ -z "$WASM_BASE_URL" ]; then
@@ -338,6 +508,34 @@ if [ -n "$LANGUAGE" ]; then
fi
fi
+IFS=',' read -r -a OCR_LANGUAGE_ARRAY <<< "$OCR_LANGUAGES"
+NORMALIZED_OCR_LANGUAGES=()
+for raw_lang in "${OCR_LANGUAGE_ARRAY[@]}"; do
+ lang=$(echo "$raw_lang" | tr -d '[:space:]')
+ if [ -z "$lang" ]; then
+ continue
+ fi
+ if [[ ! "$lang" =~ ^[a-z0-9_]+$ ]]; then
+ error "Invalid OCR language code: ${lang}"
+ error "Use comma-separated Tesseract codes such as eng,deu,fra,chi_sim"
+ exit 1
+ fi
+ if ! is_supported_ocr_language "$lang"; then
+ error "Unsupported OCR language code: ${lang}"
+ error "Run with --list-ocr-languages or --search-ocr-language to find supported Tesseract codes."
+ exit 1
+ fi
+ NORMALIZED_OCR_LANGUAGES+=("$lang")
+done
+
+if [ ${#NORMALIZED_OCR_LANGUAGES[@]} -eq 0 ]; then
+ error "At least one OCR language must be included."
+ exit 1
+fi
+
+OCR_LANGUAGES=$(IFS=','; echo "${NORMALIZED_OCR_LANGUAGES[*]}")
+load_required_ocr_fonts
+
# Validate WASM base URL format
if [[ ! "$WASM_BASE_URL" =~ ^https?:// ]]; then
error "WASM base URL must start with http:// or https://"
@@ -353,11 +551,15 @@ WASM_BASE_URL="${WASM_BASE_URL%/}"
WASM_PYMUPDF_URL="${WASM_BASE_URL}/pymupdf/"
WASM_GS_URL="${WASM_BASE_URL}/gs/"
WASM_CPDF_URL="${WASM_BASE_URL}/cpdf/"
+OCR_TESSERACT_WORKER_URL="${WASM_BASE_URL}/ocr/worker.min.js"
+OCR_TESSERACT_CORE_URL="${WASM_BASE_URL}/ocr/core"
+OCR_TESSERACT_LANG_URL="${WASM_BASE_URL}/ocr/lang-data"
+OCR_FONT_BASE_URL="${WASM_BASE_URL}/ocr/fonts"
echo ""
echo -e "${BOLD}============================================================${NC}"
echo -e "${BOLD} BentoPDF Air-Gapped Bundle Preparation${NC}"
-echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION}${NC}"
+echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION} | OCR: ${TESSERACT_VERSION}${NC}"
echo -e "${BOLD}============================================================${NC}"
# --- Phase 1: Prepare output directory ---
@@ -398,6 +600,27 @@ if [ "$SKIP_WASM" = true ]; then
error "Missing: coherentpdf-*.tgz"
wasm_missing=true
fi
+ if ! ls "$OUTPUT_DIR"/tesseract.js-*.tgz &>/dev/null; then
+ error "Missing: tesseract.js-*.tgz"
+ wasm_missing=true
+ fi
+ if ! ls "$OUTPUT_DIR"/tesseract.js-core-*.tgz &>/dev/null; then
+ error "Missing: tesseract.js-core-*.tgz"
+ wasm_missing=true
+ fi
+ for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do
+ if [ ! -f "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz" ]; then
+ error "Missing: tesseract-langdata/${lang}.traineddata.gz"
+ wasm_missing=true
+ fi
+ done
+ while IFS=$'\t' read -r font_family font_url font_file; do
+ [ -z "$font_file" ] && continue
+ if [ ! -f "$OUTPUT_DIR/ocr-fonts/${font_file}" ]; then
+ error "Missing: ocr-fonts/${font_file} (${font_family})"
+ wasm_missing=true
+ fi
+ done <<< "$OCR_FONT_MANIFEST_RAW"
if [ "$wasm_missing" = true ]; then
error "Run without --skip-wasm first to download the packages."
exit 1
@@ -430,8 +653,42 @@ else
exit 1
fi
+ info "Downloading tesseract.js@${TESSERACT_VERSION}..."
+ if ! (cd "$WASM_TMP" && npm pack "tesseract.js@${TESSERACT_VERSION}" --quiet 2>&1); then
+ error "Failed to download tesseract.js@${TESSERACT_VERSION}"
+ exit 1
+ fi
+
+ info "Downloading tesseract.js-core@${TESSERACT_CORE_VERSION}..."
+ if ! (cd "$WASM_TMP" && npm pack "tesseract.js-core@${TESSERACT_CORE_VERSION}" --quiet 2>&1); then
+ error "Failed to download tesseract.js-core@${TESSERACT_CORE_VERSION}"
+ exit 1
+ fi
+
# Move to output directory
mv "$WASM_TMP"/*.tgz "$OUTPUT_DIR/"
+
+ mkdir -p "$OUTPUT_DIR/tesseract-langdata"
+ for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do
+ info "Downloading OCR language data: ${lang}..."
+ if ! curl -fsSL "https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/${TESSDATA_VERSION}/${lang}.traineddata.gz" -o "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz"; then
+ error "Failed to download OCR language data for ${lang}"
+ error "Check that the language code exists and that the network can reach jsDelivr."
+ exit 1
+ fi
+ done
+
+ mkdir -p "$OUTPUT_DIR/ocr-fonts"
+ while IFS=$'\t' read -r font_family font_url font_file; do
+ [ -z "$font_file" ] && continue
+ info "Downloading OCR font: ${font_family}..."
+ if ! curl -fsSL "$font_url" -o "$OUTPUT_DIR/ocr-fonts/${font_file}"; then
+ error "Failed to download OCR font '${font_family}'"
+ error "Check that the network can reach the font URL: ${font_url}"
+ exit 1
+ fi
+ done <<< "$OCR_FONT_MANIFEST_RAW"
+
rm -rf "$WASM_TMP"
trap - EXIT
@@ -443,6 +700,10 @@ else
info " PyMuPDF: $(filesize "$OUTPUT_DIR"/bentopdf-pymupdf-wasm-*.tgz)"
info " Ghostscript: $(filesize "$OUTPUT_DIR"/bentopdf-gs-wasm-*.tgz)"
info " CoherentPDF: $(filesize "$CPDF_TGZ") (v${CPDF_VERSION})"
+ info " Tesseract.js: $(filesize "$OUTPUT_DIR"/tesseract.js-*.tgz)"
+ info " OCR Core: $(filesize "$OUTPUT_DIR"/tesseract.js-core-*.tgz)"
+ info " OCR Langs: ${OCR_LANGUAGES}"
+ info " OCR Fonts: $(printf '%s\n' "$OCR_FONT_MANIFEST_RAW" | awk -F '\t' 'NF >= 1 { print $1 }' | paste -sd ', ' -)"
fi
# Resolve CPDF version if we skipped download
@@ -488,6 +749,11 @@ else
BUILD_ARGS+=(--build-arg "VITE_WASM_PYMUPDF_URL=${WASM_PYMUPDF_URL}")
BUILD_ARGS+=(--build-arg "VITE_WASM_GS_URL=${WASM_GS_URL}")
BUILD_ARGS+=(--build-arg "VITE_WASM_CPDF_URL=${WASM_CPDF_URL}")
+ BUILD_ARGS+=(--build-arg "VITE_TESSERACT_WORKER_URL=${OCR_TESSERACT_WORKER_URL}")
+ BUILD_ARGS+=(--build-arg "VITE_TESSERACT_CORE_URL=${OCR_TESSERACT_CORE_URL}")
+ BUILD_ARGS+=(--build-arg "VITE_TESSERACT_LANG_URL=${OCR_TESSERACT_LANG_URL}")
+ BUILD_ARGS+=(--build-arg "VITE_TESSERACT_AVAILABLE_LANGUAGES=${OCR_LANGUAGES}")
+ BUILD_ARGS+=(--build-arg "VITE_OCR_FONT_BASE_URL=${OCR_FONT_BASE_URL}")
[ -n "$SIMPLE_MODE" ] && BUILD_ARGS+=(--build-arg "SIMPLE_MODE=${SIMPLE_MODE}")
[ -n "$BASE_URL" ] && BUILD_ARGS+=(--build-arg "BASE_URL=${BASE_URL}")
@@ -503,6 +769,12 @@ else
info " PyMuPDF: ${WASM_PYMUPDF_URL}"
info " Ghostscript: ${WASM_GS_URL}"
info " CoherentPDF: ${WASM_CPDF_URL}"
+ info "OCR URLs:"
+ info " Worker: ${OCR_TESSERACT_WORKER_URL}"
+ info " Core: ${OCR_TESSERACT_CORE_URL}"
+ info " Lang Data: ${OCR_TESSERACT_LANG_URL}"
+ info " Font Base: ${OCR_FONT_BASE_URL}"
+ info " Languages: ${OCR_LANGUAGES}"
echo ""
info "Building... this may take a few minutes (npm install + Vite build)."
echo ""
@@ -582,7 +854,7 @@ fi
echo ""
echo "[2/3] Extracting WASM packages to \${WASM_DIR}..."
-mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf"
+mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf" "\${WASM_DIR}/ocr/core" "\${WASM_DIR}/ocr/lang-data" "\${WASM_DIR}/ocr/fonts"
# PyMuPDF: package has dist/ and assets/ at root
echo " Extracting PyMuPDF..."
@@ -610,12 +882,35 @@ else
fi
rm -rf "\${TEMP_CPDF}"
+# Tesseract worker: browser expects a single worker.min.js file
+echo " Extracting Tesseract worker..."
+TEMP_TESS="\$(mktemp -d)"
+tar xzf "\${SCRIPT_DIR}"/tesseract.js-*.tgz -C "\${TEMP_TESS}"
+cp "\${TEMP_TESS}/package/dist/worker.min.js" "\${WASM_DIR}/ocr/worker.min.js"
+rm -rf "\${TEMP_TESS}"
+
+# Tesseract core: browser expects the full tesseract.js-core directory
+echo " Extracting Tesseract core..."
+tar xzf "\${SCRIPT_DIR}"/tesseract.js-core-*.tgz -C "\${WASM_DIR}/ocr/core" --strip-components=1
+
+# OCR language data: copy the bundled traineddata files
+echo " Installing OCR language data..."
+cp "\${SCRIPT_DIR}"/tesseract-langdata/*.traineddata.gz "\${WASM_DIR}/ocr/lang-data/"
+
+# OCR fonts: copy the bundled font files for searchable text layer rendering
+echo " Installing OCR fonts..."
+cp "\${SCRIPT_DIR}"/ocr-fonts/* "\${WASM_DIR}/ocr/fonts/"
+
echo " WASM files extracted to: \${WASM_DIR}"
echo ""
echo " IMPORTANT: Ensure these paths are served by your internal web server:"
echo " \${WASM_BASE_URL}/pymupdf/ -> \${WASM_DIR}/pymupdf/"
echo " \${WASM_BASE_URL}/gs/ -> \${WASM_DIR}/gs/"
echo " \${WASM_BASE_URL}/cpdf/ -> \${WASM_DIR}/cpdf/"
+echo " \${WASM_BASE_URL}/ocr/worker.min.js -> \${WASM_DIR}/ocr/worker.min.js"
+echo " \${WASM_BASE_URL}/ocr/core -> \${WASM_DIR}/ocr/core/"
+echo " \${WASM_BASE_URL}/ocr/lang-data -> \${WASM_DIR}/ocr/lang-data/"
+echo " \${WASM_BASE_URL}/ocr/fonts -> \${WASM_DIR}/ocr/fonts/"
# --- Step 3: Start BentoPDF ---
echo ""
@@ -654,6 +949,10 @@ cat > "$OUTPUT_DIR/README.md" <= 3 { printf "- **%s** -> `%s`\n", $1, $3 }')
These URLs are baked into the app at build time. The user's browser fetches
WASM files from these URLs at runtime.
@@ -694,7 +1003,7 @@ docker load -i bentopdf.tar
Extract to your internal web server's document root:
\`\`\`bash
-mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf
+mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
# PyMuPDF
tar xzf bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz -C ./wasm/pymupdf --strip-components=1
@@ -710,6 +1019,21 @@ TEMP_CPDF=\$(mktemp -d)
tar xzf coherentpdf-${CPDF_VERSION}.tgz -C \$TEMP_CPDF
cp -r \$TEMP_CPDF/package/dist/* ./wasm/cpdf/
rm -rf \$TEMP_CPDF
+
+# Tesseract worker
+TEMP_TESS=\$(mktemp -d)
+tar xzf tesseract.js-${TESSERACT_VERSION}.tgz -C \$TEMP_TESS
+cp \$TEMP_TESS/package/dist/worker.min.js ./wasm/ocr/worker.min.js
+rm -rf \$TEMP_TESS
+
+# Tesseract core
+tar xzf tesseract.js-core-${TESSERACT_CORE_VERSION}.tgz -C ./wasm/ocr/core --strip-components=1
+
+# OCR language data
+cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
+
+# OCR fonts
+cp ./ocr-fonts/* ./wasm/ocr/fonts/
\`\`\`
### 3. Configure your web server
@@ -721,6 +1045,10 @@ Ensure these paths are accessible at the configured URLs:
| \`${WASM_PYMUPDF_URL}\` | \`./wasm/pymupdf/\` |
| \`${WASM_GS_URL}\` | \`./wasm/gs/\` |
| \`${WASM_CPDF_URL}\` | \`./wasm/cpdf/\` |
+| \`${OCR_TESSERACT_WORKER_URL}\` | \`./wasm/ocr/worker.min.js\` |
+| \`${OCR_TESSERACT_CORE_URL}\` | \`./wasm/ocr/core/\` |
+| \`${OCR_TESSERACT_LANG_URL}\` | \`./wasm/ocr/lang-data/\` |
+| \`${OCR_FONT_BASE_URL}\` | \`./wasm/ocr/fonts/\` |
### 4. Run BentoPDF
diff --git a/src/js/compare/engine/ocr-page.ts b/src/js/compare/engine/ocr-page.ts
index 5c229c7..40abf00 100644
--- a/src/js/compare/engine/ocr-page.ts
+++ b/src/js/compare/engine/ocr-page.ts
@@ -1,37 +1,39 @@
-import Tesseract from 'tesseract.js';
-
+import type Tesseract from 'tesseract.js';
import type { ComparePageModel, CompareTextItem } from '../types.ts';
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
import {
joinCompareTextItems,
normalizeCompareText,
} from './text-normalization.ts';
+import { createConfiguredTesseractWorker } from '../../utils/tesseract-runtime.js';
-type OcrWord = {
- text: string;
- bbox: {
- x0: number;
- y0: number;
- x1: number;
- y1: number;
- };
-};
+type OcrWord = Tesseract.Word;
+type OcrRecognizeResult = Tesseract.RecognizeResult;
+type OcrPageWithWords = Tesseract.Page & { words: OcrWord[] };
export async function recognizePageCanvas(
canvas: HTMLCanvasElement,
language: string,
onProgress?: (status: string, progress: number) => void
): Promise {
- const result = await Tesseract.recognize(canvas, language, {
- logger(message) {
+ const worker = await createConfiguredTesseractWorker(
+ language,
+ 1,
+ (message) => {
onProgress?.(message.status, message.progress || 0);
- },
- });
+ }
+ );
- const ocrData = result.data as unknown as { words?: OcrWord[] };
- const words = ((ocrData.words || []) as OcrWord[])
+ let result: OcrRecognizeResult;
+ try {
+ result = await worker.recognize(canvas);
+ } finally {
+ await worker.terminate();
+ }
+
+ const words = (result.data as OcrPageWithWords).words
.map((word, index) => {
- const normalizedText = normalizeCompareText(word.text || '');
+ const normalizedText = normalizeCompareText(word.text);
if (!normalizedText) return null;
const item: CompareTextItem = {
diff --git a/src/js/config/font-mappings.ts b/src/js/config/font-mappings.ts
index c6c0c31..6a3df53 100644
--- a/src/js/config/font-mappings.ts
+++ b/src/js/config/font-mappings.ts
@@ -1,189 +1,233 @@
-/**
- * Font mappings for OCR text layer rendering
- * Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs
- */
-
-export const languageToFontFamily: Record = {
- // CJK Languages
- jpn: 'Noto Sans JP',
- chi_sim: 'Noto Sans SC',
- chi_tra: 'Noto Sans TC',
- kor: 'Noto Sans KR',
-
- // Arabic Script
- ara: 'Noto Sans Arabic',
- fas: 'Noto Sans Arabic',
- urd: 'Noto Sans Arabic',
- pus: 'Noto Sans Arabic',
- kur: 'Noto Sans Arabic',
-
- // Devanagari Script
- hin: 'Noto Sans Devanagari',
- mar: 'Noto Sans Devanagari',
- san: 'Noto Sans Devanagari',
- nep: 'Noto Sans Devanagari',
-
- // Bengali Script
- ben: 'Noto Sans Bengali',
- asm: 'Noto Sans Bengali',
-
- // Tamil Script
- tam: 'Noto Sans Tamil',
-
- // Telugu Script
- tel: 'Noto Sans Telugu',
-
- // Kannada Script
- kan: 'Noto Sans Kannada',
-
- // Malayalam Script
- mal: 'Noto Sans Malayalam',
-
- // Gujarati Script
- guj: 'Noto Sans Gujarati',
-
- // Gurmukhi Script (Punjabi)
- pan: 'Noto Sans Gurmukhi',
-
- // Oriya Script
- ori: 'Noto Sans Oriya',
-
- // Sinhala Script
- sin: 'Noto Sans Sinhala',
-
- // Thai Script
- tha: 'Noto Sans Thai',
-
- // Lao Script
- lao: 'Noto Sans Lao',
-
- // Khmer Script
- khm: 'Noto Sans Khmer',
-
- // Myanmar Script
- mya: 'Noto Sans Myanmar',
-
- // Tibetan Script
- bod: 'Noto Serif Tibetan',
-
- // Georgian Script
- kat: 'Noto Sans Georgian',
- kat_old: 'Noto Sans Georgian',
-
- // Armenian Script
- hye: 'Noto Sans Armenian',
-
- // Hebrew Script
- heb: 'Noto Sans Hebrew',
- yid: 'Noto Sans Hebrew',
-
- // Ethiopic Script
- amh: 'Noto Sans Ethiopic',
- tir: 'Noto Sans Ethiopic',
-
- // Cherokee Script
- chr: 'Noto Sans Cherokee',
-
- // Syriac Script
- syr: 'Noto Sans Syriac',
-
- // Cyrillic Script (Noto Sans includes Cyrillic)
- bel: 'Noto Sans',
- bul: 'Noto Sans',
- mkd: 'Noto Sans',
- rus: 'Noto Sans',
- srp: 'Noto Sans',
- srp_latn: 'Noto Sans',
- ukr: 'Noto Sans',
- kaz: 'Noto Sans',
- kir: 'Noto Sans',
- tgk: 'Noto Sans',
- uzb: 'Noto Sans',
- uzb_cyrl: 'Noto Sans',
- aze_cyrl: 'Noto Sans',
-
- // Latin Script (covered by base Noto Sans)
- afr: 'Noto Sans',
- aze: 'Noto Sans',
- bos: 'Noto Sans',
- cat: 'Noto Sans',
- ceb: 'Noto Sans',
- ces: 'Noto Sans',
- cym: 'Noto Sans',
- dan: 'Noto Sans',
- deu: 'Noto Sans',
- ell: 'Noto Sans',
- eng: 'Noto Sans',
- enm: 'Noto Sans',
- epo: 'Noto Sans',
- est: 'Noto Sans',
- eus: 'Noto Sans',
- fin: 'Noto Sans',
- fra: 'Noto Sans',
- frk: 'Noto Sans',
- frm: 'Noto Sans',
- gle: 'Noto Sans',
- glg: 'Noto Sans',
- grc: 'Noto Sans',
- hat: 'Noto Sans',
- hrv: 'Noto Sans',
- hun: 'Noto Sans',
- iku: 'Noto Sans',
- ind: 'Noto Sans',
- isl: 'Noto Sans',
- ita: 'Noto Sans',
- ita_old: 'Noto Sans',
- jav: 'Noto Sans',
- lat: 'Noto Sans',
- lav: 'Noto Sans',
- lit: 'Noto Sans',
- mlt: 'Noto Sans',
- msa: 'Noto Sans',
- nld: 'Noto Sans',
- nor: 'Noto Sans',
- pol: 'Noto Sans',
- por: 'Noto Sans',
- ron: 'Noto Sans',
- slk: 'Noto Sans',
- slv: 'Noto Sans',
- spa: 'Noto Sans',
- spa_old: 'Noto Sans',
- sqi: 'Noto Sans',
- swa: 'Noto Sans',
- swe: 'Noto Sans',
- tgl: 'Noto Sans',
- tur: 'Noto Sans',
- vie: 'Noto Sans',
- dzo: 'Noto Sans',
- uig: 'Noto Sans',
-};
-
-export const fontFamilyToUrl: Record = {
- 'Noto Sans JP': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf',
- 'Noto Sans SC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf',
- 'Noto Sans TC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf',
- 'Noto Sans KR': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf',
- 'Noto Sans Arabic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf',
- 'Noto Sans Devanagari': 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf',
- 'Noto Sans Bengali': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf',
- 'Noto Sans Gujarati': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf',
- 'Noto Sans Kannada': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf',
- 'Noto Sans Malayalam': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf',
- 'Noto Sans Oriya': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf',
- 'Noto Sans Gurmukhi': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf',
- 'Noto Sans Tamil': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf',
- 'Noto Sans Telugu': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf',
- 'Noto Sans Sinhala': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf',
- 'Noto Sans Thai': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf',
- 'Noto Sans Khmer': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf',
- 'Noto Sans Lao': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf',
- 'Noto Sans Myanmar': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf',
- 'Noto Sans Hebrew': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf',
- 'Noto Sans Georgian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf',
- 'Noto Sans Ethiopic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf',
- 'Noto Serif Tibetan': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf',
- 'Noto Sans Cherokee': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf',
- 'Noto Sans Armenian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf',
- 'Noto Sans Syriac': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf',
- 'Noto Sans': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf',
-};
\ No newline at end of file
+/**
+ * Font mappings for OCR text layer rendering
+ * Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs
+ */
+
+export const languageToFontFamily: Record = {
+ // CJK Languages
+ jpn: 'Noto Sans JP',
+ chi_sim: 'Noto Sans SC',
+ chi_tra: 'Noto Sans TC',
+ kor: 'Noto Sans KR',
+
+ // Arabic Script
+ ara: 'Noto Sans Arabic',
+ fas: 'Noto Sans Arabic',
+ urd: 'Noto Sans Arabic',
+ pus: 'Noto Sans Arabic',
+ kur: 'Noto Sans Arabic',
+
+ // Devanagari Script
+ hin: 'Noto Sans Devanagari',
+ mar: 'Noto Sans Devanagari',
+ san: 'Noto Sans Devanagari',
+ nep: 'Noto Sans Devanagari',
+
+ // Bengali Script
+ ben: 'Noto Sans Bengali',
+ asm: 'Noto Sans Bengali',
+
+ // Tamil Script
+ tam: 'Noto Sans Tamil',
+
+ // Telugu Script
+ tel: 'Noto Sans Telugu',
+
+ // Kannada Script
+ kan: 'Noto Sans Kannada',
+
+ // Malayalam Script
+ mal: 'Noto Sans Malayalam',
+
+ // Gujarati Script
+ guj: 'Noto Sans Gujarati',
+
+ // Gurmukhi Script (Punjabi)
+ pan: 'Noto Sans Gurmukhi',
+
+ // Oriya Script
+ ori: 'Noto Sans Oriya',
+
+ // Sinhala Script
+ sin: 'Noto Sans Sinhala',
+
+ // Thai Script
+ tha: 'Noto Sans Thai',
+
+ // Lao Script
+ lao: 'Noto Sans Lao',
+
+ // Khmer Script
+ khm: 'Noto Sans Khmer',
+
+ // Myanmar Script
+ mya: 'Noto Sans Myanmar',
+
+ // Tibetan Script
+ bod: 'Noto Serif Tibetan',
+
+ // Georgian Script
+ kat: 'Noto Sans Georgian',
+ kat_old: 'Noto Sans Georgian',
+
+ // Armenian Script
+ hye: 'Noto Sans Armenian',
+
+ // Hebrew Script
+ heb: 'Noto Sans Hebrew',
+ yid: 'Noto Sans Hebrew',
+
+ // Ethiopic Script
+ amh: 'Noto Sans Ethiopic',
+ tir: 'Noto Sans Ethiopic',
+
+ // Cherokee Script
+ chr: 'Noto Sans Cherokee',
+
+ // Syriac Script
+ syr: 'Noto Sans Syriac',
+
+ // Cyrillic Script (Noto Sans includes Cyrillic)
+ bel: 'Noto Sans',
+ bul: 'Noto Sans',
+ mkd: 'Noto Sans',
+ rus: 'Noto Sans',
+ srp: 'Noto Sans',
+ srp_latn: 'Noto Sans',
+ ukr: 'Noto Sans',
+ kaz: 'Noto Sans',
+ kir: 'Noto Sans',
+ tgk: 'Noto Sans',
+ uzb: 'Noto Sans',
+ uzb_cyrl: 'Noto Sans',
+ aze_cyrl: 'Noto Sans',
+
+ // Latin Script (covered by base Noto Sans)
+ afr: 'Noto Sans',
+ aze: 'Noto Sans',
+ bos: 'Noto Sans',
+ cat: 'Noto Sans',
+ ceb: 'Noto Sans',
+ ces: 'Noto Sans',
+ cym: 'Noto Sans',
+ dan: 'Noto Sans',
+ deu: 'Noto Sans',
+ ell: 'Noto Sans',
+ eng: 'Noto Sans',
+ enm: 'Noto Sans',
+ epo: 'Noto Sans',
+ est: 'Noto Sans',
+ eus: 'Noto Sans',
+ fin: 'Noto Sans',
+ fra: 'Noto Sans',
+ frk: 'Noto Sans',
+ frm: 'Noto Sans',
+ gle: 'Noto Sans',
+ glg: 'Noto Sans',
+ grc: 'Noto Sans',
+ hat: 'Noto Sans',
+ hrv: 'Noto Sans',
+ hun: 'Noto Sans',
+ iku: 'Noto Sans',
+ ind: 'Noto Sans',
+ isl: 'Noto Sans',
+ ita: 'Noto Sans',
+ ita_old: 'Noto Sans',
+ jav: 'Noto Sans',
+ lat: 'Noto Sans',
+ lav: 'Noto Sans',
+ lit: 'Noto Sans',
+ mlt: 'Noto Sans',
+ msa: 'Noto Sans',
+ nld: 'Noto Sans',
+ nor: 'Noto Sans',
+ pol: 'Noto Sans',
+ por: 'Noto Sans',
+ ron: 'Noto Sans',
+ slk: 'Noto Sans',
+ slv: 'Noto Sans',
+ spa: 'Noto Sans',
+ spa_old: 'Noto Sans',
+ sqi: 'Noto Sans',
+ swa: 'Noto Sans',
+ swe: 'Noto Sans',
+ tgl: 'Noto Sans',
+ tur: 'Noto Sans',
+ vie: 'Noto Sans',
+ dzo: 'Noto Sans',
+ uig: 'Noto Sans',
+};
+
+export const fontFamilyToUrl: Record = {
+ 'Noto Sans JP':
+ 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf',
+ 'Noto Sans SC':
+ 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf',
+ 'Noto Sans TC':
+ 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf',
+ 'Noto Sans KR':
+ 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf',
+ 'Noto Sans Arabic':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf',
+ 'Noto Sans Devanagari':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf',
+ 'Noto Sans Bengali':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf',
+ 'Noto Sans Gujarati':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf',
+ 'Noto Sans Kannada':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf',
+ 'Noto Sans Malayalam':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf',
+ 'Noto Sans Oriya':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf',
+ 'Noto Sans Gurmukhi':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf',
+ 'Noto Sans Tamil':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf',
+ 'Noto Sans Telugu':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf',
+ 'Noto Sans Sinhala':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf',
+ 'Noto Sans Thai':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf',
+ 'Noto Sans Khmer':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf',
+ 'Noto Sans Lao':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf',
+ 'Noto Sans Myanmar':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf',
+ 'Noto Sans Hebrew':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf',
+ 'Noto Sans Georgian':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf',
+ 'Noto Sans Ethiopic':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf',
+ 'Noto Serif Tibetan':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf',
+ 'Noto Sans Cherokee':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf',
+ 'Noto Sans Armenian':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf',
+ 'Noto Sans Syriac':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf',
+ 'Noto Sans':
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf',
+};
+
+export function getFontUrlForFamily(fontFamily: string): string {
+ return fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans'];
+}
+
+export function getFontAssetFileName(fontFamily: string): string {
+ const defaultUrl = getFontUrlForFamily(fontFamily);
+ const fileName = defaultUrl.split('/').pop();
+
+ if (!fileName) {
+ throw new Error(
+ `Could not resolve a font asset filename for ${fontFamily}`
+ );
+ }
+
+ return fileName;
+}
diff --git a/src/js/logic/ocr-pdf-page.ts b/src/js/logic/ocr-pdf-page.ts
index 04341f7..1f8318c 100644
--- a/src/js/logic/ocr-pdf-page.ts
+++ b/src/js/logic/ocr-pdf-page.ts
@@ -4,6 +4,11 @@ import { downloadFile, formatBytes } from '../utils/helpers.js';
import { icons, createIcons } from 'lucide';
import { OcrState } from '@/types';
import { performOcr } from '../utils/ocr.js';
+import {
+ getAvailableTesseractLanguageEntries,
+ resolveConfiguredTesseractAvailableLanguages,
+ UnsupportedOcrLanguageError,
+} from '../utils/tesseract-language-availability.js';
const pageState: OcrState = {
file: null,
@@ -80,6 +85,30 @@ function resetState() {
if (processBtn) processBtn.disabled = true;
}
+function updateLanguageAvailabilityNotice() {
+ const notice = document.getElementById('lang-availability-note');
+ if (!notice) return;
+
+ const configuredLanguages = resolveConfiguredTesseractAvailableLanguages();
+ if (!configuredLanguages) {
+ notice.classList.add('hidden');
+ notice.textContent = '';
+ return;
+ }
+
+ const availableEntries = getAvailableTesseractLanguageEntries();
+ if (availableEntries.length === 0) {
+ notice.classList.remove('hidden');
+ notice.textContent =
+ 'This deployment does not expose any valid OCR languages. Rebuild it with VITE_TESSERACT_AVAILABLE_LANGUAGES set to valid Tesseract codes.';
+ return;
+ }
+
+ const availableNames = availableEntries.map(([, name]) => name).join(', ');
+ notice.classList.remove('hidden');
+ notice.textContent = `This deployment bundles OCR for: ${availableNames}.`;
+}
+
async function runOCR() {
const selectedLangs = Array.from(
document.querySelectorAll('.lang-checkbox:checked')
@@ -142,10 +171,14 @@ async function runOCR() {
if (textOutput) textOutput.value = result.fullText.trim();
} catch (e) {
console.error(e);
- showAlert(
- 'OCR Error',
- 'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
- );
+ if (e instanceof UnsupportedOcrLanguageError) {
+ showAlert('OCR Language Not Available', e.message);
+ } else {
+ showAlert(
+ 'OCR Error',
+ 'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
+ );
+ }
if (toolOptions) toolOptions.classList.remove('hidden');
if (ocrProgress) ocrProgress.classList.add('hidden');
}
@@ -213,10 +246,21 @@ function populateLanguageList() {
langList.innerHTML = '';
- Object.entries(tesseractLanguages).forEach(function ([code, name]) {
+ const availableEntries = getAvailableTesseractLanguageEntries();
+ if (availableEntries.length === 0) {
+ const emptyState = document.createElement('p');
+ emptyState.className = 'text-sm text-yellow-300 p-2';
+ emptyState.textContent =
+ 'No OCR languages are available in this deployment.';
+ langList.appendChild(emptyState);
+ return;
+ }
+
+ availableEntries.forEach(function ([code, name]) {
const label = document.createElement('label');
label.className =
'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer';
+ label.dataset.search = `${name} ${code}`.toLowerCase();
const checkbox = document.createElement('input');
checkbox.type = 'checkbox';
@@ -253,6 +297,7 @@ document.addEventListener('DOMContentLoaded', function () {
const downloadPdfBtn = document.getElementById('download-searchable-pdf');
populateLanguageList();
+ updateLanguageAvailabilityNotice();
if (backBtn) {
backBtn.addEventListener('click', function () {
@@ -304,9 +349,9 @@ document.addEventListener('DOMContentLoaded', function () {
langSearch.addEventListener('input', function () {
const searchTerm = langSearch.value.toLowerCase();
langList.querySelectorAll('label').forEach(function (label) {
- (label as HTMLElement).style.display = label.textContent
- ?.toLowerCase()
- .includes(searchTerm)
+ (label as HTMLElement).style.display = (
+ label as HTMLElement
+ ).dataset.search?.includes(searchTerm)
? ''
: 'none';
});
diff --git a/src/js/logic/pdf-workflow-page.ts b/src/js/logic/pdf-workflow-page.ts
index c4ab6bb..7bac637 100644
--- a/src/js/logic/pdf-workflow-page.ts
+++ b/src/js/logic/pdf-workflow-page.ts
@@ -1,7 +1,7 @@
import { showAlert } from '../ui.js';
-import { tesseractLanguages } from '../config/tesseract-languages.js';
import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor';
import { executeWorkflow } from '../workflow/engine';
+import { getAvailableTesseractLanguageEntries } from '../utils/tesseract-language-availability.js';
import {
nodeRegistry,
getNodesByCategory,
@@ -1194,7 +1194,7 @@ function showNodeSettings(node: BaseWorkflowNode) {
{ label: 'High (288 DPI)', value: '3.0' },
{ label: 'Ultra (384 DPI)', value: '4.0' },
],
- language: Object.entries(tesseractLanguages).map(([code, name]) => ({
+ language: getAvailableTesseractLanguageEntries().map(([code, name]) => ({
label: name,
value: code,
})),
diff --git a/src/js/utils/font-loader.ts b/src/js/utils/font-loader.ts
index 7d2bc83..b27c27e 100644
--- a/src/js/utils/font-loader.ts
+++ b/src/js/utils/font-loader.ts
@@ -1,281 +1,330 @@
-import { languageToFontFamily, fontFamilyToUrl } from '../config/font-mappings.js';
-
-const fontCache: Map = new Map();
-
-const DB_NAME = 'bentopdf-fonts';
-const DB_VERSION = 1;
-const STORE_NAME = 'fonts';
-
-async function openFontDB(): Promise {
- return new Promise((resolve, reject) => {
- const request = indexedDB.open(DB_NAME, DB_VERSION);
-
- request.onerror = () => reject(request.error);
- request.onsuccess = () => resolve(request.result);
-
- request.onupgradeneeded = (event) => {
- const db = (event.target as IDBOpenDBRequest).result;
- if (!db.objectStoreNames.contains(STORE_NAME)) {
- db.createObjectStore(STORE_NAME);
- }
- };
- });
-}
-
-async function getCachedFontFromDB(fontFamily: string): Promise {
- try {
- const db = await openFontDB();
- return new Promise((resolve, reject) => {
- const transaction = db.transaction(STORE_NAME, 'readonly');
- const store = transaction.objectStore(STORE_NAME);
- const request = store.get(fontFamily);
-
- request.onsuccess = () => resolve(request.result || null);
- request.onerror = () => reject(request.error);
- });
- } catch (error) {
- console.warn('IndexedDB read failed:', error);
- return null;
- }
-}
-
-async function saveFontToDB(fontFamily: string, fontBuffer: ArrayBuffer): Promise {
- try {
- const db = await openFontDB();
- return new Promise((resolve, reject) => {
- const transaction = db.transaction(STORE_NAME, 'readwrite');
- const store = transaction.objectStore(STORE_NAME);
- const request = store.put(fontBuffer, fontFamily);
-
- request.onsuccess = () => resolve();
- request.onerror = () => reject(request.error);
- });
- } catch (error) {
- console.warn('IndexedDB write failed:', error);
- }
-}
-
-export async function getFontForLanguage(lang: string): Promise {
- const fontFamily = languageToFontFamily[lang] || 'Noto Sans';
-
- if (fontCache.has(fontFamily)) {
- return fontCache.get(fontFamily)!;
- }
- const cachedFont = await getCachedFontFromDB(fontFamily);
- if (cachedFont) {
- fontCache.set(fontFamily, cachedFont);
- return cachedFont;
- }
-
- try {
- const fontUrl = fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans'];
-
- const fontResponse = await fetch(fontUrl);
-
- if (!fontResponse.ok) {
- throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
- }
-
- const fontBuffer = await fontResponse.arrayBuffer();
-
- fontCache.set(fontFamily, fontBuffer);
- await saveFontToDB(fontFamily, fontBuffer);
-
- return fontBuffer;
- } catch (error) {
- console.warn(`Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`, error);
-
- if (fontFamily !== 'Noto Sans') {
- return await getFontForLanguage('eng');
- }
-
- throw error;
- }
-}
-
-export function detectScripts(text: string): string[] {
- const scripts = new Set();
-
- // Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF)
- if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
- scripts.add('jpn');
- }
-
- // Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF)
- if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) {
- scripts.add('kor');
- }
-
- // Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF)
- if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
- scripts.add('chi_sim');
- }
-
- // Check for Arabic
- if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) {
- scripts.add('ara');
- }
-
- // Check for Devanagari (Hindi, Marathi, etc.)
- if (/[\u0900-\u097F]/.test(text)) scripts.add('hin');
-
- // Check for Bengali
- if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben');
-
- // Check for Tamil
- if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam');
-
- // Check for Telugu
- if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel');
-
- // Check for Kannada
- if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
-
- // Check for Malayalam
- if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal');
-
- // Check for Gujarati
- if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj');
-
- // Check for Punjabi (Gurmukhi)
- if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan');
-
- // Check for Oriya
- if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
-
- // Check for Sinhala
- if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin');
-
- // Check for Thai
- if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha');
-
- // Check for Lao
- if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao');
-
- // Check for Khmer
- if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm');
-
- // Check for Myanmar
- if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
-
- // Check for Tibetan
- if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
-
- // Check for Georgian
- if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
-
- // Check for Armenian
- if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
-
- // Check for Hebrew
- if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
-
- // Check for Ethiopic
- if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
-
- // Check for Cherokee
- if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
-
- // Check for Syriac
- if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
-
- if (scripts.size === 0 || /[a-zA-Z]/.test(text)) {
- scripts.add('eng');
- }
-
- return Array.from(scripts);
-}
-
-export function getLanguageForChar(char: string): string {
- const code = char.charCodeAt(0);
-
- // Latin (Basic + Supplement + Extended)
- if (code <= 0x024F) return 'eng';
-
- // Japanese: Hiragana & Katakana
- if (
- (code >= 0x3040 && code <= 0x309F) || // Hiragana
- (code >= 0x30A0 && code <= 0x30FF) // Katakana
- ) return 'jpn';
-
- // Korean: Hangul Syllables & Jamo
- if (
- (code >= 0xAC00 && code <= 0xD7A3) || // Hangul Syllables
- (code >= 0x1100 && code <= 0x11FF) // Hangul Jamo
- ) return 'kor';
-
- // Chinese: CJK Unified Ideographs (Han)
- if (
- (code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified
- (code >= 0x3400 && code <= 0x4DBF) // CJK Ext A
- ) return 'chi_sim';
-
- // Arabic
- if ((code >= 0x0600 && code <= 0x06FF) || (code >= 0x0750 && code <= 0x077F) || (code >= 0x08A0 && code <= 0x08FF)) return 'ara';
-
- // Devanagari
- if (code >= 0x0900 && code <= 0x097F) return 'hin';
-
- // Bengali
- if (code >= 0x0980 && code <= 0x09FF) return 'ben';
-
- // Tamil
- if (code >= 0x0B80 && code <= 0x0BFF) return 'tam';
-
- // Telugu
- if (code >= 0x0C00 && code <= 0x0C7F) return 'tel';
-
- // Kannada
- if (code >= 0x0C80 && code <= 0x0CFF) return 'kan';
-
- // Malayalam
- if (code >= 0x0D00 && code <= 0x0D7F) return 'mal';
-
- // Gujarati
- if (code >= 0x0A80 && code <= 0x0AFF) return 'guj';
-
- // Punjabi (Gurmukhi)
- if (code >= 0x0A00 && code <= 0x0A7F) return 'pan';
-
- // Oriya
- if (code >= 0x0B00 && code <= 0x0B7F) return 'ori';
-
- // Sinhala
- if (code >= 0x0D80 && code <= 0x0DFF) return 'sin';
-
- // Thai
- if (code >= 0x0E00 && code <= 0x0E7F) return 'tha';
-
- // Lao
- if (code >= 0x0E80 && code <= 0x0EFF) return 'lao';
-
- // Khmer
- if (code >= 0x1780 && code <= 0x17FF) return 'khm';
-
- // Myanmar
- if (code >= 0x1000 && code <= 0x109F) return 'mya';
-
- // Tibetan
- if (code >= 0x0F00 && code <= 0x0FFF) return 'bod';
-
- // Georgian
- if (code >= 0x10A0 && code <= 0x10FF) return 'kat';
-
- // Armenian
- if (code >= 0x0530 && code <= 0x058F) return 'hye';
-
- // Hebrew
- if (code >= 0x0590 && code <= 0x05FF) return 'heb';
-
- // Ethiopic
- if (code >= 0x1200 && code <= 0x137F) return 'amh';
-
- // Cherokee
- if (code >= 0x13A0 && code <= 0x13FF) return 'chr';
-
- // Syriac
- if (code >= 0x0700 && code <= 0x074F) return 'syr';
-
- // Default to English (Latin)
- return 'eng';
-}
+import {
+ getFontAssetFileName,
+ getFontUrlForFamily,
+ languageToFontFamily,
+} from '../config/font-mappings.js';
+
+const fontCache: Map = new Map();
+
+const DB_NAME = 'bentopdf-fonts';
+const DB_VERSION = 1;
+const STORE_NAME = 'fonts';
+
+type OcrFontEnv = Partial>;
+
+function getDefaultFontEnv(): OcrFontEnv {
+ return import.meta.env;
+}
+
+function normalizeFontBaseUrl(url?: string): string | undefined {
+ const trimmed = url?.trim();
+
+ if (!trimmed) {
+ return undefined;
+ }
+
+ return trimmed.replace(/\/+$/, '');
+}
+
+export function resolveFontUrl(
+ fontFamily: string,
+ env: OcrFontEnv = getDefaultFontEnv()
+): string {
+ const fontBaseUrl = normalizeFontBaseUrl(env.VITE_OCR_FONT_BASE_URL);
+
+ if (fontBaseUrl) {
+ return `${fontBaseUrl}/${getFontAssetFileName(fontFamily)}`;
+ }
+
+ return getFontUrlForFamily(fontFamily);
+}
+
+async function openFontDB(): Promise {
+ return new Promise((resolve, reject) => {
+ const request = indexedDB.open(DB_NAME, DB_VERSION);
+
+ request.onerror = () => reject(request.error);
+ request.onsuccess = () => resolve(request.result);
+
+ request.onupgradeneeded = (event) => {
+ const db = (event.target as IDBOpenDBRequest).result;
+ if (!db.objectStoreNames.contains(STORE_NAME)) {
+ db.createObjectStore(STORE_NAME);
+ }
+ };
+ });
+}
+
+async function getCachedFontFromDB(
+ fontFamily: string
+): Promise {
+ try {
+ const db = await openFontDB();
+ return new Promise((resolve, reject) => {
+ const transaction = db.transaction(STORE_NAME, 'readonly');
+ const store = transaction.objectStore(STORE_NAME);
+ const request = store.get(fontFamily);
+
+ request.onsuccess = () => resolve(request.result || null);
+ request.onerror = () => reject(request.error);
+ });
+ } catch (error) {
+ console.warn('IndexedDB read failed:', error);
+ return null;
+ }
+}
+
+async function saveFontToDB(
+ fontFamily: string,
+ fontBuffer: ArrayBuffer
+): Promise {
+ try {
+ const db = await openFontDB();
+ return new Promise((resolve, reject) => {
+ const transaction = db.transaction(STORE_NAME, 'readwrite');
+ const store = transaction.objectStore(STORE_NAME);
+ const request = store.put(fontBuffer, fontFamily);
+
+ request.onsuccess = () => resolve();
+ request.onerror = () => reject(request.error);
+ });
+ } catch (error) {
+ console.warn('IndexedDB write failed:', error);
+ }
+}
+
+export async function getFontForLanguage(lang: string): Promise {
+ const fontFamily = languageToFontFamily[lang] || 'Noto Sans';
+
+ if (fontCache.has(fontFamily)) {
+ return fontCache.get(fontFamily)!;
+ }
+ const cachedFont = await getCachedFontFromDB(fontFamily);
+ if (cachedFont) {
+ fontCache.set(fontFamily, cachedFont);
+ return cachedFont;
+ }
+
+ try {
+ const fontUrl = resolveFontUrl(fontFamily);
+
+ const fontResponse = await fetch(fontUrl);
+
+ if (!fontResponse.ok) {
+ throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
+ }
+
+ const fontBuffer = await fontResponse.arrayBuffer();
+
+ fontCache.set(fontFamily, fontBuffer);
+ await saveFontToDB(fontFamily, fontBuffer);
+
+ return fontBuffer;
+ } catch (error) {
+ console.warn(
+ `Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`,
+ error
+ );
+
+ if (fontFamily !== 'Noto Sans') {
+ return await getFontForLanguage('eng');
+ }
+
+ throw error;
+ }
+}
+
+export function detectScripts(text: string): string[] {
+ const scripts = new Set();
+
+ // Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF)
+ if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
+ scripts.add('jpn');
+ }
+
+ // Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF)
+ if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) {
+ scripts.add('kor');
+ }
+
+ // Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF)
+ if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
+ scripts.add('chi_sim');
+ }
+
+ // Check for Arabic
+ if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) {
+ scripts.add('ara');
+ }
+
+ // Check for Devanagari (Hindi, Marathi, etc.)
+ if (/[\u0900-\u097F]/.test(text)) scripts.add('hin');
+
+ // Check for Bengali
+ if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben');
+
+ // Check for Tamil
+ if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam');
+
+ // Check for Telugu
+ if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel');
+
+ // Check for Kannada
+ if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
+
+ // Check for Malayalam
+ if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal');
+
+ // Check for Gujarati
+ if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj');
+
+ // Check for Punjabi (Gurmukhi)
+ if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan');
+
+ // Check for Oriya
+ if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
+
+ // Check for Sinhala
+ if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin');
+
+ // Check for Thai
+ if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha');
+
+ // Check for Lao
+ if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao');
+
+ // Check for Khmer
+ if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm');
+
+ // Check for Myanmar
+ if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
+
+ // Check for Tibetan
+ if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
+
+ // Check for Georgian
+ if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
+
+ // Check for Armenian
+ if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
+
+ // Check for Hebrew
+ if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
+
+ // Check for Ethiopic
+ if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
+
+ // Check for Cherokee
+ if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
+
+ // Check for Syriac
+ if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
+
+ if (scripts.size === 0 || /[a-zA-Z]/.test(text)) {
+ scripts.add('eng');
+ }
+
+ return Array.from(scripts);
+}
+
+export function getLanguageForChar(char: string): string {
+ const code = char.charCodeAt(0);
+
+ // Latin (Basic + Supplement + Extended)
+ if (code <= 0x024f) return 'eng';
+
+ // Japanese: Hiragana & Katakana
+ if (
+ (code >= 0x3040 && code <= 0x309f) || // Hiragana
+ (code >= 0x30a0 && code <= 0x30ff) // Katakana
+ )
+ return 'jpn';
+
+ // Korean: Hangul Syllables & Jamo
+ if (
+ (code >= 0xac00 && code <= 0xd7a3) || // Hangul Syllables
+ (code >= 0x1100 && code <= 0x11ff) // Hangul Jamo
+ )
+ return 'kor';
+
+ // Chinese: CJK Unified Ideographs (Han)
+ if (
+ (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified
+ (code >= 0x3400 && code <= 0x4dbf) // CJK Ext A
+ )
+ return 'chi_sim';
+
+ // Arabic
+ if (
+ (code >= 0x0600 && code <= 0x06ff) ||
+ (code >= 0x0750 && code <= 0x077f) ||
+ (code >= 0x08a0 && code <= 0x08ff)
+ )
+ return 'ara';
+
+ // Devanagari
+ if (code >= 0x0900 && code <= 0x097f) return 'hin';
+
+ // Bengali
+ if (code >= 0x0980 && code <= 0x09ff) return 'ben';
+
+ // Tamil
+ if (code >= 0x0b80 && code <= 0x0bff) return 'tam';
+
+ // Telugu
+ if (code >= 0x0c00 && code <= 0x0c7f) return 'tel';
+
+ // Kannada
+ if (code >= 0x0c80 && code <= 0x0cff) return 'kan';
+
+ // Malayalam
+ if (code >= 0x0d00 && code <= 0x0d7f) return 'mal';
+
+ // Gujarati
+ if (code >= 0x0a80 && code <= 0x0aff) return 'guj';
+
+ // Punjabi (Gurmukhi)
+ if (code >= 0x0a00 && code <= 0x0a7f) return 'pan';
+
+ // Oriya
+ if (code >= 0x0b00 && code <= 0x0b7f) return 'ori';
+
+ // Sinhala
+ if (code >= 0x0d80 && code <= 0x0dff) return 'sin';
+
+ // Thai
+ if (code >= 0x0e00 && code <= 0x0e7f) return 'tha';
+
+ // Lao
+ if (code >= 0x0e80 && code <= 0x0eff) return 'lao';
+
+ // Khmer
+ if (code >= 0x1780 && code <= 0x17ff) return 'khm';
+
+ // Myanmar
+ if (code >= 0x1000 && code <= 0x109f) return 'mya';
+
+ // Tibetan
+ if (code >= 0x0f00 && code <= 0x0fff) return 'bod';
+
+ // Georgian
+ if (code >= 0x10a0 && code <= 0x10ff) return 'kat';
+
+ // Armenian
+ if (code >= 0x0530 && code <= 0x058f) return 'hye';
+
+ // Hebrew
+ if (code >= 0x0590 && code <= 0x05ff) return 'heb';
+
+ // Ethiopic
+ if (code >= 0x1200 && code <= 0x137f) return 'amh';
+
+ // Cherokee
+ if (code >= 0x13a0 && code <= 0x13ff) return 'chr';
+
+ // Syriac
+ if (code >= 0x0700 && code <= 0x074f) return 'syr';
+
+ // Default to English (Latin)
+ return 'eng';
+}
diff --git a/src/js/utils/ocr.ts b/src/js/utils/ocr.ts
index 5a38d39..931d3c1 100644
--- a/src/js/utils/ocr.ts
+++ b/src/js/utils/ocr.ts
@@ -1,7 +1,6 @@
import Tesseract from 'tesseract.js';
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
import fontkit from '@pdf-lib/fontkit';
-import * as pdfjsLib from 'pdfjs-dist';
import { getFontForLanguage } from './font-loader.js';
import { OcrPage, OcrLine } from '@/types';
import {
@@ -10,6 +9,7 @@ import {
calculateSpaceTransform,
} from './hocr-transform.js';
import { getPDFDocument } from './helpers.js';
+import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
export interface OcrOptions {
language: string;
@@ -134,11 +134,13 @@ export async function performOcr(
const { language, resolution, binarize, whitelist, onProgress } = options;
const progress = onProgress || (() => {});
- const worker = await Tesseract.createWorker(language, 1, {
- logger: function (m: { status: string; progress: number }) {
+ const worker = await createConfiguredTesseractWorker(
+ language,
+ 1,
+ function (m: { status: string; progress: number }) {
progress(m.status, m.progress || 0);
- },
- });
+ }
+ );
await worker.setParameters({
tessjs_create_hocr: '1',
diff --git a/src/js/utils/tesseract-language-availability.ts b/src/js/utils/tesseract-language-availability.ts
new file mode 100644
index 0000000..16cff30
--- /dev/null
+++ b/src/js/utils/tesseract-language-availability.ts
@@ -0,0 +1,132 @@
+import { tesseractLanguages } from '../config/tesseract-languages.js';
+
+export const TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY =
+ 'VITE_TESSERACT_AVAILABLE_LANGUAGES' as const;
+
+type TesseractAvailabilityEnv = Partial<
+ Pick
+>;
+
+export type TesseractLanguageCode = keyof typeof tesseractLanguages;
+
+function getDefaultEnv(): TesseractAvailabilityEnv {
+ return import.meta.env;
+}
+
+function normalizeLanguageCodes(value: string | string[]): string[] {
+ const rawCodes = Array.isArray(value) ? value : value.split(/[+,]/);
+ const seen = new Set();
+ const normalizedCodes: string[] = [];
+
+ for (const rawCode of rawCodes) {
+ const code = rawCode.trim();
+ if (!code || seen.has(code)) {
+ continue;
+ }
+ seen.add(code);
+ normalizedCodes.push(code);
+ }
+
+ return normalizedCodes;
+}
+
+function formatLanguageLabel(code: string): string {
+ const label = tesseractLanguages[code as TesseractLanguageCode];
+ return label ? `${label} (${code})` : code;
+}
+
+export function resolveConfiguredTesseractAvailableLanguages(
+ env: TesseractAvailabilityEnv = getDefaultEnv()
+): string[] | null {
+ const configuredLanguages = env.VITE_TESSERACT_AVAILABLE_LANGUAGES?.trim();
+ if (!configuredLanguages) {
+ return null;
+ }
+
+ return normalizeLanguageCodes(configuredLanguages);
+}
+
+export function getAvailableTesseractLanguageEntries(
+ env: TesseractAvailabilityEnv = getDefaultEnv()
+): Array<[TesseractLanguageCode, string]> {
+ const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
+ const allEntries = Object.entries(tesseractLanguages) as Array<
+ [TesseractLanguageCode, string]
+ >;
+
+ if (!configuredLanguages) {
+ return allEntries;
+ }
+
+ const configuredSet = new Set(configuredLanguages);
+ return allEntries.filter(([code]) => configuredSet.has(code));
+}
+
+export function getUnavailableTesseractLanguages(
+ requestedLanguages: string | string[],
+ env: TesseractAvailabilityEnv = getDefaultEnv()
+): string[] {
+ const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
+ if (!configuredLanguages) {
+ return [];
+ }
+
+ const configuredSet = new Set(configuredLanguages);
+ return normalizeLanguageCodes(requestedLanguages).filter(
+ (code) => !configuredSet.has(code)
+ );
+}
+
+export function formatTesseractLanguageList(codes: string[]): string {
+ return codes.map(formatLanguageLabel).join(', ');
+}
+
+function buildUnsupportedLanguageMessage(
+ unavailableLanguages: string[],
+ availableLanguages: string[]
+): string {
+ const unavailableText = formatTesseractLanguageList(unavailableLanguages);
+ const availableText = formatTesseractLanguageList(availableLanguages);
+
+ return [
+ `This BentoPDF build only bundles OCR data for ${availableText}.`,
+ `The requested OCR language is not available: ${unavailableText}.`,
+ 'Choose one of the bundled languages or rebuild the air-gapped bundle with the missing language added to --ocr-languages.',
+ ].join(' ');
+}
+
+export class UnsupportedOcrLanguageError extends Error {
+ readonly unavailableLanguages: string[];
+ readonly availableLanguages: string[];
+
+ constructor(unavailableLanguages: string[], availableLanguages: string[]) {
+ super(
+ buildUnsupportedLanguageMessage(unavailableLanguages, availableLanguages)
+ );
+ this.name = 'UnsupportedOcrLanguageError';
+ this.unavailableLanguages = unavailableLanguages;
+ this.availableLanguages = availableLanguages;
+ }
+}
+
+export function assertTesseractLanguagesAvailable(
+ requestedLanguages: string | string[],
+ env: TesseractAvailabilityEnv = getDefaultEnv()
+): void {
+ const availableLanguages = resolveConfiguredTesseractAvailableLanguages(env);
+ if (!availableLanguages) {
+ return;
+ }
+
+ const unavailableLanguages = getUnavailableTesseractLanguages(
+ requestedLanguages,
+ env
+ );
+
+ if (unavailableLanguages.length > 0) {
+ throw new UnsupportedOcrLanguageError(
+ unavailableLanguages,
+ availableLanguages
+ );
+ }
+}
diff --git a/src/js/utils/tesseract-runtime.ts b/src/js/utils/tesseract-runtime.ts
new file mode 100644
index 0000000..3af7ff3
--- /dev/null
+++ b/src/js/utils/tesseract-runtime.ts
@@ -0,0 +1,130 @@
+import Tesseract from 'tesseract.js';
+import {
+ assertTesseractLanguagesAvailable,
+ TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
+} from './tesseract-language-availability.js';
+
+const TESSERACT_ENV_KEYS = [
+ 'VITE_TESSERACT_WORKER_URL',
+ 'VITE_TESSERACT_CORE_URL',
+ 'VITE_TESSERACT_LANG_URL',
+] as const;
+
+const TESSERACT_RUNTIME_ENV_KEYS = [
+ ...TESSERACT_ENV_KEYS,
+ TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
+] as const;
+
+type TesseractRuntimeEnvKey = (typeof TESSERACT_RUNTIME_ENV_KEYS)[number];
+
+export type TesseractAssetEnv = Partial<
+ Pick
+>;
+
+export interface TesseractAssetConfig {
+ workerPath?: string;
+ corePath?: string;
+ langPath?: string;
+}
+
+export type TesseractLoggerMessage = Tesseract.LoggerMessage;
+export type TesseractWorkerOptions = Partial;
+export type TesseractWorker = Tesseract.Worker;
+
+function getDefaultTesseractAssetEnv(): TesseractAssetEnv {
+ return import.meta.env;
+}
+
+function normalizeDirectoryUrl(url?: string): string | undefined {
+ const trimmed = url?.trim();
+ if (!trimmed) return undefined;
+ return trimmed.replace(/\/+$/, '');
+}
+
+function normalizeFileUrl(url?: string): string | undefined {
+ const trimmed = url?.trim();
+ if (!trimmed) return undefined;
+ return trimmed.replace(/\/+$/, '');
+}
+
+export function resolveTesseractAssetConfig(
+ env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
+): TesseractAssetConfig {
+ return {
+ workerPath: normalizeFileUrl(env.VITE_TESSERACT_WORKER_URL),
+ corePath: normalizeDirectoryUrl(env.VITE_TESSERACT_CORE_URL),
+ langPath: normalizeDirectoryUrl(env.VITE_TESSERACT_LANG_URL),
+ };
+}
+
+export function hasConfiguredTesseractOverrides(
+ config: TesseractAssetConfig = resolveTesseractAssetConfig()
+): boolean {
+ return Boolean(config.workerPath || config.corePath || config.langPath);
+}
+
+export function hasCompleteTesseractOverrides(
+ config: TesseractAssetConfig = resolveTesseractAssetConfig()
+): boolean {
+ return Boolean(config.workerPath && config.corePath && config.langPath);
+}
+
+export function getIncompleteTesseractOverrideKeys(
+ config: TesseractAssetConfig = resolveTesseractAssetConfig()
+): Array<(typeof TESSERACT_ENV_KEYS)[number]> {
+ if (!hasConfiguredTesseractOverrides(config)) {
+ return [];
+ }
+
+ return TESSERACT_ENV_KEYS.filter((key) => {
+ switch (key) {
+ case 'VITE_TESSERACT_WORKER_URL':
+ return !config.workerPath;
+ case 'VITE_TESSERACT_CORE_URL':
+ return !config.corePath;
+ case 'VITE_TESSERACT_LANG_URL':
+ return !config.langPath;
+ }
+ });
+}
+
+export function buildTesseractWorkerOptions(
+ logger?: TesseractWorkerOptions['logger'],
+ env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
+): TesseractWorkerOptions {
+ const config = resolveTesseractAssetConfig(env);
+
+ if (!hasConfiguredTesseractOverrides(config)) {
+ return logger ? { logger } : {};
+ }
+
+ if (!hasCompleteTesseractOverrides(config)) {
+ const missing = getIncompleteTesseractOverrideKeys(config).join(', ');
+ throw new Error(
+ `Self-hosted OCR assets are partially configured. Set ${missing} together with the other Tesseract asset URLs.`
+ );
+ }
+
+ return {
+ ...(logger ? { logger } : {}),
+ workerPath: config.workerPath,
+ corePath: config.corePath,
+ langPath: config.langPath,
+ gzip: true,
+ };
+}
+
+export async function createConfiguredTesseractWorker(
+ language: string,
+ oem: Tesseract.OEM,
+ logger?: TesseractWorkerOptions['logger'],
+ env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
+): Promise {
+ assertTesseractLanguagesAvailable(language, env);
+
+ return Tesseract.createWorker(
+ language,
+ oem,
+ buildTesseractWorkerOptions(logger, env)
+ );
+}
diff --git a/src/pages/ocr-pdf.html b/src/pages/ocr-pdf.html
index d7d2368..0baa3a1 100644
--- a/src/pages/ocr-pdf.html
+++ b/src/pages/ocr-pdf.html
@@ -214,6 +214,10 @@
>None
+
diff --git a/src/tests/compare/ocr-page.test.ts b/src/tests/compare/ocr-page.test.ts
new file mode 100644
index 0000000..c98cfe2
--- /dev/null
+++ b/src/tests/compare/ocr-page.test.ts
@@ -0,0 +1,81 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { createConfiguredTesseractWorker } = vi.hoisted(() => ({
+ createConfiguredTesseractWorker: vi.fn(),
+}));
+
+const mockWorker = {
+ recognize: vi.fn(),
+ terminate: vi.fn(),
+};
+
+vi.mock('../../js/utils/tesseract-runtime', () => ({
+ createConfiguredTesseractWorker,
+}));
+
+import { recognizePageCanvas } from '../../js/compare/engine/ocr-page';
+
+describe('compare OCR page recognition', () => {
+ beforeEach(() => {
+ createConfiguredTesseractWorker.mockReset();
+ mockWorker.recognize.mockReset();
+ mockWorker.terminate.mockReset();
+ createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
+ });
+
+ it('uses the configured Tesseract worker and maps OCR words into compare text items', async () => {
+ const progress = vi.fn();
+ const canvas = {
+ width: 300,
+ height: 150,
+ } as HTMLCanvasElement;
+
+ mockWorker.recognize.mockResolvedValue({
+ data: {
+ words: [
+ {
+ text: 'Hello',
+ bbox: { x0: 10, y0: 20, x1: 60, y1: 40 },
+ },
+ {
+ text: 'world',
+ bbox: { x0: 70, y0: 20, x1: 120, y1: 40 },
+ },
+ ],
+ },
+ });
+
+ const model = await recognizePageCanvas(canvas, 'eng', progress);
+
+ expect(createConfiguredTesseractWorker).toHaveBeenCalledWith(
+ 'eng',
+ 1,
+ expect.any(Function)
+ );
+ expect(mockWorker.recognize).toHaveBeenCalledWith(canvas);
+ expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
+ expect(model.source).toBe('ocr');
+ expect(model.hasText).toBe(true);
+ expect(model.plainText).toContain('Hello');
+ expect(model.textItems).toHaveLength(1);
+
+ const logger = createConfiguredTesseractWorker.mock
+ .calls[0][2] as (message: { status: string; progress: number }) => void;
+ logger({ status: 'recognizing text', progress: 0.5 });
+ expect(progress).toHaveBeenCalledWith('recognizing text', 0.5);
+ });
+
+ it('terminates the worker when compare OCR fails', async () => {
+ const canvas = {
+ width: 300,
+ height: 150,
+ } as HTMLCanvasElement;
+ mockWorker.recognize.mockRejectedValueOnce(new Error('compare ocr failed'));
+
+ await expect(recognizePageCanvas(canvas, 'eng')).rejects.toThrow(
+ 'compare ocr failed'
+ );
+
+ expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
+ });
+});
diff --git a/src/tests/font-loader.test.ts b/src/tests/font-loader.test.ts
new file mode 100644
index 0000000..dfadcf0
--- /dev/null
+++ b/src/tests/font-loader.test.ts
@@ -0,0 +1,28 @@
+import { describe, expect, it } from 'vitest';
+
+import { getFontAssetFileName } from '../js/config/font-mappings';
+import { resolveFontUrl } from '../js/utils/font-loader';
+
+describe('font-loader', () => {
+ it('uses the default public font URL when no offline font base URL is configured', () => {
+ expect(resolveFontUrl('Noto Sans', {})).toBe(
+ 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf'
+ );
+ });
+
+ it('builds a self-hosted font URL when an OCR font base URL is configured', () => {
+ expect(
+ resolveFontUrl('Noto Sans Arabic', {
+ VITE_OCR_FONT_BASE_URL: 'https://internal.example.com/wasm/ocr/fonts/',
+ })
+ ).toBe(
+ 'https://internal.example.com/wasm/ocr/fonts/NotoSansArabic-Regular.ttf'
+ );
+ });
+
+ it('derives the bundled font asset file name from the default font URL', () => {
+ expect(getFontAssetFileName('Noto Sans SC')).toBe(
+ 'NotoSansCJKsc-Regular.otf'
+ );
+ });
+});
diff --git a/src/tests/ocr.test.ts b/src/tests/ocr.test.ts
new file mode 100644
index 0000000..97e175b
--- /dev/null
+++ b/src/tests/ocr.test.ts
@@ -0,0 +1,185 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const {
+ createConfiguredTesseractWorker,
+ getPDFDocument,
+ getFontForLanguage,
+ parseHocrDocument,
+} = vi.hoisted(() => ({
+ createConfiguredTesseractWorker: vi.fn(),
+ getPDFDocument: vi.fn(),
+ getFontForLanguage: vi.fn(),
+ parseHocrDocument: vi.fn(),
+}));
+
+const mockWorker = {
+ setParameters: vi.fn(),
+ recognize: vi.fn(),
+ terminate: vi.fn(),
+};
+
+const mockPdfPage = {
+ getViewport: vi.fn(() => ({ width: 200, height: 100 })),
+ render: vi.fn(() => ({ promise: Promise.resolve() })),
+};
+
+const mockPdfOutputPage = {
+ drawImage: vi.fn(),
+ drawText: vi.fn(),
+};
+
+const mockPdfDoc = {
+ registerFontkit: vi.fn(),
+ embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })),
+ addPage: vi.fn(() => mockPdfOutputPage),
+ embedPng: vi.fn(async () => ({ id: 'png' })),
+ save: vi.fn(async () => new Uint8Array([1, 2, 3])),
+};
+
+vi.mock('../js/utils/tesseract-runtime', () => ({
+ createConfiguredTesseractWorker,
+}));
+
+vi.mock('../js/utils/helpers.js', () => ({
+ getPDFDocument,
+}));
+
+vi.mock('../js/utils/font-loader.js', () => ({
+ getFontForLanguage,
+}));
+
+vi.mock('../js/utils/hocr-transform.js', () => ({
+ parseHocrDocument,
+ calculateWordTransform: vi.fn(),
+ calculateSpaceTransform: vi.fn(),
+}));
+
+vi.mock('pdf-lib', () => ({
+ PDFDocument: {
+ create: vi.fn(async () => mockPdfDoc),
+ },
+ StandardFonts: {
+ Helvetica: 'Helvetica',
+ },
+ rgb: vi.fn(() => ({ r: 0, g: 0, b: 0 })),
+}));
+
+vi.mock('@pdf-lib/fontkit', () => ({
+ default: {},
+}));
+
+import { performOcr } from '../js/utils/ocr';
+
+describe('performOcr', () => {
+ const originalCreateElement = document.createElement.bind(document);
+ const originalFileReader = globalThis.FileReader;
+
+ beforeEach(() => {
+ createConfiguredTesseractWorker.mockReset();
+ getPDFDocument.mockReset();
+ getFontForLanguage.mockReset();
+ parseHocrDocument.mockReset();
+
+ mockWorker.setParameters.mockReset();
+ mockWorker.recognize.mockReset();
+ mockWorker.terminate.mockReset();
+ mockPdfPage.getViewport.mockClear();
+ mockPdfPage.render.mockClear();
+ mockPdfOutputPage.drawImage.mockClear();
+ mockPdfOutputPage.drawText.mockClear();
+ mockPdfDoc.registerFontkit.mockClear();
+ mockPdfDoc.embedFont.mockClear();
+ mockPdfDoc.addPage.mockClear();
+ mockPdfDoc.embedPng.mockClear();
+ mockPdfDoc.save.mockClear();
+
+ createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
+ getPDFDocument.mockReturnValue({
+ promise: Promise.resolve({
+ numPages: 1,
+ getPage: vi.fn(async () => mockPdfPage),
+ }),
+ });
+ getFontForLanguage.mockResolvedValue(new Uint8Array([1, 2, 3]));
+ mockWorker.recognize.mockResolvedValue({
+ data: {
+ text: 'Recognized text',
+ hocr: '',
+ },
+ });
+
+ document.createElement = ((tagName: string) => {
+ if (tagName !== 'canvas') {
+ return originalCreateElement(tagName);
+ }
+
+ return {
+ width: 0,
+ height: 0,
+ getContext: vi.fn(() => ({
+ canvas: { width: 200, height: 100 },
+ getImageData: vi.fn(() => ({ data: new Uint8ClampedArray(4) })),
+ putImageData: vi.fn(),
+ })),
+ toBlob: vi.fn((callback: (blob: Blob) => void) => {
+ callback(
+ new Blob([new Uint8Array([1, 2, 3])], { type: 'image/png' })
+ );
+ }),
+ } as unknown as HTMLCanvasElement;
+ }) as typeof document.createElement;
+
+ globalThis.FileReader = class {
+ result: ArrayBuffer = new Uint8Array([1, 2, 3]).buffer;
+ onload: null | (() => void) = null;
+ onerror: null | (() => void) = null;
+
+ readAsArrayBuffer() {
+ this.onload?.();
+ }
+ } as unknown as typeof FileReader;
+ });
+
+ afterEach(() => {
+ document.createElement = originalCreateElement;
+ globalThis.FileReader = originalFileReader;
+ });
+
+ it('uses the configured Tesseract worker and terminates it after OCR completes', async () => {
+ const result = await performOcr(new Uint8Array([1, 2, 3]), {
+ language: 'eng',
+ resolution: 2,
+ binarize: false,
+ whitelist: '',
+ });
+
+ expect(createConfiguredTesseractWorker).toHaveBeenCalledWith(
+ 'eng',
+ 1,
+ expect.any(Function)
+ );
+ expect(mockWorker.setParameters).toHaveBeenCalledWith({
+ tessjs_create_hocr: '1',
+ tessedit_pageseg_mode: '3',
+ });
+ expect(mockWorker.recognize).toHaveBeenCalledTimes(1);
+ expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
+ expect(result.fullText).toContain('Recognized text');
+ expect(result.pdfBytes).toBeInstanceOf(Uint8Array);
+ });
+
+ it('terminates the Tesseract worker when OCR fails', async () => {
+ mockWorker.recognize.mockRejectedValueOnce(new Error('ocr failed'));
+
+ await expect(
+ performOcr(new Uint8Array([1, 2, 3]), {
+ language: 'eng',
+ resolution: 2,
+ binarize: false,
+ whitelist: '',
+ })
+ ).rejects.toThrow('ocr failed');
+
+ expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
+ });
+});
diff --git a/src/tests/tesseract-runtime.test.ts b/src/tests/tesseract-runtime.test.ts
new file mode 100644
index 0000000..748aaa7
--- /dev/null
+++ b/src/tests/tesseract-runtime.test.ts
@@ -0,0 +1,128 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { createWorker } = vi.hoisted(() => ({
+ createWorker: vi.fn(),
+}));
+
+vi.mock('tesseract.js', () => ({
+ default: {
+ createWorker,
+ },
+}));
+
+import {
+ buildTesseractWorkerOptions,
+ createConfiguredTesseractWorker,
+ getIncompleteTesseractOverrideKeys,
+ hasCompleteTesseractOverrides,
+ hasConfiguredTesseractOverrides,
+ resolveTesseractAssetConfig,
+} from '../js/utils/tesseract-runtime';
+import {
+ assertTesseractLanguagesAvailable,
+ getAvailableTesseractLanguageEntries,
+ getUnavailableTesseractLanguages,
+ UnsupportedOcrLanguageError,
+} from '../js/utils/tesseract-language-availability';
+
+describe('tesseract-runtime', () => {
+ beforeEach(() => {
+ createWorker.mockReset();
+ });
+
+ it('normalizes self-hosted OCR asset URLs', () => {
+ const config = resolveTesseractAssetConfig({
+ VITE_TESSERACT_WORKER_URL:
+ 'https://internal.example.com/ocr/worker.min.js/',
+ VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core/',
+ VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data/',
+ });
+
+ expect(config).toEqual({
+ workerPath: 'https://internal.example.com/ocr/worker.min.js',
+ corePath: 'https://internal.example.com/ocr/core',
+ langPath: 'https://internal.example.com/ocr/lang-data',
+ });
+ expect(hasConfiguredTesseractOverrides(config)).toBe(true);
+ expect(hasCompleteTesseractOverrides(config)).toBe(true);
+ });
+
+ it('returns logger-only options when no self-hosted OCR assets are configured', () => {
+ const logger = vi.fn();
+
+ expect(buildTesseractWorkerOptions(logger, {})).toEqual({ logger });
+ expect(
+ hasConfiguredTesseractOverrides(resolveTesseractAssetConfig({}))
+ ).toBe(false);
+ });
+
+ it('throws on partial OCR asset configuration', () => {
+ const env = {
+ VITE_TESSERACT_WORKER_URL:
+ 'https://internal.example.com/ocr/worker.min.js',
+ VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core',
+ };
+
+ expect(
+ getIncompleteTesseractOverrideKeys(resolveTesseractAssetConfig(env))
+ ).toEqual(['VITE_TESSERACT_LANG_URL']);
+ expect(() => buildTesseractWorkerOptions(undefined, env)).toThrow(
+ 'Self-hosted OCR assets are partially configured'
+ );
+ });
+
+ it('passes configured OCR asset URLs to Tesseract.createWorker', async () => {
+ const logger = vi.fn();
+ createWorker.mockResolvedValue({ id: 'worker' });
+
+ await createConfiguredTesseractWorker('eng', 1, logger, {
+ VITE_TESSERACT_WORKER_URL:
+ 'https://internal.example.com/ocr/worker.min.js',
+ VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core',
+ VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data',
+ });
+
+ expect(createWorker).toHaveBeenCalledWith('eng', 1, {
+ logger,
+ workerPath: 'https://internal.example.com/ocr/worker.min.js',
+ corePath: 'https://internal.example.com/ocr/core',
+ langPath: 'https://internal.example.com/ocr/lang-data',
+ gzip: true,
+ });
+ });
+
+ it('filters OCR language entries when the build restricts bundled languages', () => {
+ expect(
+ getAvailableTesseractLanguageEntries({
+ VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
+ })
+ ).toEqual([
+ ['eng', 'English'],
+ ['deu', 'German'],
+ ]);
+ });
+
+ it('reports unavailable OCR languages for restricted air-gap builds', () => {
+ expect(
+ getUnavailableTesseractLanguages('eng+fra', {
+ VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
+ })
+ ).toEqual(['fra']);
+
+ expect(() =>
+ assertTesseractLanguagesAvailable('eng+fra', {
+ VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
+ })
+ ).toThrow(UnsupportedOcrLanguageError);
+ });
+
+ it('blocks worker creation when OCR requests an unbundled language', async () => {
+ await expect(
+ createConfiguredTesseractWorker('fra', 1, undefined, {
+ VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
+ })
+ ).rejects.toThrow('This BentoPDF build only bundles OCR data for');
+
+ expect(createWorker).not.toHaveBeenCalled();
+ });
+});
diff --git a/src/types/globals.d.ts b/src/types/globals.d.ts
index 48c971c..aee6f2f 100644
--- a/src/types/globals.d.ts
+++ b/src/types/globals.d.ts
@@ -1 +1,15 @@
+///
+
+interface ImportMetaEnv {
+ readonly VITE_TESSERACT_WORKER_URL?: string;
+ readonly VITE_TESSERACT_CORE_URL?: string;
+ readonly VITE_TESSERACT_LANG_URL?: string;
+ readonly VITE_TESSERACT_AVAILABLE_LANGUAGES?: string;
+ readonly VITE_OCR_FONT_BASE_URL?: string;
+}
+
+interface ImportMeta {
+ readonly env: ImportMetaEnv;
+}
+
declare const __SIMPLE_MODE__: boolean;