feat: integrate Tesseract.js with improved language availability and font handling

- Refactored OCR page recognition to utilize a configured Tesseract worker.
- Added functions to manage font URLs and asset filenames based on language.
- Implemented language availability checks and error handling for unsupported languages.
- Enhanced PDF workflow to display available OCR languages and handle user selections.
- Introduced utility functions for resolving Tesseract asset configurations.
- Added tests for OCR functionality, font loading, and Tesseract runtime behavior.
- Updated global types to include environment variables for Tesseract and font configurations.
This commit is contained in:
alam00000
2026-03-14 15:50:30 +05:30
parent 58c78b09d2
commit 77da6d7a7d
23 changed files with 1906 additions and 564 deletions

View File

@@ -12,6 +12,15 @@ VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.1
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/ VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/ VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
# OCR assets (optional)
# Set all three together for self-hosted or air-gapped OCR.
# Leave empty to use Tesseract.js runtime defaults.
VITE_TESSERACT_WORKER_URL=
VITE_TESSERACT_CORE_URL=
VITE_TESSERACT_LANG_URL=
VITE_TESSERACT_AVAILABLE_LANGUAGES=
VITE_OCR_FONT_BASE_URL=
# Default UI language (build-time) # Default UI language (build-time)
# Supported: en, ar, be, fr, de, es, zh, zh-TW, vi, tr, id, it, pt, nl, da # Supported: en, ar, be, fr, de, es, zh, zh-TW, vi, tr, id, it, pt, nl, da
VITE_DEFAULT_LANGUAGE= VITE_DEFAULT_LANGUAGE=

View File

@@ -35,6 +35,18 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
# OCR asset URLs (optional, used for self-hosted or air-gapped OCR)
ARG VITE_TESSERACT_WORKER_URL
ARG VITE_TESSERACT_CORE_URL
ARG VITE_TESSERACT_LANG_URL
ARG VITE_TESSERACT_AVAILABLE_LANGUAGES
ARG VITE_OCR_FONT_BASE_URL
ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL
ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL
ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL
ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES
ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL
# Default UI language (e.g. en, fr, de, es, zh, ar) # Default UI language (e.g. en, fr, de, es, zh, ar)
ARG VITE_DEFAULT_LANGUAGE ARG VITE_DEFAULT_LANGUAGE
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE

View File

@@ -32,6 +32,17 @@ ENV VITE_WASM_PYMUPDF_URL=$VITE_WASM_PYMUPDF_URL
ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL ENV VITE_WASM_GS_URL=$VITE_WASM_GS_URL
ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL ENV VITE_WASM_CPDF_URL=$VITE_WASM_CPDF_URL
ARG VITE_TESSERACT_WORKER_URL
ARG VITE_TESSERACT_CORE_URL
ARG VITE_TESSERACT_LANG_URL
ARG VITE_TESSERACT_AVAILABLE_LANGUAGES
ARG VITE_OCR_FONT_BASE_URL
ENV VITE_TESSERACT_WORKER_URL=$VITE_TESSERACT_WORKER_URL
ENV VITE_TESSERACT_CORE_URL=$VITE_TESSERACT_CORE_URL
ENV VITE_TESSERACT_LANG_URL=$VITE_TESSERACT_LANG_URL
ENV VITE_TESSERACT_AVAILABLE_LANGUAGES=$VITE_TESSERACT_AVAILABLE_LANGUAGES
ENV VITE_OCR_FONT_BASE_URL=$VITE_OCR_FONT_BASE_URL
# Default UI language (e.g. en, fr, de, es, zh, ar) # Default UI language (e.g. en, fr, de, es, zh, ar)
ARG VITE_DEFAULT_LANGUAGE ARG VITE_DEFAULT_LANGUAGE
ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE ENV VITE_DEFAULT_LANGUAGE=$VITE_DEFAULT_LANGUAGE

View File

@@ -465,6 +465,11 @@ The default URLs are set in `.env.production`:
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/ VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/ VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/ VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
VITE_TESSERACT_WORKER_URL=
VITE_TESSERACT_CORE_URL=
VITE_TESSERACT_LANG_URL=
VITE_TESSERACT_AVAILABLE_LANGUAGES=
VITE_OCR_FONT_BASE_URL=
``` ```
To override via Docker build args: To override via Docker build args:
@@ -474,11 +479,18 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \ --build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \ --build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \ --build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
--build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \
--build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \
--build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \
--build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
--build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \
-t bentopdf . -t bentopdf .
``` ```
To disable a module (require manual user config via Advanced Settings), set its variable to an empty string. To disable a module (require manual user config via Advanced Settings), set its variable to an empty string.
For OCR, either leave all `VITE_TESSERACT_*` variables empty and use the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If your self-hosted bundle only includes a subset such as `eng,deu`, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu` so the UI only shows bundled languages and OCR fails with a descriptive message for unsupported ones. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR text-layer fonts.
Users can also override these defaults per-browser via **Advanced Settings** in the UI — user overrides take priority over the environment defaults. Users can also override these defaults per-browser via **Advanced Settings** in the UI — user overrides take priority over the environment defaults.
> [!IMPORTANT] > [!IMPORTANT]
@@ -496,6 +508,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa
git clone https://github.com/alam00000/bentopdf.git git clone https://github.com/alam00000/bentopdf.git
cd bentopdf cd bentopdf
# Show supported OCR language codes (for --ocr-languages)
bash scripts/prepare-airgap.sh --list-ocr-languages
# Search OCR language codes by name or abbreviation
bash scripts/prepare-airgap.sh --search-ocr-language german
# Interactive mode — prompts for all options # Interactive mode — prompts for all options
bash scripts/prepare-airgap.sh bash scripts/prepare-airgap.sh
@@ -508,7 +526,9 @@ This produces a bundle directory containing:
``` ```
bentopdf-airgap-bundle/ bentopdf-airgap-bundle/
bentopdf.tar # Docker image bentopdf.tar # Docker image
*.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF) *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract)
tesseract-langdata/ # OCR traineddata files
ocr-fonts/ # OCR text-layer font files
setup.sh # Setup script for the air-gapped side setup.sh # Setup script for the air-gapped side
README.md # Instructions README.md # Instructions
``` ```
@@ -525,23 +545,28 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
<details> <details>
<summary><strong>Script options</strong></summary> <summary><strong>Script options</strong></summary>
| Flag | Description | Default | | Flag | Description | Default |
| ----------------------- | ------------------------------------------------ | --------------------------------- | | ------------------------------ | ------------------------------------------------ | --------------------------------- |
| `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ | | `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
| `--image-name <name>` | Docker image tag | `bentopdf` | | `--image-name <name>` | Docker image tag | `bentopdf` |
| `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` | | `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` |
| `--simple-mode` | Enable Simple Mode | off | | `--simple-mode` | Enable Simple Mode | off |
| `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` | | `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
| `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ | | `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
| `--brand-name <name>` | Custom brand name | _(none)_ | | `--brand-name <name>` | Custom brand name | _(none)_ |
| `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ | | `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ |
| `--footer-text <text>` | Custom footer text | _(none)_ | | `--footer-text <text>` | Custom footer text | _(none)_ |
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` | | `--ocr-languages <list>` | Comma-separated OCR languages to bundle | `eng` |
| `--skip-docker` | Skip Docker build and export | off | | `--list-ocr-languages` | Print supported OCR codes and names, then exit | off |
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off | | `--search-ocr-language <term>` | Search OCR codes by name or abbreviation | off |
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` |
| `--skip-docker` | Skip Docker build and export | off |
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
</details> </details>
The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search <term>` to find matches such as `search german` or `search chi`.
> [!IMPORTANT] > [!IMPORTANT]
> WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`. > WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
@@ -550,12 +575,18 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
<details> <details>
<summary>If you prefer to do it manually without the script</summary> <summary>If you prefer to do it manually without the script</summary>
**Step 1: Download the WASM packages** (on a machine with internet) **Step 1: Download the WASM and OCR packages** (on a machine with internet)
```bash ```bash
npm pack @bentopdf/pymupdf-wasm@0.11.16 npm pack @bentopdf/pymupdf-wasm@0.11.16
npm pack @bentopdf/gs-wasm npm pack @bentopdf/gs-wasm
npm pack coherentpdf npm pack coherentpdf
npm pack tesseract.js@7.0.0
npm pack tesseract.js-core@7.0.0
mkdir -p tesseract-langdata
curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
mkdir -p ocr-fonts
curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
``` ```
**Step 2: Build the Docker image with internal URLs** **Step 2: Build the Docker image with internal URLs**
@@ -568,6 +599,10 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \ --build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \ --build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \ --build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
--build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
--build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
--build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
--build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
-t bentopdf . -t bentopdf .
``` ```
@@ -585,6 +620,10 @@ Copy these files via USB drive, internal artifact repository, or approved transf
- `bentopdf-pymupdf-wasm-0.11.14.tgz` — PyMuPDF WASM package - `bentopdf-pymupdf-wasm-0.11.14.tgz` — PyMuPDF WASM package
- `bentopdf-gs-wasm-*.tgz` — Ghostscript WASM package - `bentopdf-gs-wasm-*.tgz` — Ghostscript WASM package
- `coherentpdf-*.tgz` — CoherentPDF WASM package - `coherentpdf-*.tgz` — CoherentPDF WASM package
- `tesseract.js-7.0.0.tgz` — Tesseract worker package
- `tesseract.js-core-7.0.0.tgz` — Tesseract core runtime package
- `tesseract-langdata/` — OCR traineddata files
- `ocr-fonts/` — OCR text-layer font files
**Step 5: Set up inside the air-gapped network** **Step 5: Set up inside the air-gapped network**
@@ -593,16 +632,23 @@ Copy these files via USB drive, internal artifact repository, or approved transf
docker load -i bentopdf.tar docker load -i bentopdf.tar
# Extract the WASM packages # Extract the WASM packages
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1 tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1 tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1 tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
TEMP_TESS=$(mktemp -d)
tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js
rm -rf "$TEMP_TESS"
tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1
cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
cp ./ocr-fonts/* ./wasm/ocr/fonts/
# Run BentoPDF # Run BentoPDF
docker run -d -p 3000:8080 --restart unless-stopped bentopdf docker run -d -p 3000:8080 --restart unless-stopped bentopdf
``` ```
Make sure the WASM files are accessible at the URLs you configured in Step 2. Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
</details> </details>
@@ -613,6 +659,10 @@ Make sure the WASM files are accessible at the URLs you configured in Step 2.
> VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ > VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
> VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ > VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
> VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ > VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
> VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js
> VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core
> VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data
> VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts
> ``` > ```
**Subdirectory Hosting:** **Subdirectory Hosting:**

View File

@@ -34,6 +34,9 @@ docker compose up -d
Then open `http://localhost:3000` in your browser. Then open `http://localhost:3000` in your browser.
> [!NOTE]
> If you are preparing an air-gapped OCR deployment, you must host the OCR text-layer fonts internally in addition to the Tesseract worker, core runtime, and traineddata files. The full setup is documented in [Self-Hosting](/self-hosting/), including `VITE_OCR_FONT_BASE_URL` and the bundled `ocr-fonts/` directory.
### Option 3: Build from Source ### Option 3: Build from Source
```bash ```bash

View File

@@ -32,5 +32,11 @@ features:
details: Convert, edit, merge, split, compress, sign, OCR, and more. Everything you need in one place. details: Convert, edit, merge, split, compress, sign, OCR, and more. Everything you need in one place.
- icon: 🌐 - icon: 🌐
title: Self-Hostable title: Self-Hostable
details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or any static hosting. details: Deploy on your own infrastructure. Docker, Vercel, Netlify, AWS, or fully air-gapped environments with self-hosted OCR workers, language data, and text-layer fonts.
## Offline OCR
If you self-host BentoPDF in an air-gapped or offline environment, OCR needs more than the Tesseract worker and traineddata files. Searchable PDF output also needs the OCR text-layer fonts to be served internally.
See [Self-Hosting](/self-hosting/) for the full setup, including `VITE_OCR_FONT_BASE_URL`, the bundled `ocr-fonts/` directory, and the updated air-gap workflow.
--- ---

View File

@@ -90,20 +90,27 @@ docker run -d -p 3000:8080 bentopdf:custom
## Environment Variables ## Environment Variables
| Variable | Description | Default | | Variable | Description | Default |
| ----------------------- | ------------------------------- | -------------------------------------------------------------- | | ------------------------------------ | ------------------------------------------- | -------------------------------------------------------------- |
| `SIMPLE_MODE` | Build without LibreOffice tools | `false` | | `SIMPLE_MODE` | Build without LibreOffice tools | `false` |
| `BASE_URL` | Deploy to subdirectory | `/` | | `BASE_URL` | Deploy to subdirectory | `/` |
| `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` | | `VITE_WASM_PYMUPDF_URL` | PyMuPDF WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/` |
| `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` | | `VITE_WASM_GS_URL` | Ghostscript WASM module URL | `https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/` |
| `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` | | `VITE_WASM_CPDF_URL` | CoherentPDF WASM module URL | `https://cdn.jsdelivr.net/npm/coherentpdf/dist/` |
| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` | | `VITE_TESSERACT_WORKER_URL` | OCR worker script URL | _(empty; use Tesseract.js default CDN)_ |
| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` | | `VITE_TESSERACT_CORE_URL` | OCR core runtime directory | _(empty; use Tesseract.js default CDN)_ |
| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` | | `VITE_TESSERACT_LANG_URL` | OCR traineddata directory | _(empty; use Tesseract.js default CDN)_ |
| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` | | `VITE_TESSERACT_AVAILABLE_LANGUAGES` | Comma-separated OCR languages exposed in UI | _(empty; show full catalog)_ |
| `VITE_OCR_FONT_BASE_URL` | OCR text-layer font directory | _(empty; use remote Noto font URLs)_ |
| `VITE_DEFAULT_LANGUAGE` | Default UI language | `en` |
| `VITE_BRAND_NAME` | Custom brand name | `BentoPDF` |
| `VITE_BRAND_LOGO` | Logo path relative to `public/` | `images/favicon-no-bg.svg` |
| `VITE_FOOTER_TEXT` | Custom footer/copyright text | `© 2026 BentoPDF. All rights reserved.` |
WASM module URLs are pre-configured with CDN defaults — all advanced features work out of the box. Override these for air-gapped or self-hosted deployments. WASM module URLs are pre-configured with CDN defaults — all advanced features work out of the box. Override these for air-gapped or self-hosted deployments.
For OCR, leave the `VITE_TESSERACT_*` variables empty to use the default online assets, or set all three together for self-hosted/offline OCR. Partial OCR overrides are rejected because the worker, core runtime, and traineddata directory must match. For fully offline searchable PDF output, also set `VITE_OCR_FONT_BASE_URL` so the OCR text-layer fonts are loaded from your internal server instead of the public Noto font URLs.
`VITE_DEFAULT_LANGUAGE` sets the UI language for first-time visitors. Supported values: `en`, `ar`, `be`, `fr`, `de`, `es`, `zh`, `zh-TW`, `vi`, `tr`, `id`, `it`, `pt`, `nl`, `da`. Users can still switch languages — this only changes the default. `VITE_DEFAULT_LANGUAGE` sets the UI language for first-time visitors. Supported values: `en`, `ar`, `be`, `fr`, `de`, `es`, `zh`, `zh-TW`, `vi`, `tr`, `id`, `it`, `pt`, `nl`, `da`. Users can still switch languages — this only changes the default.
Example: Example:
@@ -137,35 +144,59 @@ Branding works in both full mode and Simple Mode, and can be combined with all o
```bash ```bash
# 1. On a machine WITH internet — download WASM packages # 1. On a machine WITH internet — download WASM packages
bash scripts/prepare-airgap.sh --list-ocr-languages
bash scripts/prepare-airgap.sh --search-ocr-language german
# 2. Download WASM/OCR packages
npm pack @bentopdf/pymupdf-wasm@0.11.14 npm pack @bentopdf/pymupdf-wasm@0.11.14
npm pack @bentopdf/gs-wasm npm pack @bentopdf/gs-wasm
npm pack coherentpdf npm pack coherentpdf
npm pack tesseract.js@7.0.0
npm pack tesseract.js-core@7.0.0
mkdir -p tesseract-langdata
curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
mkdir -p ocr-fonts
curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
# 2. Build the image with your internal server URLs # 3. Build the image with your internal server URLs
docker build \ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \ --build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \ --build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \ --build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
--build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
--build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
--build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
--build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
--build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
-t bentopdf . -t bentopdf .
# 3. Export the image # 4. Export the image
docker save bentopdf -o bentopdf.tar docker save bentopdf -o bentopdf.tar
# 4. Transfer bentopdf.tar + the .tgz WASM packages into the air-gapped network # 5. Transfer bentopdf.tar + the .tgz packages + tesseract-langdata/ + ocr-fonts/ into the air-gapped network
# 5. Inside the air-gapped network — load and run # 6. Inside the air-gapped network — load and run
docker load -i bentopdf.tar docker load -i bentopdf.tar
# Extract WASM packages to your internal web server # Extract WASM packages to your internal web server
mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf mkdir -p /var/www/wasm/pymupdf /var/www/wasm/gs /var/www/wasm/cpdf /var/www/wasm/ocr/core /var/www/wasm/ocr/lang-data /var/www/wasm/ocr/fonts
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C /var/www/wasm/pymupdf --strip-components=1 tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C /var/www/wasm/pymupdf --strip-components=1
tar xzf bentopdf-gs-wasm-*.tgz -C /var/www/wasm/gs --strip-components=1 tar xzf bentopdf-gs-wasm-*.tgz -C /var/www/wasm/gs --strip-components=1
tar xzf coherentpdf-*.tgz -C /var/www/wasm/cpdf --strip-components=1 tar xzf coherentpdf-*.tgz -C /var/www/wasm/cpdf --strip-components=1
TEMP_TESS=$(mktemp -d)
tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
cp "$TEMP_TESS/package/dist/worker.min.js" /var/www/wasm/ocr/worker.min.js
rm -rf "$TEMP_TESS"
tar xzf tesseract.js-core-7.0.0.tgz -C /var/www/wasm/ocr/core --strip-components=1
cp ./tesseract-langdata/*.traineddata.gz /var/www/wasm/ocr/lang-data/
cp ./ocr-fonts/* /var/www/wasm/ocr/fonts/
# Run BentoPDF # Run BentoPDF
docker run -d -p 3000:8080 --restart unless-stopped bentopdf docker run -d -p 3000:8080 --restart unless-stopped bentopdf
``` ```
Use the codes printed by `bash scripts/prepare-airgap.sh --list-ocr-languages`, or search by name with `bash scripts/prepare-airgap.sh --search-ocr-language <term>`, for `--ocr-languages`. When you build with a restricted OCR subset, pass the same codes to `VITE_TESSERACT_AVAILABLE_LANGUAGES` so the app only shows bundled languages. For full offline OCR output, also host the bundled `ocr-fonts/` directory and point `VITE_OCR_FONT_BASE_URL` at it.
Set a variable to empty string to disable that module (users must configure manually via Advanced Settings). Set a variable to empty string to disable that module (users must configure manually via Advanced Settings).
## Custom User ID (PUID/PGID) ## Custom User ID (PUID/PGID)

View File

@@ -175,6 +175,11 @@ These are set in `.env.production` and baked into the build:
VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/ VITE_WASM_PYMUPDF_URL=https://cdn.jsdelivr.net/npm/@bentopdf/pymupdf-wasm@0.11.16/
VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/ VITE_WASM_GS_URL=https://cdn.jsdelivr.net/npm/@bentopdf/gs-wasm/assets/
VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/ VITE_WASM_CPDF_URL=https://cdn.jsdelivr.net/npm/coherentpdf/dist/
VITE_TESSERACT_WORKER_URL=
VITE_TESSERACT_CORE_URL=
VITE_TESSERACT_LANG_URL=
VITE_TESSERACT_AVAILABLE_LANGUAGES=
VITE_OCR_FONT_BASE_URL=
``` ```
### Overriding WASM URLs ### Overriding WASM URLs
@@ -187,6 +192,11 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \ --build-arg VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \ --build-arg VITE_WASM_GS_URL=https://your-server.com/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \ --build-arg VITE_WASM_CPDF_URL=https://your-server.com/cpdf/ \
--build-arg VITE_TESSERACT_WORKER_URL=https://your-server.com/ocr/worker.min.js \
--build-arg VITE_TESSERACT_CORE_URL=https://your-server.com/ocr/core \
--build-arg VITE_TESSERACT_LANG_URL=https://your-server.com/ocr/lang-data \
--build-arg VITE_TESSERACT_AVAILABLE_LANGUAGES=eng,deu \
--build-arg VITE_OCR_FONT_BASE_URL=https://your-server.com/ocr/fonts \
-t bentopdf . -t bentopdf .
# Or via .env.production before building from source # Or via .env.production before building from source
@@ -195,6 +205,8 @@ VITE_WASM_PYMUPDF_URL=https://your-server.com/pymupdf/ npm run build
To disable a module entirely (require manual user config via Advanced Settings), set its variable to an empty string. To disable a module entirely (require manual user config via Advanced Settings), set its variable to an empty string.
For OCR, either leave all `VITE_TESSERACT_*` variables empty and keep the default online assets, or set the worker/core/lang URLs together for self-hosted/offline OCR. If you bundle only specific OCR languages, also set `VITE_TESSERACT_AVAILABLE_LANGUAGES` to the same comma-separated codes so the UI only offers installed languages and unsupported selections fail with a descriptive error. For fully offline searchable-PDF output, also set `VITE_OCR_FONT_BASE_URL` to the internal directory that serves the bundled OCR fonts.
Users can also override these defaults at any time via **Advanced Settings** in the UI — user overrides stored in the browser take priority over environment defaults. Users can also override these defaults at any time via **Advanced Settings** in the UI — user overrides stored in the browser take priority over environment defaults.
### Air-Gapped / Offline Deployment ### Air-Gapped / Offline Deployment
@@ -209,6 +221,12 @@ The included `prepare-airgap.sh` script automates the entire process — downloa
git clone https://github.com/alam00000/bentopdf.git git clone https://github.com/alam00000/bentopdf.git
cd bentopdf cd bentopdf
# Show supported OCR language codes (for --ocr-languages)
bash scripts/prepare-airgap.sh --list-ocr-languages
# Search OCR language codes by name or abbreviation
bash scripts/prepare-airgap.sh --search-ocr-language german
# Interactive mode — prompts for all options # Interactive mode — prompts for all options
bash scripts/prepare-airgap.sh bash scripts/prepare-airgap.sh
@@ -221,7 +239,9 @@ This produces a bundle directory:
``` ```
bentopdf-airgap-bundle/ bentopdf-airgap-bundle/
bentopdf.tar # Docker image bentopdf.tar # Docker image
*.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF) *.tgz # WASM packages (PyMuPDF, Ghostscript, CoherentPDF, Tesseract)
tesseract-langdata/ # OCR traineddata files
ocr-fonts/ # OCR text-layer font files
setup.sh # Setup script for the air-gapped side setup.sh # Setup script for the air-gapped side
README.md # Instructions README.md # Instructions
``` ```
@@ -237,20 +257,25 @@ The setup script loads the Docker image, extracts WASM files, and optionally sta
**Script options:** **Script options:**
| Flag | Description | Default | | Flag | Description | Default |
| ----------------------- | ------------------------------------------------ | --------------------------------- | | ------------------------------ | ------------------------------------------------ | --------------------------------- |
| `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ | | `--wasm-base-url <url>` | Where WASMs will be hosted internally | _(required, prompted if missing)_ |
| `--image-name <name>` | Docker image tag | `bentopdf` | | `--image-name <name>` | Docker image tag | `bentopdf` |
| `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` | | `--output-dir <path>` | Output bundle directory | `./bentopdf-airgap-bundle` |
| `--simple-mode` | Enable Simple Mode | off | | `--simple-mode` | Enable Simple Mode | off |
| `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` | | `--base-url <path>` | Subdirectory base URL (e.g. `/pdf/`) | `/` |
| `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ | | `--language <code>` | Default UI language (e.g. `fr`, `de`) | _(none)_ |
| `--brand-name <name>` | Custom brand name | _(none)_ | | `--brand-name <name>` | Custom brand name | _(none)_ |
| `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ | | `--brand-logo <path>` | Logo path relative to `public/` | _(none)_ |
| `--footer-text <text>` | Custom footer text | _(none)_ | | `--footer-text <text>` | Custom footer text | _(none)_ |
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` | | `--ocr-languages <list>` | Comma-separated OCR languages to bundle | `eng` |
| `--skip-docker` | Skip Docker build and export | off | | `--list-ocr-languages` | Print supported OCR codes and names, then exit | off |
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off | | `--search-ocr-language <term>` | Search OCR codes by name or abbreviation | off |
| `--dockerfile <path>` | Dockerfile to use | `Dockerfile` |
| `--skip-docker` | Skip Docker build and export | off |
| `--skip-wasm` | Skip WASM download (reuse existing `.tgz` files) | off |
The interactive prompt also accepts `list` to print the full supported Tesseract code list and `search <term>` to find matches such as `search german` or `search chi`.
::: warning Same-Origin Requirement ::: warning Same-Origin Requirement
WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`. WASM files must be served from the **same origin** as the BentoPDF app. Web Workers use `importScripts()` which cannot load scripts cross-origin. For example, if BentoPDF runs at `https://internal.example.com`, the WASM base URL should also be `https://internal.example.com/wasm`.
@@ -261,12 +286,18 @@ WASM files must be served from the **same origin** as the BentoPDF app. Web Work
<details> <details>
<summary>If you prefer to do it manually without the script</summary> <summary>If you prefer to do it manually without the script</summary>
**Step 1: Download the WASM packages** (on a machine with internet) **Step 1: Download the WASM and OCR packages** (on a machine with internet)
```bash ```bash
npm pack @bentopdf/pymupdf-wasm@0.11.14 npm pack @bentopdf/pymupdf-wasm@0.11.14
npm pack @bentopdf/gs-wasm npm pack @bentopdf/gs-wasm
npm pack coherentpdf npm pack coherentpdf
npm pack tesseract.js@7.0.0
npm pack tesseract.js-core@7.0.0
mkdir -p tesseract-langdata
curl -fsSL https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng/4.0.0_best_int/eng.traineddata.gz -o tesseract-langdata/eng.traineddata.gz
mkdir -p ocr-fonts
curl -fsSL https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf -o ocr-fonts/NotoSans-Regular.ttf
``` ```
**Step 2: Build the Docker image with internal URLs** **Step 2: Build the Docker image with internal URLs**
@@ -279,6 +310,10 @@ docker build \
--build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \ --build-arg VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ \
--build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \ --build-arg VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ \
--build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \ --build-arg VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ \
--build-arg VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js \
--build-arg VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core \
--build-arg VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data \
--build-arg VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts \
-t bentopdf . -t bentopdf .
``` ```
@@ -293,7 +328,9 @@ docker save bentopdf -o bentopdf.tar
Copy via USB, internal artifact repo, or approved transfer method: Copy via USB, internal artifact repo, or approved transfer method:
- `bentopdf.tar` — the Docker image - `bentopdf.tar` — the Docker image
- The three `.tgz` WASM packages from Step 1 - The five `.tgz` WASM/OCR packages from Step 1
- The `tesseract-langdata/` directory from Step 1
- The `ocr-fonts/` directory from Step 1
**Step 5: Set up inside the air-gapped network** **Step 5: Set up inside the air-gapped network**
@@ -302,16 +339,23 @@ Copy via USB, internal artifact repo, or approved transfer method:
docker load -i bentopdf.tar docker load -i bentopdf.tar
# Extract WASM packages # Extract WASM packages
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1 tar xzf bentopdf-pymupdf-wasm-0.11.14.tgz -C ./wasm/pymupdf --strip-components=1
tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1 tar xzf bentopdf-gs-wasm-*.tgz -C ./wasm/gs --strip-components=1
tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1 tar xzf coherentpdf-*.tgz -C ./wasm/cpdf --strip-components=1
TEMP_TESS=$(mktemp -d)
tar xzf tesseract.js-7.0.0.tgz -C "$TEMP_TESS"
cp "$TEMP_TESS/package/dist/worker.min.js" ./wasm/ocr/worker.min.js
rm -rf "$TEMP_TESS"
tar xzf tesseract.js-core-7.0.0.tgz -C ./wasm/ocr/core --strip-components=1
cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
cp ./ocr-fonts/* ./wasm/ocr/fonts/
# Run BentoPDF # Run BentoPDF
docker run -d -p 3000:8080 --restart unless-stopped bentopdf docker run -d -p 3000:8080 --restart unless-stopped bentopdf
``` ```
Make sure the WASM files are accessible at the URLs you configured in Step 2. Make sure the files are accessible at the URLs you configured in Step 2, including `.../ocr/worker.min.js`, `.../ocr/core`, `.../ocr/lang-data`, and `.../ocr/fonts`.
</details> </details>
@@ -322,6 +366,10 @@ Set the variables in `.env.production` before running `npm run build`:
VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/ VITE_WASM_PYMUPDF_URL=https://internal-server.example.com/wasm/pymupdf/
VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/ VITE_WASM_GS_URL=https://internal-server.example.com/wasm/gs/
VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/ VITE_WASM_CPDF_URL=https://internal-server.example.com/wasm/cpdf/
VITE_TESSERACT_WORKER_URL=https://internal-server.example.com/wasm/ocr/worker.min.js
VITE_TESSERACT_CORE_URL=https://internal-server.example.com/wasm/ocr/core
VITE_TESSERACT_LANG_URL=https://internal-server.example.com/wasm/ocr/lang-data
VITE_OCR_FONT_BASE_URL=https://internal-server.example.com/wasm/ocr/fonts
``` ```
::: :::

View File

@@ -13,6 +13,8 @@ set -euo pipefail
# Usage: # Usage:
# bash scripts/prepare-airgap.sh --wasm-base-url https://internal.example.com/wasm # bash scripts/prepare-airgap.sh --wasm-base-url https://internal.example.com/wasm
# bash scripts/prepare-airgap.sh # interactive mode # bash scripts/prepare-airgap.sh # interactive mode
# bash scripts/prepare-airgap.sh --ocr-languages eng,deu,fra
# bash scripts/prepare-airgap.sh --search-ocr-language german
# #
# See --help for all options. # See --help for all options.
# ============================================================ # ============================================================
@@ -54,6 +56,110 @@ DOCKERFILE="Dockerfile"
SKIP_DOCKER=false SKIP_DOCKER=false
SKIP_WASM=false SKIP_WASM=false
INTERACTIVE=false INTERACTIVE=false
OCR_LANGUAGES="eng"
TESSDATA_VERSION="4.0.0_best_int"
LIST_OCR_LANGUAGES=false
SEARCH_OCR_LANGUAGE_TERM=""
TESSERACT_LANGUAGE_CONFIG="src/js/config/tesseract-languages.ts"
FONT_MAPPING_CONFIG="src/js/config/font-mappings.ts"
SUPPORTED_OCR_LANGUAGES_RAW=""
OCR_FONT_MANIFEST_RAW=""
load_supported_ocr_languages() {
if [ -n "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then
return
fi
if [ ! -f "$TESSERACT_LANGUAGE_CONFIG" ]; then
error "Missing OCR language config: ${TESSERACT_LANGUAGE_CONFIG}"
exit 1
fi
SUPPORTED_OCR_LANGUAGES_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const languages = []; const pattern = /^\\s*([a-z0-9_]+):\\s*'([^']+)'/gm; let match; while ((match = pattern.exec(source)) !== null) { languages.push(match[1] + '\\t' + match[2]); } process.stdout.write(languages.join('\\n'));" "$TESSERACT_LANGUAGE_CONFIG")
if [ -z "$SUPPORTED_OCR_LANGUAGES_RAW" ]; then
error "Failed to load supported OCR languages from ${TESSERACT_LANGUAGE_CONFIG}"
exit 1
fi
}
is_supported_ocr_language() {
local code="$1"
load_supported_ocr_languages
printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v code="$code" '$1 == code { found = 1 } END { exit found ? 0 : 1 }'
}
show_supported_ocr_languages() {
load_supported_ocr_languages
echo ""
echo -e "${BOLD}Supported OCR languages:${NC}"
echo " Use the code in the left column for --ocr-languages."
echo ""
printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }'
echo ""
echo " Example: --ocr-languages eng,deu,fra,spa"
echo ""
}
show_matching_ocr_languages() {
local query="$1"
load_supported_ocr_languages
if [ -z "$query" ]; then
error "OCR language search requires a non-empty query."
exit 1
fi
local matches
matches=$(printf '%s\n' "$SUPPORTED_OCR_LANGUAGES_RAW" | awk -F '\t' -v query="$query" '
BEGIN {
normalized = tolower(query)
}
{
code = tolower($1)
name = tolower($2)
if (index(code, normalized) || index(name, normalized)) {
printf "%s\t%s\n", $1, $2
}
}
')
echo ""
echo -e "${BOLD}OCR language search:${NC} ${query}"
if [ -z "$matches" ]; then
echo " No supported OCR languages matched that query."
echo " Tip: run --list-ocr-languages to browse the full list."
echo ""
return 1
fi
echo " Matching codes for --ocr-languages:"
echo ""
printf '%s\n' "$matches" | awk -F '\t' '{ printf " %-12s %s\n", $1, $2 }'
echo ""
}
load_required_ocr_fonts() {
if [ -n "$OCR_FONT_MANIFEST_RAW" ]; then
return
fi
if [ ! -f "$FONT_MAPPING_CONFIG" ]; then
error "Missing OCR font mapping config: ${FONT_MAPPING_CONFIG}"
exit 1
fi
OCR_FONT_MANIFEST_RAW=$(node -e "const fs = require('fs'); const source = fs.readFileSync(process.argv[1], 'utf8'); const selected = (process.argv[2] || '').split(',').map((value) => value.trim()).filter(Boolean); const sections = source.split('export const fontFamilyToUrl'); const languageSection = sections[0] || ''; const fontSection = sections[1] || ''; const languageToFamily = {}; const fontFamilyToUrl = {}; let match; const languagePattern = /^\s*([a-z_]+):\s*'([^']+)',/gm; while ((match = languagePattern.exec(languageSection)) !== null) { languageToFamily[match[1]] = match[2]; } const fontPattern = /^\s*'([^']+)':\s*'([^']+)',/gm; while ((match = fontPattern.exec(fontSection)) !== null) { fontFamilyToUrl[match[1]] = match[2]; } const families = new Set(['Noto Sans']); for (const lang of selected) { families.add(languageToFamily[lang] || 'Noto Sans'); } const lines = Array.from(families).sort().map((family) => { const url = fontFamilyToUrl[family] || fontFamilyToUrl['Noto Sans']; const fileName = url.split('/').pop(); return [family, url, fileName].join('\t'); }); process.stdout.write(lines.join('\n'));" "$FONT_MAPPING_CONFIG" "$OCR_LANGUAGES")
if [ -z "$OCR_FONT_MANIFEST_RAW" ]; then
error "Failed to resolve OCR font assets from ${FONT_MAPPING_CONFIG}"
exit 1
fi
}
# --- Usage --- # --- Usage ---
usage() { usage() {
@@ -80,6 +186,10 @@ OPTIONS:
--brand-name <name> Custom brand name --brand-name <name> Custom brand name
--brand-logo <path> Logo path relative to public/ --brand-logo <path> Logo path relative to public/
--footer-text <text> Custom footer text --footer-text <text> Custom footer text
--ocr-languages <list> Comma-separated OCR languages to bundle
(default: eng)
--list-ocr-languages Print supported OCR language codes and exit
--search-ocr-language Search supported OCR languages by code or name
--skip-docker Skip Docker build and export --skip-docker Skip Docker build and export
--skip-wasm Skip WASM download (reuse existing .tgz files) --skip-wasm Skip WASM download (reuse existing .tgz files)
--help Show this help message --help Show this help message
@@ -91,6 +201,7 @@ EXAMPLES:
# Full automation # Full automation
bash scripts/prepare-airgap.sh \ bash scripts/prepare-airgap.sh \
--wasm-base-url https://internal.example.com/wasm \ --wasm-base-url https://internal.example.com/wasm \
--ocr-languages eng,deu,fra \
--brand-name "AcmePDF" \ --brand-name "AcmePDF" \
--language fr --language fr
@@ -98,6 +209,12 @@ EXAMPLES:
bash scripts/prepare-airgap.sh \ bash scripts/prepare-airgap.sh \
--wasm-base-url https://internal.example.com/wasm \ --wasm-base-url https://internal.example.com/wasm \
--skip-docker --skip-docker
# Show all supported OCR language codes
bash scripts/prepare-airgap.sh --list-ocr-languages
# Search OCR languages by code or human-readable name
bash scripts/prepare-airgap.sh --search-ocr-language german
EOF EOF
exit 0 exit 0
} }
@@ -115,6 +232,9 @@ while [[ $# -gt 0 ]]; do
--brand-name) BRAND_NAME="$2"; shift 2 ;; --brand-name) BRAND_NAME="$2"; shift 2 ;;
--brand-logo) BRAND_LOGO="$2"; shift 2 ;; --brand-logo) BRAND_LOGO="$2"; shift 2 ;;
--footer-text) FOOTER_TEXT="$2"; shift 2 ;; --footer-text) FOOTER_TEXT="$2"; shift 2 ;;
--ocr-languages) OCR_LANGUAGES="$2"; shift 2 ;;
--list-ocr-languages) LIST_OCR_LANGUAGES=true; shift ;;
--search-ocr-language) SEARCH_OCR_LANGUAGE_TERM="$2"; shift 2 ;;
--dockerfile) DOCKERFILE="$2"; shift 2 ;; --dockerfile) DOCKERFILE="$2"; shift 2 ;;
--skip-docker) SKIP_DOCKER=true; shift ;; --skip-docker) SKIP_DOCKER=true; shift ;;
--skip-wasm) SKIP_WASM=true; shift ;; --skip-wasm) SKIP_WASM=true; shift ;;
@@ -132,6 +252,18 @@ if [ ! -f "package.json" ] || [ ! -f "src/js/const/cdn-version.ts" ]; then
exit 1 exit 1
fi fi
if [ "$LIST_OCR_LANGUAGES" = true ]; then
show_supported_ocr_languages
exit 0
fi
if [ -n "$SEARCH_OCR_LANGUAGE_TERM" ]; then
if show_matching_ocr_languages "$SEARCH_OCR_LANGUAGE_TERM"; then
exit 0
fi
exit 1
fi
# --- Check prerequisites --- # --- Check prerequisites ---
check_prerequisites() { check_prerequisites() {
local missing=false local missing=false
@@ -141,6 +273,11 @@ check_prerequisites() {
missing=true missing=true
fi fi
if [ "$SKIP_WASM" = false ] && ! command -v curl &>/dev/null; then
error "curl is required to download OCR language data."
missing=true
fi
if [ "$SKIP_DOCKER" = false ] && ! command -v docker &>/dev/null; then if [ "$SKIP_DOCKER" = false ] && ! command -v docker &>/dev/null; then
error "docker is required but not found (use --skip-docker to skip)." error "docker is required but not found (use --skip-docker to skip)."
missing=true missing=true
@@ -156,9 +293,11 @@ read_versions() {
PYMUPDF_VERSION=$(grep "pymupdf:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'") PYMUPDF_VERSION=$(grep "pymupdf:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
GS_VERSION=$(grep "ghostscript:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'") GS_VERSION=$(grep "ghostscript:" src/js/const/cdn-version.ts | grep -o "'[^']*'" | tr -d "'")
APP_VERSION=$(node -p "require('./package.json').version") APP_VERSION=$(node -p "require('./package.json').version")
TESSERACT_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js'].version")
TESSERACT_CORE_VERSION=$(node -p "require('./package-lock.json').packages['node_modules/tesseract.js-core'].version")
if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ]; then if [ -z "$PYMUPDF_VERSION" ] || [ -z "$GS_VERSION" ] || [ -z "$TESSERACT_VERSION" ] || [ -z "$TESSERACT_CORE_VERSION" ]; then
error "Failed to read WASM versions from src/js/const/cdn-version.ts" error "Failed to read external asset versions from the repository metadata"
exit 1 exit 1
fi fi
} }
@@ -175,6 +314,8 @@ interactive_mode() {
echo " PyMuPDF: ${PYMUPDF_VERSION}" echo " PyMuPDF: ${PYMUPDF_VERSION}"
echo " Ghostscript: ${GS_VERSION}" echo " Ghostscript: ${GS_VERSION}"
echo " CoherentPDF: latest" echo " CoherentPDF: latest"
echo " Tesseract.js: ${TESSERACT_VERSION}"
echo " OCR Data: ${TESSDATA_VERSION}"
echo "" echo ""
# [1] WASM base URL (REQUIRED) # [1] WASM base URL (REQUIRED)
@@ -256,8 +397,35 @@ interactive_mode() {
DOCKERFILE="${input:-$DOCKERFILE}" DOCKERFILE="${input:-$DOCKERFILE}"
echo "" echo ""
# [8] Output directory (optional) # [8] OCR languages (optional)
echo -e "${BOLD}[8/8] Output Directory ${GREEN}(optional)${NC}" echo -e "${BOLD}[8/9] OCR Languages ${GREEN}(optional)${NC}"
echo " Comma-separated traineddata files to bundle for offline OCR."
echo " Enter Tesseract language codes such as: eng,deu,fra,spa"
echo " Type 'list' to print the full supported language list."
echo " Type 'search <term>' to find codes by name or abbreviation."
while true; do
read -r -p " OCR languages [${OCR_LANGUAGES}]: " input
if [ -z "${input:-}" ]; then
break
fi
if [ "$input" = "list" ]; then
show_supported_ocr_languages
continue
fi
if [[ "$input" == search\ * ]]; then
search_query="${input#search }"
if ! show_matching_ocr_languages "$search_query"; then
warn "No OCR language matched '${search_query}'."
fi
continue
fi
OCR_LANGUAGES="$input"
break
done
echo ""
# [9] Output directory (optional)
echo -e "${BOLD}[9/9] Output Directory ${GREEN}(optional)${NC}"
read -r -p " Path [${OUTPUT_DIR}]: " input read -r -p " Path [${OUTPUT_DIR}]: " input
OUTPUT_DIR="${input:-$OUTPUT_DIR}" OUTPUT_DIR="${input:-$OUTPUT_DIR}"
@@ -274,6 +442,7 @@ interactive_mode() {
[ -n "$BRAND_NAME" ] && echo " Brand Logo: ${BRAND_LOGO:-images/favicon-no-bg.svg (default)}" [ -n "$BRAND_NAME" ] && echo " Brand Logo: ${BRAND_LOGO:-images/favicon-no-bg.svg (default)}"
[ -n "$BRAND_NAME" ] && echo " Footer Text: ${FOOTER_TEXT:-(default)}" [ -n "$BRAND_NAME" ] && echo " Footer Text: ${FOOTER_TEXT:-(default)}"
echo " Base URL: ${BASE_URL:-/ (root)}" echo " Base URL: ${BASE_URL:-/ (root)}"
echo " OCR Languages: ${OCR_LANGUAGES}"
echo " Output: ${OUTPUT_DIR}" echo " Output: ${OUTPUT_DIR}"
echo "" echo ""
read -r -p " Proceed? (Y/n): " input read -r -p " Proceed? (Y/n): " input
@@ -321,6 +490,7 @@ filesize() {
check_prerequisites check_prerequisites
read_versions read_versions
load_supported_ocr_languages
# If no WASM base URL provided, go interactive # If no WASM base URL provided, go interactive
if [ -z "$WASM_BASE_URL" ]; then if [ -z "$WASM_BASE_URL" ]; then
@@ -338,6 +508,34 @@ if [ -n "$LANGUAGE" ]; then
fi fi
fi fi
IFS=',' read -r -a OCR_LANGUAGE_ARRAY <<< "$OCR_LANGUAGES"
NORMALIZED_OCR_LANGUAGES=()
for raw_lang in "${OCR_LANGUAGE_ARRAY[@]}"; do
lang=$(echo "$raw_lang" | tr -d '[:space:]')
if [ -z "$lang" ]; then
continue
fi
if [[ ! "$lang" =~ ^[a-z0-9_]+$ ]]; then
error "Invalid OCR language code: ${lang}"
error "Use comma-separated Tesseract codes such as eng,deu,fra,chi_sim"
exit 1
fi
if ! is_supported_ocr_language "$lang"; then
error "Unsupported OCR language code: ${lang}"
error "Run with --list-ocr-languages or --search-ocr-language <term> to find supported Tesseract codes."
exit 1
fi
NORMALIZED_OCR_LANGUAGES+=("$lang")
done
if [ ${#NORMALIZED_OCR_LANGUAGES[@]} -eq 0 ]; then
error "At least one OCR language must be included."
exit 1
fi
OCR_LANGUAGES=$(IFS=','; echo "${NORMALIZED_OCR_LANGUAGES[*]}")
load_required_ocr_fonts
# Validate WASM base URL format # Validate WASM base URL format
if [[ ! "$WASM_BASE_URL" =~ ^https?:// ]]; then if [[ ! "$WASM_BASE_URL" =~ ^https?:// ]]; then
error "WASM base URL must start with http:// or https://" error "WASM base URL must start with http:// or https://"
@@ -353,11 +551,15 @@ WASM_BASE_URL="${WASM_BASE_URL%/}"
WASM_PYMUPDF_URL="${WASM_BASE_URL}/pymupdf/" WASM_PYMUPDF_URL="${WASM_BASE_URL}/pymupdf/"
WASM_GS_URL="${WASM_BASE_URL}/gs/" WASM_GS_URL="${WASM_BASE_URL}/gs/"
WASM_CPDF_URL="${WASM_BASE_URL}/cpdf/" WASM_CPDF_URL="${WASM_BASE_URL}/cpdf/"
OCR_TESSERACT_WORKER_URL="${WASM_BASE_URL}/ocr/worker.min.js"
OCR_TESSERACT_CORE_URL="${WASM_BASE_URL}/ocr/core"
OCR_TESSERACT_LANG_URL="${WASM_BASE_URL}/ocr/lang-data"
OCR_FONT_BASE_URL="${WASM_BASE_URL}/ocr/fonts"
echo "" echo ""
echo -e "${BOLD}============================================================${NC}" echo -e "${BOLD}============================================================${NC}"
echo -e "${BOLD} BentoPDF Air-Gapped Bundle Preparation${NC}" echo -e "${BOLD} BentoPDF Air-Gapped Bundle Preparation${NC}"
echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION}${NC}" echo -e "${BOLD} App: v${APP_VERSION} | PyMuPDF: ${PYMUPDF_VERSION} | GS: ${GS_VERSION} | OCR: ${TESSERACT_VERSION}${NC}"
echo -e "${BOLD}============================================================${NC}" echo -e "${BOLD}============================================================${NC}"
# --- Phase 1: Prepare output directory --- # --- Phase 1: Prepare output directory ---
@@ -398,6 +600,27 @@ if [ "$SKIP_WASM" = true ]; then
error "Missing: coherentpdf-*.tgz" error "Missing: coherentpdf-*.tgz"
wasm_missing=true wasm_missing=true
fi fi
if ! ls "$OUTPUT_DIR"/tesseract.js-*.tgz &>/dev/null; then
error "Missing: tesseract.js-*.tgz"
wasm_missing=true
fi
if ! ls "$OUTPUT_DIR"/tesseract.js-core-*.tgz &>/dev/null; then
error "Missing: tesseract.js-core-*.tgz"
wasm_missing=true
fi
for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do
if [ ! -f "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz" ]; then
error "Missing: tesseract-langdata/${lang}.traineddata.gz"
wasm_missing=true
fi
done
while IFS=$'\t' read -r font_family font_url font_file; do
[ -z "$font_file" ] && continue
if [ ! -f "$OUTPUT_DIR/ocr-fonts/${font_file}" ]; then
error "Missing: ocr-fonts/${font_file} (${font_family})"
wasm_missing=true
fi
done <<< "$OCR_FONT_MANIFEST_RAW"
if [ "$wasm_missing" = true ]; then if [ "$wasm_missing" = true ]; then
error "Run without --skip-wasm first to download the packages." error "Run without --skip-wasm first to download the packages."
exit 1 exit 1
@@ -430,8 +653,42 @@ else
exit 1 exit 1
fi fi
info "Downloading tesseract.js@${TESSERACT_VERSION}..."
if ! (cd "$WASM_TMP" && npm pack "tesseract.js@${TESSERACT_VERSION}" --quiet 2>&1); then
error "Failed to download tesseract.js@${TESSERACT_VERSION}"
exit 1
fi
info "Downloading tesseract.js-core@${TESSERACT_CORE_VERSION}..."
if ! (cd "$WASM_TMP" && npm pack "tesseract.js-core@${TESSERACT_CORE_VERSION}" --quiet 2>&1); then
error "Failed to download tesseract.js-core@${TESSERACT_CORE_VERSION}"
exit 1
fi
# Move to output directory # Move to output directory
mv "$WASM_TMP"/*.tgz "$OUTPUT_DIR/" mv "$WASM_TMP"/*.tgz "$OUTPUT_DIR/"
mkdir -p "$OUTPUT_DIR/tesseract-langdata"
for lang in "${NORMALIZED_OCR_LANGUAGES[@]}"; do
info "Downloading OCR language data: ${lang}..."
if ! curl -fsSL "https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/${TESSDATA_VERSION}/${lang}.traineddata.gz" -o "$OUTPUT_DIR/tesseract-langdata/${lang}.traineddata.gz"; then
error "Failed to download OCR language data for ${lang}"
error "Check that the language code exists and that the network can reach jsDelivr."
exit 1
fi
done
mkdir -p "$OUTPUT_DIR/ocr-fonts"
while IFS=$'\t' read -r font_family font_url font_file; do
[ -z "$font_file" ] && continue
info "Downloading OCR font: ${font_family}..."
if ! curl -fsSL "$font_url" -o "$OUTPUT_DIR/ocr-fonts/${font_file}"; then
error "Failed to download OCR font '${font_family}'"
error "Check that the network can reach the font URL: ${font_url}"
exit 1
fi
done <<< "$OCR_FONT_MANIFEST_RAW"
rm -rf "$WASM_TMP" rm -rf "$WASM_TMP"
trap - EXIT trap - EXIT
@@ -443,6 +700,10 @@ else
info " PyMuPDF: $(filesize "$OUTPUT_DIR"/bentopdf-pymupdf-wasm-*.tgz)" info " PyMuPDF: $(filesize "$OUTPUT_DIR"/bentopdf-pymupdf-wasm-*.tgz)"
info " Ghostscript: $(filesize "$OUTPUT_DIR"/bentopdf-gs-wasm-*.tgz)" info " Ghostscript: $(filesize "$OUTPUT_DIR"/bentopdf-gs-wasm-*.tgz)"
info " CoherentPDF: $(filesize "$CPDF_TGZ") (v${CPDF_VERSION})" info " CoherentPDF: $(filesize "$CPDF_TGZ") (v${CPDF_VERSION})"
info " Tesseract.js: $(filesize "$OUTPUT_DIR"/tesseract.js-*.tgz)"
info " OCR Core: $(filesize "$OUTPUT_DIR"/tesseract.js-core-*.tgz)"
info " OCR Langs: ${OCR_LANGUAGES}"
info " OCR Fonts: $(printf '%s\n' "$OCR_FONT_MANIFEST_RAW" | awk -F '\t' 'NF >= 1 { print $1 }' | paste -sd ', ' -)"
fi fi
# Resolve CPDF version if we skipped download # Resolve CPDF version if we skipped download
@@ -488,6 +749,11 @@ else
BUILD_ARGS+=(--build-arg "VITE_WASM_PYMUPDF_URL=${WASM_PYMUPDF_URL}") BUILD_ARGS+=(--build-arg "VITE_WASM_PYMUPDF_URL=${WASM_PYMUPDF_URL}")
BUILD_ARGS+=(--build-arg "VITE_WASM_GS_URL=${WASM_GS_URL}") BUILD_ARGS+=(--build-arg "VITE_WASM_GS_URL=${WASM_GS_URL}")
BUILD_ARGS+=(--build-arg "VITE_WASM_CPDF_URL=${WASM_CPDF_URL}") BUILD_ARGS+=(--build-arg "VITE_WASM_CPDF_URL=${WASM_CPDF_URL}")
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_WORKER_URL=${OCR_TESSERACT_WORKER_URL}")
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_CORE_URL=${OCR_TESSERACT_CORE_URL}")
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_LANG_URL=${OCR_TESSERACT_LANG_URL}")
BUILD_ARGS+=(--build-arg "VITE_TESSERACT_AVAILABLE_LANGUAGES=${OCR_LANGUAGES}")
BUILD_ARGS+=(--build-arg "VITE_OCR_FONT_BASE_URL=${OCR_FONT_BASE_URL}")
[ -n "$SIMPLE_MODE" ] && BUILD_ARGS+=(--build-arg "SIMPLE_MODE=${SIMPLE_MODE}") [ -n "$SIMPLE_MODE" ] && BUILD_ARGS+=(--build-arg "SIMPLE_MODE=${SIMPLE_MODE}")
[ -n "$BASE_URL" ] && BUILD_ARGS+=(--build-arg "BASE_URL=${BASE_URL}") [ -n "$BASE_URL" ] && BUILD_ARGS+=(--build-arg "BASE_URL=${BASE_URL}")
@@ -503,6 +769,12 @@ else
info " PyMuPDF: ${WASM_PYMUPDF_URL}" info " PyMuPDF: ${WASM_PYMUPDF_URL}"
info " Ghostscript: ${WASM_GS_URL}" info " Ghostscript: ${WASM_GS_URL}"
info " CoherentPDF: ${WASM_CPDF_URL}" info " CoherentPDF: ${WASM_CPDF_URL}"
info "OCR URLs:"
info " Worker: ${OCR_TESSERACT_WORKER_URL}"
info " Core: ${OCR_TESSERACT_CORE_URL}"
info " Lang Data: ${OCR_TESSERACT_LANG_URL}"
info " Font Base: ${OCR_FONT_BASE_URL}"
info " Languages: ${OCR_LANGUAGES}"
echo "" echo ""
info "Building... this may take a few minutes (npm install + Vite build)." info "Building... this may take a few minutes (npm install + Vite build)."
echo "" echo ""
@@ -582,7 +854,7 @@ fi
echo "" echo ""
echo "[2/3] Extracting WASM packages to \${WASM_DIR}..." echo "[2/3] Extracting WASM packages to \${WASM_DIR}..."
mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf" mkdir -p "\${WASM_DIR}/pymupdf" "\${WASM_DIR}/gs" "\${WASM_DIR}/cpdf" "\${WASM_DIR}/ocr/core" "\${WASM_DIR}/ocr/lang-data" "\${WASM_DIR}/ocr/fonts"
# PyMuPDF: package has dist/ and assets/ at root # PyMuPDF: package has dist/ and assets/ at root
echo " Extracting PyMuPDF..." echo " Extracting PyMuPDF..."
@@ -610,12 +882,35 @@ else
fi fi
rm -rf "\${TEMP_CPDF}" rm -rf "\${TEMP_CPDF}"
# Tesseract worker: browser expects a single worker.min.js file
echo " Extracting Tesseract worker..."
TEMP_TESS="\$(mktemp -d)"
tar xzf "\${SCRIPT_DIR}"/tesseract.js-*.tgz -C "\${TEMP_TESS}"
cp "\${TEMP_TESS}/package/dist/worker.min.js" "\${WASM_DIR}/ocr/worker.min.js"
rm -rf "\${TEMP_TESS}"
# Tesseract core: browser expects the full tesseract.js-core directory
echo " Extracting Tesseract core..."
tar xzf "\${SCRIPT_DIR}"/tesseract.js-core-*.tgz -C "\${WASM_DIR}/ocr/core" --strip-components=1
# OCR language data: copy the bundled traineddata files
echo " Installing OCR language data..."
cp "\${SCRIPT_DIR}"/tesseract-langdata/*.traineddata.gz "\${WASM_DIR}/ocr/lang-data/"
# OCR fonts: copy the bundled font files for searchable text layer rendering
echo " Installing OCR fonts..."
cp "\${SCRIPT_DIR}"/ocr-fonts/* "\${WASM_DIR}/ocr/fonts/"
echo " WASM files extracted to: \${WASM_DIR}" echo " WASM files extracted to: \${WASM_DIR}"
echo "" echo ""
echo " IMPORTANT: Ensure these paths are served by your internal web server:" echo " IMPORTANT: Ensure these paths are served by your internal web server:"
echo " \${WASM_BASE_URL}/pymupdf/ -> \${WASM_DIR}/pymupdf/" echo " \${WASM_BASE_URL}/pymupdf/ -> \${WASM_DIR}/pymupdf/"
echo " \${WASM_BASE_URL}/gs/ -> \${WASM_DIR}/gs/" echo " \${WASM_BASE_URL}/gs/ -> \${WASM_DIR}/gs/"
echo " \${WASM_BASE_URL}/cpdf/ -> \${WASM_DIR}/cpdf/" echo " \${WASM_BASE_URL}/cpdf/ -> \${WASM_DIR}/cpdf/"
echo " \${WASM_BASE_URL}/ocr/worker.min.js -> \${WASM_DIR}/ocr/worker.min.js"
echo " \${WASM_BASE_URL}/ocr/core -> \${WASM_DIR}/ocr/core/"
echo " \${WASM_BASE_URL}/ocr/lang-data -> \${WASM_DIR}/ocr/lang-data/"
echo " \${WASM_BASE_URL}/ocr/fonts -> \${WASM_DIR}/ocr/fonts/"
# --- Step 3: Start BentoPDF --- # --- Step 3: Start BentoPDF ---
echo "" echo ""
@@ -654,6 +949,10 @@ cat > "$OUTPUT_DIR/README.md" <<README_EOF
| \`bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz\` | PyMuPDF WASM module | | \`bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz\` | PyMuPDF WASM module |
| \`bentopdf-gs-wasm-${GS_VERSION}.tgz\` | Ghostscript WASM module | | \`bentopdf-gs-wasm-${GS_VERSION}.tgz\` | Ghostscript WASM module |
| \`coherentpdf-${CPDF_VERSION}.tgz\` | CoherentPDF WASM module | | \`coherentpdf-${CPDF_VERSION}.tgz\` | CoherentPDF WASM module |
| \`tesseract.js-${TESSERACT_VERSION}.tgz\` | Tesseract browser worker package |
| \`tesseract.js-core-${TESSERACT_CORE_VERSION}.tgz\` | Tesseract core runtime package |
| \`tesseract-langdata/\` | OCR language data files (${OCR_LANGUAGES}) |
| \`ocr-fonts/\` | OCR text-layer font files |
| \`setup.sh\` | Automated setup script | | \`setup.sh\` | Automated setup script |
| \`README.md\` | This file | | \`README.md\` | This file |
@@ -664,6 +963,16 @@ The Docker image was built with these WASM URLs:
- **PyMuPDF:** \`${WASM_PYMUPDF_URL}\` - **PyMuPDF:** \`${WASM_PYMUPDF_URL}\`
- **Ghostscript:** \`${WASM_GS_URL}\` - **Ghostscript:** \`${WASM_GS_URL}\`
- **CoherentPDF:** \`${WASM_CPDF_URL}\` - **CoherentPDF:** \`${WASM_CPDF_URL}\`
- **OCR Worker:** \`${OCR_TESSERACT_WORKER_URL}\`
- **OCR Core:** \`${OCR_TESSERACT_CORE_URL}\`
- **OCR Lang Data:** \`${OCR_TESSERACT_LANG_URL}\`
- **OCR Font Base:** \`${OCR_FONT_BASE_URL}\`
Bundled OCR languages: **${OCR_LANGUAGES}**
Bundled OCR fonts:
$(printf '%s\n' "$OCR_FONT_MANIFEST_RAW" | awk -F '\t' 'NF >= 3 { printf "- **%s** -> `%s`\n", $1, $3 }')
These URLs are baked into the app at build time. The user's browser fetches These URLs are baked into the app at build time. The user's browser fetches
WASM files from these URLs at runtime. WASM files from these URLs at runtime.
@@ -694,7 +1003,7 @@ docker load -i bentopdf.tar
Extract to your internal web server's document root: Extract to your internal web server's document root:
\`\`\`bash \`\`\`bash
mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf mkdir -p ./wasm/pymupdf ./wasm/gs ./wasm/cpdf ./wasm/ocr/core ./wasm/ocr/lang-data ./wasm/ocr/fonts
# PyMuPDF # PyMuPDF
tar xzf bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz -C ./wasm/pymupdf --strip-components=1 tar xzf bentopdf-pymupdf-wasm-${PYMUPDF_VERSION}.tgz -C ./wasm/pymupdf --strip-components=1
@@ -710,6 +1019,21 @@ TEMP_CPDF=\$(mktemp -d)
tar xzf coherentpdf-${CPDF_VERSION}.tgz -C \$TEMP_CPDF tar xzf coherentpdf-${CPDF_VERSION}.tgz -C \$TEMP_CPDF
cp -r \$TEMP_CPDF/package/dist/* ./wasm/cpdf/ cp -r \$TEMP_CPDF/package/dist/* ./wasm/cpdf/
rm -rf \$TEMP_CPDF rm -rf \$TEMP_CPDF
# Tesseract worker
TEMP_TESS=\$(mktemp -d)
tar xzf tesseract.js-${TESSERACT_VERSION}.tgz -C \$TEMP_TESS
cp \$TEMP_TESS/package/dist/worker.min.js ./wasm/ocr/worker.min.js
rm -rf \$TEMP_TESS
# Tesseract core
tar xzf tesseract.js-core-${TESSERACT_CORE_VERSION}.tgz -C ./wasm/ocr/core --strip-components=1
# OCR language data
cp ./tesseract-langdata/*.traineddata.gz ./wasm/ocr/lang-data/
# OCR fonts
cp ./ocr-fonts/* ./wasm/ocr/fonts/
\`\`\` \`\`\`
### 3. Configure your web server ### 3. Configure your web server
@@ -721,6 +1045,10 @@ Ensure these paths are accessible at the configured URLs:
| \`${WASM_PYMUPDF_URL}\` | \`./wasm/pymupdf/\` | | \`${WASM_PYMUPDF_URL}\` | \`./wasm/pymupdf/\` |
| \`${WASM_GS_URL}\` | \`./wasm/gs/\` | | \`${WASM_GS_URL}\` | \`./wasm/gs/\` |
| \`${WASM_CPDF_URL}\` | \`./wasm/cpdf/\` | | \`${WASM_CPDF_URL}\` | \`./wasm/cpdf/\` |
| \`${OCR_TESSERACT_WORKER_URL}\` | \`./wasm/ocr/worker.min.js\` |
| \`${OCR_TESSERACT_CORE_URL}\` | \`./wasm/ocr/core/\` |
| \`${OCR_TESSERACT_LANG_URL}\` | \`./wasm/ocr/lang-data/\` |
| \`${OCR_FONT_BASE_URL}\` | \`./wasm/ocr/fonts/\` |
### 4. Run BentoPDF ### 4. Run BentoPDF

View File

@@ -1,37 +1,39 @@
import Tesseract from 'tesseract.js'; import type Tesseract from 'tesseract.js';
import type { ComparePageModel, CompareTextItem } from '../types.ts'; import type { ComparePageModel, CompareTextItem } from '../types.ts';
import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts'; import { mergeIntoLines, sortCompareTextItems } from './extract-page-model.ts';
import { import {
joinCompareTextItems, joinCompareTextItems,
normalizeCompareText, normalizeCompareText,
} from './text-normalization.ts'; } from './text-normalization.ts';
import { createConfiguredTesseractWorker } from '../../utils/tesseract-runtime.js';
type OcrWord = { type OcrWord = Tesseract.Word;
text: string; type OcrRecognizeResult = Tesseract.RecognizeResult;
bbox: { type OcrPageWithWords = Tesseract.Page & { words: OcrWord[] };
x0: number;
y0: number;
x1: number;
y1: number;
};
};
export async function recognizePageCanvas( export async function recognizePageCanvas(
canvas: HTMLCanvasElement, canvas: HTMLCanvasElement,
language: string, language: string,
onProgress?: (status: string, progress: number) => void onProgress?: (status: string, progress: number) => void
): Promise<ComparePageModel> { ): Promise<ComparePageModel> {
const result = await Tesseract.recognize(canvas, language, { const worker = await createConfiguredTesseractWorker(
logger(message) { language,
1,
(message) => {
onProgress?.(message.status, message.progress || 0); onProgress?.(message.status, message.progress || 0);
}, }
}); );
const ocrData = result.data as unknown as { words?: OcrWord[] }; let result: OcrRecognizeResult;
const words = ((ocrData.words || []) as OcrWord[]) try {
result = await worker.recognize(canvas);
} finally {
await worker.terminate();
}
const words = (result.data as OcrPageWithWords).words
.map((word, index) => { .map((word, index) => {
const normalizedText = normalizeCompareText(word.text || ''); const normalizedText = normalizeCompareText(word.text);
if (!normalizedText) return null; if (!normalizedText) return null;
const item: CompareTextItem = { const item: CompareTextItem = {

View File

@@ -1,189 +1,233 @@
/** /**
* Font mappings for OCR text layer rendering * Font mappings for OCR text layer rendering
* Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs * Maps Tesseract language codes to appropriate Noto Sans font families and their CDN URLs
*/ */
export const languageToFontFamily: Record<string, string> = { export const languageToFontFamily: Record<string, string> = {
// CJK Languages // CJK Languages
jpn: 'Noto Sans JP', jpn: 'Noto Sans JP',
chi_sim: 'Noto Sans SC', chi_sim: 'Noto Sans SC',
chi_tra: 'Noto Sans TC', chi_tra: 'Noto Sans TC',
kor: 'Noto Sans KR', kor: 'Noto Sans KR',
// Arabic Script // Arabic Script
ara: 'Noto Sans Arabic', ara: 'Noto Sans Arabic',
fas: 'Noto Sans Arabic', fas: 'Noto Sans Arabic',
urd: 'Noto Sans Arabic', urd: 'Noto Sans Arabic',
pus: 'Noto Sans Arabic', pus: 'Noto Sans Arabic',
kur: 'Noto Sans Arabic', kur: 'Noto Sans Arabic',
// Devanagari Script // Devanagari Script
hin: 'Noto Sans Devanagari', hin: 'Noto Sans Devanagari',
mar: 'Noto Sans Devanagari', mar: 'Noto Sans Devanagari',
san: 'Noto Sans Devanagari', san: 'Noto Sans Devanagari',
nep: 'Noto Sans Devanagari', nep: 'Noto Sans Devanagari',
// Bengali Script // Bengali Script
ben: 'Noto Sans Bengali', ben: 'Noto Sans Bengali',
asm: 'Noto Sans Bengali', asm: 'Noto Sans Bengali',
// Tamil Script // Tamil Script
tam: 'Noto Sans Tamil', tam: 'Noto Sans Tamil',
// Telugu Script // Telugu Script
tel: 'Noto Sans Telugu', tel: 'Noto Sans Telugu',
// Kannada Script // Kannada Script
kan: 'Noto Sans Kannada', kan: 'Noto Sans Kannada',
// Malayalam Script // Malayalam Script
mal: 'Noto Sans Malayalam', mal: 'Noto Sans Malayalam',
// Gujarati Script // Gujarati Script
guj: 'Noto Sans Gujarati', guj: 'Noto Sans Gujarati',
// Gurmukhi Script (Punjabi) // Gurmukhi Script (Punjabi)
pan: 'Noto Sans Gurmukhi', pan: 'Noto Sans Gurmukhi',
// Oriya Script // Oriya Script
ori: 'Noto Sans Oriya', ori: 'Noto Sans Oriya',
// Sinhala Script // Sinhala Script
sin: 'Noto Sans Sinhala', sin: 'Noto Sans Sinhala',
// Thai Script // Thai Script
tha: 'Noto Sans Thai', tha: 'Noto Sans Thai',
// Lao Script // Lao Script
lao: 'Noto Sans Lao', lao: 'Noto Sans Lao',
// Khmer Script // Khmer Script
khm: 'Noto Sans Khmer', khm: 'Noto Sans Khmer',
// Myanmar Script // Myanmar Script
mya: 'Noto Sans Myanmar', mya: 'Noto Sans Myanmar',
// Tibetan Script // Tibetan Script
bod: 'Noto Serif Tibetan', bod: 'Noto Serif Tibetan',
// Georgian Script // Georgian Script
kat: 'Noto Sans Georgian', kat: 'Noto Sans Georgian',
kat_old: 'Noto Sans Georgian', kat_old: 'Noto Sans Georgian',
// Armenian Script // Armenian Script
hye: 'Noto Sans Armenian', hye: 'Noto Sans Armenian',
// Hebrew Script // Hebrew Script
heb: 'Noto Sans Hebrew', heb: 'Noto Sans Hebrew',
yid: 'Noto Sans Hebrew', yid: 'Noto Sans Hebrew',
// Ethiopic Script // Ethiopic Script
amh: 'Noto Sans Ethiopic', amh: 'Noto Sans Ethiopic',
tir: 'Noto Sans Ethiopic', tir: 'Noto Sans Ethiopic',
// Cherokee Script // Cherokee Script
chr: 'Noto Sans Cherokee', chr: 'Noto Sans Cherokee',
// Syriac Script // Syriac Script
syr: 'Noto Sans Syriac', syr: 'Noto Sans Syriac',
// Cyrillic Script (Noto Sans includes Cyrillic) // Cyrillic Script (Noto Sans includes Cyrillic)
bel: 'Noto Sans', bel: 'Noto Sans',
bul: 'Noto Sans', bul: 'Noto Sans',
mkd: 'Noto Sans', mkd: 'Noto Sans',
rus: 'Noto Sans', rus: 'Noto Sans',
srp: 'Noto Sans', srp: 'Noto Sans',
srp_latn: 'Noto Sans', srp_latn: 'Noto Sans',
ukr: 'Noto Sans', ukr: 'Noto Sans',
kaz: 'Noto Sans', kaz: 'Noto Sans',
kir: 'Noto Sans', kir: 'Noto Sans',
tgk: 'Noto Sans', tgk: 'Noto Sans',
uzb: 'Noto Sans', uzb: 'Noto Sans',
uzb_cyrl: 'Noto Sans', uzb_cyrl: 'Noto Sans',
aze_cyrl: 'Noto Sans', aze_cyrl: 'Noto Sans',
// Latin Script (covered by base Noto Sans) // Latin Script (covered by base Noto Sans)
afr: 'Noto Sans', afr: 'Noto Sans',
aze: 'Noto Sans', aze: 'Noto Sans',
bos: 'Noto Sans', bos: 'Noto Sans',
cat: 'Noto Sans', cat: 'Noto Sans',
ceb: 'Noto Sans', ceb: 'Noto Sans',
ces: 'Noto Sans', ces: 'Noto Sans',
cym: 'Noto Sans', cym: 'Noto Sans',
dan: 'Noto Sans', dan: 'Noto Sans',
deu: 'Noto Sans', deu: 'Noto Sans',
ell: 'Noto Sans', ell: 'Noto Sans',
eng: 'Noto Sans', eng: 'Noto Sans',
enm: 'Noto Sans', enm: 'Noto Sans',
epo: 'Noto Sans', epo: 'Noto Sans',
est: 'Noto Sans', est: 'Noto Sans',
eus: 'Noto Sans', eus: 'Noto Sans',
fin: 'Noto Sans', fin: 'Noto Sans',
fra: 'Noto Sans', fra: 'Noto Sans',
frk: 'Noto Sans', frk: 'Noto Sans',
frm: 'Noto Sans', frm: 'Noto Sans',
gle: 'Noto Sans', gle: 'Noto Sans',
glg: 'Noto Sans', glg: 'Noto Sans',
grc: 'Noto Sans', grc: 'Noto Sans',
hat: 'Noto Sans', hat: 'Noto Sans',
hrv: 'Noto Sans', hrv: 'Noto Sans',
hun: 'Noto Sans', hun: 'Noto Sans',
iku: 'Noto Sans', iku: 'Noto Sans',
ind: 'Noto Sans', ind: 'Noto Sans',
isl: 'Noto Sans', isl: 'Noto Sans',
ita: 'Noto Sans', ita: 'Noto Sans',
ita_old: 'Noto Sans', ita_old: 'Noto Sans',
jav: 'Noto Sans', jav: 'Noto Sans',
lat: 'Noto Sans', lat: 'Noto Sans',
lav: 'Noto Sans', lav: 'Noto Sans',
lit: 'Noto Sans', lit: 'Noto Sans',
mlt: 'Noto Sans', mlt: 'Noto Sans',
msa: 'Noto Sans', msa: 'Noto Sans',
nld: 'Noto Sans', nld: 'Noto Sans',
nor: 'Noto Sans', nor: 'Noto Sans',
pol: 'Noto Sans', pol: 'Noto Sans',
por: 'Noto Sans', por: 'Noto Sans',
ron: 'Noto Sans', ron: 'Noto Sans',
slk: 'Noto Sans', slk: 'Noto Sans',
slv: 'Noto Sans', slv: 'Noto Sans',
spa: 'Noto Sans', spa: 'Noto Sans',
spa_old: 'Noto Sans', spa_old: 'Noto Sans',
sqi: 'Noto Sans', sqi: 'Noto Sans',
swa: 'Noto Sans', swa: 'Noto Sans',
swe: 'Noto Sans', swe: 'Noto Sans',
tgl: 'Noto Sans', tgl: 'Noto Sans',
tur: 'Noto Sans', tur: 'Noto Sans',
vie: 'Noto Sans', vie: 'Noto Sans',
dzo: 'Noto Sans', dzo: 'Noto Sans',
uig: 'Noto Sans', uig: 'Noto Sans',
}; };
export const fontFamilyToUrl: Record<string, string> = { export const fontFamilyToUrl: Record<string, string> = {
'Noto Sans JP': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf', 'Noto Sans JP':
'Noto Sans SC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf', 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Japanese/NotoSansCJKjp-Regular.otf',
'Noto Sans TC': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf', 'Noto Sans SC':
'Noto Sans KR': 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf', 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf',
'Noto Sans Arabic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf', 'Noto Sans TC':
'Noto Sans Devanagari': 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/TraditionalChinese/NotoSansCJKtc-Regular.otf',
'Noto Sans Bengali': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf', 'Noto Sans KR':
'Noto Sans Gujarati': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-cjk/main/Sans/OTF/Korean/NotoSansCJKkr-Regular.otf',
'Noto Sans Kannada': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf', 'Noto Sans Arabic':
'Noto Sans Malayalam': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArabic/NotoSansArabic-Regular.ttf',
'Noto Sans Oriya': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf', 'Noto Sans Devanagari':
'Noto Sans Gurmukhi': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/unhinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf',
'Noto Sans Tamil': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf', 'Noto Sans Bengali':
'Noto Sans Telugu': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf',
'Noto Sans Sinhala': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf', 'Noto Sans Gujarati':
'Noto Sans Thai': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGujarati/NotoSansGujarati-Regular.ttf',
'Noto Sans Khmer': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf', 'Noto Sans Kannada':
'Noto Sans Lao': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKannada/NotoSansKannada-Regular.ttf',
'Noto Sans Myanmar': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf', 'Noto Sans Malayalam':
'Noto Sans Hebrew': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMalayalam/NotoSansMalayalam-Regular.ttf',
'Noto Sans Georgian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf', 'Noto Sans Oriya':
'Noto Sans Ethiopic': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansOriya/NotoSansOriya-Regular.ttf',
'Noto Serif Tibetan': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf', 'Noto Sans Gurmukhi':
'Noto Sans Cherokee': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGurmukhi/NotoSansGurmukhi-Regular.ttf',
'Noto Sans Armenian': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf', 'Noto Sans Tamil':
'Noto Sans Syriac': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf', 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTamil/NotoSansTamil-Regular.ttf',
'Noto Sans': 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf', 'Noto Sans Telugu':
}; 'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansTelugu/NotoSansTelugu-Regular.ttf',
'Noto Sans Sinhala':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSinhala/NotoSansSinhala-Regular.ttf',
'Noto Sans Thai':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansThai/NotoSansThai-Regular.ttf',
'Noto Sans Khmer':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansKhmer/NotoSansKhmer-Regular.ttf',
'Noto Sans Lao':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansLao/NotoSansLao-Regular.ttf',
'Noto Sans Myanmar':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansMyanmar/NotoSansMyanmar-Regular.ttf',
'Noto Sans Hebrew':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansHebrew/NotoSansHebrew-Regular.ttf',
'Noto Sans Georgian':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansGeorgian/NotoSansGeorgian-Regular.ttf',
'Noto Sans Ethiopic':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansEthiopic/NotoSansEthiopic-Regular.ttf',
'Noto Serif Tibetan':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf',
'Noto Sans Cherokee':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansCherokee/NotoSansCherokee-Regular.ttf',
'Noto Sans Armenian':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansArmenian/NotoSansArmenian-Regular.ttf',
'Noto Sans Syriac':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSansSyriac/NotoSansSyriac-Regular.ttf',
'Noto Sans':
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf',
};
export function getFontUrlForFamily(fontFamily: string): string {
return fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans'];
}
export function getFontAssetFileName(fontFamily: string): string {
const defaultUrl = getFontUrlForFamily(fontFamily);
const fileName = defaultUrl.split('/').pop();
if (!fileName) {
throw new Error(
`Could not resolve a font asset filename for ${fontFamily}`
);
}
return fileName;
}

View File

@@ -4,6 +4,11 @@ import { downloadFile, formatBytes } from '../utils/helpers.js';
import { icons, createIcons } from 'lucide'; import { icons, createIcons } from 'lucide';
import { OcrState } from '@/types'; import { OcrState } from '@/types';
import { performOcr } from '../utils/ocr.js'; import { performOcr } from '../utils/ocr.js';
import {
getAvailableTesseractLanguageEntries,
resolveConfiguredTesseractAvailableLanguages,
UnsupportedOcrLanguageError,
} from '../utils/tesseract-language-availability.js';
const pageState: OcrState = { const pageState: OcrState = {
file: null, file: null,
@@ -80,6 +85,30 @@ function resetState() {
if (processBtn) processBtn.disabled = true; if (processBtn) processBtn.disabled = true;
} }
function updateLanguageAvailabilityNotice() {
const notice = document.getElementById('lang-availability-note');
if (!notice) return;
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages();
if (!configuredLanguages) {
notice.classList.add('hidden');
notice.textContent = '';
return;
}
const availableEntries = getAvailableTesseractLanguageEntries();
if (availableEntries.length === 0) {
notice.classList.remove('hidden');
notice.textContent =
'This deployment does not expose any valid OCR languages. Rebuild it with VITE_TESSERACT_AVAILABLE_LANGUAGES set to valid Tesseract codes.';
return;
}
const availableNames = availableEntries.map(([, name]) => name).join(', ');
notice.classList.remove('hidden');
notice.textContent = `This deployment bundles OCR for: ${availableNames}.`;
}
async function runOCR() { async function runOCR() {
const selectedLangs = Array.from( const selectedLangs = Array.from(
document.querySelectorAll('.lang-checkbox:checked') document.querySelectorAll('.lang-checkbox:checked')
@@ -142,10 +171,14 @@ async function runOCR() {
if (textOutput) textOutput.value = result.fullText.trim(); if (textOutput) textOutput.value = result.fullText.trim();
} catch (e) { } catch (e) {
console.error(e); console.error(e);
showAlert( if (e instanceof UnsupportedOcrLanguageError) {
'OCR Error', showAlert('OCR Language Not Available', e.message);
'An error occurred during the OCR process. The worker may have failed to load. Please try again.' } else {
); showAlert(
'OCR Error',
'An error occurred during the OCR process. The worker may have failed to load. Please try again.'
);
}
if (toolOptions) toolOptions.classList.remove('hidden'); if (toolOptions) toolOptions.classList.remove('hidden');
if (ocrProgress) ocrProgress.classList.add('hidden'); if (ocrProgress) ocrProgress.classList.add('hidden');
} }
@@ -213,10 +246,21 @@ function populateLanguageList() {
langList.innerHTML = ''; langList.innerHTML = '';
Object.entries(tesseractLanguages).forEach(function ([code, name]) { const availableEntries = getAvailableTesseractLanguageEntries();
if (availableEntries.length === 0) {
const emptyState = document.createElement('p');
emptyState.className = 'text-sm text-yellow-300 p-2';
emptyState.textContent =
'No OCR languages are available in this deployment.';
langList.appendChild(emptyState);
return;
}
availableEntries.forEach(function ([code, name]) {
const label = document.createElement('label'); const label = document.createElement('label');
label.className = label.className =
'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer'; 'flex items-center gap-2 p-2 rounded-md hover:bg-gray-700 cursor-pointer';
label.dataset.search = `${name} ${code}`.toLowerCase();
const checkbox = document.createElement('input'); const checkbox = document.createElement('input');
checkbox.type = 'checkbox'; checkbox.type = 'checkbox';
@@ -253,6 +297,7 @@ document.addEventListener('DOMContentLoaded', function () {
const downloadPdfBtn = document.getElementById('download-searchable-pdf'); const downloadPdfBtn = document.getElementById('download-searchable-pdf');
populateLanguageList(); populateLanguageList();
updateLanguageAvailabilityNotice();
if (backBtn) { if (backBtn) {
backBtn.addEventListener('click', function () { backBtn.addEventListener('click', function () {
@@ -304,9 +349,9 @@ document.addEventListener('DOMContentLoaded', function () {
langSearch.addEventListener('input', function () { langSearch.addEventListener('input', function () {
const searchTerm = langSearch.value.toLowerCase(); const searchTerm = langSearch.value.toLowerCase();
langList.querySelectorAll('label').forEach(function (label) { langList.querySelectorAll('label').forEach(function (label) {
(label as HTMLElement).style.display = label.textContent (label as HTMLElement).style.display = (
?.toLowerCase() label as HTMLElement
.includes(searchTerm) ).dataset.search?.includes(searchTerm)
? '' ? ''
: 'none'; : 'none';
}); });

View File

@@ -1,7 +1,7 @@
import { showAlert } from '../ui.js'; import { showAlert } from '../ui.js';
import { tesseractLanguages } from '../config/tesseract-languages.js';
import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor'; import { createWorkflowEditor, updateNodeDisplay } from '../workflow/editor';
import { executeWorkflow } from '../workflow/engine'; import { executeWorkflow } from '../workflow/engine';
import { getAvailableTesseractLanguageEntries } from '../utils/tesseract-language-availability.js';
import { import {
nodeRegistry, nodeRegistry,
getNodesByCategory, getNodesByCategory,
@@ -1194,7 +1194,7 @@ function showNodeSettings(node: BaseWorkflowNode) {
{ label: 'High (288 DPI)', value: '3.0' }, { label: 'High (288 DPI)', value: '3.0' },
{ label: 'Ultra (384 DPI)', value: '4.0' }, { label: 'Ultra (384 DPI)', value: '4.0' },
], ],
language: Object.entries(tesseractLanguages).map(([code, name]) => ({ language: getAvailableTesseractLanguageEntries().map(([code, name]) => ({
label: name, label: name,
value: code, value: code,
})), })),

View File

@@ -1,281 +1,330 @@
import { languageToFontFamily, fontFamilyToUrl } from '../config/font-mappings.js'; import {
getFontAssetFileName,
const fontCache: Map<string, ArrayBuffer> = new Map(); getFontUrlForFamily,
languageToFontFamily,
const DB_NAME = 'bentopdf-fonts'; } from '../config/font-mappings.js';
const DB_VERSION = 1;
const STORE_NAME = 'fonts'; const fontCache: Map<string, ArrayBuffer> = new Map();
async function openFontDB(): Promise<IDBDatabase> { const DB_NAME = 'bentopdf-fonts';
return new Promise((resolve, reject) => { const DB_VERSION = 1;
const request = indexedDB.open(DB_NAME, DB_VERSION); const STORE_NAME = 'fonts';
request.onerror = () => reject(request.error); type OcrFontEnv = Partial<Pick<ImportMetaEnv, 'VITE_OCR_FONT_BASE_URL'>>;
request.onsuccess = () => resolve(request.result);
function getDefaultFontEnv(): OcrFontEnv {
request.onupgradeneeded = (event) => { return import.meta.env;
const db = (event.target as IDBOpenDBRequest).result; }
if (!db.objectStoreNames.contains(STORE_NAME)) {
db.createObjectStore(STORE_NAME); function normalizeFontBaseUrl(url?: string): string | undefined {
} const trimmed = url?.trim();
};
}); if (!trimmed) {
} return undefined;
}
async function getCachedFontFromDB(fontFamily: string): Promise<ArrayBuffer | null> {
try { return trimmed.replace(/\/+$/, '');
const db = await openFontDB(); }
return new Promise((resolve, reject) => {
const transaction = db.transaction(STORE_NAME, 'readonly'); export function resolveFontUrl(
const store = transaction.objectStore(STORE_NAME); fontFamily: string,
const request = store.get(fontFamily); env: OcrFontEnv = getDefaultFontEnv()
): string {
request.onsuccess = () => resolve(request.result || null); const fontBaseUrl = normalizeFontBaseUrl(env.VITE_OCR_FONT_BASE_URL);
request.onerror = () => reject(request.error);
}); if (fontBaseUrl) {
} catch (error) { return `${fontBaseUrl}/${getFontAssetFileName(fontFamily)}`;
console.warn('IndexedDB read failed:', error); }
return null;
} return getFontUrlForFamily(fontFamily);
} }
async function saveFontToDB(fontFamily: string, fontBuffer: ArrayBuffer): Promise<void> { async function openFontDB(): Promise<IDBDatabase> {
try { return new Promise((resolve, reject) => {
const db = await openFontDB(); const request = indexedDB.open(DB_NAME, DB_VERSION);
return new Promise((resolve, reject) => {
const transaction = db.transaction(STORE_NAME, 'readwrite'); request.onerror = () => reject(request.error);
const store = transaction.objectStore(STORE_NAME); request.onsuccess = () => resolve(request.result);
const request = store.put(fontBuffer, fontFamily);
request.onupgradeneeded = (event) => {
request.onsuccess = () => resolve(); const db = (event.target as IDBOpenDBRequest).result;
request.onerror = () => reject(request.error); if (!db.objectStoreNames.contains(STORE_NAME)) {
}); db.createObjectStore(STORE_NAME);
} catch (error) { }
console.warn('IndexedDB write failed:', error); };
} });
} }
export async function getFontForLanguage(lang: string): Promise<ArrayBuffer> { async function getCachedFontFromDB(
const fontFamily = languageToFontFamily[lang] || 'Noto Sans'; fontFamily: string
): Promise<ArrayBuffer | null> {
if (fontCache.has(fontFamily)) { try {
return fontCache.get(fontFamily)!; const db = await openFontDB();
} return new Promise((resolve, reject) => {
const cachedFont = await getCachedFontFromDB(fontFamily); const transaction = db.transaction(STORE_NAME, 'readonly');
if (cachedFont) { const store = transaction.objectStore(STORE_NAME);
fontCache.set(fontFamily, cachedFont); const request = store.get(fontFamily);
return cachedFont;
} request.onsuccess = () => resolve(request.result || null);
request.onerror = () => reject(request.error);
try { });
const fontUrl = fontFamilyToUrl[fontFamily] || fontFamilyToUrl['Noto Sans']; } catch (error) {
console.warn('IndexedDB read failed:', error);
const fontResponse = await fetch(fontUrl); return null;
}
if (!fontResponse.ok) { }
throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
} async function saveFontToDB(
fontFamily: string,
const fontBuffer = await fontResponse.arrayBuffer(); fontBuffer: ArrayBuffer
): Promise<void> {
fontCache.set(fontFamily, fontBuffer); try {
await saveFontToDB(fontFamily, fontBuffer); const db = await openFontDB();
return new Promise((resolve, reject) => {
return fontBuffer; const transaction = db.transaction(STORE_NAME, 'readwrite');
} catch (error) { const store = transaction.objectStore(STORE_NAME);
console.warn(`Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`, error); const request = store.put(fontBuffer, fontFamily);
if (fontFamily !== 'Noto Sans') { request.onsuccess = () => resolve();
return await getFontForLanguage('eng'); request.onerror = () => reject(request.error);
} });
} catch (error) {
throw error; console.warn('IndexedDB write failed:', error);
} }
} }
export function detectScripts(text: string): string[] { export async function getFontForLanguage(lang: string): Promise<ArrayBuffer> {
const scripts = new Set<string>(); const fontFamily = languageToFontFamily[lang] || 'Noto Sans';
// Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF) if (fontCache.has(fontFamily)) {
if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) { return fontCache.get(fontFamily)!;
scripts.add('jpn'); }
} const cachedFont = await getCachedFontFromDB(fontFamily);
if (cachedFont) {
// Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF) fontCache.set(fontFamily, cachedFont);
if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) { return cachedFont;
scripts.add('kor'); }
}
try {
// Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF) const fontUrl = resolveFontUrl(fontFamily);
if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
scripts.add('chi_sim'); const fontResponse = await fetch(fontUrl);
}
if (!fontResponse.ok) {
// Check for Arabic throw new Error(`Failed to fetch font file: ${fontResponse.statusText}`);
if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) { }
scripts.add('ara');
} const fontBuffer = await fontResponse.arrayBuffer();
// Check for Devanagari (Hindi, Marathi, etc.) fontCache.set(fontFamily, fontBuffer);
if (/[\u0900-\u097F]/.test(text)) scripts.add('hin'); await saveFontToDB(fontFamily, fontBuffer);
// Check for Bengali return fontBuffer;
if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben'); } catch (error) {
console.warn(
// Check for Tamil `Failed to fetch font for ${lang} (${fontFamily}), falling back to default.`,
if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam'); error
);
// Check for Telugu
if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel'); if (fontFamily !== 'Noto Sans') {
return await getFontForLanguage('eng');
// Check for Kannada }
if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
throw error;
// Check for Malayalam }
if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal'); }
// Check for Gujarati export function detectScripts(text: string): string[] {
if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj'); const scripts = new Set<string>();
// Check for Punjabi (Gurmukhi) // Japanese: Hiragana (\u3040-\u309F) & Katakana (\u30A0-\u30FF)
if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan'); if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
scripts.add('jpn');
// Check for Oriya }
if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
// Korean: Hangul Syllables (\uAC00-\uD7A3) & Jamo (\u1100-\u11FF)
// Check for Sinhala if (/[\uAC00-\uD7A3\u1100-\u11FF]/.test(text)) {
if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin'); scripts.add('kor');
}
// Check for Thai
if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha'); // Chinese: CJK Unified Ideographs (\u4E00-\u9FFF) & Ext A (\u3400-\u4DBF)
if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
// Check for Lao scripts.add('chi_sim');
if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao'); }
// Check for Khmer // Check for Arabic
if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm'); if (/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/.test(text)) {
scripts.add('ara');
// Check for Myanmar }
if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
// Check for Devanagari (Hindi, Marathi, etc.)
// Check for Tibetan if (/[\u0900-\u097F]/.test(text)) scripts.add('hin');
if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
// Check for Bengali
// Check for Georgian if (/[\u0980-\u09FF]/.test(text)) scripts.add('ben');
if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
// Check for Tamil
// Check for Armenian if (/[\u0B80-\u0BFF]/.test(text)) scripts.add('tam');
if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
// Check for Telugu
// Check for Hebrew if (/[\u0C00-\u0C7F]/.test(text)) scripts.add('tel');
if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
// Check for Kannada
// Check for Ethiopic if (/[\u0C80-\u0CFF]/.test(text)) scripts.add('kan');
if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
// Check for Malayalam
// Check for Cherokee if (/[\u0D00-\u0D7F]/.test(text)) scripts.add('mal');
if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
// Check for Gujarati
// Check for Syriac if (/[\u0A80-\u0AFF]/.test(text)) scripts.add('guj');
if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
// Check for Punjabi (Gurmukhi)
if (scripts.size === 0 || /[a-zA-Z]/.test(text)) { if (/[\u0A00-\u0A7F]/.test(text)) scripts.add('pan');
scripts.add('eng');
} // Check for Oriya
if (/[\u0B00-\u0B7F]/.test(text)) scripts.add('ori');
return Array.from(scripts);
} // Check for Sinhala
if (/[\u0D80-\u0DFF]/.test(text)) scripts.add('sin');
export function getLanguageForChar(char: string): string {
const code = char.charCodeAt(0); // Check for Thai
if (/[\u0E00-\u0E7F]/.test(text)) scripts.add('tha');
// Latin (Basic + Supplement + Extended)
if (code <= 0x024F) return 'eng'; // Check for Lao
if (/[\u0E80-\u0EFF]/.test(text)) scripts.add('lao');
// Japanese: Hiragana & Katakana
if ( // Check for Khmer
(code >= 0x3040 && code <= 0x309F) || // Hiragana if (/[\u1780-\u17FF]/.test(text)) scripts.add('khm');
(code >= 0x30A0 && code <= 0x30FF) // Katakana
) return 'jpn'; // Check for Myanmar
if (/[\u1000-\u109F]/.test(text)) scripts.add('mya');
// Korean: Hangul Syllables & Jamo
if ( // Check for Tibetan
(code >= 0xAC00 && code <= 0xD7A3) || // Hangul Syllables if (/[\u0F00-\u0FFF]/.test(text)) scripts.add('bod');
(code >= 0x1100 && code <= 0x11FF) // Hangul Jamo
) return 'kor'; // Check for Georgian
if (/[\u10A0-\u10FF]/.test(text)) scripts.add('kat');
// Chinese: CJK Unified Ideographs (Han)
if ( // Check for Armenian
(code >= 0x4E00 && code <= 0x9FFF) || // CJK Unified if (/[\u0530-\u058F]/.test(text)) scripts.add('hye');
(code >= 0x3400 && code <= 0x4DBF) // CJK Ext A
) return 'chi_sim'; // Check for Hebrew
if (/[\u0590-\u05FF]/.test(text)) scripts.add('heb');
// Arabic
if ((code >= 0x0600 && code <= 0x06FF) || (code >= 0x0750 && code <= 0x077F) || (code >= 0x08A0 && code <= 0x08FF)) return 'ara'; // Check for Ethiopic
if (/[\u1200-\u137F]/.test(text)) scripts.add('amh');
// Devanagari
if (code >= 0x0900 && code <= 0x097F) return 'hin'; // Check for Cherokee
if (/[\u13A0-\u13FF]/.test(text)) scripts.add('chr');
// Bengali
if (code >= 0x0980 && code <= 0x09FF) return 'ben'; // Check for Syriac
if (/[\u0700-\u074F]/.test(text)) scripts.add('syr');
// Tamil
if (code >= 0x0B80 && code <= 0x0BFF) return 'tam'; if (scripts.size === 0 || /[a-zA-Z]/.test(text)) {
scripts.add('eng');
// Telugu }
if (code >= 0x0C00 && code <= 0x0C7F) return 'tel';
return Array.from(scripts);
// Kannada }
if (code >= 0x0C80 && code <= 0x0CFF) return 'kan';
export function getLanguageForChar(char: string): string {
// Malayalam const code = char.charCodeAt(0);
if (code >= 0x0D00 && code <= 0x0D7F) return 'mal';
// Latin (Basic + Supplement + Extended)
// Gujarati if (code <= 0x024f) return 'eng';
if (code >= 0x0A80 && code <= 0x0AFF) return 'guj';
// Japanese: Hiragana & Katakana
// Punjabi (Gurmukhi) if (
if (code >= 0x0A00 && code <= 0x0A7F) return 'pan'; (code >= 0x3040 && code <= 0x309f) || // Hiragana
(code >= 0x30a0 && code <= 0x30ff) // Katakana
// Oriya )
if (code >= 0x0B00 && code <= 0x0B7F) return 'ori'; return 'jpn';
// Sinhala // Korean: Hangul Syllables & Jamo
if (code >= 0x0D80 && code <= 0x0DFF) return 'sin'; if (
(code >= 0xac00 && code <= 0xd7a3) || // Hangul Syllables
// Thai (code >= 0x1100 && code <= 0x11ff) // Hangul Jamo
if (code >= 0x0E00 && code <= 0x0E7F) return 'tha'; )
return 'kor';
// Lao
if (code >= 0x0E80 && code <= 0x0EFF) return 'lao'; // Chinese: CJK Unified Ideographs (Han)
if (
// Khmer (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified
if (code >= 0x1780 && code <= 0x17FF) return 'khm'; (code >= 0x3400 && code <= 0x4dbf) // CJK Ext A
)
// Myanmar return 'chi_sim';
if (code >= 0x1000 && code <= 0x109F) return 'mya';
// Arabic
// Tibetan if (
if (code >= 0x0F00 && code <= 0x0FFF) return 'bod'; (code >= 0x0600 && code <= 0x06ff) ||
(code >= 0x0750 && code <= 0x077f) ||
// Georgian (code >= 0x08a0 && code <= 0x08ff)
if (code >= 0x10A0 && code <= 0x10FF) return 'kat'; )
return 'ara';
// Armenian
if (code >= 0x0530 && code <= 0x058F) return 'hye'; // Devanagari
if (code >= 0x0900 && code <= 0x097f) return 'hin';
// Hebrew
if (code >= 0x0590 && code <= 0x05FF) return 'heb'; // Bengali
if (code >= 0x0980 && code <= 0x09ff) return 'ben';
// Ethiopic
if (code >= 0x1200 && code <= 0x137F) return 'amh'; // Tamil
if (code >= 0x0b80 && code <= 0x0bff) return 'tam';
// Cherokee
if (code >= 0x13A0 && code <= 0x13FF) return 'chr'; // Telugu
if (code >= 0x0c00 && code <= 0x0c7f) return 'tel';
// Syriac
if (code >= 0x0700 && code <= 0x074F) return 'syr'; // Kannada
if (code >= 0x0c80 && code <= 0x0cff) return 'kan';
// Default to English (Latin)
return 'eng'; // Malayalam
} if (code >= 0x0d00 && code <= 0x0d7f) return 'mal';
// Gujarati
if (code >= 0x0a80 && code <= 0x0aff) return 'guj';
// Punjabi (Gurmukhi)
if (code >= 0x0a00 && code <= 0x0a7f) return 'pan';
// Oriya
if (code >= 0x0b00 && code <= 0x0b7f) return 'ori';
// Sinhala
if (code >= 0x0d80 && code <= 0x0dff) return 'sin';
// Thai
if (code >= 0x0e00 && code <= 0x0e7f) return 'tha';
// Lao
if (code >= 0x0e80 && code <= 0x0eff) return 'lao';
// Khmer
if (code >= 0x1780 && code <= 0x17ff) return 'khm';
// Myanmar
if (code >= 0x1000 && code <= 0x109f) return 'mya';
// Tibetan
if (code >= 0x0f00 && code <= 0x0fff) return 'bod';
// Georgian
if (code >= 0x10a0 && code <= 0x10ff) return 'kat';
// Armenian
if (code >= 0x0530 && code <= 0x058f) return 'hye';
// Hebrew
if (code >= 0x0590 && code <= 0x05ff) return 'heb';
// Ethiopic
if (code >= 0x1200 && code <= 0x137f) return 'amh';
// Cherokee
if (code >= 0x13a0 && code <= 0x13ff) return 'chr';
// Syriac
if (code >= 0x0700 && code <= 0x074f) return 'syr';
// Default to English (Latin)
return 'eng';
}

View File

@@ -1,7 +1,6 @@
import Tesseract from 'tesseract.js'; import Tesseract from 'tesseract.js';
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib'; import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
import fontkit from '@pdf-lib/fontkit'; import fontkit from '@pdf-lib/fontkit';
import * as pdfjsLib from 'pdfjs-dist';
import { getFontForLanguage } from './font-loader.js'; import { getFontForLanguage } from './font-loader.js';
import { OcrPage, OcrLine } from '@/types'; import { OcrPage, OcrLine } from '@/types';
import { import {
@@ -10,6 +9,7 @@ import {
calculateSpaceTransform, calculateSpaceTransform,
} from './hocr-transform.js'; } from './hocr-transform.js';
import { getPDFDocument } from './helpers.js'; import { getPDFDocument } from './helpers.js';
import { createConfiguredTesseractWorker } from './tesseract-runtime.js';
export interface OcrOptions { export interface OcrOptions {
language: string; language: string;
@@ -134,11 +134,13 @@ export async function performOcr(
const { language, resolution, binarize, whitelist, onProgress } = options; const { language, resolution, binarize, whitelist, onProgress } = options;
const progress = onProgress || (() => {}); const progress = onProgress || (() => {});
const worker = await Tesseract.createWorker(language, 1, { const worker = await createConfiguredTesseractWorker(
logger: function (m: { status: string; progress: number }) { language,
1,
function (m: { status: string; progress: number }) {
progress(m.status, m.progress || 0); progress(m.status, m.progress || 0);
}, }
}); );
await worker.setParameters({ await worker.setParameters({
tessjs_create_hocr: '1', tessjs_create_hocr: '1',

View File

@@ -0,0 +1,132 @@
import { tesseractLanguages } from '../config/tesseract-languages.js';
export const TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY =
'VITE_TESSERACT_AVAILABLE_LANGUAGES' as const;
type TesseractAvailabilityEnv = Partial<
Pick<ImportMetaEnv, typeof TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY>
>;
export type TesseractLanguageCode = keyof typeof tesseractLanguages;
function getDefaultEnv(): TesseractAvailabilityEnv {
return import.meta.env;
}
function normalizeLanguageCodes(value: string | string[]): string[] {
const rawCodes = Array.isArray(value) ? value : value.split(/[+,]/);
const seen = new Set<string>();
const normalizedCodes: string[] = [];
for (const rawCode of rawCodes) {
const code = rawCode.trim();
if (!code || seen.has(code)) {
continue;
}
seen.add(code);
normalizedCodes.push(code);
}
return normalizedCodes;
}
function formatLanguageLabel(code: string): string {
const label = tesseractLanguages[code as TesseractLanguageCode];
return label ? `${label} (${code})` : code;
}
export function resolveConfiguredTesseractAvailableLanguages(
env: TesseractAvailabilityEnv = getDefaultEnv()
): string[] | null {
const configuredLanguages = env.VITE_TESSERACT_AVAILABLE_LANGUAGES?.trim();
if (!configuredLanguages) {
return null;
}
return normalizeLanguageCodes(configuredLanguages);
}
export function getAvailableTesseractLanguageEntries(
env: TesseractAvailabilityEnv = getDefaultEnv()
): Array<[TesseractLanguageCode, string]> {
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
const allEntries = Object.entries(tesseractLanguages) as Array<
[TesseractLanguageCode, string]
>;
if (!configuredLanguages) {
return allEntries;
}
const configuredSet = new Set(configuredLanguages);
return allEntries.filter(([code]) => configuredSet.has(code));
}
export function getUnavailableTesseractLanguages(
requestedLanguages: string | string[],
env: TesseractAvailabilityEnv = getDefaultEnv()
): string[] {
const configuredLanguages = resolveConfiguredTesseractAvailableLanguages(env);
if (!configuredLanguages) {
return [];
}
const configuredSet = new Set(configuredLanguages);
return normalizeLanguageCodes(requestedLanguages).filter(
(code) => !configuredSet.has(code)
);
}
export function formatTesseractLanguageList(codes: string[]): string {
return codes.map(formatLanguageLabel).join(', ');
}
function buildUnsupportedLanguageMessage(
unavailableLanguages: string[],
availableLanguages: string[]
): string {
const unavailableText = formatTesseractLanguageList(unavailableLanguages);
const availableText = formatTesseractLanguageList(availableLanguages);
return [
`This BentoPDF build only bundles OCR data for ${availableText}.`,
`The requested OCR language is not available: ${unavailableText}.`,
'Choose one of the bundled languages or rebuild the air-gapped bundle with the missing language added to --ocr-languages.',
].join(' ');
}
export class UnsupportedOcrLanguageError extends Error {
readonly unavailableLanguages: string[];
readonly availableLanguages: string[];
constructor(unavailableLanguages: string[], availableLanguages: string[]) {
super(
buildUnsupportedLanguageMessage(unavailableLanguages, availableLanguages)
);
this.name = 'UnsupportedOcrLanguageError';
this.unavailableLanguages = unavailableLanguages;
this.availableLanguages = availableLanguages;
}
}
export function assertTesseractLanguagesAvailable(
requestedLanguages: string | string[],
env: TesseractAvailabilityEnv = getDefaultEnv()
): void {
const availableLanguages = resolveConfiguredTesseractAvailableLanguages(env);
if (!availableLanguages) {
return;
}
const unavailableLanguages = getUnavailableTesseractLanguages(
requestedLanguages,
env
);
if (unavailableLanguages.length > 0) {
throw new UnsupportedOcrLanguageError(
unavailableLanguages,
availableLanguages
);
}
}

View File

@@ -0,0 +1,130 @@
import Tesseract from 'tesseract.js';
import {
assertTesseractLanguagesAvailable,
TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
} from './tesseract-language-availability.js';
const TESSERACT_ENV_KEYS = [
'VITE_TESSERACT_WORKER_URL',
'VITE_TESSERACT_CORE_URL',
'VITE_TESSERACT_LANG_URL',
] as const;
const TESSERACT_RUNTIME_ENV_KEYS = [
...TESSERACT_ENV_KEYS,
TESSERACT_AVAILABLE_LANGUAGES_ENV_KEY,
] as const;
type TesseractRuntimeEnvKey = (typeof TESSERACT_RUNTIME_ENV_KEYS)[number];
export type TesseractAssetEnv = Partial<
Pick<ImportMetaEnv, TesseractRuntimeEnvKey>
>;
export interface TesseractAssetConfig {
workerPath?: string;
corePath?: string;
langPath?: string;
}
export type TesseractLoggerMessage = Tesseract.LoggerMessage;
export type TesseractWorkerOptions = Partial<Tesseract.WorkerOptions>;
export type TesseractWorker = Tesseract.Worker;
function getDefaultTesseractAssetEnv(): TesseractAssetEnv {
return import.meta.env;
}
function normalizeDirectoryUrl(url?: string): string | undefined {
const trimmed = url?.trim();
if (!trimmed) return undefined;
return trimmed.replace(/\/+$/, '');
}
function normalizeFileUrl(url?: string): string | undefined {
const trimmed = url?.trim();
if (!trimmed) return undefined;
return trimmed.replace(/\/+$/, '');
}
export function resolveTesseractAssetConfig(
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
): TesseractAssetConfig {
return {
workerPath: normalizeFileUrl(env.VITE_TESSERACT_WORKER_URL),
corePath: normalizeDirectoryUrl(env.VITE_TESSERACT_CORE_URL),
langPath: normalizeDirectoryUrl(env.VITE_TESSERACT_LANG_URL),
};
}
export function hasConfiguredTesseractOverrides(
config: TesseractAssetConfig = resolveTesseractAssetConfig()
): boolean {
return Boolean(config.workerPath || config.corePath || config.langPath);
}
export function hasCompleteTesseractOverrides(
config: TesseractAssetConfig = resolveTesseractAssetConfig()
): boolean {
return Boolean(config.workerPath && config.corePath && config.langPath);
}
export function getIncompleteTesseractOverrideKeys(
config: TesseractAssetConfig = resolveTesseractAssetConfig()
): Array<(typeof TESSERACT_ENV_KEYS)[number]> {
if (!hasConfiguredTesseractOverrides(config)) {
return [];
}
return TESSERACT_ENV_KEYS.filter((key) => {
switch (key) {
case 'VITE_TESSERACT_WORKER_URL':
return !config.workerPath;
case 'VITE_TESSERACT_CORE_URL':
return !config.corePath;
case 'VITE_TESSERACT_LANG_URL':
return !config.langPath;
}
});
}
export function buildTesseractWorkerOptions(
logger?: TesseractWorkerOptions['logger'],
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
): TesseractWorkerOptions {
const config = resolveTesseractAssetConfig(env);
if (!hasConfiguredTesseractOverrides(config)) {
return logger ? { logger } : {};
}
if (!hasCompleteTesseractOverrides(config)) {
const missing = getIncompleteTesseractOverrideKeys(config).join(', ');
throw new Error(
`Self-hosted OCR assets are partially configured. Set ${missing} together with the other Tesseract asset URLs.`
);
}
return {
...(logger ? { logger } : {}),
workerPath: config.workerPath,
corePath: config.corePath,
langPath: config.langPath,
gzip: true,
};
}
export async function createConfiguredTesseractWorker(
language: string,
oem: Tesseract.OEM,
logger?: TesseractWorkerOptions['logger'],
env: TesseractAssetEnv = getDefaultTesseractAssetEnv()
): Promise<TesseractWorker> {
assertTesseractLanguagesAvailable(language, env);
return Tesseract.createWorker(
language,
oem,
buildTesseractWorkerOptions(logger, env)
);
}

View File

@@ -214,6 +214,10 @@
>None</span >None</span
> >
</p> </p>
<p
id="lang-availability-note"
class="hidden text-xs text-amber-300 mt-2"
></p>
</div> </div>
<!-- Advanced settings --> <!-- Advanced settings -->

View File

@@ -0,0 +1,81 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
const { createConfiguredTesseractWorker } = vi.hoisted(() => ({
createConfiguredTesseractWorker: vi.fn(),
}));
const mockWorker = {
recognize: vi.fn(),
terminate: vi.fn(),
};
vi.mock('../../js/utils/tesseract-runtime', () => ({
createConfiguredTesseractWorker,
}));
import { recognizePageCanvas } from '../../js/compare/engine/ocr-page';
describe('compare OCR page recognition', () => {
beforeEach(() => {
createConfiguredTesseractWorker.mockReset();
mockWorker.recognize.mockReset();
mockWorker.terminate.mockReset();
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
});
it('uses the configured Tesseract worker and maps OCR words into compare text items', async () => {
const progress = vi.fn();
const canvas = {
width: 300,
height: 150,
} as HTMLCanvasElement;
mockWorker.recognize.mockResolvedValue({
data: {
words: [
{
text: 'Hello',
bbox: { x0: 10, y0: 20, x1: 60, y1: 40 },
},
{
text: 'world',
bbox: { x0: 70, y0: 20, x1: 120, y1: 40 },
},
],
},
});
const model = await recognizePageCanvas(canvas, 'eng', progress);
expect(createConfiguredTesseractWorker).toHaveBeenCalledWith(
'eng',
1,
expect.any(Function)
);
expect(mockWorker.recognize).toHaveBeenCalledWith(canvas);
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
expect(model.source).toBe('ocr');
expect(model.hasText).toBe(true);
expect(model.plainText).toContain('Hello');
expect(model.textItems).toHaveLength(1);
const logger = createConfiguredTesseractWorker.mock
.calls[0][2] as (message: { status: string; progress: number }) => void;
logger({ status: 'recognizing text', progress: 0.5 });
expect(progress).toHaveBeenCalledWith('recognizing text', 0.5);
});
it('terminates the worker when compare OCR fails', async () => {
const canvas = {
width: 300,
height: 150,
} as HTMLCanvasElement;
mockWorker.recognize.mockRejectedValueOnce(new Error('compare ocr failed'));
await expect(recognizePageCanvas(canvas, 'eng')).rejects.toThrow(
'compare ocr failed'
);
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
});
});

View File

@@ -0,0 +1,28 @@
import { describe, expect, it } from 'vitest';
import { getFontAssetFileName } from '../js/config/font-mappings';
import { resolveFontUrl } from '../js/utils/font-loader';
describe('font-loader', () => {
it('uses the default public font URL when no offline font base URL is configured', () => {
expect(resolveFontUrl('Noto Sans', {})).toBe(
'https://raw.githack.com/googlefonts/noto-fonts/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf'
);
});
it('builds a self-hosted font URL when an OCR font base URL is configured', () => {
expect(
resolveFontUrl('Noto Sans Arabic', {
VITE_OCR_FONT_BASE_URL: 'https://internal.example.com/wasm/ocr/fonts/',
})
).toBe(
'https://internal.example.com/wasm/ocr/fonts/NotoSansArabic-Regular.ttf'
);
});
it('derives the bundled font asset file name from the default font URL', () => {
expect(getFontAssetFileName('Noto Sans SC')).toBe(
'NotoSansCJKsc-Regular.otf'
);
});
});

185
src/tests/ocr.test.ts Normal file
View File

@@ -0,0 +1,185 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
const {
createConfiguredTesseractWorker,
getPDFDocument,
getFontForLanguage,
parseHocrDocument,
} = vi.hoisted(() => ({
createConfiguredTesseractWorker: vi.fn(),
getPDFDocument: vi.fn(),
getFontForLanguage: vi.fn(),
parseHocrDocument: vi.fn(),
}));
const mockWorker = {
setParameters: vi.fn(),
recognize: vi.fn(),
terminate: vi.fn(),
};
const mockPdfPage = {
getViewport: vi.fn(() => ({ width: 200, height: 100 })),
render: vi.fn(() => ({ promise: Promise.resolve() })),
};
const mockPdfOutputPage = {
drawImage: vi.fn(),
drawText: vi.fn(),
};
const mockPdfDoc = {
registerFontkit: vi.fn(),
embedFont: vi.fn(async () => ({ widthOfTextAtSize: vi.fn(() => 12) })),
addPage: vi.fn(() => mockPdfOutputPage),
embedPng: vi.fn(async () => ({ id: 'png' })),
save: vi.fn(async () => new Uint8Array([1, 2, 3])),
};
vi.mock('../js/utils/tesseract-runtime', () => ({
createConfiguredTesseractWorker,
}));
vi.mock('../js/utils/helpers.js', () => ({
getPDFDocument,
}));
vi.mock('../js/utils/font-loader.js', () => ({
getFontForLanguage,
}));
vi.mock('../js/utils/hocr-transform.js', () => ({
parseHocrDocument,
calculateWordTransform: vi.fn(),
calculateSpaceTransform: vi.fn(),
}));
vi.mock('pdf-lib', () => ({
PDFDocument: {
create: vi.fn(async () => mockPdfDoc),
},
StandardFonts: {
Helvetica: 'Helvetica',
},
rgb: vi.fn(() => ({ r: 0, g: 0, b: 0 })),
}));
vi.mock('@pdf-lib/fontkit', () => ({
default: {},
}));
import { performOcr } from '../js/utils/ocr';
describe('performOcr', () => {
const originalCreateElement = document.createElement.bind(document);
const originalFileReader = globalThis.FileReader;
beforeEach(() => {
createConfiguredTesseractWorker.mockReset();
getPDFDocument.mockReset();
getFontForLanguage.mockReset();
parseHocrDocument.mockReset();
mockWorker.setParameters.mockReset();
mockWorker.recognize.mockReset();
mockWorker.terminate.mockReset();
mockPdfPage.getViewport.mockClear();
mockPdfPage.render.mockClear();
mockPdfOutputPage.drawImage.mockClear();
mockPdfOutputPage.drawText.mockClear();
mockPdfDoc.registerFontkit.mockClear();
mockPdfDoc.embedFont.mockClear();
mockPdfDoc.addPage.mockClear();
mockPdfDoc.embedPng.mockClear();
mockPdfDoc.save.mockClear();
createConfiguredTesseractWorker.mockResolvedValue(mockWorker);
getPDFDocument.mockReturnValue({
promise: Promise.resolve({
numPages: 1,
getPage: vi.fn(async () => mockPdfPage),
}),
});
getFontForLanguage.mockResolvedValue(new Uint8Array([1, 2, 3]));
mockWorker.recognize.mockResolvedValue({
data: {
text: 'Recognized text',
hocr: '',
},
});
document.createElement = ((tagName: string) => {
if (tagName !== 'canvas') {
return originalCreateElement(tagName);
}
return {
width: 0,
height: 0,
getContext: vi.fn(() => ({
canvas: { width: 200, height: 100 },
getImageData: vi.fn(() => ({ data: new Uint8ClampedArray(4) })),
putImageData: vi.fn(),
})),
toBlob: vi.fn((callback: (blob: Blob) => void) => {
callback(
new Blob([new Uint8Array([1, 2, 3])], { type: 'image/png' })
);
}),
} as unknown as HTMLCanvasElement;
}) as typeof document.createElement;
globalThis.FileReader = class {
result: ArrayBuffer = new Uint8Array([1, 2, 3]).buffer;
onload: null | (() => void) = null;
onerror: null | (() => void) = null;
readAsArrayBuffer() {
this.onload?.();
}
} as unknown as typeof FileReader;
});
afterEach(() => {
document.createElement = originalCreateElement;
globalThis.FileReader = originalFileReader;
});
it('uses the configured Tesseract worker and terminates it after OCR completes', async () => {
const result = await performOcr(new Uint8Array([1, 2, 3]), {
language: 'eng',
resolution: 2,
binarize: false,
whitelist: '',
});
expect(createConfiguredTesseractWorker).toHaveBeenCalledWith(
'eng',
1,
expect.any(Function)
);
expect(mockWorker.setParameters).toHaveBeenCalledWith({
tessjs_create_hocr: '1',
tessedit_pageseg_mode: '3',
});
expect(mockWorker.recognize).toHaveBeenCalledTimes(1);
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
expect(result.fullText).toContain('Recognized text');
expect(result.pdfBytes).toBeInstanceOf(Uint8Array);
});
it('terminates the Tesseract worker when OCR fails', async () => {
mockWorker.recognize.mockRejectedValueOnce(new Error('ocr failed'));
await expect(
performOcr(new Uint8Array([1, 2, 3]), {
language: 'eng',
resolution: 2,
binarize: false,
whitelist: '',
})
).rejects.toThrow('ocr failed');
expect(mockWorker.terminate).toHaveBeenCalledTimes(1);
});
});

View File

@@ -0,0 +1,128 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
const { createWorker } = vi.hoisted(() => ({
createWorker: vi.fn(),
}));
vi.mock('tesseract.js', () => ({
default: {
createWorker,
},
}));
import {
buildTesseractWorkerOptions,
createConfiguredTesseractWorker,
getIncompleteTesseractOverrideKeys,
hasCompleteTesseractOverrides,
hasConfiguredTesseractOverrides,
resolveTesseractAssetConfig,
} from '../js/utils/tesseract-runtime';
import {
assertTesseractLanguagesAvailable,
getAvailableTesseractLanguageEntries,
getUnavailableTesseractLanguages,
UnsupportedOcrLanguageError,
} from '../js/utils/tesseract-language-availability';
describe('tesseract-runtime', () => {
beforeEach(() => {
createWorker.mockReset();
});
it('normalizes self-hosted OCR asset URLs', () => {
const config = resolveTesseractAssetConfig({
VITE_TESSERACT_WORKER_URL:
'https://internal.example.com/ocr/worker.min.js/',
VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core/',
VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data/',
});
expect(config).toEqual({
workerPath: 'https://internal.example.com/ocr/worker.min.js',
corePath: 'https://internal.example.com/ocr/core',
langPath: 'https://internal.example.com/ocr/lang-data',
});
expect(hasConfiguredTesseractOverrides(config)).toBe(true);
expect(hasCompleteTesseractOverrides(config)).toBe(true);
});
it('returns logger-only options when no self-hosted OCR assets are configured', () => {
const logger = vi.fn();
expect(buildTesseractWorkerOptions(logger, {})).toEqual({ logger });
expect(
hasConfiguredTesseractOverrides(resolveTesseractAssetConfig({}))
).toBe(false);
});
it('throws on partial OCR asset configuration', () => {
const env = {
VITE_TESSERACT_WORKER_URL:
'https://internal.example.com/ocr/worker.min.js',
VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core',
};
expect(
getIncompleteTesseractOverrideKeys(resolveTesseractAssetConfig(env))
).toEqual(['VITE_TESSERACT_LANG_URL']);
expect(() => buildTesseractWorkerOptions(undefined, env)).toThrow(
'Self-hosted OCR assets are partially configured'
);
});
it('passes configured OCR asset URLs to Tesseract.createWorker', async () => {
const logger = vi.fn();
createWorker.mockResolvedValue({ id: 'worker' });
await createConfiguredTesseractWorker('eng', 1, logger, {
VITE_TESSERACT_WORKER_URL:
'https://internal.example.com/ocr/worker.min.js',
VITE_TESSERACT_CORE_URL: 'https://internal.example.com/ocr/core',
VITE_TESSERACT_LANG_URL: 'https://internal.example.com/ocr/lang-data',
});
expect(createWorker).toHaveBeenCalledWith('eng', 1, {
logger,
workerPath: 'https://internal.example.com/ocr/worker.min.js',
corePath: 'https://internal.example.com/ocr/core',
langPath: 'https://internal.example.com/ocr/lang-data',
gzip: true,
});
});
it('filters OCR language entries when the build restricts bundled languages', () => {
expect(
getAvailableTesseractLanguageEntries({
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
})
).toEqual([
['eng', 'English'],
['deu', 'German'],
]);
});
it('reports unavailable OCR languages for restricted air-gap builds', () => {
expect(
getUnavailableTesseractLanguages('eng+fra', {
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
})
).toEqual(['fra']);
expect(() =>
assertTesseractLanguagesAvailable('eng+fra', {
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
})
).toThrow(UnsupportedOcrLanguageError);
});
it('blocks worker creation when OCR requests an unbundled language', async () => {
await expect(
createConfiguredTesseractWorker('fra', 1, undefined, {
VITE_TESSERACT_AVAILABLE_LANGUAGES: 'eng,deu',
})
).rejects.toThrow('This BentoPDF build only bundles OCR data for');
expect(createWorker).not.toHaveBeenCalled();
});
});

View File

@@ -1 +1,15 @@
/// <reference types="vite/client" />
interface ImportMetaEnv {
readonly VITE_TESSERACT_WORKER_URL?: string;
readonly VITE_TESSERACT_CORE_URL?: string;
readonly VITE_TESSERACT_LANG_URL?: string;
readonly VITE_TESSERACT_AVAILABLE_LANGUAGES?: string;
readonly VITE_OCR_FONT_BASE_URL?: string;
}
interface ImportMeta {
readonly env: ImportMetaEnv;
}
declare const __SIMPLE_MODE__: boolean; declare const __SIMPLE_MODE__: boolean;